initial commit v1.0

This commit is contained in:
2014-11-28 18:05:12 +01:00
commit ecd3d5214d
21 changed files with 1709 additions and 0 deletions

BIN
R-Code/backup_cl.RData Normal file

Binary file not shown.

Binary file not shown.

View File

@@ -0,0 +1,46 @@
require(stringr)
require(dplyr)
require(XML)
setwd(dir="~/Dokumente/Uni/Aktuell/Datenerhebung WWW/Blogartikel 2 - Newsfokus")
clist_url <- "http://de.wikipedia.org/wiki/ISO-3166-1-Kodierliste"
clist_tbl <- readHTMLTable(clist_url, encoding = "UTF-8")[[1]]
names(clist_tbl) <- c("name", "code", "code-3", "code-num", "tld", "ioc", "code-2", "un")
clist <- data.frame(clist_tbl, stringsAsFactors = FALSE)
keep <- c("code", "name")
clist <- clist[keep]
clist$code <- as.character.factor(clist$code)
clist$name <- as.character.factor(clist$name)
rm(keep)
# Cleaning table from headlines, annotations and brackets
for (i in 1:nrow(clist)) {
name <- as.character(clist$name[i])
status <- str_detect(name, "!!")
if (status) {
clist <- clist[-i,]
}
rm(i, name, status)
}
cleanBrackets <- function(string) {
string <- str_replace(string, "\\[.*\\]", "")
string <- str_replace(string, "\\(.*\\)", "")
string <- str_replace(string, ".+!", "")
string <- str_trim(string)
return(string)
}
for (i in 1:nrow(clist)) {
name <- as.character(clist$name[i])
code <- as.character(clist$code[i])
clist$name[i] <- cleanBrackets(name)
clist$code[i] <- cleanBrackets(code)
rm(i, name, code)
}
write.table(clist, "newsfokus-countrylist.txt", sep=";", row.names=FALSE, col.names=FALSE)
# Diese exportierte Länderliste wird noch manuell bearbeitet, siehe "newsfokus-countrylist-manualchanges.txt"

View File

@@ -0,0 +1,185 @@
require(XML)
require(RCurl)
require(stringr)
setwd("~/Dokumente/Uni/Aktuell/Datenerhebung WWW/Blogartikel 2 - Newsfokus/")
# Import manually edited country list
cl <- read.csv("newsfokus-countrylist.txt", sep = ";", header = FALSE)
names(cl) <- c("code", "name")
# FUNCTIONS ---------------------------------------------------------------
source("newsfokus-functions.R")
# PREPARATIONS ------------------------------------------------------------
# Generate all month.year combinations since January 2000
tspans <- NULL
for (y in 2000:2014) {
for (m in 01:12) {
if (length(tspans) == 0) {
tspans <- str_c(m,".",y)
}
else {
tspans <- c(tspans, str_c(m,".",y))
}
}
rm(y, m)
}
# # All combinations mmmYYYY (not used yet)
# years <- 2000:2014 # 15
# months <- month.abb
# no <- 0
# comb <- NULL
# for (m in 1:12) {
# for (y in 1:15) {
# no <- no + 1
# comb[no] <- str_c(months[m],years[y])
# }
# }
# rm(years, months, m, y, no)
# Remove future/incomplete months
removemonths <- c(9.2014, 10.2014, 11.2014, 12.2014)
tspans <- tspans [! tspans %in% removemonths]
rm(removemonths)
# Create new columns for every month and set the column names accordingly
cl[sprintf("%s", tspans)] <- 0
# Copy data frame for headlines
headlines <- cl
# Set curl handle for friendly scraping
handle <- getCurlHandle(httpheader = list(from = "max.mehl@uni.kn",
'user-agent' = str_c(R.version$version.string)
)
)
# SCRAPING ALL THE NEWS \o/ ----------------------------------------------
# Das Prozedere ist folgendes:
# 1. Erstelle die URL je nach Zeitspanne und wiederhole das für jede existierende Seite (meist über 100)
# - Es gibt eine Fehlermeldung auf der Suchseite, wenn keine Ergebnisse mehr vorhanden sind
# 2. Lade Seite herunter und parse sie
# - Nicht, wenn sie schon vorhanden ist, um Bandbreite zu sparen
# 3. Suche nach Auftauchen von Ländernamen
# 3a. Wenn ja, dann zähle jeweiligen Eintrag im DF um 1 hoch
# - nur ein Auftauchen des Landes in einem Artikel wird gezählt
# (es gibt für einzelne Ländercodes mehrere Schreibweisen)
stat_news_all <- 0
stat_news_pos <- 0
stat_pages <- 0
# This loop does the scraping, searching and indexing of each month and country
# !!! Beware: It takes around 24 hours to finish for 1.2000 - 8.2014! Load backup_cl.RData (and optionally backup_headlines.RData) to skip this step
for (i in 3:ncol(cl)) {
tspan <- names(cl)[i]
# Create folder for downloaded HTML files
dir.create(str_c("materials/",tspan), showWarnings = FALSE, recursive = TRUE)
days <- getDays(i, cl)
# Generate the month's base url
baseurl <- str_c("http://www.spiegel.de/suche/index.html?suchbegriff=+&quellenGroup=SPOX&suchbereich=kopftext&fromDate=1.", tspan, "&toDate=", days, ".", tspan, "&offsets=999999&pageNumber=")
# In every loop we start with page 1 again
page <- 1
# Now expand the URL by the page number UNTIL there's an error page
repeat {
cat("\n--> Processing page", page, "of timespan", tspan, "\n")
url <- str_c(baseurl, page)
dest <- str_c("materials/",tspan,"/",tspan,"-",page,".html")
url_parsed <- dlParsePage(url, dest)
status <- unlist(xpathSApply(url_parsed, "//h3[contains(text(), 'Ihre Suche ergab keinen Treffer')]"))
if (! is.null(status)) {
# If there's an error page, there're no further articles. Skip this loop and begin with next month
cat("Letzte Seite erreicht:", url, "\n")
break
}
# Page is valid, now split in single articles and search for countries in each title and teaser
headline <- xpathSApply(url_parsed, "//div[@class='search-teaser']//span[@class='headline']", xmlValue)
teaser <- xpathSApply(url_parsed, "//div[@class='search-teaser']/p", xmlValue)
url_arts <- xpathSApply(url_parsed, "//div[@class='search-teaser']/a", xmlGetAttr, "href")
url_short <- ""
# Combine headline and teaser to make it easier to search
teaser <- str_c(headline, teaser, sep=" ")
if (length(teaser) == 0) {
errormsg <- str_c("Probably 500 error at: ", tspan,"-",page)
write(errormsg, "scraping-errors.log", append = TRUE)
rm(errormsg)
}
else {
# Analyse every single teaser/headline combination
for (t in 1:length(teaser)) {
yet <- "" # Did the country already appear in the article? Empty with each loop
string <- teaser[t]
for (c in 1:nrow(cl)) {
name <- as.character(cl$name[c]) # Name of the county to detect in the teaser
status <- str_detect(tolower(string), tolower(name)) # Does the country's name appear in the teaser?
if (status) { # yes
code <- getCode(c, cl$code)
cat("The string contains news from:", code, "\n")
# We only want to count a country once even if it appears multiple times in an article
already <- str_detect(yet, code) # Did the country already appear?
if (!already) { # no
yet <- str_c(yet, code, sep=" ")
cl[c , tspan] <- cl[c , tspan] + 1 # Count +1 to the number of appearances in the data frame
# Save headlines + links to a different data frame
url_short[t] <- str_extract(url_arts[t], ".+/")
url_short[t] <- str_c(url_short[t], str_extract(url_arts[t], "a\\-\\d+\\.html"))
new_headline_entry <- str_c(headline[t], " (", url_short[t], ")")
if (headlines[c , tspan] == 0) {
headlines[c , tspan] <- new_headline_entry
}
else {
headlines[c , tspan] <- str_c(headlines[c , tspan], "\n", new_headline_entry)
}
}
rm(code, already)
stat_news_pos <- stat_news_pos + 1
}
rm(c, name, status)
}
rm(t, yet, string)
stat_news_all <- stat_news_all +1
}
}
# Go to the next page
page <- page + 1
stat_pages <- stat_pages + 1
}
rm(i, tspan, days, baseurl, page, url, url_parsed, status, teaser, headline, dest, url_arts, url_short, new_headline_entry)
# Backup all data after each month
write.csv(cl, "backup_cl.csv")
write.csv(headlines, "backup_headlines.csv")
save(cl, file="backup_cl.RData")
save(headlines, file="backup_headlines.RData")
}
# End of huge for-loop
# Final Backup
write.csv(cl, "backup_cl.csv")
write.csv(headlines, "backup_headlines.csv")
save(cl, file="backup_cl.RData")
save(headlines, file="backup_headlines.RData")

View File

@@ -0,0 +1,264 @@
require(plyr)
require(dplyr)
require(stringr)
# PREPARATIONS ------------------------------------------------------------
no <- 0
for (r in 1:nrow(cl)) {
for (c in 3:ncol(cl)) {
no <- no + as.numeric(cl[r,c])
}
}
cat("Sum of country news entries:",no)
rm(r,c,no)
# Save old cl-dataframe for next steps
cl_bak <- cl
# Merge appearances for same country code (but different name)
cl <- ddply(cl,"code",numcolwise(sum))
target <- which(names(cl) == 'code')[1]
cl <- cbind(cl[,1:target,drop=F], data.frame(name="PLACEHOLDER"), cl[,(target+1):length(cl),drop=F])
rm(target)
cl$name <- "bla"
# Now choose names for the according codes (first fits)
for (r in 1:nrow(cl)) {
code <- as.character(cl$code[r])
code <- namibiaBug(code)
r2 <- 1
repeat {
code2 <- as.character(cl_bak$code[r2])
code2 <- namibiaBug(code2)
if (code2 == code) {
name <- as.character(cl_bak$name[r2])
cl$name[r] <- name
break
}
else {
r2 <- r2 + 1
}
}
rm(r, r2, code, code2, name)
}
# Summarize all counts for each country
cl_stats <- cl[,1:2]
cl_stats["overall"] <- 0
overall <- NULL
for (r in 1:nrow(cl)) {
overall[r] <- 0
for (c in 3:ncol(cl)) {
overall[r] <- overall[r] + as.numeric(cl[r,c])
}
cl_stats$overall[r] <- overall[r]
}
rm(overall, r, c)
# # If we would want the overall-counter in cl
# cl["overall"] <- 0
# cl$overall <- cl_stats$overall
# # Get all names for one country code (if there are multiple)
# for (r in 1:nrow(cl)) {
# name <- NULL
# no <- 0
# code <- as.character(cl$code[r])
# code <- namibiaBug(code)
# r2 <- 1
# repeat {
# if (r2 > nrow(cl_bak)) {
# break # only end if no next row
# }
# code2 <- as.character(cl_bak$code[r2])
# code2 <- namibiaBug(code2)
# if (code2 == code) {
# no <- no + 1
# name[no] <- as.character(cl_bak$name[r2])
# r2 <- r2 + 1
# }
# else {
# r2 <- r2 + 1
# }
# }
# if (length(name) > 1) {
# cat("For",code,"there are", no, "names:", name, "\n")
# }
# rm(r, name, no, code, r2, code2)
# }
# Old dataframe not needed anymore
rm(cl_bak)
# Calculate the total and average news entries for each year
years <- 2000:2014
# Search string for str_detect for every year
year_str <- sprintf("^\\d{1,2}\\.%s", years)
for (r in 1:nrow(cl)) {
total <- 0
average <- 0
for (y in 1:length(years)) {
months <- 0
for (c in 1:ncol(cl)) {
if (str_detect(names(cl)[c], year_str[y])) {
total <- total + as.numeric(cl[r,c])
months <- months + 1
}
}
colnametotal <- str_c(years[y],"-total")
colnameaverg <- str_c(years[y],"-averg")
average <- round(total / months, 4)
cl_stats[r, colnametotal] <- total
cl_stats[r, colnameaverg] <- average
total <- 0
average <- 0
}
rm(r, total, average, y, months, c, colnametotal, colnameaverg)
}
rm(years, year_str)
# IDENTIFY SURPRISING NEWSFOCUS -------------------------------------------
# Land war 3x öfter genannt als im Monat davor, aber mehr als 50x
no <- 0
for (c in 4:ncol(cl)) { # starting 1 month later
for (r in 1:nrow(cl)) {
month <- names(cl)[c]
# Conditions to fulfill
status1 <- cl[r,c] > 3 * cl[r,c-1]
status2 <- cl[r,c] > 50
if (status1 && status2) {
no <- no + 1
cat("[",no,"] ",as.character(cl$code[r]),": 3x m-1 && >50 in: ", month,"\n", sep = "")
}
}
rm(r,c,month,status1,status2)
}
rm(no)
# Land wurde in einem Monat 3x öfter als im Jahresdurchschnitt genannt
no <- 0
for (c in 3:ncol(cl)) {
for (r in 1:nrow(cl)) {
month <- names(cl)[c]
year <- str_extract(month, "\\d{4}")
averg <- str_c(year,"-averg")
averg <- cl_stats[r,averg]
# Conditions to fulfill
status1 <- cl[r,c] > 3 * averg
status2 <- cl[r,c] > 50
if (status1 && status2) {
no <- no + 1
cat("[",no,"] ",as.character(cl$code[r]),": 3x year average && >50 ", month,"\n", sep = "") }
}
rm(r,c,month,year,averg,status1,status2)
}
rm(no)
# Final method: Land in einem Monat öfter genannt als alle 3 Monate davor zusammen
cl_supfoc_mon <- data.frame(code=NA,name=NA)
cl_supfoc_mon[sprintf("%s", tspans)] <- 0
no <- 0
for (c in 6:ncol(cl)) {
for (r in 1:nrow(cl)) {
month <- names(cl)[c]
code <- as.character(cl$code[r])
name <- as.character(cl$name[r])
# Conditions to fulfill
status1 <- cl[r,c] > cl[r,c-1]+cl[r,c-2]+cl[r,c-3]
status2 <- cl[r,c] > 50
if (status1 && status2) {
no <- no + 1
cl_supfoc_mon[no, "code"] <- code
cl_supfoc_mon$name[no] <- name
#if (is.null(cl_supfoc_mon[no, month])) { cl_supfoc_mon[no, month] <- 0}
#cl_supfoc_mon[no, month] <- cl_supfoc_mon[no, month] + 1
cl_supfoc_mon[no, month] <- 1
cat("[",no,"] ",as.character(cl$code[r]),": >m-(1:3) && >50 ", month,"\n", sep = "")
}
}
rm(r,c,month,status1,status2)
}
rm(no, code, name)
# Clean cl_supfoc_mon: Replace NAs by 0, and sum up multiple appeared countries
cl_supfoc_mon[is.na(cl_supfoc_mon)] <- 0
cl_supfoc_mon <- ddply(cl_supfoc_mon,c("code", "name"),numcolwise(sum))
# Delete all month-columns with 0 surprising events
cl_supfoc_only_mon <- removeZeroMonths(cl_supfoc_mon, 3, ncol(cl_supfoc_mon))
# Get total surprising newsfocuses for each country
cl_supfoc_total <- data.frame(code=NA, name=NA, total=NA)
for (r in 1:nrow(cl_supfoc_mon)) {
total <- 0
cl_supfoc_total[r,"code"] <- as.character(cl_supfoc_mon$code[r])
cl_supfoc_total[r,"name"] <- as.character(cl_supfoc_mon$name[r])
for (c in 3:ncol(cl_supfoc_mon)) {
total <- total + as.numeric(cl_supfoc_mon[r,c])
}
cl_supfoc_total[r,"total"] <- total
}
rm(r, total, c)
# Total highlights per month + turn around table for graphs
cl_supfoc_mon["highlight"] <- 1
cl_supfoc_turn_mon <- ddply(cl_supfoc_mon,"highlight", numcolwise(sum))
cl_supfoc_mon$highlight <- NULL
cl_supfoc_turn_mon$highlight <- NULL
cl_supfoc_turn_mon <- data.frame(month = names(cl_supfoc_turn_mon), highs = as.numeric(cl_supfoc_turn_mon[1,]))
# Convert %d.%y to valid date class
months <- NULL
for (m in 1:length(tspans)) {
dates <- str_c("15.",tspans[m])
months[m] <- dates
}
rm(m, dates)
cl_supfoc_turn_mon$month <- as.Date(months, format = "%d.%m.%Y")
rm(months)
# Delete all month-rows with 0 surprising events
cl_supfoc_turn_only_mon <- cl_supfoc_turn_mon[!cl_supfoc_turn_mon$highs == 0,]
rownames(cl_supfoc_turn_only_mon) <- NULL
# # Replace 0s by NAs
# for (r in 1:180) {
# if (! is.na(cl_total2$highs[r])) {
# if (cl_total2$highs[r] == 0) {
# cl_total2$highs[r] <- NA
# }
# }
# }

View File

@@ -0,0 +1,80 @@
require(rworldmap)
require(ggplot2)
theCountries <- c("DE", "US", "BR")
# These are the ISO3 names of the countries you'd like to plot in red
malDF <- data.frame(country = c("DE", "US", "BR", "ZA"), malaria = c(2000, 2001, 2002, 2002), news = c(2, 3, 0, 1))
# malDF is a data.frame with the ISO3 country names plus a variable to
# merge to the map data
malMap <- joinCountryData2Map(malDF, joinCode = "ISO2", nameJoinColumn = "country")
# This will join your malDF data.frame to the country map data
mapCountryData(malMap, nameColumnToPlot="malaria", catMethod = "categorical", missingCountryCol = gray(.8))
# And this will plot it, with the trick that the color palette's first
# color is red
# Absolute Frequ Newsfocus Map --------------------------------------------
# # Absolute Häufigkeiten der Highlights mit Bubbles
# malMap <- joinCountryData2Map(cl_supfoc_total, joinCode = "ISO2", nameJoinColumn = "code")
# mapBubbles( dF=malMap, nameZSize="total",nameZColour="GEO3major",
# colourPalette=c("black", "yellow", "blue", "orange", "red", "white", "green"),
# oceanCol="lightblue",
# landCol="wheat",
# fill=TRUE,
# symbolSize=0.5,
# pch=21)
# Absolute Newfokus Häufigkeiten Welt:
absMap <- joinCountryData2Map(cl_supfoc_total, joinCode = "ISO2", nameJoinColumn = "code", verbose=TRUE)
mapCountryData(absMap, nameColumnToPlot="total", catMethod="fixedWidth",
numCats=5,
mapTitle="Anzahl Überraschungsfokusse weltweit",
oceanCol="lightblue",
missingCountryCol=gray(.9)
)
# Absolute Newfokus Häufigkeiten EU-Asien-Nordafrika:
mapCountryData(absMap, nameColumnToPlot="total", catMethod="fixedWidth",
numCats=5,
mapTitle="Anzahl Überraschungsfokusse Nordafrika und Asien",
oceanCol="lightblue",
missingCountryCol=gray(.9),
xlim=c(10,140),
ylim=c(30,70)
)
# Development Newsfocus over time -----------------------------------------
# Entwicklung der Surprising Focuses über die Jahre
cc <- ggplot(cl_supfoc_turn_mon, aes(month,highs))
cc <- cc + geom_histogram(fill="steelblue", stat="identity")
cc <- cc + stat_smooth(size=1,colour="red",method="loess", se=FALSE)
cc <- cc + ggtitle("Zeitliche Entwicklung von plötzlichen Medienfokussen") + xlab("Einzelne Monate") + ylab("Plötzliche Medienfokusse")
cc
# Beispiel1: Jährliche Durchschnittsanzahl 2000-2014 der Nachrichten über Syrien
yearspan <- 2000:2014
avergdf <- getAverages(df = cl_stats, codecol = "code", code = "SY", yearspan = yearspan)
averg <- ggplot(data = avergdf, aes(x = year, y=averg))
averg + geom_line() + ggtitle("Durchschnittliche Nachrichten pro Jahr über Syrien") + xlab("Jahre") + ylab("Durchschnittliche Nachrichten")
# Beispiel2: Jährliche Durchschnittsanzahl 2000-2014 der Nachrichten über Israel
yearspan <- 2000:2014
avergdf <- getAverages(df = cl_stats, codecol = "code", code = "IL", yearspan = yearspan)
averg <- ggplot(data = avergdf, aes(x = year, y=averg))
averg + geom_line() + ggtitle("Durchschnittliche Nachrichten pro Jahr über Israel") + xlab("Jahre") + ylab("Durchschnittliche Nachrichten")

View File

@@ -0,0 +1,38 @@
"Nordkorea";"KP"
"Südkorea";"KR"
"Zaire";"CD"
"Großbritannien";"GB"
"England";"GB"
"Russland";"RU"
"Palästina";"PS"
"Gazastreifen";"PS"
"Burma";"MM"
"Moldau";"MW"
"Laos";"LA"
"Kosovo";"RS"
"Kongo";"CD"
"Kongo-Brazzaville";"CG"
"Iran";"IR"
"Isle of Man";"IM"
"DDR";"DD"
"Elfenbeinküste";"CI"
"China";"CN"
"Weißrussland";"BY"
"Antarktis";"AQ"
"Krim";"UA"
"UdSSR";"RU"
"USA";"US"
"BRD";"DE"
"Nordirland";"IE"
"Nordzypern";"CY"
"Syrien";"SY"
"VA";"Vatikan"
"TZ";"Tansania"
"EA";"Ceuta"
"EA";"Melilla"
"ENTFERNT Neutrale Zone";"NT"
"ENTFERNT Europäische Gemeinschaft";"CE"
"ENTFERNT Europäische Union";"EU"
"ENTFERNT Burma";"BU"

View File

@@ -0,0 +1,295 @@
"AC";"Ascension"
"AD";"Andorra"
"AE";"Vereinigte Arabische Emirate"
"AF";"Afghanistan"
"AG";"Antigua und Barbuda"
"AI";"Anguilla"
"AL";"Albanien"
"AM";"Armenien"
"AN";"Niederländische Antillen"
"AO";"Angola"
"AQ";"Antarktis"
"AQ";"Antarktika"
"AR";"Argentinien"
"AS";"Amerikanisch-Samoa"
"AT";"Österreich"
"AU";"Australien"
"AW";"Aruba"
"AX";"Åland"
"AZ";"Aserbaidschan"
"BA";"Bosnien und Herzegowina"
"BB";"Barbados"
"BD";"Bangladesch"
"BE";"Belgien"
"BF";"Burkina Faso"
"BG";"Bulgarien"
"BH";"Bahrain"
"BI";"Burundi"
"BJ";"Benin"
"BL";"Saint-Barthélemy"
"BM";"Bermuda"
"BN";"Brunei Darussalam"
"BO";"Bolivien"
"BQ";"Bonaire, Sint Eustatius und Saba"
"BR";"Brasilien"
"BS";"Bahamas"
"BT";"Bhutan"
"BV";"Bouvetinsel"
"BW";"Botswana"
"BY";"Belarus"
"BY";"Weißrussland"
"BZ";"Belize"
"CA";"Kanada"
"CC";"Kokosinseln"
"CD";"Demokratische Republik Kongo"
"CD";"Kongo, Demokratische Republik"
"CD";"Kongo"
"CD";"Zaire"
"CF";"Zentralafrikanische Republik"
"CG";"Republik Kongo"
"CG";"Kongo-Brazzaville"
"CH";"Schweiz"
"CI";"Elfenbeinküste"
"CI";"Côte dIvoire"
"CK";"Cookinseln"
"CL";"Chile"
"CM";"Kamerun"
"CN";"China"
"CN";"China, Volksrepublik"
"CO";"Kolumbien"
"CP";"Clipperton"
"CR";"Costa Rica"
"CS";"Tschechoslowakei"
"CS";"Serbien und Montenegro"
"CU";"Kuba"
"CV";"Kap Verde"
"CW";"Curaçao"
"CX";"Weihnachtsinsel"
"CY";"Zypern"
"CY";"Nordzypern"
"CZ";"Tschechische Republik"
"DD";"DDR"
"DD";"Deutsche Demokratische Republik"
"DE";"Deutschland"
"DE";"BRD"
"DG";"Diego Garcia"
"DJ";"Dschibuti"
"DK";"Dänemark"
"DM";"Dominica"
"DO";"Dominikanische Republik"
"DZ";"Algerien"
"EA";"Ceuta, Melilla"
"EA";"Ceuta"
"EA";"Melilla"
"EC";"Ecuador"
"EE";"Estland"
"EG";"Ägypten"
"EH";"Westsahara"
"ER";"Eritrea"
"ES";"Spanien"
"ET";"Äthiopien"
"FI";"Finnland"
"FJ";"Fidschi"
"FK";"Falklandinseln"
"FM";"Mikronesien"
"FO";"Färöer"
"FR";"Frankreich"
"FX";"Frankreich, France métropolitaine"
"GA";"Gabun"
"GB";"Großbritannien"
"GB";"England"
"GB";"Vereinigtes Königreich Großbritannien und Nordirland"
"GD";"Grenada"
"GE";"Georgien"
"GF";"Französisch-Guayana"
"GG";"Guernsey"
"GH";"Ghana"
"GI";"Gibraltar"
"GL";"Grönland"
"GM";"Gambia"
"GN";"Guinea"
"GP";"Guadeloupe"
"GQ";"Äquatorialguinea"
"GR";"Griechenland"
"GS";"Südgeorgien und die Südlichen Sandwichinseln"
"GT";"Guatemala"
"GU";"Guam"
"GW";"Guinea-Bissau"
"GY";"Guyana"
"HK";"Hongkong"
"HM";"Heard und McDonaldinseln"
"HN";"Honduras"
"HR";"Kroatien"
"HT";"Haiti"
"HU";"Ungarn"
"IC";"Kanarische Inseln"
"ID";"Indonesien"
"IE";"Irland"
"IE";"Nordirland"
"IL";"Israel"
"IM";"Isle of Man"
"IM";"Insel Man"
"IN";"Indien"
"IO";"Britisches Territorium im Indischen Ozean"
"IQ";"Irak"
"IR";"Iran"
"IR";"Iran, Islamische Republik"
"IS";"Island"
"IT";"Italien"
"JE";"Jersey"
"JM";"Jamaika"
"JO";"Jordanien"
"JP";"Japan"
"KE";"Kenia"
"KG";"Kirgisistan"
"KH";"Kambodscha"
"KI";"Kiribati"
"KM";"Komoren"
"KN";"St. Kitts und Nevis"
"KP";"Nordkorea"
"KP";"Korea, Demokratische Volksrepublik"
"KR";"Südkorea"
"KR";"Korea, Republik"
"KW";"Kuwait"
"KY";"Kaimaninseln"
"KZ";"Kasachstan"
"LA";"Laos"
"LA";"Laos, Demokratische Volksrepublik"
"LB";"Libanon"
"LC";"St. Lucia"
"LI";"Liechtenstein"
"LK";"Sri Lanka"
"LR";"Liberia"
"LS";"Lesotho"
"LT";"Litauen"
"LU";"Luxemburg"
"LV";"Lettland"
"LY";"Libyen"
"MA";"Marokko"
"MC";"Monaco"
"MD";"Moldawien"
"ME";"Montenegro"
"MF";"Saint-Martin"
"MG";"Madagaskar"
"MH";"Marshallinseln"
"MK";"Mazedonien"
"ML";"Mali"
"MM";"Myanmar"
"MM";"Burma"
"MN";"Mongolei"
"MO";"Macao"
"MP";"Nördliche Marianen"
"MQ";"Martinique"
"MR";"Mauretanien"
"MS";"Montserrat"
"MT";"Malta"
"MU";"Mauritius"
"MV";"Malediven"
"MW";"Malawi"
"MW";"Moldau"
"MX";"Mexiko"
"MY";"Malaysia"
"MZ";"Mosambik"
"NA";"Namibia"
"NC";"Neukaledonien"
"NE";"Niger"
"NF";"Norfolkinsel"
"NG";"Nigeria"
"NI";"Nicaragua"
"NL";"Niederlande"
"NO";"Norwegen"
"NP";"Nepal"
"NR";"Nauru"
"NU";"Niue"
"NZ";"Neuseeland"
"OM";"Oman"
"PA";"Panama"
"PE";"Peru"
"PF";"Französisch-Polynesien"
"PG";"Papua-Neuguinea"
"PH";"Philippinen"
"PK";"Pakistan"
"PL";"Polen"
"PM";"Saint-Pierre und Miquelon"
"PN";"Pitcairninseln"
"PR";"Puerto Rico"
"PS";"Staat Palästina"
"PS";"Gazastreifen"
"PS";"Palästina"
"PT";"Portugal"
"PW";"Palau"
"PY";"Paraguay"
"QA";"Katar"
"RE";"Réunion"
"RO";"Rumänien"
"RS";"Serbien"
"RS";"Kosovo"
"RU";"Russland"
"RU";"Russische Föderation"
"RU";"UdSSR"
"RW";"Ruanda"
"SA";"Saudi-Arabien"
"SB";"Salomonen"
"SC";"Seychellen"
"SD";"Sudan"
"SE";"Schweden"
"SG";"Singapur"
"SH";"St. Helena"
"SI";"Slowenien"
"SJ";"Svalbard und Jan Mayen"
"SK";"Slowakei"
"SL";"Sierra Leone"
"SM";"San Marino"
"SN";"Senegal"
"SO";"Somalia"
"SR";"Suriname"
"SS";"Südsudan"
"ST";"São Tomé und Príncipe"
"SV";"El Salvador"
"SX";"Sint Maarten"
"SY";"Syrien"
"SY";"Syrien, Arabische Republik"
"SZ";"Swasiland"
"TA";"Tristan da Cunha"
"TC";"Turks- und Caicosinseln"
"TD";"Tschad"
"TF";"Französische Süd- und Antarktisgebiete"
"TG";"Togo"
"TH";"Thailand"
"TJ";"Tadschikistan"
"TK";"Tokelau"
"TL";"Osttimor"
"TM";"Turkmenistan"
"TN";"Tunesien"
"TO";"Tonga"
"TR";"Türkei"
"TT";"Trinidad und Tobago"
"TV";"Tuvalu"
"TW";"Republik China"
"TZ";"Tansania"
"TZ";"Tansania, Vereinigte Republik"
"UA";"Ukraine"
"UA";"Krim"
"UG";"Uganda"
"UM";"United States Minor Outlying Islands"
"US";"USA"
"US";"Vereinigte Staaten von Amerika"
"UY";"Uruguay"
"UZ";"Usbekistan"
"VA";"Vatikanstadt"
"VA";"Vatikan"
"VC";"St. Vincent und die Grenadinen"
"VE";"Venezuela"
"VG";"Britische Jungferninseln"
"VI";"Amerikanische Jungferninseln"
"VN";"Vietnam"
"VU";"Vanuatu"
"WF";"Wallis und Futuna"
"WS";"Samoa"
"YE";"Jemen"
"YT";"Mayotte"
"YU";"Jugoslawien"
"ZA";"Südafrika"
"ZM";"Sambia"
"ZR";"Zaire"
"ZW";"Simbabwe"

View File

@@ -0,0 +1,127 @@
require(stringr)
require(RCurl)
dlParsePage <- function(url, dest) {
if (!file.exists(dest)) {
url_curl <- getURL(url, curl=handle)
write(url_curl, dest)
}
parsed <- htmlParse(dest, encoding = "UTF-8")
return(parsed)
}
# Calculate the number of days in a specific month
# (Important for manipulating the search URL because otherwise there would be articles
# of the NEXT month in the search results if we have a too large time span)
getDays <- function(i, df) {
cur_tspan <- names(df)[i]
cur_m <- as.numeric(str_extract(cur_tspan, "^\\d+\\>"))
cur_y <- as.numeric(str_extract(cur_tspan, "\\d{4}"))
if (cur_m == 12) {
nex_y <- cur_y + 1
nex_m <- 1
}
else {
nex_y <- cur_y
nex_m <- cur_m +1
}
cur_date <- str_c(cur_y, cur_m, "01", sep = "-")
nex_date <- str_c(nex_y, nex_m, "01", sep = "-")
days <- as.numeric(difftime(as.Date(nex_date), as.Date(cur_date)))
return(days)
}
getCode <- function(i, df) {
# There's a bug with appearances of Namibia, producing NAs instead of "NA"
if (! is.na(as.character(df[i]))) {
code <- as.character(df[i]) # Get the country's code
}
else {
code <- "NA"
}
return(code)
}
# NAs get converted into "NA" for Namibia
namibiaBug <- function(x) {
if (is.na(x)) {x <- "NA"}
return(x)
}
# Get all headlines for a specific country and a specific month
getHeadlines <- function(df, countrycol, descountry, month) {
no <- 0
news <- NULL
for (r in 1:nrow(df)) {
descountry <- namibiaBug(descountry)
curcountry <- namibiaBug(as.character(df[r,countrycol]))
if (curcountry == descountry) {
no <- no + 1
news[no] <- as.character(df[r,month])
}
}
cat(news)
}
# Remove months with 0 surprising newsfocuses
removeZeroMonths <- function(df, mincol, maxcol) {
delmonthno <- 0
delmonth <- NULL
for (c in mincol:maxcol) {
month <- names(df)[c]
no <- 0
for (r in 1:nrow(df)) {
no <- no + as.numeric(df[r,c])
}
if (no == 0) {
delmonthno <- delmonthno + 1
delmonth[delmonthno] <- month
}
}
return(df [! names(df) %in% delmonth])
}
# Get countries which produced newsfocuses in a given month
getFocusCountries <- function(df, month) {
exists <- FALSE
for (c in 1:ncol(df)) {
status <- str_detect(month, names(df)[c])
if (status) {
exists <- TRUE
}
}
if (!exists) {
return(cat(month, "isn't a valid column in given dataframe"))
}
no <- 0
countries <- NULL
for (r in 1:nrow(df)) {
supfoc <- df[r,month]
if (supfoc > 0) {
no <- no + 1
countries[no] <- as.character(df[r,"name"])
}
}
return(countries)
}
# Get average development of news over a span of years
getAverages <- function(df, codecol, code, yearspan) {
no <- 0
averg <- NULL
for (r in 1:nrow(df)) {
code <- namibiaBug(code)
curcode <- namibiaBug(df[r,codecol])
if (curcode == code) {
for (y in yearspan) {
no <- no + 1
curcol <- str_c(y,"-averg")
averg[no] <- as.numeric(df[r,curcol])
}
}
}
return(data.frame(year=yearspan, averg=averg))
}