initial commit v1.0

2014-11-28 18:05:12 +01:00
commit ecd3d5214d
21 changed files with 1709 additions and 0 deletions
@@ -0,0 +1,46 @@
+require(stringr)
+require(dplyr)
+require(XML)
+
+setwd(dir="~/Dokumente/Uni/Aktuell/Datenerhebung WWW/Blogartikel 2 - Newsfokus")
+
+clist_url <- "http://de.wikipedia.org/wiki/ISO-3166-1-Kodierliste"
+clist_tbl <- readHTMLTable(clist_url, encoding = "UTF-8")[[1]]
+names(clist_tbl) <- c("name", "code", "code-3", "code-num", "tld", "ioc", "code-2", "un")
+
+clist <- data.frame(clist_tbl, stringsAsFactors = FALSE)
+keep <- c("code", "name")
+clist <- clist[keep]
+clist$code <- as.character.factor(clist$code)
+clist$name <- as.character.factor(clist$name)
+rm(keep)
+
+# Cleaning table from headlines, annotations and brackets
+for (i in 1:nrow(clist)) {
+  name <- as.character(clist$name[i])
+  status <- str_detect(name, "!!")
+  if (status) {
+    clist <- clist[-i,]
+  }
+  rm(i, name, status)
+}
+
+cleanBrackets <- function(string) {
+  string <- str_replace(string, "\\[.*\\]", "")
+  string <- str_replace(string, "\\(.*\\)", "")
+  string <- str_replace(string, ".+!", "")
+  string <- str_trim(string)
+  return(string)
+}
+
+for (i in 1:nrow(clist)) {
+  name <- as.character(clist$name[i])
+  code <- as.character(clist$code[i])
+  clist$name[i] <- cleanBrackets(name)
+  clist$code[i] <- cleanBrackets(code)
+  rm(i, name, code)
+}
+
+write.table(clist, "newsfokus-countrylist.txt", sep=";", row.names=FALSE, col.names=FALSE) 
+
+# Diese exportierte Länderliste wird noch manuell bearbeitet, siehe "newsfokus-countrylist-manualchanges.txt"
@@ -0,0 +1,185 @@
+require(XML)
+require(RCurl)
+require(stringr)
+
+
+
+setwd("~/Dokumente/Uni/Aktuell/Datenerhebung WWW/Blogartikel 2 - Newsfokus/")
+# Import manually edited country list
+cl <- read.csv("newsfokus-countrylist.txt", sep = ";", header = FALSE)
+names(cl) <- c("code", "name")
+
+
+# FUNCTIONS ---------------------------------------------------------------
+
+source("newsfokus-functions.R")
+
+
+# PREPARATIONS ------------------------------------------------------------
+
+# Generate all month.year combinations since January 2000
+tspans <- NULL
+for (y in 2000:2014) {
+  for (m in 01:12) {
+    if (length(tspans) == 0) {
+      tspans <- str_c(m,".",y)
+    }
+    else {
+      tspans <- c(tspans, str_c(m,".",y))
+    }
+  }
+  rm(y, m)
+}
+
+# # All combinations mmmYYYY (not used yet)
+# years <- 2000:2014 # 15
+# months <- month.abb
+# no <- 0
+# comb <- NULL
+# for (m in 1:12) {
+#   for (y in 1:15) {
+#     no <- no + 1
+#     comb[no] <- str_c(months[m],years[y])
+#   }
+# }
+# rm(years, months, m, y, no)
+
+# Remove future/incomplete months
+removemonths <- c(9.2014, 10.2014, 11.2014, 12.2014)
+tspans <- tspans [! tspans %in% removemonths]
+rm(removemonths)
+
+
+# Create new columns for every month and set the column names accordingly
+cl[sprintf("%s", tspans)] <- 0
+
+# Copy data frame for headlines
+headlines <- cl
+
+# Set curl handle for friendly scraping
+handle <- getCurlHandle(httpheader = list(from         = "max.mehl@uni.kn",
+                                          'user-agent' = str_c(R.version$version.string)
+                                          )
+                        )
+
+
+# SCRAPING ALL THE NEWS \o/ ----------------------------------------------
+
+# Das Prozedere ist folgendes:
+#   1. Erstelle die URL je nach Zeitspanne und wiederhole das für jede existierende Seite (meist über 100)
+#     - Es gibt eine Fehlermeldung auf der Suchseite, wenn keine Ergebnisse mehr vorhanden sind
+#   2. Lade Seite herunter und parse sie
+#     - Nicht, wenn sie schon vorhanden ist, um Bandbreite zu sparen
+#   3. Suche nach Auftauchen von Ländernamen
+#     3a. Wenn ja, dann zähle jeweiligen Eintrag im DF um 1 hoch
+#       - nur ein Auftauchen des Landes in einem Artikel wird gezählt 
+#         (es gibt für einzelne Ländercodes mehrere Schreibweisen)
+
+stat_news_all <- 0
+stat_news_pos <- 0
+stat_pages    <- 0
+# This loop does the scraping, searching and indexing of each month and country
+# !!! Beware: It takes around 24 hours to finish for 1.2000 - 8.2014! Load backup_cl.RData (and optionally backup_headlines.RData) to skip this step
+for (i in 3:ncol(cl)) {
+  tspan <- names(cl)[i]
+  
+  # Create folder for downloaded HTML files
+  dir.create(str_c("materials/",tspan), showWarnings = FALSE, recursive = TRUE)
+  
+  days <- getDays(i, cl)
+  # Generate the month's base url
+  baseurl  <- str_c("http://www.spiegel.de/suche/index.html?suchbegriff=+&quellenGroup=SPOX&suchbereich=kopftext&fromDate=1.", tspan, "&toDate=", days, ".", tspan, "&offsets=999999&pageNumber=")
+  
+  # In every loop we start with page 1 again
+  page <- 1
+  
+  # Now expand the URL by the page number UNTIL there's an error page
+  repeat {
+    cat("\n--> Processing page", page, "of timespan", tspan, "\n")
+    
+    url <- str_c(baseurl, page)
+    dest <- str_c("materials/",tspan,"/",tspan,"-",page,".html")
+    url_parsed <- dlParsePage(url, dest)
+    
+    status <- unlist(xpathSApply(url_parsed, "//h3[contains(text(), 'Ihre Suche ergab keinen Treffer')]"))
+    if (! is.null(status)) {
+      # If there's an error page, there're no further articles. Skip this loop and begin with next month
+      cat("Letzte Seite erreicht:", url, "\n")
+      break
+    }
+    
+    # Page is valid, now split in single articles and search for countries in each title and teaser
+    headline  <- xpathSApply(url_parsed, "//div[@class='search-teaser']//span[@class='headline']", xmlValue)
+    teaser    <- xpathSApply(url_parsed, "//div[@class='search-teaser']/p", xmlValue)
+    url_arts  <- xpathSApply(url_parsed, "//div[@class='search-teaser']/a", xmlGetAttr, "href")
+    url_short <- ""
+    # Combine headline and teaser to make it easier to search
+    teaser   <- str_c(headline, teaser, sep=" ")
+    if (length(teaser) == 0) {
+      errormsg <- str_c("Probably 500 error at: ", tspan,"-",page)
+      write(errormsg, "scraping-errors.log", append = TRUE)
+      rm(errormsg)
+    }
+    else {
+      # Analyse every single teaser/headline combination
+      for (t in 1:length(teaser)) {
+        yet <- "" # Did the country already appear in the article? Empty with each loop 
+        string <- teaser[t]
+        for (c in 1:nrow(cl)) {
+          name <- as.character(cl$name[c])    # Name of the county to detect in the teaser
+          status <- str_detect(tolower(string), tolower(name))  # Does the country's name appear in the teaser?
+          
+          if (status) { # yes
+            code <- getCode(c, cl$code)
+            
+            cat("The string contains news from:", code, "\n")
+            
+            # We only want to count a country once even if it appears multiple times in an article
+            already <- str_detect(yet, code)  # Did the country already appear?
+            if (!already) { # no
+              yet <- str_c(yet, code, sep=" ")
+              cl[c , tspan] <- cl[c , tspan] + 1  # Count +1 to the number of appearances in the data frame
+              
+              # Save headlines + links to a different data frame
+              url_short[t] <- str_extract(url_arts[t], ".+/")
+              url_short[t] <- str_c(url_short[t], str_extract(url_arts[t], "a\\-\\d+\\.html"))
+              new_headline_entry <- str_c(headline[t], " (", url_short[t], ")")
+              if (headlines[c , tspan] == 0) {
+                headlines[c , tspan] <- new_headline_entry
+              }
+              else {
+                headlines[c , tspan] <- str_c(headlines[c , tspan], "\n", new_headline_entry)
+              }
+              
+            }
+            rm(code, already)
+            stat_news_pos <- stat_news_pos + 1
+          }
+          rm(c, name, status)
+        }
+        rm(t, yet, string)
+        stat_news_all <- stat_news_all +1
+      }
+    }
+     
+    # Go to the next page
+    page <- page + 1
+    stat_pages <- stat_pages + 1
+  }
+  rm(i, tspan, days, baseurl, page, url, url_parsed, status, teaser, headline, dest, url_arts, url_short, new_headline_entry)
+  # Backup all data after each month
+  write.csv(cl, "backup_cl.csv")
+  write.csv(headlines, "backup_headlines.csv")
+  save(cl, file="backup_cl.RData")
+  save(headlines, file="backup_headlines.RData")
+}
+# End of huge for-loop
+
+
+# Final Backup
+write.csv(cl, "backup_cl.csv")
+write.csv(headlines, "backup_headlines.csv")
+save(cl, file="backup_cl.RData")
+save(headlines, file="backup_headlines.RData")
+
+
@@ -0,0 +1,264 @@
+require(plyr)
+require(dplyr)
+require(stringr)
+
+
+# PREPARATIONS ------------------------------------------------------------
+
+
+no <- 0
+for (r in 1:nrow(cl)) {
+  for (c in 3:ncol(cl)) {
+    no <- no + as.numeric(cl[r,c])
+  }
+}
+cat("Sum of country news entries:",no)
+rm(r,c,no)
+
+# Save old cl-dataframe for next steps
+cl_bak <- cl
+
+# Merge appearances for same country code (but different name)
+cl <- ddply(cl,"code",numcolwise(sum))
+target <- which(names(cl) == 'code')[1]
+cl <- cbind(cl[,1:target,drop=F], data.frame(name="PLACEHOLDER"), cl[,(target+1):length(cl),drop=F])
+rm(target)
+cl$name <- "bla"
+
+# Now choose names for the according codes (first fits)
+for (r in 1:nrow(cl)) {
+  code <- as.character(cl$code[r])
+  code <- namibiaBug(code)
+  r2 <- 1
+  
+  repeat {  
+    code2 <- as.character(cl_bak$code[r2])
+    code2 <- namibiaBug(code2)
+    
+    if (code2 == code) {
+      name <- as.character(cl_bak$name[r2])
+      cl$name[r] <- name
+      break
+    }
+    else {
+      r2 <- r2 + 1
+    }
+  }
+  rm(r, r2, code, code2, name)
+}
+
+
+# Summarize all counts for each country
+cl_stats <- cl[,1:2]
+cl_stats["overall"] <- 0
+overall <- NULL
+for (r in 1:nrow(cl)) {
+  overall[r] <- 0
+  for (c in 3:ncol(cl)) {
+    overall[r] <- overall[r] + as.numeric(cl[r,c])
+  }
+  cl_stats$overall[r] <- overall[r]
+}
+rm(overall, r, c)
+
+
+# # If we would want the overall-counter in cl
+# cl["overall"] <- 0
+# cl$overall <- cl_stats$overall
+
+
+# # Get all names for one country code (if there are multiple)
+# for (r in 1:nrow(cl)) {
+#   name <- NULL
+#   no <- 0
+#   code <- as.character(cl$code[r])
+#   code <- namibiaBug(code)
+#   r2 <- 1
+#   repeat {
+#     if (r2 > nrow(cl_bak)) {
+#       break  # only end if no next row
+#     }
+#     code2 <- as.character(cl_bak$code[r2])
+#     code2 <- namibiaBug(code2)
+#     if (code2 == code) {
+#       no <- no + 1
+#       name[no] <- as.character(cl_bak$name[r2])
+#       r2 <- r2 + 1
+#     }
+#     else {
+#       r2 <- r2 + 1
+#     }
+#   }
+#   if (length(name) > 1) {
+#     cat("For",code,"there are", no, "names:", name, "\n")
+#   }
+#   rm(r, name, no, code, r2, code2)
+# }
+
+# Old dataframe not needed anymore
+rm(cl_bak)
+
+
+# Calculate the total and average news entries for each year
+years <- 2000:2014
+# Search string for str_detect for every year
+year_str <- sprintf("^\\d{1,2}\\.%s", years)
+
+for (r in 1:nrow(cl)) {
+  total <- 0
+  average <- 0
+  for (y in 1:length(years)) {
+    months <- 0
+    for (c in 1:ncol(cl)) {
+      if (str_detect(names(cl)[c], year_str[y])) {
+        total <- total + as.numeric(cl[r,c])
+        months <- months + 1
+      }
+    }
+    colnametotal <- str_c(years[y],"-total")
+    colnameaverg <- str_c(years[y],"-averg")
+    average <- round(total / months, 4)
+    cl_stats[r, colnametotal] <- total
+    cl_stats[r, colnameaverg] <- average
+    total <- 0
+    average <- 0
+  }
+  rm(r, total, average, y, months, c, colnametotal, colnameaverg)
+}
+rm(years, year_str)
+
+
+
+
+# IDENTIFY SURPRISING NEWSFOCUS -------------------------------------------
+
+
+# Land war 3x öfter genannt als im Monat davor, aber mehr als 50x
+no <- 0
+for (c in 4:ncol(cl)) { # starting 1 month later
+  for (r in 1:nrow(cl)) {
+    month <- names(cl)[c]
+    
+    # Conditions to fulfill
+    status1 <- cl[r,c] > 3 * cl[r,c-1]
+    status2 <- cl[r,c] > 50
+    
+    if (status1 && status2) {
+      no <- no + 1
+      cat("[",no,"] ",as.character(cl$code[r]),": 3x m-1 && >50 in: ", month,"\n", sep = "")
+    }
+  }
+  rm(r,c,month,status1,status2)
+}
+rm(no)
+
+
+# Land wurde in einem Monat 3x öfter als im Jahresdurchschnitt genannt
+no <- 0
+for (c in 3:ncol(cl)) {
+  for (r in 1:nrow(cl)) {
+    month <- names(cl)[c]
+    year <- str_extract(month, "\\d{4}")
+    averg <- str_c(year,"-averg")
+    averg <- cl_stats[r,averg]
+    
+    # Conditions to fulfill
+    status1 <- cl[r,c] > 3 * averg
+    status2 <- cl[r,c] > 50
+    
+    if (status1 && status2) {
+      no <- no + 1
+      cat("[",no,"] ",as.character(cl$code[r]),": 3x year average && >50 ", month,"\n", sep = "")    }
+  }
+  rm(r,c,month,year,averg,status1,status2)
+}
+rm(no)
+
+# Final method: Land in einem Monat öfter genannt als alle 3 Monate davor zusammen
+cl_supfoc_mon <- data.frame(code=NA,name=NA)
+cl_supfoc_mon[sprintf("%s", tspans)] <- 0
+no <- 0
+for (c in 6:ncol(cl)) {
+  for (r in 1:nrow(cl)) {
+    month <- names(cl)[c]
+    code  <- as.character(cl$code[r])
+    name  <- as.character(cl$name[r])
+    
+    # Conditions to fulfill
+    status1 <- cl[r,c] > cl[r,c-1]+cl[r,c-2]+cl[r,c-3]
+    status2 <- cl[r,c] > 50
+    
+    if (status1 && status2) {
+      no <- no + 1
+      cl_supfoc_mon[no, "code"] <- code
+      cl_supfoc_mon$name[no] <- name
+      #if (is.null(cl_supfoc_mon[no, month])) { cl_supfoc_mon[no, month] <- 0}
+      #cl_supfoc_mon[no, month] <- cl_supfoc_mon[no, month] + 1
+      cl_supfoc_mon[no, month] <- 1
+      cat("[",no,"] ",as.character(cl$code[r]),": >m-(1:3) && >50 ", month,"\n", sep = "")    
+    }
+  }
+  rm(r,c,month,status1,status2)
+}
+rm(no, code, name)
+
+# Clean cl_supfoc_mon: Replace NAs by 0, and sum up multiple appeared countries
+cl_supfoc_mon[is.na(cl_supfoc_mon)] <- 0
+cl_supfoc_mon <- ddply(cl_supfoc_mon,c("code", "name"),numcolwise(sum))
+
+# Delete all month-columns with 0 surprising events
+cl_supfoc_only_mon <- removeZeroMonths(cl_supfoc_mon, 3, ncol(cl_supfoc_mon))
+
+
+
+
+# Get total surprising newsfocuses for each country
+cl_supfoc_total <- data.frame(code=NA, name=NA, total=NA)
+for (r in 1:nrow(cl_supfoc_mon)) {
+  total <- 0
+  cl_supfoc_total[r,"code"] <- as.character(cl_supfoc_mon$code[r])
+  cl_supfoc_total[r,"name"] <- as.character(cl_supfoc_mon$name[r])
+  for (c in 3:ncol(cl_supfoc_mon)) {
+    total <- total + as.numeric(cl_supfoc_mon[r,c])
+  }
+  cl_supfoc_total[r,"total"] <- total
+}
+rm(r, total, c)
+
+
+
+
+# Total highlights per month + turn around table for graphs
+cl_supfoc_mon["highlight"] <- 1
+cl_supfoc_turn_mon <- ddply(cl_supfoc_mon,"highlight", numcolwise(sum))
+cl_supfoc_mon$highlight <- NULL
+cl_supfoc_turn_mon$highlight <- NULL
+cl_supfoc_turn_mon <- data.frame(month = names(cl_supfoc_turn_mon), highs = as.numeric(cl_supfoc_turn_mon[1,]))
+# Convert %d.%y to valid date class
+months <- NULL
+for (m in 1:length(tspans)) {
+  dates <- str_c("15.",tspans[m])
+  months[m] <- dates
+}
+rm(m, dates)
+cl_supfoc_turn_mon$month <- as.Date(months, format = "%d.%m.%Y")
+rm(months)
+
+# Delete all month-rows with 0 surprising events
+cl_supfoc_turn_only_mon <- cl_supfoc_turn_mon[!cl_supfoc_turn_mon$highs == 0,]
+rownames(cl_supfoc_turn_only_mon) <- NULL
+
+
+
+
+
+
+# # Replace 0s by NAs
+# for (r in 1:180) {
+#   if (! is.na(cl_total2$highs[r])) {
+#     if (cl_total2$highs[r] == 0) {
+#       cl_total2$highs[r] <- NA
+#     }
+#   }
+# }
+
@@ -0,0 +1,80 @@
+require(rworldmap)
+require(ggplot2)
+
+theCountries <- c("DE", "US", "BR")
+# These are the ISO3 names of the countries you'd like to plot in red
+
+malDF <- data.frame(country = c("DE", "US", "BR", "ZA"), malaria = c(2000, 2001, 2002, 2002), news = c(2, 3, 0, 1))
+# malDF is a data.frame with the ISO3 country names plus a variable to
+# merge to the map data
+
+malMap <- joinCountryData2Map(malDF, joinCode = "ISO2", nameJoinColumn = "country")
+# This will join your malDF data.frame to the country map data
+
+mapCountryData(malMap, nameColumnToPlot="malaria", catMethod = "categorical", missingCountryCol = gray(.8))
+# And this will plot it, with the trick that the color palette's first
+# color is red
+
+
+
+# Absolute Frequ Newsfocus Map --------------------------------------------
+
+
+# # Absolute Häufigkeiten der Highlights mit Bubbles
+# malMap <- joinCountryData2Map(cl_supfoc_total, joinCode = "ISO2", nameJoinColumn = "code")
+# mapBubbles( dF=malMap, nameZSize="total",nameZColour="GEO3major", 
+#             colourPalette=c("black", "yellow", "blue", "orange", "red", "white", "green"),
+#             oceanCol="lightblue",
+#             landCol="wheat",
+#             fill=TRUE,
+#             symbolSize=0.5,
+#             pch=21)
+
+# Absolute Newfokus Häufigkeiten Welt:
+absMap <- joinCountryData2Map(cl_supfoc_total, joinCode = "ISO2", nameJoinColumn = "code", verbose=TRUE)
+mapCountryData(absMap, nameColumnToPlot="total", catMethod="fixedWidth",
+               numCats=5,
+               mapTitle="Anzahl Überraschungsfokusse weltweit",
+               oceanCol="lightblue",
+               missingCountryCol=gray(.9)
+               )
+
+# Absolute Newfokus Häufigkeiten EU-Asien-Nordafrika:
+mapCountryData(absMap, nameColumnToPlot="total", catMethod="fixedWidth",
+               numCats=5,
+               mapTitle="Anzahl Überraschungsfokusse Nordafrika und Asien",
+               oceanCol="lightblue",
+               missingCountryCol=gray(.9),
+               xlim=c(10,140),
+               ylim=c(30,70)
+               )
+
+
+
+# Development Newsfocus over time -----------------------------------------
+
+
+# Entwicklung der Surprising Focuses über die Jahre
+cc <- ggplot(cl_supfoc_turn_mon, aes(month,highs))
+cc <- cc + geom_histogram(fill="steelblue", stat="identity") 
+cc <- cc + stat_smooth(size=1,colour="red",method="loess", se=FALSE)
+cc <- cc + ggtitle("Zeitliche Entwicklung von plötzlichen Medienfokussen") + xlab("Einzelne Monate") + ylab("Plötzliche Medienfokusse")
+cc
+
+
+# Beispiel1: Jährliche Durchschnittsanzahl 2000-2014 der Nachrichten über Syrien
+yearspan <- 2000:2014
+avergdf <- getAverages(df = cl_stats, codecol = "code", code = "SY", yearspan = yearspan)
+
+averg <- ggplot(data = avergdf, aes(x = year, y=averg))
+averg + geom_line() + ggtitle("Durchschnittliche Nachrichten pro Jahr über Syrien") + xlab("Jahre") + ylab("Durchschnittliche Nachrichten")
+
+# Beispiel2: Jährliche Durchschnittsanzahl 2000-2014 der Nachrichten über Israel
+yearspan <- 2000:2014
+avergdf <- getAverages(df = cl_stats, codecol = "code", code = "IL", yearspan = yearspan)
+
+averg <- ggplot(data = avergdf, aes(x = year, y=averg))
+averg + geom_line() + ggtitle("Durchschnittliche Nachrichten pro Jahr über Israel") + xlab("Jahre") + ylab("Durchschnittliche Nachrichten")
+
+
+
@@ -0,0 +1,38 @@
+"Nordkorea";"KP"
+"Südkorea";"KR"
+"Zaire";"CD"
+"Großbritannien";"GB"
+"England";"GB"
+"Russland";"RU"
+"Palästina";"PS"
+"Gazastreifen";"PS"
+"Burma";"MM"
+"Moldau";"MW"
+"Laos";"LA"
+"Kosovo";"RS"
+"Kongo";"CD"
+"Kongo-Brazzaville";"CG"
+"Iran";"IR"
+"Isle of Man";"IM"
+"DDR";"DD"
+"Elfenbeinküste";"CI"
+"China";"CN"
+"Weißrussland";"BY"
+"Antarktis";"AQ"
+"Krim";"UA"
+"UdSSR";"RU"
+"USA";"US"
+"BRD";"DE"
+"Nordirland";"IE"
+"Nordzypern";"CY"
+"Syrien";"SY"
+"VA";"Vatikan"
+"TZ";"Tansania"
+"EA";"Ceuta"
+"EA";"Melilla"
+
+
+"ENTFERNT Neutrale Zone";"NT"
+"ENTFERNT Europäische Gemeinschaft";"CE"
+"ENTFERNT Europäische Union";"EU"
+"ENTFERNT Burma";"BU"
@@ -0,0 +1,295 @@
+"AC";"Ascension"
+"AD";"Andorra"
+"AE";"Vereinigte Arabische Emirate"
+"AF";"Afghanistan"
+"AG";"Antigua und Barbuda"
+"AI";"Anguilla"
+"AL";"Albanien"
+"AM";"Armenien"
+"AN";"Niederländische Antillen"
+"AO";"Angola"
+"AQ";"Antarktis"
+"AQ";"Antarktika"
+"AR";"Argentinien"
+"AS";"Amerikanisch-Samoa"
+"AT";"Österreich"
+"AU";"Australien"
+"AW";"Aruba"
+"AX";"Åland"
+"AZ";"Aserbaidschan"
+"BA";"Bosnien und Herzegowina"
+"BB";"Barbados"
+"BD";"Bangladesch"
+"BE";"Belgien"
+"BF";"Burkina Faso"
+"BG";"Bulgarien"
+"BH";"Bahrain"
+"BI";"Burundi"
+"BJ";"Benin"
+"BL";"Saint-Barthélemy"
+"BM";"Bermuda"
+"BN";"Brunei Darussalam"
+"BO";"Bolivien"
+"BQ";"Bonaire, Sint Eustatius und Saba"
+"BR";"Brasilien"
+"BS";"Bahamas"
+"BT";"Bhutan"
+"BV";"Bouvetinsel"
+"BW";"Botswana"
+"BY";"Belarus"
+"BY";"Weißrussland"
+"BZ";"Belize"
+"CA";"Kanada"
+"CC";"Kokosinseln"
+"CD";"Demokratische Republik Kongo"
+"CD";"Kongo, Demokratische Republik"
+"CD";"Kongo"
+"CD";"Zaire"
+"CF";"Zentralafrikanische Republik"
+"CG";"Republik Kongo"
+"CG";"Kongo-Brazzaville"
+"CH";"Schweiz"
+"CI";"Elfenbeinküste"
+"CI";"Côte d’Ivoire"
+"CK";"Cookinseln"
+"CL";"Chile"
+"CM";"Kamerun"
+"CN";"China"
+"CN";"China, Volksrepublik"
+"CO";"Kolumbien"
+"CP";"Clipperton"
+"CR";"Costa Rica"
+"CS";"Tschechoslowakei"
+"CS";"Serbien und Montenegro"
+"CU";"Kuba"
+"CV";"Kap Verde"
+"CW";"Curaçao"
+"CX";"Weihnachtsinsel"
+"CY";"Zypern"
+"CY";"Nordzypern"
+"CZ";"Tschechische Republik"
+"DD";"DDR"
+"DD";"Deutsche Demokratische Republik"
+"DE";"Deutschland"
+"DE";"BRD"
+"DG";"Diego Garcia"
+"DJ";"Dschibuti"
+"DK";"Dänemark"
+"DM";"Dominica"
+"DO";"Dominikanische Republik"
+"DZ";"Algerien"
+"EA";"Ceuta, Melilla"
+"EA";"Ceuta"
+"EA";"Melilla"
+"EC";"Ecuador"
+"EE";"Estland"
+"EG";"Ägypten"
+"EH";"Westsahara"
+"ER";"Eritrea"
+"ES";"Spanien"
+"ET";"Äthiopien"
+"FI";"Finnland"
+"FJ";"Fidschi"
+"FK";"Falklandinseln"
+"FM";"Mikronesien"
+"FO";"Färöer"
+"FR";"Frankreich"
+"FX";"Frankreich, France métropolitaine"
+"GA";"Gabun"
+"GB";"Großbritannien"
+"GB";"England"
+"GB";"Vereinigtes Königreich Großbritannien und Nordirland"
+"GD";"Grenada"
+"GE";"Georgien"
+"GF";"Französisch-Guayana"
+"GG";"Guernsey"
+"GH";"Ghana"
+"GI";"Gibraltar"
+"GL";"Grönland"
+"GM";"Gambia"
+"GN";"Guinea"
+"GP";"Guadeloupe"
+"GQ";"Äquatorialguinea"
+"GR";"Griechenland"
+"GS";"Südgeorgien und die Südlichen Sandwichinseln"
+"GT";"Guatemala"
+"GU";"Guam"
+"GW";"Guinea-Bissau"
+"GY";"Guyana"
+"HK";"Hongkong"
+"HM";"Heard und McDonaldinseln"
+"HN";"Honduras"
+"HR";"Kroatien"
+"HT";"Haiti"
+"HU";"Ungarn"
+"IC";"Kanarische Inseln"
+"ID";"Indonesien"
+"IE";"Irland"
+"IE";"Nordirland"
+"IL";"Israel"
+"IM";"Isle of Man"
+"IM";"Insel Man"
+"IN";"Indien"
+"IO";"Britisches Territorium im Indischen Ozean"
+"IQ";"Irak"
+"IR";"Iran"
+"IR";"Iran, Islamische Republik"
+"IS";"Island"
+"IT";"Italien"
+"JE";"Jersey"
+"JM";"Jamaika"
+"JO";"Jordanien"
+"JP";"Japan"
+"KE";"Kenia"
+"KG";"Kirgisistan"
+"KH";"Kambodscha"
+"KI";"Kiribati"
+"KM";"Komoren"
+"KN";"St. Kitts und Nevis"
+"KP";"Nordkorea"
+"KP";"Korea, Demokratische Volksrepublik"
+"KR";"Südkorea"
+"KR";"Korea, Republik"
+"KW";"Kuwait"
+"KY";"Kaimaninseln"
+"KZ";"Kasachstan"
+"LA";"Laos"
+"LA";"Laos, Demokratische Volksrepublik"
+"LB";"Libanon"
+"LC";"St. Lucia"
+"LI";"Liechtenstein"
+"LK";"Sri Lanka"
+"LR";"Liberia"
+"LS";"Lesotho"
+"LT";"Litauen"
+"LU";"Luxemburg"
+"LV";"Lettland"
+"LY";"Libyen"
+"MA";"Marokko"
+"MC";"Monaco"
+"MD";"Moldawien"
+"ME";"Montenegro"
+"MF";"Saint-Martin"
+"MG";"Madagaskar"
+"MH";"Marshallinseln"
+"MK";"Mazedonien"
+"ML";"Mali"
+"MM";"Myanmar"
+"MM";"Burma"
+"MN";"Mongolei"
+"MO";"Macao"
+"MP";"Nördliche Marianen"
+"MQ";"Martinique"
+"MR";"Mauretanien"
+"MS";"Montserrat"
+"MT";"Malta"
+"MU";"Mauritius"
+"MV";"Malediven"
+"MW";"Malawi"
+"MW";"Moldau"
+"MX";"Mexiko"
+"MY";"Malaysia"
+"MZ";"Mosambik"
+"NA";"Namibia"
+"NC";"Neukaledonien"
+"NE";"Niger"
+"NF";"Norfolkinsel"
+"NG";"Nigeria"
+"NI";"Nicaragua"
+"NL";"Niederlande"
+"NO";"Norwegen"
+"NP";"Nepal"
+"NR";"Nauru"
+"NU";"Niue"
+"NZ";"Neuseeland"
+"OM";"Oman"
+"PA";"Panama"
+"PE";"Peru"
+"PF";"Französisch-Polynesien"
+"PG";"Papua-Neuguinea"
+"PH";"Philippinen"
+"PK";"Pakistan"
+"PL";"Polen"
+"PM";"Saint-Pierre und Miquelon"
+"PN";"Pitcairninseln"
+"PR";"Puerto Rico"
+"PS";"Staat Palästina"
+"PS";"Gazastreifen"
+"PS";"Palästina"
+"PT";"Portugal"
+"PW";"Palau"
+"PY";"Paraguay"
+"QA";"Katar"
+"RE";"Réunion"
+"RO";"Rumänien"
+"RS";"Serbien"
+"RS";"Kosovo"
+"RU";"Russland"
+"RU";"Russische Föderation"
+"RU";"UdSSR"
+"RW";"Ruanda"
+"SA";"Saudi-Arabien"
+"SB";"Salomonen"
+"SC";"Seychellen"
+"SD";"Sudan"
+"SE";"Schweden"
+"SG";"Singapur"
+"SH";"St. Helena"
+"SI";"Slowenien"
+"SJ";"Svalbard und Jan Mayen"
+"SK";"Slowakei"
+"SL";"Sierra Leone"
+"SM";"San Marino"
+"SN";"Senegal"
+"SO";"Somalia"
+"SR";"Suriname"
+"SS";"Südsudan"
+"ST";"São Tomé und Príncipe"
+"SV";"El Salvador"
+"SX";"Sint Maarten"
+"SY";"Syrien"
+"SY";"Syrien, Arabische Republik"
+"SZ";"Swasiland"
+"TA";"Tristan da Cunha"
+"TC";"Turks- und Caicosinseln"
+"TD";"Tschad"
+"TF";"Französische Süd- und Antarktisgebiete"
+"TG";"Togo"
+"TH";"Thailand"
+"TJ";"Tadschikistan"
+"TK";"Tokelau"
+"TL";"Osttimor"
+"TM";"Turkmenistan"
+"TN";"Tunesien"
+"TO";"Tonga"
+"TR";"Türkei"
+"TT";"Trinidad und Tobago"
+"TV";"Tuvalu"
+"TW";"Republik China"
+"TZ";"Tansania"
+"TZ";"Tansania, Vereinigte Republik"
+"UA";"Ukraine"
+"UA";"Krim"
+"UG";"Uganda"
+"UM";"United States Minor Outlying Islands"
+"US";"USA"
+"US";"Vereinigte Staaten von Amerika"
+"UY";"Uruguay"
+"UZ";"Usbekistan"
+"VA";"Vatikanstadt"
+"VA";"Vatikan"
+"VC";"St. Vincent und die Grenadinen"
+"VE";"Venezuela"
+"VG";"Britische Jungferninseln"
+"VI";"Amerikanische Jungferninseln"
+"VN";"Vietnam"
+"VU";"Vanuatu"
+"WF";"Wallis und Futuna"
+"WS";"Samoa"
+"YE";"Jemen"
+"YT";"Mayotte"
+"YU";"Jugoslawien"
+"ZA";"Südafrika"
+"ZM";"Sambia"
+"ZR";"Zaire"
+"ZW";"Simbabwe"
@@ -0,0 +1,127 @@
+require(stringr)
+require(RCurl)
+
+
+dlParsePage <- function(url, dest) {
+  if (!file.exists(dest)) {
+    url_curl <- getURL(url, curl=handle)
+    write(url_curl, dest)
+  }
+  parsed <- htmlParse(dest, encoding = "UTF-8")
+  return(parsed)
+}
+
+# Calculate the number of days in a specific month
+  # (Important for manipulating the search URL because otherwise there would be articles 
+  # of the NEXT month in the search results if we have a too large time span)
+getDays <- function(i, df) {
+  cur_tspan <- names(df)[i]
+  cur_m <- as.numeric(str_extract(cur_tspan, "^\\d+\\>"))
+  cur_y <- as.numeric(str_extract(cur_tspan, "\\d{4}"))
+  if (cur_m == 12) {
+    nex_y <- cur_y + 1
+    nex_m <- 1
+  }
+  else {
+    nex_y <- cur_y
+    nex_m <- cur_m +1
+  }
+  cur_date <- str_c(cur_y, cur_m, "01", sep = "-")
+  nex_date <- str_c(nex_y, nex_m, "01", sep = "-")
+  days <- as.numeric(difftime(as.Date(nex_date), as.Date(cur_date)))
+  return(days)
+}
+
+getCode <- function(i, df) {
+  # There's a bug with appearances of Namibia, producing NAs instead of "NA"
+  if (! is.na(as.character(df[i]))) { 
+    code <- as.character(df[i])  # Get the country's code
+  }
+  else {
+    code <- "NA"
+  }
+  return(code)
+}
+
+# NAs get converted into "NA" for Namibia
+namibiaBug <- function(x) {
+  if (is.na(x)) {x <- "NA"}
+  return(x)
+}
+
+
+# Get all headlines for a specific country and a specific month
+getHeadlines <- function(df, countrycol, descountry, month) {
+  no <- 0
+  news <- NULL
+  for (r in 1:nrow(df)) {
+    descountry <- namibiaBug(descountry)
+    curcountry <- namibiaBug(as.character(df[r,countrycol]))
+    if (curcountry == descountry) {
+      no <- no + 1
+      news[no] <- as.character(df[r,month])
+    }
+  }
+  cat(news)
+}
+
+
+# Remove months with 0 surprising newsfocuses
+removeZeroMonths <- function(df, mincol, maxcol) {
+  delmonthno <- 0
+  delmonth <- NULL
+  for (c in mincol:maxcol) {
+    month <- names(df)[c]
+    no <- 0
+    for (r in 1:nrow(df)) {
+      no <- no + as.numeric(df[r,c])
+    }
+    if (no == 0) {
+      delmonthno <- delmonthno + 1
+      delmonth[delmonthno] <- month
+    }
+  }
+  return(df [! names(df) %in% delmonth])
+}
+
+# Get countries which produced newsfocuses in a given month
+getFocusCountries <- function(df, month) {
+exists <- FALSE
+for (c in 1:ncol(df)) {
+  status <- str_detect(month, names(df)[c])
+  if (status) {
+    exists <- TRUE
+  }
+}
+if (!exists) {
+  return(cat(month, "isn't a valid column in given dataframe"))
+}
+no <- 0
+countries <- NULL
+  for (r in 1:nrow(df)) {
+    supfoc <- df[r,month]
+    if (supfoc > 0) {
+      no <- no + 1
+      countries[no] <- as.character(df[r,"name"])
+    }
+  }
+  return(countries)
+}
+
+# Get average development of news over a span of years
+getAverages <- function(df, codecol, code, yearspan) {
+  no <- 0
+  averg <- NULL
+  for (r in 1:nrow(df)) {
+    code <- namibiaBug(code)
+    curcode <- namibiaBug(df[r,codecol])
+    if (curcode == code) {
+      for (y in yearspan) {
+        no <- no + 1
+        curcol <- str_c(y,"-averg")
+        averg[no] <- as.numeric(df[r,curcol])
+      }
+    }
+  }
+  return(data.frame(year=yearspan, averg=averg))
+}