initial commit v1.0
This commit is contained in:
BIN
R-Code/backup_cl.RData
Normal file
BIN
R-Code/backup_cl.RData
Normal file
Binary file not shown.
BIN
R-Code/backup_headlines.RData
Normal file
BIN
R-Code/backup_headlines.RData
Normal file
Binary file not shown.
46
R-Code/newsfokus-1-countrylist.R
Normal file
46
R-Code/newsfokus-1-countrylist.R
Normal file
@@ -0,0 +1,46 @@
|
||||
require(stringr)
|
||||
require(dplyr)
|
||||
require(XML)
|
||||
|
||||
setwd(dir="~/Dokumente/Uni/Aktuell/Datenerhebung WWW/Blogartikel 2 - Newsfokus")
|
||||
|
||||
clist_url <- "http://de.wikipedia.org/wiki/ISO-3166-1-Kodierliste"
|
||||
clist_tbl <- readHTMLTable(clist_url, encoding = "UTF-8")[[1]]
|
||||
names(clist_tbl) <- c("name", "code", "code-3", "code-num", "tld", "ioc", "code-2", "un")
|
||||
|
||||
clist <- data.frame(clist_tbl, stringsAsFactors = FALSE)
|
||||
keep <- c("code", "name")
|
||||
clist <- clist[keep]
|
||||
clist$code <- as.character.factor(clist$code)
|
||||
clist$name <- as.character.factor(clist$name)
|
||||
rm(keep)
|
||||
|
||||
# Cleaning table from headlines, annotations and brackets
|
||||
for (i in 1:nrow(clist)) {
|
||||
name <- as.character(clist$name[i])
|
||||
status <- str_detect(name, "!!")
|
||||
if (status) {
|
||||
clist <- clist[-i,]
|
||||
}
|
||||
rm(i, name, status)
|
||||
}
|
||||
|
||||
cleanBrackets <- function(string) {
|
||||
string <- str_replace(string, "\\[.*\\]", "")
|
||||
string <- str_replace(string, "\\(.*\\)", "")
|
||||
string <- str_replace(string, ".+!", "")
|
||||
string <- str_trim(string)
|
||||
return(string)
|
||||
}
|
||||
|
||||
for (i in 1:nrow(clist)) {
|
||||
name <- as.character(clist$name[i])
|
||||
code <- as.character(clist$code[i])
|
||||
clist$name[i] <- cleanBrackets(name)
|
||||
clist$code[i] <- cleanBrackets(code)
|
||||
rm(i, name, code)
|
||||
}
|
||||
|
||||
write.table(clist, "newsfokus-countrylist.txt", sep=";", row.names=FALSE, col.names=FALSE)
|
||||
|
||||
# Diese exportierte Länderliste wird noch manuell bearbeitet, siehe "newsfokus-countrylist-manualchanges.txt"
|
||||
185
R-Code/newsfokus-2-scraping.R
Normal file
185
R-Code/newsfokus-2-scraping.R
Normal file
@@ -0,0 +1,185 @@
|
||||
require(XML)
|
||||
require(RCurl)
|
||||
require(stringr)
|
||||
|
||||
|
||||
|
||||
setwd("~/Dokumente/Uni/Aktuell/Datenerhebung WWW/Blogartikel 2 - Newsfokus/")
|
||||
# Import manually edited country list
|
||||
cl <- read.csv("newsfokus-countrylist.txt", sep = ";", header = FALSE)
|
||||
names(cl) <- c("code", "name")
|
||||
|
||||
|
||||
# FUNCTIONS ---------------------------------------------------------------
|
||||
|
||||
source("newsfokus-functions.R")
|
||||
|
||||
|
||||
# PREPARATIONS ------------------------------------------------------------
|
||||
|
||||
# Generate all month.year combinations since January 2000
|
||||
tspans <- NULL
|
||||
for (y in 2000:2014) {
|
||||
for (m in 01:12) {
|
||||
if (length(tspans) == 0) {
|
||||
tspans <- str_c(m,".",y)
|
||||
}
|
||||
else {
|
||||
tspans <- c(tspans, str_c(m,".",y))
|
||||
}
|
||||
}
|
||||
rm(y, m)
|
||||
}
|
||||
|
||||
# # All combinations mmmYYYY (not used yet)
|
||||
# years <- 2000:2014 # 15
|
||||
# months <- month.abb
|
||||
# no <- 0
|
||||
# comb <- NULL
|
||||
# for (m in 1:12) {
|
||||
# for (y in 1:15) {
|
||||
# no <- no + 1
|
||||
# comb[no] <- str_c(months[m],years[y])
|
||||
# }
|
||||
# }
|
||||
# rm(years, months, m, y, no)
|
||||
|
||||
# Remove future/incomplete months
|
||||
removemonths <- c(9.2014, 10.2014, 11.2014, 12.2014)
|
||||
tspans <- tspans [! tspans %in% removemonths]
|
||||
rm(removemonths)
|
||||
|
||||
|
||||
# Create new columns for every month and set the column names accordingly
|
||||
cl[sprintf("%s", tspans)] <- 0
|
||||
|
||||
# Copy data frame for headlines
|
||||
headlines <- cl
|
||||
|
||||
# Set curl handle for friendly scraping
|
||||
handle <- getCurlHandle(httpheader = list(from = "max.mehl@uni.kn",
|
||||
'user-agent' = str_c(R.version$version.string)
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
# SCRAPING ALL THE NEWS \o/ ----------------------------------------------
|
||||
|
||||
# Das Prozedere ist folgendes:
|
||||
# 1. Erstelle die URL je nach Zeitspanne und wiederhole das für jede existierende Seite (meist über 100)
|
||||
# - Es gibt eine Fehlermeldung auf der Suchseite, wenn keine Ergebnisse mehr vorhanden sind
|
||||
# 2. Lade Seite herunter und parse sie
|
||||
# - Nicht, wenn sie schon vorhanden ist, um Bandbreite zu sparen
|
||||
# 3. Suche nach Auftauchen von Ländernamen
|
||||
# 3a. Wenn ja, dann zähle jeweiligen Eintrag im DF um 1 hoch
|
||||
# - nur ein Auftauchen des Landes in einem Artikel wird gezählt
|
||||
# (es gibt für einzelne Ländercodes mehrere Schreibweisen)
|
||||
|
||||
stat_news_all <- 0
|
||||
stat_news_pos <- 0
|
||||
stat_pages <- 0
|
||||
# This loop does the scraping, searching and indexing of each month and country
|
||||
# !!! Beware: It takes around 24 hours to finish for 1.2000 - 8.2014! Load backup_cl.RData (and optionally backup_headlines.RData) to skip this step
|
||||
for (i in 3:ncol(cl)) {
|
||||
tspan <- names(cl)[i]
|
||||
|
||||
# Create folder for downloaded HTML files
|
||||
dir.create(str_c("materials/",tspan), showWarnings = FALSE, recursive = TRUE)
|
||||
|
||||
days <- getDays(i, cl)
|
||||
# Generate the month's base url
|
||||
baseurl <- str_c("http://www.spiegel.de/suche/index.html?suchbegriff=+&quellenGroup=SPOX&suchbereich=kopftext&fromDate=1.", tspan, "&toDate=", days, ".", tspan, "&offsets=999999&pageNumber=")
|
||||
|
||||
# In every loop we start with page 1 again
|
||||
page <- 1
|
||||
|
||||
# Now expand the URL by the page number UNTIL there's an error page
|
||||
repeat {
|
||||
cat("\n--> Processing page", page, "of timespan", tspan, "\n")
|
||||
|
||||
url <- str_c(baseurl, page)
|
||||
dest <- str_c("materials/",tspan,"/",tspan,"-",page,".html")
|
||||
url_parsed <- dlParsePage(url, dest)
|
||||
|
||||
status <- unlist(xpathSApply(url_parsed, "//h3[contains(text(), 'Ihre Suche ergab keinen Treffer')]"))
|
||||
if (! is.null(status)) {
|
||||
# If there's an error page, there're no further articles. Skip this loop and begin with next month
|
||||
cat("Letzte Seite erreicht:", url, "\n")
|
||||
break
|
||||
}
|
||||
|
||||
# Page is valid, now split in single articles and search for countries in each title and teaser
|
||||
headline <- xpathSApply(url_parsed, "//div[@class='search-teaser']//span[@class='headline']", xmlValue)
|
||||
teaser <- xpathSApply(url_parsed, "//div[@class='search-teaser']/p", xmlValue)
|
||||
url_arts <- xpathSApply(url_parsed, "//div[@class='search-teaser']/a", xmlGetAttr, "href")
|
||||
url_short <- ""
|
||||
# Combine headline and teaser to make it easier to search
|
||||
teaser <- str_c(headline, teaser, sep=" ")
|
||||
if (length(teaser) == 0) {
|
||||
errormsg <- str_c("Probably 500 error at: ", tspan,"-",page)
|
||||
write(errormsg, "scraping-errors.log", append = TRUE)
|
||||
rm(errormsg)
|
||||
}
|
||||
else {
|
||||
# Analyse every single teaser/headline combination
|
||||
for (t in 1:length(teaser)) {
|
||||
yet <- "" # Did the country already appear in the article? Empty with each loop
|
||||
string <- teaser[t]
|
||||
for (c in 1:nrow(cl)) {
|
||||
name <- as.character(cl$name[c]) # Name of the county to detect in the teaser
|
||||
status <- str_detect(tolower(string), tolower(name)) # Does the country's name appear in the teaser?
|
||||
|
||||
if (status) { # yes
|
||||
code <- getCode(c, cl$code)
|
||||
|
||||
cat("The string contains news from:", code, "\n")
|
||||
|
||||
# We only want to count a country once even if it appears multiple times in an article
|
||||
already <- str_detect(yet, code) # Did the country already appear?
|
||||
if (!already) { # no
|
||||
yet <- str_c(yet, code, sep=" ")
|
||||
cl[c , tspan] <- cl[c , tspan] + 1 # Count +1 to the number of appearances in the data frame
|
||||
|
||||
# Save headlines + links to a different data frame
|
||||
url_short[t] <- str_extract(url_arts[t], ".+/")
|
||||
url_short[t] <- str_c(url_short[t], str_extract(url_arts[t], "a\\-\\d+\\.html"))
|
||||
new_headline_entry <- str_c(headline[t], " (", url_short[t], ")")
|
||||
if (headlines[c , tspan] == 0) {
|
||||
headlines[c , tspan] <- new_headline_entry
|
||||
}
|
||||
else {
|
||||
headlines[c , tspan] <- str_c(headlines[c , tspan], "\n", new_headline_entry)
|
||||
}
|
||||
|
||||
}
|
||||
rm(code, already)
|
||||
stat_news_pos <- stat_news_pos + 1
|
||||
}
|
||||
rm(c, name, status)
|
||||
}
|
||||
rm(t, yet, string)
|
||||
stat_news_all <- stat_news_all +1
|
||||
}
|
||||
}
|
||||
|
||||
# Go to the next page
|
||||
page <- page + 1
|
||||
stat_pages <- stat_pages + 1
|
||||
}
|
||||
rm(i, tspan, days, baseurl, page, url, url_parsed, status, teaser, headline, dest, url_arts, url_short, new_headline_entry)
|
||||
# Backup all data after each month
|
||||
write.csv(cl, "backup_cl.csv")
|
||||
write.csv(headlines, "backup_headlines.csv")
|
||||
save(cl, file="backup_cl.RData")
|
||||
save(headlines, file="backup_headlines.RData")
|
||||
}
|
||||
# End of huge for-loop
|
||||
|
||||
|
||||
# Final Backup
|
||||
write.csv(cl, "backup_cl.csv")
|
||||
write.csv(headlines, "backup_headlines.csv")
|
||||
save(cl, file="backup_cl.RData")
|
||||
save(headlines, file="backup_headlines.RData")
|
||||
|
||||
|
||||
264
R-Code/newsfokus-3-analysis.R
Normal file
264
R-Code/newsfokus-3-analysis.R
Normal file
@@ -0,0 +1,264 @@
|
||||
require(plyr)
|
||||
require(dplyr)
|
||||
require(stringr)
|
||||
|
||||
|
||||
# PREPARATIONS ------------------------------------------------------------
|
||||
|
||||
|
||||
no <- 0
|
||||
for (r in 1:nrow(cl)) {
|
||||
for (c in 3:ncol(cl)) {
|
||||
no <- no + as.numeric(cl[r,c])
|
||||
}
|
||||
}
|
||||
cat("Sum of country news entries:",no)
|
||||
rm(r,c,no)
|
||||
|
||||
# Save old cl-dataframe for next steps
|
||||
cl_bak <- cl
|
||||
|
||||
# Merge appearances for same country code (but different name)
|
||||
cl <- ddply(cl,"code",numcolwise(sum))
|
||||
target <- which(names(cl) == 'code')[1]
|
||||
cl <- cbind(cl[,1:target,drop=F], data.frame(name="PLACEHOLDER"), cl[,(target+1):length(cl),drop=F])
|
||||
rm(target)
|
||||
cl$name <- "bla"
|
||||
|
||||
# Now choose names for the according codes (first fits)
|
||||
for (r in 1:nrow(cl)) {
|
||||
code <- as.character(cl$code[r])
|
||||
code <- namibiaBug(code)
|
||||
r2 <- 1
|
||||
|
||||
repeat {
|
||||
code2 <- as.character(cl_bak$code[r2])
|
||||
code2 <- namibiaBug(code2)
|
||||
|
||||
if (code2 == code) {
|
||||
name <- as.character(cl_bak$name[r2])
|
||||
cl$name[r] <- name
|
||||
break
|
||||
}
|
||||
else {
|
||||
r2 <- r2 + 1
|
||||
}
|
||||
}
|
||||
rm(r, r2, code, code2, name)
|
||||
}
|
||||
|
||||
|
||||
# Summarize all counts for each country
|
||||
cl_stats <- cl[,1:2]
|
||||
cl_stats["overall"] <- 0
|
||||
overall <- NULL
|
||||
for (r in 1:nrow(cl)) {
|
||||
overall[r] <- 0
|
||||
for (c in 3:ncol(cl)) {
|
||||
overall[r] <- overall[r] + as.numeric(cl[r,c])
|
||||
}
|
||||
cl_stats$overall[r] <- overall[r]
|
||||
}
|
||||
rm(overall, r, c)
|
||||
|
||||
|
||||
# # If we would want the overall-counter in cl
|
||||
# cl["overall"] <- 0
|
||||
# cl$overall <- cl_stats$overall
|
||||
|
||||
|
||||
# # Get all names for one country code (if there are multiple)
|
||||
# for (r in 1:nrow(cl)) {
|
||||
# name <- NULL
|
||||
# no <- 0
|
||||
# code <- as.character(cl$code[r])
|
||||
# code <- namibiaBug(code)
|
||||
# r2 <- 1
|
||||
# repeat {
|
||||
# if (r2 > nrow(cl_bak)) {
|
||||
# break # only end if no next row
|
||||
# }
|
||||
# code2 <- as.character(cl_bak$code[r2])
|
||||
# code2 <- namibiaBug(code2)
|
||||
# if (code2 == code) {
|
||||
# no <- no + 1
|
||||
# name[no] <- as.character(cl_bak$name[r2])
|
||||
# r2 <- r2 + 1
|
||||
# }
|
||||
# else {
|
||||
# r2 <- r2 + 1
|
||||
# }
|
||||
# }
|
||||
# if (length(name) > 1) {
|
||||
# cat("For",code,"there are", no, "names:", name, "\n")
|
||||
# }
|
||||
# rm(r, name, no, code, r2, code2)
|
||||
# }
|
||||
|
||||
# Old dataframe not needed anymore
|
||||
rm(cl_bak)
|
||||
|
||||
|
||||
# Calculate the total and average news entries for each year
|
||||
years <- 2000:2014
|
||||
# Search string for str_detect for every year
|
||||
year_str <- sprintf("^\\d{1,2}\\.%s", years)
|
||||
|
||||
for (r in 1:nrow(cl)) {
|
||||
total <- 0
|
||||
average <- 0
|
||||
for (y in 1:length(years)) {
|
||||
months <- 0
|
||||
for (c in 1:ncol(cl)) {
|
||||
if (str_detect(names(cl)[c], year_str[y])) {
|
||||
total <- total + as.numeric(cl[r,c])
|
||||
months <- months + 1
|
||||
}
|
||||
}
|
||||
colnametotal <- str_c(years[y],"-total")
|
||||
colnameaverg <- str_c(years[y],"-averg")
|
||||
average <- round(total / months, 4)
|
||||
cl_stats[r, colnametotal] <- total
|
||||
cl_stats[r, colnameaverg] <- average
|
||||
total <- 0
|
||||
average <- 0
|
||||
}
|
||||
rm(r, total, average, y, months, c, colnametotal, colnameaverg)
|
||||
}
|
||||
rm(years, year_str)
|
||||
|
||||
|
||||
|
||||
|
||||
# IDENTIFY SURPRISING NEWSFOCUS -------------------------------------------
|
||||
|
||||
|
||||
# Land war 3x öfter genannt als im Monat davor, aber mehr als 50x
|
||||
no <- 0
|
||||
for (c in 4:ncol(cl)) { # starting 1 month later
|
||||
for (r in 1:nrow(cl)) {
|
||||
month <- names(cl)[c]
|
||||
|
||||
# Conditions to fulfill
|
||||
status1 <- cl[r,c] > 3 * cl[r,c-1]
|
||||
status2 <- cl[r,c] > 50
|
||||
|
||||
if (status1 && status2) {
|
||||
no <- no + 1
|
||||
cat("[",no,"] ",as.character(cl$code[r]),": 3x m-1 && >50 in: ", month,"\n", sep = "")
|
||||
}
|
||||
}
|
||||
rm(r,c,month,status1,status2)
|
||||
}
|
||||
rm(no)
|
||||
|
||||
|
||||
# Land wurde in einem Monat 3x öfter als im Jahresdurchschnitt genannt
|
||||
no <- 0
|
||||
for (c in 3:ncol(cl)) {
|
||||
for (r in 1:nrow(cl)) {
|
||||
month <- names(cl)[c]
|
||||
year <- str_extract(month, "\\d{4}")
|
||||
averg <- str_c(year,"-averg")
|
||||
averg <- cl_stats[r,averg]
|
||||
|
||||
# Conditions to fulfill
|
||||
status1 <- cl[r,c] > 3 * averg
|
||||
status2 <- cl[r,c] > 50
|
||||
|
||||
if (status1 && status2) {
|
||||
no <- no + 1
|
||||
cat("[",no,"] ",as.character(cl$code[r]),": 3x year average && >50 ", month,"\n", sep = "") }
|
||||
}
|
||||
rm(r,c,month,year,averg,status1,status2)
|
||||
}
|
||||
rm(no)
|
||||
|
||||
# Final method: Land in einem Monat öfter genannt als alle 3 Monate davor zusammen
|
||||
cl_supfoc_mon <- data.frame(code=NA,name=NA)
|
||||
cl_supfoc_mon[sprintf("%s", tspans)] <- 0
|
||||
no <- 0
|
||||
for (c in 6:ncol(cl)) {
|
||||
for (r in 1:nrow(cl)) {
|
||||
month <- names(cl)[c]
|
||||
code <- as.character(cl$code[r])
|
||||
name <- as.character(cl$name[r])
|
||||
|
||||
# Conditions to fulfill
|
||||
status1 <- cl[r,c] > cl[r,c-1]+cl[r,c-2]+cl[r,c-3]
|
||||
status2 <- cl[r,c] > 50
|
||||
|
||||
if (status1 && status2) {
|
||||
no <- no + 1
|
||||
cl_supfoc_mon[no, "code"] <- code
|
||||
cl_supfoc_mon$name[no] <- name
|
||||
#if (is.null(cl_supfoc_mon[no, month])) { cl_supfoc_mon[no, month] <- 0}
|
||||
#cl_supfoc_mon[no, month] <- cl_supfoc_mon[no, month] + 1
|
||||
cl_supfoc_mon[no, month] <- 1
|
||||
cat("[",no,"] ",as.character(cl$code[r]),": >m-(1:3) && >50 ", month,"\n", sep = "")
|
||||
}
|
||||
}
|
||||
rm(r,c,month,status1,status2)
|
||||
}
|
||||
rm(no, code, name)
|
||||
|
||||
# Clean cl_supfoc_mon: Replace NAs by 0, and sum up multiple appeared countries
|
||||
cl_supfoc_mon[is.na(cl_supfoc_mon)] <- 0
|
||||
cl_supfoc_mon <- ddply(cl_supfoc_mon,c("code", "name"),numcolwise(sum))
|
||||
|
||||
# Delete all month-columns with 0 surprising events
|
||||
cl_supfoc_only_mon <- removeZeroMonths(cl_supfoc_mon, 3, ncol(cl_supfoc_mon))
|
||||
|
||||
|
||||
|
||||
|
||||
# Get total surprising newsfocuses for each country
|
||||
cl_supfoc_total <- data.frame(code=NA, name=NA, total=NA)
|
||||
for (r in 1:nrow(cl_supfoc_mon)) {
|
||||
total <- 0
|
||||
cl_supfoc_total[r,"code"] <- as.character(cl_supfoc_mon$code[r])
|
||||
cl_supfoc_total[r,"name"] <- as.character(cl_supfoc_mon$name[r])
|
||||
for (c in 3:ncol(cl_supfoc_mon)) {
|
||||
total <- total + as.numeric(cl_supfoc_mon[r,c])
|
||||
}
|
||||
cl_supfoc_total[r,"total"] <- total
|
||||
}
|
||||
rm(r, total, c)
|
||||
|
||||
|
||||
|
||||
|
||||
# Total highlights per month + turn around table for graphs
|
||||
cl_supfoc_mon["highlight"] <- 1
|
||||
cl_supfoc_turn_mon <- ddply(cl_supfoc_mon,"highlight", numcolwise(sum))
|
||||
cl_supfoc_mon$highlight <- NULL
|
||||
cl_supfoc_turn_mon$highlight <- NULL
|
||||
cl_supfoc_turn_mon <- data.frame(month = names(cl_supfoc_turn_mon), highs = as.numeric(cl_supfoc_turn_mon[1,]))
|
||||
# Convert %d.%y to valid date class
|
||||
months <- NULL
|
||||
for (m in 1:length(tspans)) {
|
||||
dates <- str_c("15.",tspans[m])
|
||||
months[m] <- dates
|
||||
}
|
||||
rm(m, dates)
|
||||
cl_supfoc_turn_mon$month <- as.Date(months, format = "%d.%m.%Y")
|
||||
rm(months)
|
||||
|
||||
# Delete all month-rows with 0 surprising events
|
||||
cl_supfoc_turn_only_mon <- cl_supfoc_turn_mon[!cl_supfoc_turn_mon$highs == 0,]
|
||||
rownames(cl_supfoc_turn_only_mon) <- NULL
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# # Replace 0s by NAs
|
||||
# for (r in 1:180) {
|
||||
# if (! is.na(cl_total2$highs[r])) {
|
||||
# if (cl_total2$highs[r] == 0) {
|
||||
# cl_total2$highs[r] <- NA
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
|
||||
80
R-Code/newsfokus-4-visual.R
Normal file
80
R-Code/newsfokus-4-visual.R
Normal file
@@ -0,0 +1,80 @@
|
||||
require(rworldmap)
|
||||
require(ggplot2)
|
||||
|
||||
theCountries <- c("DE", "US", "BR")
|
||||
# These are the ISO3 names of the countries you'd like to plot in red
|
||||
|
||||
malDF <- data.frame(country = c("DE", "US", "BR", "ZA"), malaria = c(2000, 2001, 2002, 2002), news = c(2, 3, 0, 1))
|
||||
# malDF is a data.frame with the ISO3 country names plus a variable to
|
||||
# merge to the map data
|
||||
|
||||
malMap <- joinCountryData2Map(malDF, joinCode = "ISO2", nameJoinColumn = "country")
|
||||
# This will join your malDF data.frame to the country map data
|
||||
|
||||
mapCountryData(malMap, nameColumnToPlot="malaria", catMethod = "categorical", missingCountryCol = gray(.8))
|
||||
# And this will plot it, with the trick that the color palette's first
|
||||
# color is red
|
||||
|
||||
|
||||
|
||||
# Absolute Frequ Newsfocus Map --------------------------------------------
|
||||
|
||||
|
||||
# # Absolute Häufigkeiten der Highlights mit Bubbles
|
||||
# malMap <- joinCountryData2Map(cl_supfoc_total, joinCode = "ISO2", nameJoinColumn = "code")
|
||||
# mapBubbles( dF=malMap, nameZSize="total",nameZColour="GEO3major",
|
||||
# colourPalette=c("black", "yellow", "blue", "orange", "red", "white", "green"),
|
||||
# oceanCol="lightblue",
|
||||
# landCol="wheat",
|
||||
# fill=TRUE,
|
||||
# symbolSize=0.5,
|
||||
# pch=21)
|
||||
|
||||
# Absolute Newfokus Häufigkeiten Welt:
|
||||
absMap <- joinCountryData2Map(cl_supfoc_total, joinCode = "ISO2", nameJoinColumn = "code", verbose=TRUE)
|
||||
mapCountryData(absMap, nameColumnToPlot="total", catMethod="fixedWidth",
|
||||
numCats=5,
|
||||
mapTitle="Anzahl Überraschungsfokusse weltweit",
|
||||
oceanCol="lightblue",
|
||||
missingCountryCol=gray(.9)
|
||||
)
|
||||
|
||||
# Absolute Newfokus Häufigkeiten EU-Asien-Nordafrika:
|
||||
mapCountryData(absMap, nameColumnToPlot="total", catMethod="fixedWidth",
|
||||
numCats=5,
|
||||
mapTitle="Anzahl Überraschungsfokusse Nordafrika und Asien",
|
||||
oceanCol="lightblue",
|
||||
missingCountryCol=gray(.9),
|
||||
xlim=c(10,140),
|
||||
ylim=c(30,70)
|
||||
)
|
||||
|
||||
|
||||
|
||||
# Development Newsfocus over time -----------------------------------------
|
||||
|
||||
|
||||
# Entwicklung der Surprising Focuses über die Jahre
|
||||
cc <- ggplot(cl_supfoc_turn_mon, aes(month,highs))
|
||||
cc <- cc + geom_histogram(fill="steelblue", stat="identity")
|
||||
cc <- cc + stat_smooth(size=1,colour="red",method="loess", se=FALSE)
|
||||
cc <- cc + ggtitle("Zeitliche Entwicklung von plötzlichen Medienfokussen") + xlab("Einzelne Monate") + ylab("Plötzliche Medienfokusse")
|
||||
cc
|
||||
|
||||
|
||||
# Beispiel1: Jährliche Durchschnittsanzahl 2000-2014 der Nachrichten über Syrien
|
||||
yearspan <- 2000:2014
|
||||
avergdf <- getAverages(df = cl_stats, codecol = "code", code = "SY", yearspan = yearspan)
|
||||
|
||||
averg <- ggplot(data = avergdf, aes(x = year, y=averg))
|
||||
averg + geom_line() + ggtitle("Durchschnittliche Nachrichten pro Jahr über Syrien") + xlab("Jahre") + ylab("Durchschnittliche Nachrichten")
|
||||
|
||||
# Beispiel2: Jährliche Durchschnittsanzahl 2000-2014 der Nachrichten über Israel
|
||||
yearspan <- 2000:2014
|
||||
avergdf <- getAverages(df = cl_stats, codecol = "code", code = "IL", yearspan = yearspan)
|
||||
|
||||
averg <- ggplot(data = avergdf, aes(x = year, y=averg))
|
||||
averg + geom_line() + ggtitle("Durchschnittliche Nachrichten pro Jahr über Israel") + xlab("Jahre") + ylab("Durchschnittliche Nachrichten")
|
||||
|
||||
|
||||
|
||||
38
R-Code/newsfokus-countrylist-manualchanges.txt
Normal file
38
R-Code/newsfokus-countrylist-manualchanges.txt
Normal file
@@ -0,0 +1,38 @@
|
||||
"Nordkorea";"KP"
|
||||
"Südkorea";"KR"
|
||||
"Zaire";"CD"
|
||||
"Großbritannien";"GB"
|
||||
"England";"GB"
|
||||
"Russland";"RU"
|
||||
"Palästina";"PS"
|
||||
"Gazastreifen";"PS"
|
||||
"Burma";"MM"
|
||||
"Moldau";"MW"
|
||||
"Laos";"LA"
|
||||
"Kosovo";"RS"
|
||||
"Kongo";"CD"
|
||||
"Kongo-Brazzaville";"CG"
|
||||
"Iran";"IR"
|
||||
"Isle of Man";"IM"
|
||||
"DDR";"DD"
|
||||
"Elfenbeinküste";"CI"
|
||||
"China";"CN"
|
||||
"Weißrussland";"BY"
|
||||
"Antarktis";"AQ"
|
||||
"Krim";"UA"
|
||||
"UdSSR";"RU"
|
||||
"USA";"US"
|
||||
"BRD";"DE"
|
||||
"Nordirland";"IE"
|
||||
"Nordzypern";"CY"
|
||||
"Syrien";"SY"
|
||||
"VA";"Vatikan"
|
||||
"TZ";"Tansania"
|
||||
"EA";"Ceuta"
|
||||
"EA";"Melilla"
|
||||
|
||||
|
||||
"ENTFERNT Neutrale Zone";"NT"
|
||||
"ENTFERNT Europäische Gemeinschaft";"CE"
|
||||
"ENTFERNT Europäische Union";"EU"
|
||||
"ENTFERNT Burma";"BU"
|
||||
295
R-Code/newsfokus-countrylist.txt
Normal file
295
R-Code/newsfokus-countrylist.txt
Normal file
@@ -0,0 +1,295 @@
|
||||
"AC";"Ascension"
|
||||
"AD";"Andorra"
|
||||
"AE";"Vereinigte Arabische Emirate"
|
||||
"AF";"Afghanistan"
|
||||
"AG";"Antigua und Barbuda"
|
||||
"AI";"Anguilla"
|
||||
"AL";"Albanien"
|
||||
"AM";"Armenien"
|
||||
"AN";"Niederländische Antillen"
|
||||
"AO";"Angola"
|
||||
"AQ";"Antarktis"
|
||||
"AQ";"Antarktika"
|
||||
"AR";"Argentinien"
|
||||
"AS";"Amerikanisch-Samoa"
|
||||
"AT";"Österreich"
|
||||
"AU";"Australien"
|
||||
"AW";"Aruba"
|
||||
"AX";"Åland"
|
||||
"AZ";"Aserbaidschan"
|
||||
"BA";"Bosnien und Herzegowina"
|
||||
"BB";"Barbados"
|
||||
"BD";"Bangladesch"
|
||||
"BE";"Belgien"
|
||||
"BF";"Burkina Faso"
|
||||
"BG";"Bulgarien"
|
||||
"BH";"Bahrain"
|
||||
"BI";"Burundi"
|
||||
"BJ";"Benin"
|
||||
"BL";"Saint-Barthélemy"
|
||||
"BM";"Bermuda"
|
||||
"BN";"Brunei Darussalam"
|
||||
"BO";"Bolivien"
|
||||
"BQ";"Bonaire, Sint Eustatius und Saba"
|
||||
"BR";"Brasilien"
|
||||
"BS";"Bahamas"
|
||||
"BT";"Bhutan"
|
||||
"BV";"Bouvetinsel"
|
||||
"BW";"Botswana"
|
||||
"BY";"Belarus"
|
||||
"BY";"Weißrussland"
|
||||
"BZ";"Belize"
|
||||
"CA";"Kanada"
|
||||
"CC";"Kokosinseln"
|
||||
"CD";"Demokratische Republik Kongo"
|
||||
"CD";"Kongo, Demokratische Republik"
|
||||
"CD";"Kongo"
|
||||
"CD";"Zaire"
|
||||
"CF";"Zentralafrikanische Republik"
|
||||
"CG";"Republik Kongo"
|
||||
"CG";"Kongo-Brazzaville"
|
||||
"CH";"Schweiz"
|
||||
"CI";"Elfenbeinküste"
|
||||
"CI";"Côte d’Ivoire"
|
||||
"CK";"Cookinseln"
|
||||
"CL";"Chile"
|
||||
"CM";"Kamerun"
|
||||
"CN";"China"
|
||||
"CN";"China, Volksrepublik"
|
||||
"CO";"Kolumbien"
|
||||
"CP";"Clipperton"
|
||||
"CR";"Costa Rica"
|
||||
"CS";"Tschechoslowakei"
|
||||
"CS";"Serbien und Montenegro"
|
||||
"CU";"Kuba"
|
||||
"CV";"Kap Verde"
|
||||
"CW";"Curaçao"
|
||||
"CX";"Weihnachtsinsel"
|
||||
"CY";"Zypern"
|
||||
"CY";"Nordzypern"
|
||||
"CZ";"Tschechische Republik"
|
||||
"DD";"DDR"
|
||||
"DD";"Deutsche Demokratische Republik"
|
||||
"DE";"Deutschland"
|
||||
"DE";"BRD"
|
||||
"DG";"Diego Garcia"
|
||||
"DJ";"Dschibuti"
|
||||
"DK";"Dänemark"
|
||||
"DM";"Dominica"
|
||||
"DO";"Dominikanische Republik"
|
||||
"DZ";"Algerien"
|
||||
"EA";"Ceuta, Melilla"
|
||||
"EA";"Ceuta"
|
||||
"EA";"Melilla"
|
||||
"EC";"Ecuador"
|
||||
"EE";"Estland"
|
||||
"EG";"Ägypten"
|
||||
"EH";"Westsahara"
|
||||
"ER";"Eritrea"
|
||||
"ES";"Spanien"
|
||||
"ET";"Äthiopien"
|
||||
"FI";"Finnland"
|
||||
"FJ";"Fidschi"
|
||||
"FK";"Falklandinseln"
|
||||
"FM";"Mikronesien"
|
||||
"FO";"Färöer"
|
||||
"FR";"Frankreich"
|
||||
"FX";"Frankreich, France métropolitaine"
|
||||
"GA";"Gabun"
|
||||
"GB";"Großbritannien"
|
||||
"GB";"England"
|
||||
"GB";"Vereinigtes Königreich Großbritannien und Nordirland"
|
||||
"GD";"Grenada"
|
||||
"GE";"Georgien"
|
||||
"GF";"Französisch-Guayana"
|
||||
"GG";"Guernsey"
|
||||
"GH";"Ghana"
|
||||
"GI";"Gibraltar"
|
||||
"GL";"Grönland"
|
||||
"GM";"Gambia"
|
||||
"GN";"Guinea"
|
||||
"GP";"Guadeloupe"
|
||||
"GQ";"Äquatorialguinea"
|
||||
"GR";"Griechenland"
|
||||
"GS";"Südgeorgien und die Südlichen Sandwichinseln"
|
||||
"GT";"Guatemala"
|
||||
"GU";"Guam"
|
||||
"GW";"Guinea-Bissau"
|
||||
"GY";"Guyana"
|
||||
"HK";"Hongkong"
|
||||
"HM";"Heard und McDonaldinseln"
|
||||
"HN";"Honduras"
|
||||
"HR";"Kroatien"
|
||||
"HT";"Haiti"
|
||||
"HU";"Ungarn"
|
||||
"IC";"Kanarische Inseln"
|
||||
"ID";"Indonesien"
|
||||
"IE";"Irland"
|
||||
"IE";"Nordirland"
|
||||
"IL";"Israel"
|
||||
"IM";"Isle of Man"
|
||||
"IM";"Insel Man"
|
||||
"IN";"Indien"
|
||||
"IO";"Britisches Territorium im Indischen Ozean"
|
||||
"IQ";"Irak"
|
||||
"IR";"Iran"
|
||||
"IR";"Iran, Islamische Republik"
|
||||
"IS";"Island"
|
||||
"IT";"Italien"
|
||||
"JE";"Jersey"
|
||||
"JM";"Jamaika"
|
||||
"JO";"Jordanien"
|
||||
"JP";"Japan"
|
||||
"KE";"Kenia"
|
||||
"KG";"Kirgisistan"
|
||||
"KH";"Kambodscha"
|
||||
"KI";"Kiribati"
|
||||
"KM";"Komoren"
|
||||
"KN";"St. Kitts und Nevis"
|
||||
"KP";"Nordkorea"
|
||||
"KP";"Korea, Demokratische Volksrepublik"
|
||||
"KR";"Südkorea"
|
||||
"KR";"Korea, Republik"
|
||||
"KW";"Kuwait"
|
||||
"KY";"Kaimaninseln"
|
||||
"KZ";"Kasachstan"
|
||||
"LA";"Laos"
|
||||
"LA";"Laos, Demokratische Volksrepublik"
|
||||
"LB";"Libanon"
|
||||
"LC";"St. Lucia"
|
||||
"LI";"Liechtenstein"
|
||||
"LK";"Sri Lanka"
|
||||
"LR";"Liberia"
|
||||
"LS";"Lesotho"
|
||||
"LT";"Litauen"
|
||||
"LU";"Luxemburg"
|
||||
"LV";"Lettland"
|
||||
"LY";"Libyen"
|
||||
"MA";"Marokko"
|
||||
"MC";"Monaco"
|
||||
"MD";"Moldawien"
|
||||
"ME";"Montenegro"
|
||||
"MF";"Saint-Martin"
|
||||
"MG";"Madagaskar"
|
||||
"MH";"Marshallinseln"
|
||||
"MK";"Mazedonien"
|
||||
"ML";"Mali"
|
||||
"MM";"Myanmar"
|
||||
"MM";"Burma"
|
||||
"MN";"Mongolei"
|
||||
"MO";"Macao"
|
||||
"MP";"Nördliche Marianen"
|
||||
"MQ";"Martinique"
|
||||
"MR";"Mauretanien"
|
||||
"MS";"Montserrat"
|
||||
"MT";"Malta"
|
||||
"MU";"Mauritius"
|
||||
"MV";"Malediven"
|
||||
"MW";"Malawi"
|
||||
"MW";"Moldau"
|
||||
"MX";"Mexiko"
|
||||
"MY";"Malaysia"
|
||||
"MZ";"Mosambik"
|
||||
"NA";"Namibia"
|
||||
"NC";"Neukaledonien"
|
||||
"NE";"Niger"
|
||||
"NF";"Norfolkinsel"
|
||||
"NG";"Nigeria"
|
||||
"NI";"Nicaragua"
|
||||
"NL";"Niederlande"
|
||||
"NO";"Norwegen"
|
||||
"NP";"Nepal"
|
||||
"NR";"Nauru"
|
||||
"NU";"Niue"
|
||||
"NZ";"Neuseeland"
|
||||
"OM";"Oman"
|
||||
"PA";"Panama"
|
||||
"PE";"Peru"
|
||||
"PF";"Französisch-Polynesien"
|
||||
"PG";"Papua-Neuguinea"
|
||||
"PH";"Philippinen"
|
||||
"PK";"Pakistan"
|
||||
"PL";"Polen"
|
||||
"PM";"Saint-Pierre und Miquelon"
|
||||
"PN";"Pitcairninseln"
|
||||
"PR";"Puerto Rico"
|
||||
"PS";"Staat Palästina"
|
||||
"PS";"Gazastreifen"
|
||||
"PS";"Palästina"
|
||||
"PT";"Portugal"
|
||||
"PW";"Palau"
|
||||
"PY";"Paraguay"
|
||||
"QA";"Katar"
|
||||
"RE";"Réunion"
|
||||
"RO";"Rumänien"
|
||||
"RS";"Serbien"
|
||||
"RS";"Kosovo"
|
||||
"RU";"Russland"
|
||||
"RU";"Russische Föderation"
|
||||
"RU";"UdSSR"
|
||||
"RW";"Ruanda"
|
||||
"SA";"Saudi-Arabien"
|
||||
"SB";"Salomonen"
|
||||
"SC";"Seychellen"
|
||||
"SD";"Sudan"
|
||||
"SE";"Schweden"
|
||||
"SG";"Singapur"
|
||||
"SH";"St. Helena"
|
||||
"SI";"Slowenien"
|
||||
"SJ";"Svalbard und Jan Mayen"
|
||||
"SK";"Slowakei"
|
||||
"SL";"Sierra Leone"
|
||||
"SM";"San Marino"
|
||||
"SN";"Senegal"
|
||||
"SO";"Somalia"
|
||||
"SR";"Suriname"
|
||||
"SS";"Südsudan"
|
||||
"ST";"São Tomé und Príncipe"
|
||||
"SV";"El Salvador"
|
||||
"SX";"Sint Maarten"
|
||||
"SY";"Syrien"
|
||||
"SY";"Syrien, Arabische Republik"
|
||||
"SZ";"Swasiland"
|
||||
"TA";"Tristan da Cunha"
|
||||
"TC";"Turks- und Caicosinseln"
|
||||
"TD";"Tschad"
|
||||
"TF";"Französische Süd- und Antarktisgebiete"
|
||||
"TG";"Togo"
|
||||
"TH";"Thailand"
|
||||
"TJ";"Tadschikistan"
|
||||
"TK";"Tokelau"
|
||||
"TL";"Osttimor"
|
||||
"TM";"Turkmenistan"
|
||||
"TN";"Tunesien"
|
||||
"TO";"Tonga"
|
||||
"TR";"Türkei"
|
||||
"TT";"Trinidad und Tobago"
|
||||
"TV";"Tuvalu"
|
||||
"TW";"Republik China"
|
||||
"TZ";"Tansania"
|
||||
"TZ";"Tansania, Vereinigte Republik"
|
||||
"UA";"Ukraine"
|
||||
"UA";"Krim"
|
||||
"UG";"Uganda"
|
||||
"UM";"United States Minor Outlying Islands"
|
||||
"US";"USA"
|
||||
"US";"Vereinigte Staaten von Amerika"
|
||||
"UY";"Uruguay"
|
||||
"UZ";"Usbekistan"
|
||||
"VA";"Vatikanstadt"
|
||||
"VA";"Vatikan"
|
||||
"VC";"St. Vincent und die Grenadinen"
|
||||
"VE";"Venezuela"
|
||||
"VG";"Britische Jungferninseln"
|
||||
"VI";"Amerikanische Jungferninseln"
|
||||
"VN";"Vietnam"
|
||||
"VU";"Vanuatu"
|
||||
"WF";"Wallis und Futuna"
|
||||
"WS";"Samoa"
|
||||
"YE";"Jemen"
|
||||
"YT";"Mayotte"
|
||||
"YU";"Jugoslawien"
|
||||
"ZA";"Südafrika"
|
||||
"ZM";"Sambia"
|
||||
"ZR";"Zaire"
|
||||
"ZW";"Simbabwe"
|
||||
127
R-Code/newsfokus-functions.R
Normal file
127
R-Code/newsfokus-functions.R
Normal file
@@ -0,0 +1,127 @@
|
||||
require(stringr)
|
||||
require(RCurl)
|
||||
|
||||
|
||||
dlParsePage <- function(url, dest) {
|
||||
if (!file.exists(dest)) {
|
||||
url_curl <- getURL(url, curl=handle)
|
||||
write(url_curl, dest)
|
||||
}
|
||||
parsed <- htmlParse(dest, encoding = "UTF-8")
|
||||
return(parsed)
|
||||
}
|
||||
|
||||
# Calculate the number of days in a specific month
|
||||
# (Important for manipulating the search URL because otherwise there would be articles
|
||||
# of the NEXT month in the search results if we have a too large time span)
|
||||
getDays <- function(i, df) {
|
||||
cur_tspan <- names(df)[i]
|
||||
cur_m <- as.numeric(str_extract(cur_tspan, "^\\d+\\>"))
|
||||
cur_y <- as.numeric(str_extract(cur_tspan, "\\d{4}"))
|
||||
if (cur_m == 12) {
|
||||
nex_y <- cur_y + 1
|
||||
nex_m <- 1
|
||||
}
|
||||
else {
|
||||
nex_y <- cur_y
|
||||
nex_m <- cur_m +1
|
||||
}
|
||||
cur_date <- str_c(cur_y, cur_m, "01", sep = "-")
|
||||
nex_date <- str_c(nex_y, nex_m, "01", sep = "-")
|
||||
days <- as.numeric(difftime(as.Date(nex_date), as.Date(cur_date)))
|
||||
return(days)
|
||||
}
|
||||
|
||||
getCode <- function(i, df) {
|
||||
# There's a bug with appearances of Namibia, producing NAs instead of "NA"
|
||||
if (! is.na(as.character(df[i]))) {
|
||||
code <- as.character(df[i]) # Get the country's code
|
||||
}
|
||||
else {
|
||||
code <- "NA"
|
||||
}
|
||||
return(code)
|
||||
}
|
||||
|
||||
# NAs get converted into "NA" for Namibia
|
||||
namibiaBug <- function(x) {
|
||||
if (is.na(x)) {x <- "NA"}
|
||||
return(x)
|
||||
}
|
||||
|
||||
|
||||
# Get all headlines for a specific country and a specific month
|
||||
getHeadlines <- function(df, countrycol, descountry, month) {
|
||||
no <- 0
|
||||
news <- NULL
|
||||
for (r in 1:nrow(df)) {
|
||||
descountry <- namibiaBug(descountry)
|
||||
curcountry <- namibiaBug(as.character(df[r,countrycol]))
|
||||
if (curcountry == descountry) {
|
||||
no <- no + 1
|
||||
news[no] <- as.character(df[r,month])
|
||||
}
|
||||
}
|
||||
cat(news)
|
||||
}
|
||||
|
||||
|
||||
# Remove months with 0 surprising newsfocuses
|
||||
removeZeroMonths <- function(df, mincol, maxcol) {
|
||||
delmonthno <- 0
|
||||
delmonth <- NULL
|
||||
for (c in mincol:maxcol) {
|
||||
month <- names(df)[c]
|
||||
no <- 0
|
||||
for (r in 1:nrow(df)) {
|
||||
no <- no + as.numeric(df[r,c])
|
||||
}
|
||||
if (no == 0) {
|
||||
delmonthno <- delmonthno + 1
|
||||
delmonth[delmonthno] <- month
|
||||
}
|
||||
}
|
||||
return(df [! names(df) %in% delmonth])
|
||||
}
|
||||
|
||||
# Get countries which produced newsfocuses in a given month
|
||||
getFocusCountries <- function(df, month) {
|
||||
exists <- FALSE
|
||||
for (c in 1:ncol(df)) {
|
||||
status <- str_detect(month, names(df)[c])
|
||||
if (status) {
|
||||
exists <- TRUE
|
||||
}
|
||||
}
|
||||
if (!exists) {
|
||||
return(cat(month, "isn't a valid column in given dataframe"))
|
||||
}
|
||||
no <- 0
|
||||
countries <- NULL
|
||||
for (r in 1:nrow(df)) {
|
||||
supfoc <- df[r,month]
|
||||
if (supfoc > 0) {
|
||||
no <- no + 1
|
||||
countries[no] <- as.character(df[r,"name"])
|
||||
}
|
||||
}
|
||||
return(countries)
|
||||
}
|
||||
|
||||
# Get average development of news over a span of years
|
||||
getAverages <- function(df, codecol, code, yearspan) {
|
||||
no <- 0
|
||||
averg <- NULL
|
||||
for (r in 1:nrow(df)) {
|
||||
code <- namibiaBug(code)
|
||||
curcode <- namibiaBug(df[r,codecol])
|
||||
if (curcode == code) {
|
||||
for (y in yearspan) {
|
||||
no <- no + 1
|
||||
curcol <- str_c(y,"-averg")
|
||||
averg[no] <- as.numeric(df[r,curcol])
|
||||
}
|
||||
}
|
||||
}
|
||||
return(data.frame(year=yearspan, averg=averg))
|
||||
}
|
||||
Reference in New Issue
Block a user