186 lines
6.6 KiB
R
186 lines
6.6 KiB
R
|
|
require(XML)
|
||
|
|
require(RCurl)
|
||
|
|
require(stringr)
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
setwd("~/Dokumente/Uni/Aktuell/Datenerhebung WWW/Blogartikel 2 - Newsfokus/")
|
||
|
|
# Import manually edited country list
|
||
|
|
cl <- read.csv("newsfokus-countrylist.txt", sep = ";", header = FALSE)
|
||
|
|
names(cl) <- c("code", "name")
|
||
|
|
|
||
|
|
|
||
|
|
# FUNCTIONS ---------------------------------------------------------------
|
||
|
|
|
||
|
|
source("newsfokus-functions.R")
|
||
|
|
|
||
|
|
|
||
|
|
# PREPARATIONS ------------------------------------------------------------
|
||
|
|
|
||
|
|
# Generate all month.year combinations since January 2000
|
||
|
|
tspans <- NULL
|
||
|
|
for (y in 2000:2014) {
|
||
|
|
for (m in 01:12) {
|
||
|
|
if (length(tspans) == 0) {
|
||
|
|
tspans <- str_c(m,".",y)
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
tspans <- c(tspans, str_c(m,".",y))
|
||
|
|
}
|
||
|
|
}
|
||
|
|
rm(y, m)
|
||
|
|
}
|
||
|
|
|
||
|
|
# # All combinations mmmYYYY (not used yet)
|
||
|
|
# years <- 2000:2014 # 15
|
||
|
|
# months <- month.abb
|
||
|
|
# no <- 0
|
||
|
|
# comb <- NULL
|
||
|
|
# for (m in 1:12) {
|
||
|
|
# for (y in 1:15) {
|
||
|
|
# no <- no + 1
|
||
|
|
# comb[no] <- str_c(months[m],years[y])
|
||
|
|
# }
|
||
|
|
# }
|
||
|
|
# rm(years, months, m, y, no)
|
||
|
|
|
||
|
|
# Remove future/incomplete months
|
||
|
|
removemonths <- c(9.2014, 10.2014, 11.2014, 12.2014)
|
||
|
|
tspans <- tspans [! tspans %in% removemonths]
|
||
|
|
rm(removemonths)
|
||
|
|
|
||
|
|
|
||
|
|
# Create new columns for every month and set the column names accordingly
|
||
|
|
cl[sprintf("%s", tspans)] <- 0
|
||
|
|
|
||
|
|
# Copy data frame for headlines
|
||
|
|
headlines <- cl
|
||
|
|
|
||
|
|
# Set curl handle for friendly scraping
|
||
|
|
handle <- getCurlHandle(httpheader = list(from = "max.mehl@uni.kn",
|
||
|
|
'user-agent' = str_c(R.version$version.string)
|
||
|
|
)
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
# SCRAPING ALL THE NEWS \o/ ----------------------------------------------
|
||
|
|
|
||
|
|
# Das Prozedere ist folgendes:
|
||
|
|
# 1. Erstelle die URL je nach Zeitspanne und wiederhole das für jede existierende Seite (meist über 100)
|
||
|
|
# - Es gibt eine Fehlermeldung auf der Suchseite, wenn keine Ergebnisse mehr vorhanden sind
|
||
|
|
# 2. Lade Seite herunter und parse sie
|
||
|
|
# - Nicht, wenn sie schon vorhanden ist, um Bandbreite zu sparen
|
||
|
|
# 3. Suche nach Auftauchen von Ländernamen
|
||
|
|
# 3a. Wenn ja, dann zähle jeweiligen Eintrag im DF um 1 hoch
|
||
|
|
# - nur ein Auftauchen des Landes in einem Artikel wird gezählt
|
||
|
|
# (es gibt für einzelne Ländercodes mehrere Schreibweisen)
|
||
|
|
|
||
|
|
stat_news_all <- 0
|
||
|
|
stat_news_pos <- 0
|
||
|
|
stat_pages <- 0
|
||
|
|
# This loop does the scraping, searching and indexing of each month and country
|
||
|
|
# !!! Beware: It takes around 24 hours to finish for 1.2000 - 8.2014! Load backup_cl.RData (and optionally backup_headlines.RData) to skip this step
|
||
|
|
for (i in 3:ncol(cl)) {
|
||
|
|
tspan <- names(cl)[i]
|
||
|
|
|
||
|
|
# Create folder for downloaded HTML files
|
||
|
|
dir.create(str_c("materials/",tspan), showWarnings = FALSE, recursive = TRUE)
|
||
|
|
|
||
|
|
days <- getDays(i, cl)
|
||
|
|
# Generate the month's base url
|
||
|
|
baseurl <- str_c("http://www.spiegel.de/suche/index.html?suchbegriff=+&quellenGroup=SPOX&suchbereich=kopftext&fromDate=1.", tspan, "&toDate=", days, ".", tspan, "&offsets=999999&pageNumber=")
|
||
|
|
|
||
|
|
# In every loop we start with page 1 again
|
||
|
|
page <- 1
|
||
|
|
|
||
|
|
# Now expand the URL by the page number UNTIL there's an error page
|
||
|
|
repeat {
|
||
|
|
cat("\n--> Processing page", page, "of timespan", tspan, "\n")
|
||
|
|
|
||
|
|
url <- str_c(baseurl, page)
|
||
|
|
dest <- str_c("materials/",tspan,"/",tspan,"-",page,".html")
|
||
|
|
url_parsed <- dlParsePage(url, dest)
|
||
|
|
|
||
|
|
status <- unlist(xpathSApply(url_parsed, "//h3[contains(text(), 'Ihre Suche ergab keinen Treffer')]"))
|
||
|
|
if (! is.null(status)) {
|
||
|
|
# If there's an error page, there're no further articles. Skip this loop and begin with next month
|
||
|
|
cat("Letzte Seite erreicht:", url, "\n")
|
||
|
|
break
|
||
|
|
}
|
||
|
|
|
||
|
|
# Page is valid, now split in single articles and search for countries in each title and teaser
|
||
|
|
headline <- xpathSApply(url_parsed, "//div[@class='search-teaser']//span[@class='headline']", xmlValue)
|
||
|
|
teaser <- xpathSApply(url_parsed, "//div[@class='search-teaser']/p", xmlValue)
|
||
|
|
url_arts <- xpathSApply(url_parsed, "//div[@class='search-teaser']/a", xmlGetAttr, "href")
|
||
|
|
url_short <- ""
|
||
|
|
# Combine headline and teaser to make it easier to search
|
||
|
|
teaser <- str_c(headline, teaser, sep=" ")
|
||
|
|
if (length(teaser) == 0) {
|
||
|
|
errormsg <- str_c("Probably 500 error at: ", tspan,"-",page)
|
||
|
|
write(errormsg, "scraping-errors.log", append = TRUE)
|
||
|
|
rm(errormsg)
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
# Analyse every single teaser/headline combination
|
||
|
|
for (t in 1:length(teaser)) {
|
||
|
|
yet <- "" # Did the country already appear in the article? Empty with each loop
|
||
|
|
string <- teaser[t]
|
||
|
|
for (c in 1:nrow(cl)) {
|
||
|
|
name <- as.character(cl$name[c]) # Name of the county to detect in the teaser
|
||
|
|
status <- str_detect(tolower(string), tolower(name)) # Does the country's name appear in the teaser?
|
||
|
|
|
||
|
|
if (status) { # yes
|
||
|
|
code <- getCode(c, cl$code)
|
||
|
|
|
||
|
|
cat("The string contains news from:", code, "\n")
|
||
|
|
|
||
|
|
# We only want to count a country once even if it appears multiple times in an article
|
||
|
|
already <- str_detect(yet, code) # Did the country already appear?
|
||
|
|
if (!already) { # no
|
||
|
|
yet <- str_c(yet, code, sep=" ")
|
||
|
|
cl[c , tspan] <- cl[c , tspan] + 1 # Count +1 to the number of appearances in the data frame
|
||
|
|
|
||
|
|
# Save headlines + links to a different data frame
|
||
|
|
url_short[t] <- str_extract(url_arts[t], ".+/")
|
||
|
|
url_short[t] <- str_c(url_short[t], str_extract(url_arts[t], "a\\-\\d+\\.html"))
|
||
|
|
new_headline_entry <- str_c(headline[t], " (", url_short[t], ")")
|
||
|
|
if (headlines[c , tspan] == 0) {
|
||
|
|
headlines[c , tspan] <- new_headline_entry
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
headlines[c , tspan] <- str_c(headlines[c , tspan], "\n", new_headline_entry)
|
||
|
|
}
|
||
|
|
|
||
|
|
}
|
||
|
|
rm(code, already)
|
||
|
|
stat_news_pos <- stat_news_pos + 1
|
||
|
|
}
|
||
|
|
rm(c, name, status)
|
||
|
|
}
|
||
|
|
rm(t, yet, string)
|
||
|
|
stat_news_all <- stat_news_all +1
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
# Go to the next page
|
||
|
|
page <- page + 1
|
||
|
|
stat_pages <- stat_pages + 1
|
||
|
|
}
|
||
|
|
rm(i, tspan, days, baseurl, page, url, url_parsed, status, teaser, headline, dest, url_arts, url_short, new_headline_entry)
|
||
|
|
# Backup all data after each month
|
||
|
|
write.csv(cl, "backup_cl.csv")
|
||
|
|
write.csv(headlines, "backup_headlines.csv")
|
||
|
|
save(cl, file="backup_cl.RData")
|
||
|
|
save(headlines, file="backup_headlines.RData")
|
||
|
|
}
|
||
|
|
# End of huge for-loop
|
||
|
|
|
||
|
|
|
||
|
|
# Final Backup
|
||
|
|
write.csv(cl, "backup_cl.csv")
|
||
|
|
write.csv(headlines, "backup_headlines.csv")
|
||
|
|
save(cl, file="backup_cl.RData")
|
||
|
|
save(headlines, file="backup_headlines.RData")
|
||
|
|
|
||
|
|
|