Files
uni-surprising-newsfocus/R-Code/newsfokus-2-scraping.R

186 lines
6.6 KiB
R
Raw Permalink Normal View History

2014-11-28 18:05:12 +01:00
require(XML)
require(RCurl)
require(stringr)
setwd("~/Dokumente/Uni/Aktuell/Datenerhebung WWW/Blogartikel 2 - Newsfokus/")
# Import manually edited country list
cl <- read.csv("newsfokus-countrylist.txt", sep = ";", header = FALSE)
names(cl) <- c("code", "name")
# FUNCTIONS ---------------------------------------------------------------
source("newsfokus-functions.R")
# PREPARATIONS ------------------------------------------------------------
# Generate all month.year combinations since January 2000
tspans <- NULL
for (y in 2000:2014) {
for (m in 01:12) {
if (length(tspans) == 0) {
tspans <- str_c(m,".",y)
}
else {
tspans <- c(tspans, str_c(m,".",y))
}
}
rm(y, m)
}
# # All combinations mmmYYYY (not used yet)
# years <- 2000:2014 # 15
# months <- month.abb
# no <- 0
# comb <- NULL
# for (m in 1:12) {
# for (y in 1:15) {
# no <- no + 1
# comb[no] <- str_c(months[m],years[y])
# }
# }
# rm(years, months, m, y, no)
# Remove future/incomplete months
removemonths <- c(9.2014, 10.2014, 11.2014, 12.2014)
tspans <- tspans [! tspans %in% removemonths]
rm(removemonths)
# Create new columns for every month and set the column names accordingly
cl[sprintf("%s", tspans)] <- 0
# Copy data frame for headlines
headlines <- cl
# Set curl handle for friendly scraping
handle <- getCurlHandle(httpheader = list(from = "max.mehl@uni.kn",
'user-agent' = str_c(R.version$version.string)
)
)
# SCRAPING ALL THE NEWS \o/ ----------------------------------------------
# Das Prozedere ist folgendes:
# 1. Erstelle die URL je nach Zeitspanne und wiederhole das für jede existierende Seite (meist über 100)
# - Es gibt eine Fehlermeldung auf der Suchseite, wenn keine Ergebnisse mehr vorhanden sind
# 2. Lade Seite herunter und parse sie
# - Nicht, wenn sie schon vorhanden ist, um Bandbreite zu sparen
# 3. Suche nach Auftauchen von Ländernamen
# 3a. Wenn ja, dann zähle jeweiligen Eintrag im DF um 1 hoch
# - nur ein Auftauchen des Landes in einem Artikel wird gezählt
# (es gibt für einzelne Ländercodes mehrere Schreibweisen)
stat_news_all <- 0
stat_news_pos <- 0
stat_pages <- 0
# This loop does the scraping, searching and indexing of each month and country
# !!! Beware: It takes around 24 hours to finish for 1.2000 - 8.2014! Load backup_cl.RData (and optionally backup_headlines.RData) to skip this step
for (i in 3:ncol(cl)) {
tspan <- names(cl)[i]
# Create folder for downloaded HTML files
dir.create(str_c("materials/",tspan), showWarnings = FALSE, recursive = TRUE)
days <- getDays(i, cl)
# Generate the month's base url
baseurl <- str_c("http://www.spiegel.de/suche/index.html?suchbegriff=+&quellenGroup=SPOX&suchbereich=kopftext&fromDate=1.", tspan, "&toDate=", days, ".", tspan, "&offsets=999999&pageNumber=")
# In every loop we start with page 1 again
page <- 1
# Now expand the URL by the page number UNTIL there's an error page
repeat {
cat("\n--> Processing page", page, "of timespan", tspan, "\n")
url <- str_c(baseurl, page)
dest <- str_c("materials/",tspan,"/",tspan,"-",page,".html")
url_parsed <- dlParsePage(url, dest)
status <- unlist(xpathSApply(url_parsed, "//h3[contains(text(), 'Ihre Suche ergab keinen Treffer')]"))
if (! is.null(status)) {
# If there's an error page, there're no further articles. Skip this loop and begin with next month
cat("Letzte Seite erreicht:", url, "\n")
break
}
# Page is valid, now split in single articles and search for countries in each title and teaser
headline <- xpathSApply(url_parsed, "//div[@class='search-teaser']//span[@class='headline']", xmlValue)
teaser <- xpathSApply(url_parsed, "//div[@class='search-teaser']/p", xmlValue)
url_arts <- xpathSApply(url_parsed, "//div[@class='search-teaser']/a", xmlGetAttr, "href")
url_short <- ""
# Combine headline and teaser to make it easier to search
teaser <- str_c(headline, teaser, sep=" ")
if (length(teaser) == 0) {
errormsg <- str_c("Probably 500 error at: ", tspan,"-",page)
write(errormsg, "scraping-errors.log", append = TRUE)
rm(errormsg)
}
else {
# Analyse every single teaser/headline combination
for (t in 1:length(teaser)) {
yet <- "" # Did the country already appear in the article? Empty with each loop
string <- teaser[t]
for (c in 1:nrow(cl)) {
name <- as.character(cl$name[c]) # Name of the county to detect in the teaser
status <- str_detect(tolower(string), tolower(name)) # Does the country's name appear in the teaser?
if (status) { # yes
code <- getCode(c, cl$code)
cat("The string contains news from:", code, "\n")
# We only want to count a country once even if it appears multiple times in an article
already <- str_detect(yet, code) # Did the country already appear?
if (!already) { # no
yet <- str_c(yet, code, sep=" ")
cl[c , tspan] <- cl[c , tspan] + 1 # Count +1 to the number of appearances in the data frame
# Save headlines + links to a different data frame
url_short[t] <- str_extract(url_arts[t], ".+/")
url_short[t] <- str_c(url_short[t], str_extract(url_arts[t], "a\\-\\d+\\.html"))
new_headline_entry <- str_c(headline[t], " (", url_short[t], ")")
if (headlines[c , tspan] == 0) {
headlines[c , tspan] <- new_headline_entry
}
else {
headlines[c , tspan] <- str_c(headlines[c , tspan], "\n", new_headline_entry)
}
}
rm(code, already)
stat_news_pos <- stat_news_pos + 1
}
rm(c, name, status)
}
rm(t, yet, string)
stat_news_all <- stat_news_all +1
}
}
# Go to the next page
page <- page + 1
stat_pages <- stat_pages + 1
}
rm(i, tspan, days, baseurl, page, url, url_parsed, status, teaser, headline, dest, url_arts, url_short, new_headline_entry)
# Backup all data after each month
write.csv(cl, "backup_cl.csv")
write.csv(headlines, "backup_headlines.csv")
save(cl, file="backup_cl.RData")
save(headlines, file="backup_headlines.RData")
}
# End of huge for-loop
# Final Backup
write.csv(cl, "backup_cl.csv")
write.csv(headlines, "backup_headlines.csv")
save(cl, file="backup_cl.RData")
save(headlines, file="backup_headlines.RData")