require(XML) require(RCurl) require(stringr) setwd("~/Dokumente/Uni/Aktuell/Datenerhebung WWW/Blogartikel 2 - Newsfokus/") # Import manually edited country list cl <- read.csv("newsfokus-countrylist.txt", sep = ";", header = FALSE) names(cl) <- c("code", "name") # FUNCTIONS --------------------------------------------------------------- source("newsfokus-functions.R") # PREPARATIONS ------------------------------------------------------------ # Generate all month.year combinations since January 2000 tspans <- NULL for (y in 2000:2014) { for (m in 01:12) { if (length(tspans) == 0) { tspans <- str_c(m,".",y) } else { tspans <- c(tspans, str_c(m,".",y)) } } rm(y, m) } # # All combinations mmmYYYY (not used yet) # years <- 2000:2014 # 15 # months <- month.abb # no <- 0 # comb <- NULL # for (m in 1:12) { # for (y in 1:15) { # no <- no + 1 # comb[no] <- str_c(months[m],years[y]) # } # } # rm(years, months, m, y, no) # Remove future/incomplete months removemonths <- c(9.2014, 10.2014, 11.2014, 12.2014) tspans <- tspans [! tspans %in% removemonths] rm(removemonths) # Create new columns for every month and set the column names accordingly cl[sprintf("%s", tspans)] <- 0 # Copy data frame for headlines headlines <- cl # Set curl handle for friendly scraping handle <- getCurlHandle(httpheader = list(from = "max.mehl@uni.kn", 'user-agent' = str_c(R.version$version.string) ) ) # SCRAPING ALL THE NEWS \o/ ---------------------------------------------- # Das Prozedere ist folgendes: # 1. Erstelle die URL je nach Zeitspanne und wiederhole das für jede existierende Seite (meist über 100) # - Es gibt eine Fehlermeldung auf der Suchseite, wenn keine Ergebnisse mehr vorhanden sind # 2. Lade Seite herunter und parse sie # - Nicht, wenn sie schon vorhanden ist, um Bandbreite zu sparen # 3. Suche nach Auftauchen von Ländernamen # 3a. Wenn ja, dann zähle jeweiligen Eintrag im DF um 1 hoch # - nur ein Auftauchen des Landes in einem Artikel wird gezählt # (es gibt für einzelne Ländercodes mehrere Schreibweisen) stat_news_all <- 0 stat_news_pos <- 0 stat_pages <- 0 # This loop does the scraping, searching and indexing of each month and country # !!! Beware: It takes around 24 hours to finish for 1.2000 - 8.2014! Load backup_cl.RData (and optionally backup_headlines.RData) to skip this step for (i in 3:ncol(cl)) { tspan <- names(cl)[i] # Create folder for downloaded HTML files dir.create(str_c("materials/",tspan), showWarnings = FALSE, recursive = TRUE) days <- getDays(i, cl) # Generate the month's base url baseurl <- str_c("http://www.spiegel.de/suche/index.html?suchbegriff=+&quellenGroup=SPOX&suchbereich=kopftext&fromDate=1.", tspan, "&toDate=", days, ".", tspan, "&offsets=999999&pageNumber=") # In every loop we start with page 1 again page <- 1 # Now expand the URL by the page number UNTIL there's an error page repeat { cat("\n--> Processing page", page, "of timespan", tspan, "\n") url <- str_c(baseurl, page) dest <- str_c("materials/",tspan,"/",tspan,"-",page,".html") url_parsed <- dlParsePage(url, dest) status <- unlist(xpathSApply(url_parsed, "//h3[contains(text(), 'Ihre Suche ergab keinen Treffer')]")) if (! is.null(status)) { # If there's an error page, there're no further articles. Skip this loop and begin with next month cat("Letzte Seite erreicht:", url, "\n") break } # Page is valid, now split in single articles and search for countries in each title and teaser headline <- xpathSApply(url_parsed, "//div[@class='search-teaser']//span[@class='headline']", xmlValue) teaser <- xpathSApply(url_parsed, "//div[@class='search-teaser']/p", xmlValue) url_arts <- xpathSApply(url_parsed, "//div[@class='search-teaser']/a", xmlGetAttr, "href") url_short <- "" # Combine headline and teaser to make it easier to search teaser <- str_c(headline, teaser, sep=" ") if (length(teaser) == 0) { errormsg <- str_c("Probably 500 error at: ", tspan,"-",page) write(errormsg, "scraping-errors.log", append = TRUE) rm(errormsg) } else { # Analyse every single teaser/headline combination for (t in 1:length(teaser)) { yet <- "" # Did the country already appear in the article? Empty with each loop string <- teaser[t] for (c in 1:nrow(cl)) { name <- as.character(cl$name[c]) # Name of the county to detect in the teaser status <- str_detect(tolower(string), tolower(name)) # Does the country's name appear in the teaser? if (status) { # yes code <- getCode(c, cl$code) cat("The string contains news from:", code, "\n") # We only want to count a country once even if it appears multiple times in an article already <- str_detect(yet, code) # Did the country already appear? if (!already) { # no yet <- str_c(yet, code, sep=" ") cl[c , tspan] <- cl[c , tspan] + 1 # Count +1 to the number of appearances in the data frame # Save headlines + links to a different data frame url_short[t] <- str_extract(url_arts[t], ".+/") url_short[t] <- str_c(url_short[t], str_extract(url_arts[t], "a\\-\\d+\\.html")) new_headline_entry <- str_c(headline[t], " (", url_short[t], ")") if (headlines[c , tspan] == 0) { headlines[c , tspan] <- new_headline_entry } else { headlines[c , tspan] <- str_c(headlines[c , tspan], "\n", new_headline_entry) } } rm(code, already) stat_news_pos <- stat_news_pos + 1 } rm(c, name, status) } rm(t, yet, string) stat_news_all <- stat_news_all +1 } } # Go to the next page page <- page + 1 stat_pages <- stat_pages + 1 } rm(i, tspan, days, baseurl, page, url, url_parsed, status, teaser, headline, dest, url_arts, url_short, new_headline_entry) # Backup all data after each month write.csv(cl, "backup_cl.csv") write.csv(headlines, "backup_headlines.csv") save(cl, file="backup_cl.RData") save(headlines, file="backup_headlines.RData") } # End of huge for-loop # Final Backup write.csv(cl, "backup_cl.csv") write.csv(headlines, "backup_headlines.csv") save(cl, file="backup_cl.RData") save(headlines, file="backup_headlines.RData")