R-Code/newsfokus-2-scraping.R

require(XML)
require(RCurl)
require(stringr)


setwd("~/Dokumente/Uni/Aktuell/Datenerhebung WWW/Blogartikel 2 - Newsfokus/")
# Import manually edited country list
cl <- read.csv("newsfokus-countrylist.txt", sep = ";", header = FALSE)
names(cl) <- c("code", "name")


# FUNCTIONS ---------------------------------------------------------------

source("newsfokus-functions.R")


# PREPARATIONS ------------------------------------------------------------

# Generate all month.year combinations since January 2000
tspans <- NULL
for (y in 2000:2014) {
  for (m in 01:12) {
    if (length(tspans) == 0) {
      tspans <- str_c(m,".",y)
    }
    else {
      tspans <- c(tspans, str_c(m,".",y))
    }
  }
  rm(y, m)
}

# # All combinations mmmYYYY (not used yet)
# years <- 2000:2014 # 15
# months <- month.abb
# no <- 0
# comb <- NULL
# for (m in 1:12) {
#   for (y in 1:15) {
#     no <- no + 1
#     comb[no] <- str_c(months[m],years[y])
#   }
# }
# rm(years, months, m, y, no)

# Remove future/incomplete months
removemonths <- c(9.2014, 10.2014, 11.2014, 12.2014)
tspans <- tspans [! tspans %in% removemonths]
rm(removemonths)


# Create new columns for every month and set the column names accordingly
cl[sprintf("%s", tspans)] <- 0

# Copy data frame for headlines
headlines <- cl

# Set curl handle for friendly scraping
handle <- getCurlHandle(httpheader = list(from         = "max.mehl@uni.kn",
                                          'user-agent' = str_c(R.version$version.string)
                                          )
                        )


# SCRAPING ALL THE NEWS \o/ ----------------------------------------------

# Das Prozedere ist folgendes:
#   1. Erstelle die URL je nach Zeitspanne und wiederhole das für jede existierende Seite (meist über 100)
#     - Es gibt eine Fehlermeldung auf der Suchseite, wenn keine Ergebnisse mehr vorhanden sind
#   2. Lade Seite herunter und parse sie
#     - Nicht, wenn sie schon vorhanden ist, um Bandbreite zu sparen
#   3. Suche nach Auftauchen von Ländernamen
#     3a. Wenn ja, dann zähle jeweiligen Eintrag im DF um 1 hoch
#       - nur ein Auftauchen des Landes in einem Artikel wird gezählt 
#         (es gibt für einzelne Ländercodes mehrere Schreibweisen)

stat_news_all <- 0
stat_news_pos <- 0
stat_pages    <- 0
# This loop does the scraping, searching and indexing of each month and country
# !!! Beware: It takes around 24 hours to finish for 1.2000 - 8.2014! Load backup_cl.RData (and optionally backup_headlines.RData) to skip this step
for (i in 3:ncol(cl)) {
  tspan <- names(cl)[i]
  
  # Create folder for downloaded HTML files
  dir.create(str_c("materials/",tspan), showWarnings = FALSE, recursive = TRUE)
  
  days <- getDays(i, cl)
  # Generate the month's base url
  baseurl  <- str_c("http://www.spiegel.de/suche/index.html?suchbegriff=+&quellenGroup=SPOX&suchbereich=kopftext&fromDate=1.", tspan, "&toDate=", days, ".", tspan, "&offsets=999999&pageNumber=")
  
  # In every loop we start with page 1 again
  page <- 1
  
  # Now expand the URL by the page number UNTIL there's an error page
  repeat {
    cat("\n--> Processing page", page, "of timespan", tspan, "\n")
    
    url <- str_c(baseurl, page)
    dest <- str_c("materials/",tspan,"/",tspan,"-",page,".html")
    url_parsed <- dlParsePage(url, dest)
    
    status <- unlist(xpathSApply(url_parsed, "//h3[contains(text(), 'Ihre Suche ergab keinen Treffer')]"))
    if (! is.null(status)) {
      # If there's an error page, there're no further articles. Skip this loop and begin with next month
      cat("Letzte Seite erreicht:", url, "\n")
      break
    }
    
    # Page is valid, now split in single articles and search for countries in each title and teaser
    headline  <- xpathSApply(url_parsed, "//div[@class='search-teaser']//span[@class='headline']", xmlValue)
    teaser    <- xpathSApply(url_parsed, "//div[@class='search-teaser']/p", xmlValue)
    url_arts  <- xpathSApply(url_parsed, "//div[@class='search-teaser']/a", xmlGetAttr, "href")
    url_short <- ""
    # Combine headline and teaser to make it easier to search
    teaser   <- str_c(headline, teaser, sep=" ")
    if (length(teaser) == 0) {
      errormsg <- str_c("Probably 500 error at: ", tspan,"-",page)
      write(errormsg, "scraping-errors.log", append = TRUE)
      rm(errormsg)
    }
    else {
      # Analyse every single teaser/headline combination
      for (t in 1:length(teaser)) {
        yet <- "" # Did the country already appear in the article? Empty with each loop 
        string <- teaser[t]
        for (c in 1:nrow(cl)) {
          name <- as.character(cl$name[c])    # Name of the county to detect in the teaser
          status <- str_detect(tolower(string), tolower(name))  # Does the country's name appear in the teaser?
          
          if (status) { # yes
            code <- getCode(c, cl$code)
            
            cat("The string contains news from:", code, "\n")
            
            # We only want to count a country once even if it appears multiple times in an article
            already <- str_detect(yet, code)  # Did the country already appear?
            if (!already) { # no
              yet <- str_c(yet, code, sep=" ")
              cl[c , tspan] <- cl[c , tspan] + 1  # Count +1 to the number of appearances in the data frame
              
              # Save headlines + links to a different data frame
              url_short[t] <- str_extract(url_arts[t], ".+/")
              url_short[t] <- str_c(url_short[t], str_extract(url_arts[t], "a\\-\\d+\\.html"))
              new_headline_entry <- str_c(headline[t], " (", url_short[t], ")")
              if (headlines[c , tspan] == 0) {
                headlines[c , tspan] <- new_headline_entry
              }
              else {
                headlines[c , tspan] <- str_c(headlines[c , tspan], "\n", new_headline_entry)
              }
              
            }
            rm(code, already)
            stat_news_pos <- stat_news_pos + 1
          }
          rm(c, name, status)
        }
        rm(t, yet, string)
        stat_news_all <- stat_news_all +1
      }
    }
     
    # Go to the next page
    page <- page + 1
    stat_pages <- stat_pages + 1
  }
  rm(i, tspan, days, baseurl, page, url, url_parsed, status, teaser, headline, dest, url_arts, url_short, new_headline_entry)
  # Backup all data after each month
  write.csv(cl, "backup_cl.csv")
  write.csv(headlines, "backup_headlines.csv")
  save(cl, file="backup_cl.RData")
  save(headlines, file="backup_headlines.RData")
}
# End of huge for-loop


# Final Backup
write.csv(cl, "backup_cl.csv")
write.csv(headlines, "backup_headlines.csv")
save(cl, file="backup_cl.RData")
save(headlines, file="backup_headlines.RData")
initial commit v1.0 2014-11-28 18:05:12 +01:00			`require(XML)`
			`require(RCurl)`
			`require(stringr)`



			`setwd("~/Dokumente/Uni/Aktuell/Datenerhebung WWW/Blogartikel 2 - Newsfokus/")`
			`# Import manually edited country list`
			`cl <- read.csv("newsfokus-countrylist.txt", sep = ";", header = FALSE)`
			`names(cl) <- c("code", "name")`


			`# FUNCTIONS ---------------------------------------------------------------`

			`source("newsfokus-functions.R")`


			`# PREPARATIONS ------------------------------------------------------------`

			`# Generate all month.year combinations since January 2000`
			`tspans <- NULL`
			`for (y in 2000:2014) {`
			`for (m in 01:12) {`
			`if (length(tspans) == 0) {`
			`tspans <- str_c(m,".",y)`
			`}`
			`else {`
			`tspans <- c(tspans, str_c(m,".",y))`
			`}`
			`}`
			`rm(y, m)`
			`}`

			`# # All combinations mmmYYYY (not used yet)`
			`# years <- 2000:2014 # 15`
			`# months <- month.abb`
			`# no <- 0`
			`# comb <- NULL`
			`# for (m in 1:12) {`
			`# for (y in 1:15) {`
			`# no <- no + 1`
			`# comb[no] <- str_c(months[m],years[y])`
			`# }`
			`# }`
			`# rm(years, months, m, y, no)`

			`# Remove future/incomplete months`
			`removemonths <- c(9.2014, 10.2014, 11.2014, 12.2014)`
			`tspans <- tspans [! tspans %in% removemonths]`
			`rm(removemonths)`


			`# Create new columns for every month and set the column names accordingly`
			`cl[sprintf("%s", tspans)] <- 0`

			`# Copy data frame for headlines`
			`headlines <- cl`

			`# Set curl handle for friendly scraping`
			`handle <- getCurlHandle(httpheader = list(from = "max.mehl@uni.kn",`
			`'user-agent' = str_c(R.version$version.string)`
			`)`
			`)`


			`# SCRAPING ALL THE NEWS \o/ ----------------------------------------------`

			`# Das Prozedere ist folgendes:`
			`# 1. Erstelle die URL je nach Zeitspanne und wiederhole das für jede existierende Seite (meist über 100)`
			`# - Es gibt eine Fehlermeldung auf der Suchseite, wenn keine Ergebnisse mehr vorhanden sind`
			`# 2. Lade Seite herunter und parse sie`
			`# - Nicht, wenn sie schon vorhanden ist, um Bandbreite zu sparen`
			`# 3. Suche nach Auftauchen von Ländernamen`
			`# 3a. Wenn ja, dann zähle jeweiligen Eintrag im DF um 1 hoch`
			`# - nur ein Auftauchen des Landes in einem Artikel wird gezählt`
			`# (es gibt für einzelne Ländercodes mehrere Schreibweisen)`

			`stat_news_all <- 0`
			`stat_news_pos <- 0`
			`stat_pages <- 0`
			`# This loop does the scraping, searching and indexing of each month and country`
			`# !!! Beware: It takes around 24 hours to finish for 1.2000 - 8.2014! Load backup_cl.RData (and optionally backup_headlines.RData) to skip this step`
			`for (i in 3:ncol(cl)) {`
			`tspan <- names(cl)[i]`

			`# Create folder for downloaded HTML files`
			`dir.create(str_c("materials/",tspan), showWarnings = FALSE, recursive = TRUE)`

			`days <- getDays(i, cl)`
			`# Generate the month's base url`
			`baseurl <- str_c("http://www.spiegel.de/suche/index.html?suchbegriff=+&quellenGroup=SPOX&suchbereich=kopftext&fromDate=1.", tspan, "&toDate=", days, ".", tspan, "&offsets=999999&pageNumber=")`

			`# In every loop we start with page 1 again`
			`page <- 1`

			`# Now expand the URL by the page number UNTIL there's an error page`
			`repeat {`
			`cat("\n--> Processing page", page, "of timespan", tspan, "\n")`

			`url <- str_c(baseurl, page)`
			`dest <- str_c("materials/",tspan,"/",tspan,"-",page,".html")`
			`url_parsed <- dlParsePage(url, dest)`

			`status <- unlist(xpathSApply(url_parsed, "//h3[contains(text(), 'Ihre Suche ergab keinen Treffer')]"))`
			`if (! is.null(status)) {`
			`# If there's an error page, there're no further articles. Skip this loop and begin with next month`
			`cat("Letzte Seite erreicht:", url, "\n")`
			`break`
			`}`

			`# Page is valid, now split in single articles and search for countries in each title and teaser`
			`headline <- xpathSApply(url_parsed, "//div[@class='search-teaser']//span[@class='headline']", xmlValue)`
			`teaser <- xpathSApply(url_parsed, "//div[@class='search-teaser']/p", xmlValue)`
			`url_arts <- xpathSApply(url_parsed, "//div[@class='search-teaser']/a", xmlGetAttr, "href")`
			`url_short <- ""`
			`# Combine headline and teaser to make it easier to search`
			`teaser <- str_c(headline, teaser, sep=" ")`
			`if (length(teaser) == 0) {`
			`errormsg <- str_c("Probably 500 error at: ", tspan,"-",page)`
			`write(errormsg, "scraping-errors.log", append = TRUE)`
			`rm(errormsg)`
			`}`
			`else {`
			`# Analyse every single teaser/headline combination`
			`for (t in 1:length(teaser)) {`
			`yet <- "" # Did the country already appear in the article? Empty with each loop`
			`string <- teaser[t]`
			`for (c in 1:nrow(cl)) {`
			`name <- as.character(cl$name[c]) # Name of the county to detect in the teaser`
			`status <- str_detect(tolower(string), tolower(name)) # Does the country's name appear in the teaser?`

			`if (status) { # yes`
			`code <- getCode(c, cl$code)`

			`cat("The string contains news from:", code, "\n")`

			`# We only want to count a country once even if it appears multiple times in an article`
			`already <- str_detect(yet, code) # Did the country already appear?`
			`if (!already) { # no`
			`yet <- str_c(yet, code, sep=" ")`
			`cl[c , tspan] <- cl[c , tspan] + 1 # Count +1 to the number of appearances in the data frame`

			`# Save headlines + links to a different data frame`
			`url_short[t] <- str_extract(url_arts[t], ".+/")`
			`url_short[t] <- str_c(url_short[t], str_extract(url_arts[t], "a\\-\\d+\\.html"))`
			`new_headline_entry <- str_c(headline[t], " (", url_short[t], ")")`
			`if (headlines[c , tspan] == 0) {`
			`headlines[c , tspan] <- new_headline_entry`
			`}`
			`else {`
			`headlines[c , tspan] <- str_c(headlines[c , tspan], "\n", new_headline_entry)`
			`}`

			`}`
			`rm(code, already)`
			`stat_news_pos <- stat_news_pos + 1`
			`}`
			`rm(c, name, status)`
			`}`
			`rm(t, yet, string)`
			`stat_news_all <- stat_news_all +1`
			`}`
			`}`

			`# Go to the next page`
			`page <- page + 1`
			`stat_pages <- stat_pages + 1`
			`}`
			`rm(i, tspan, days, baseurl, page, url, url_parsed, status, teaser, headline, dest, url_arts, url_short, new_headline_entry)`
			`# Backup all data after each month`
			`write.csv(cl, "backup_cl.csv")`
			`write.csv(headlines, "backup_headlines.csv")`
			`save(cl, file="backup_cl.RData")`
			`save(headlines, file="backup_headlines.RData")`
			`}`
			`# End of huge for-loop`


			`# Final Backup`
			`write.csv(cl, "backup_cl.csv")`
			`write.csv(headlines, "backup_headlines.csv")`
			`save(cl, file="backup_cl.RData")`
			`save(headlines, file="backup_headlines.RData")`