require(XML)
require(RCurl)
require(stringr)


setwd("~/Dokumente/Uni/Aktuell/Datenerhebung WWW/Blogartikel 2 - Newsfokus/")
# Import manually edited country list
cl <- read.csv("newsfokus-countrylist.txt", sep = ";", header = FALSE)
names(cl) <- c("code", "name")


# FUNCTIONS ---------------------------------------------------------------

source("newsfokus-functions.R")


# PREPARATIONS ------------------------------------------------------------

# Generate all month.year combinations since January 2000
tspans <- NULL
for (y in 2000:2014) {
  for (m in 01:12) {
    if (length(tspans) == 0) {
      tspans <- str_c(m,".",y)
    }
    else {
      tspans <- c(tspans, str_c(m,".",y))
    }
  }
  rm(y, m)
}

# # All combinations mmmYYYY (not used yet)
# years <- 2000:2014 # 15
# months <- month.abb
# no <- 0
# comb <- NULL
# for (m in 1:12) {
#   for (y in 1:15) {
#     no <- no + 1
#     comb[no] <- str_c(months[m],years[y])
#   }
# }
# rm(years, months, m, y, no)

# Remove future/incomplete months
removemonths <- c(9.2014, 10.2014, 11.2014, 12.2014)
tspans <- tspans [! tspans %in% removemonths]
rm(removemonths)


# Create new columns for every month and set the column names accordingly
cl[sprintf("%s", tspans)] <- 0

# Copy data frame for headlines
headlines <- cl

# Set curl handle for friendly scraping
handle <- getCurlHandle(httpheader = list(from         = "max.mehl@uni.kn",
                                          'user-agent' = str_c(R.version$version.string)
                                          )
                        )


# SCRAPING ALL THE NEWS \o/ ----------------------------------------------

# Das Prozedere ist folgendes:
#   1. Erstelle die URL je nach Zeitspanne und wiederhole das für jede existierende Seite (meist über 100)
#     - Es gibt eine Fehlermeldung auf der Suchseite, wenn keine Ergebnisse mehr vorhanden sind
#   2. Lade Seite herunter und parse sie
#     - Nicht, wenn sie schon vorhanden ist, um Bandbreite zu sparen
#   3. Suche nach Auftauchen von Ländernamen
#     3a. Wenn ja, dann zähle jeweiligen Eintrag im DF um 1 hoch
#       - nur ein Auftauchen des Landes in einem Artikel wird gezählt 
#         (es gibt für einzelne Ländercodes mehrere Schreibweisen)

stat_news_all <- 0
stat_news_pos <- 0
stat_pages    <- 0
# This loop does the scraping, searching and indexing of each month and country
# !!! Beware: It takes around 24 hours to finish for 1.2000 - 8.2014! Load backup_cl.RData (and optionally backup_headlines.RData) to skip this step
for (i in 3:ncol(cl)) {
  tspan <- names(cl)[i]
  
  # Create folder for downloaded HTML files
  dir.create(str_c("materials/",tspan), showWarnings = FALSE, recursive = TRUE)
  
  days <- getDays(i, cl)
  # Generate the month's base url
  baseurl  <- str_c("http://www.spiegel.de/suche/index.html?suchbegriff=+&quellenGroup=SPOX&suchbereich=kopftext&fromDate=1.", tspan, "&toDate=", days, ".", tspan, "&offsets=999999&pageNumber=")
  
  # In every loop we start with page 1 again
  page <- 1
  
  # Now expand the URL by the page number UNTIL there's an error page
  repeat {
    cat("\n--> Processing page", page, "of timespan", tspan, "\n")
    
    url <- str_c(baseurl, page)
    dest <- str_c("materials/",tspan,"/",tspan,"-",page,".html")
    url_parsed <- dlParsePage(url, dest)
    
    status <- unlist(xpathSApply(url_parsed, "//h3[contains(text(), 'Ihre Suche ergab keinen Treffer')]"))
    if (! is.null(status)) {
      # If there's an error page, there're no further articles. Skip this loop and begin with next month
      cat("Letzte Seite erreicht:", url, "\n")
      break
    }
    
    # Page is valid, now split in single articles and search for countries in each title and teaser
    headline  <- xpathSApply(url_parsed, "//div[@class='search-teaser']//span[@class='headline']", xmlValue)
    teaser    <- xpathSApply(url_parsed, "//div[@class='search-teaser']/p", xmlValue)
    url_arts  <- xpathSApply(url_parsed, "//div[@class='search-teaser']/a", xmlGetAttr, "href")
    url_short <- ""
    # Combine headline and teaser to make it easier to search
    teaser   <- str_c(headline, teaser, sep=" ")
    if (length(teaser) == 0) {
      errormsg <- str_c("Probably 500 error at: ", tspan,"-",page)
      write(errormsg, "scraping-errors.log", append = TRUE)
      rm(errormsg)
    }
    else {
      # Analyse every single teaser/headline combination
      for (t in 1:length(teaser)) {
        yet <- "" # Did the country already appear in the article? Empty with each loop 
        string <- teaser[t]
        for (c in 1:nrow(cl)) {
          name <- as.character(cl$name[c])    # Name of the county to detect in the teaser
          status <- str_detect(tolower(string), tolower(name))  # Does the country's name appear in the teaser?
          
          if (status) { # yes
            code <- getCode(c, cl$code)
            
            cat("The string contains news from:", code, "\n")
            
            # We only want to count a country once even if it appears multiple times in an article
            already <- str_detect(yet, code)  # Did the country already appear?
            if (!already) { # no
              yet <- str_c(yet, code, sep=" ")
              cl[c , tspan] <- cl[c , tspan] + 1  # Count +1 to the number of appearances in the data frame
              
              # Save headlines + links to a different data frame
              url_short[t] <- str_extract(url_arts[t], ".+/")
              url_short[t] <- str_c(url_short[t], str_extract(url_arts[t], "a\\-\\d+\\.html"))
              new_headline_entry <- str_c(headline[t], " (", url_short[t], ")")
              if (headlines[c , tspan] == 0) {
                headlines[c , tspan] <- new_headline_entry
              }
              else {
                headlines[c , tspan] <- str_c(headlines[c , tspan], "\n", new_headline_entry)
              }
              
            }
            rm(code, already)
            stat_news_pos <- stat_news_pos + 1
          }
          rm(c, name, status)
        }
        rm(t, yet, string)
        stat_news_all <- stat_news_all +1
      }
    }
     
    # Go to the next page
    page <- page + 1
    stat_pages <- stat_pages + 1
  }
  rm(i, tspan, days, baseurl, page, url, url_parsed, status, teaser, headline, dest, url_arts, url_short, new_headline_entry)
  # Backup all data after each month
  write.csv(cl, "backup_cl.csv")
  write.csv(headlines, "backup_headlines.csv")
  save(cl, file="backup_cl.RData")
  save(headlines, file="backup_headlines.RData")
}
# End of huge for-loop


# Final Backup
write.csv(cl, "backup_cl.csv")
write.csv(headlines, "backup_headlines.csv")
save(cl, file="backup_cl.RData")
save(headlines, file="backup_headlines.RData")