require(stringr) require(RCurl) dlParsePage <- function(url, dest) { if (!file.exists(dest)) { url_curl <- getURL(url, curl=handle) write(url_curl, dest) } parsed <- htmlParse(dest, encoding = "UTF-8") return(parsed) } # Calculate the number of days in a specific month # (Important for manipulating the search URL because otherwise there would be articles # of the NEXT month in the search results if we have a too large time span) getDays <- function(i, df) { cur_tspan <- names(df)[i] cur_m <- as.numeric(str_extract(cur_tspan, "^\\d+\\>")) cur_y <- as.numeric(str_extract(cur_tspan, "\\d{4}")) if (cur_m == 12) { nex_y <- cur_y + 1 nex_m <- 1 } else { nex_y <- cur_y nex_m <- cur_m +1 } cur_date <- str_c(cur_y, cur_m, "01", sep = "-") nex_date <- str_c(nex_y, nex_m, "01", sep = "-") days <- as.numeric(difftime(as.Date(nex_date), as.Date(cur_date))) return(days) } getCode <- function(i, df) { # There's a bug with appearances of Namibia, producing NAs instead of "NA" if (! is.na(as.character(df[i]))) { code <- as.character(df[i]) # Get the country's code } else { code <- "NA" } return(code) } # NAs get converted into "NA" for Namibia namibiaBug <- function(x) { if (is.na(x)) {x <- "NA"} return(x) } # Get all headlines for a specific country and a specific month getHeadlines <- function(df, countrycol, descountry, month) { no <- 0 news <- NULL for (r in 1:nrow(df)) { descountry <- namibiaBug(descountry) curcountry <- namibiaBug(as.character(df[r,countrycol])) if (curcountry == descountry) { no <- no + 1 news[no] <- as.character(df[r,month]) } } cat(news) } # Remove months with 0 surprising newsfocuses removeZeroMonths <- function(df, mincol, maxcol) { delmonthno <- 0 delmonth <- NULL for (c in mincol:maxcol) { month <- names(df)[c] no <- 0 for (r in 1:nrow(df)) { no <- no + as.numeric(df[r,c]) } if (no == 0) { delmonthno <- delmonthno + 1 delmonth[delmonthno] <- month } } return(df [! names(df) %in% delmonth]) } # Get countries which produced newsfocuses in a given month getFocusCountries <- function(df, month) { exists <- FALSE for (c in 1:ncol(df)) { status <- str_detect(month, names(df)[c]) if (status) { exists <- TRUE } } if (!exists) { return(cat(month, "isn't a valid column in given dataframe")) } no <- 0 countries <- NULL for (r in 1:nrow(df)) { supfoc <- df[r,month] if (supfoc > 0) { no <- no + 1 countries[no] <- as.character(df[r,"name"]) } } return(countries) } # Get average development of news over a span of years getAverages <- function(df, codecol, code, yearspan) { no <- 0 averg <- NULL for (r in 1:nrow(df)) { code <- namibiaBug(code) curcode <- namibiaBug(df[r,codecol]) if (curcode == code) { for (y in yearspan) { no <- no + 1 curcol <- str_c(y,"-averg") averg[no] <- as.numeric(df[r,curcol]) } } } return(data.frame(year=yearspan, averg=averg)) }