Files
uni-surprising-newsfocus/R-Code/newsfokus-functions.R

128 lines
3.0 KiB
R
Raw Normal View History

2014-11-28 18:05:12 +01:00
require(stringr)
require(RCurl)
dlParsePage <- function(url, dest) {
if (!file.exists(dest)) {
url_curl <- getURL(url, curl=handle)
write(url_curl, dest)
}
parsed <- htmlParse(dest, encoding = "UTF-8")
return(parsed)
}
# Calculate the number of days in a specific month
# (Important for manipulating the search URL because otherwise there would be articles
# of the NEXT month in the search results if we have a too large time span)
getDays <- function(i, df) {
cur_tspan <- names(df)[i]
cur_m <- as.numeric(str_extract(cur_tspan, "^\\d+\\>"))
cur_y <- as.numeric(str_extract(cur_tspan, "\\d{4}"))
if (cur_m == 12) {
nex_y <- cur_y + 1
nex_m <- 1
}
else {
nex_y <- cur_y
nex_m <- cur_m +1
}
cur_date <- str_c(cur_y, cur_m, "01", sep = "-")
nex_date <- str_c(nex_y, nex_m, "01", sep = "-")
days <- as.numeric(difftime(as.Date(nex_date), as.Date(cur_date)))
return(days)
}
getCode <- function(i, df) {
# There's a bug with appearances of Namibia, producing NAs instead of "NA"
if (! is.na(as.character(df[i]))) {
code <- as.character(df[i]) # Get the country's code
}
else {
code <- "NA"
}
return(code)
}
# NAs get converted into "NA" for Namibia
namibiaBug <- function(x) {
if (is.na(x)) {x <- "NA"}
return(x)
}
# Get all headlines for a specific country and a specific month
getHeadlines <- function(df, countrycol, descountry, month) {
no <- 0
news <- NULL
for (r in 1:nrow(df)) {
descountry <- namibiaBug(descountry)
curcountry <- namibiaBug(as.character(df[r,countrycol]))
if (curcountry == descountry) {
no <- no + 1
news[no] <- as.character(df[r,month])
}
}
cat(news)
}
# Remove months with 0 surprising newsfocuses
removeZeroMonths <- function(df, mincol, maxcol) {
delmonthno <- 0
delmonth <- NULL
for (c in mincol:maxcol) {
month <- names(df)[c]
no <- 0
for (r in 1:nrow(df)) {
no <- no + as.numeric(df[r,c])
}
if (no == 0) {
delmonthno <- delmonthno + 1
delmonth[delmonthno] <- month
}
}
return(df [! names(df) %in% delmonth])
}
# Get countries which produced newsfocuses in a given month
getFocusCountries <- function(df, month) {
exists <- FALSE
for (c in 1:ncol(df)) {
status <- str_detect(month, names(df)[c])
if (status) {
exists <- TRUE
}
}
if (!exists) {
return(cat(month, "isn't a valid column in given dataframe"))
}
no <- 0
countries <- NULL
for (r in 1:nrow(df)) {
supfoc <- df[r,month]
if (supfoc > 0) {
no <- no + 1
countries[no] <- as.character(df[r,"name"])
}
}
return(countries)
}
# Get average development of news over a span of years
getAverages <- function(df, codecol, code, yearspan) {
no <- 0
averg <- NULL
for (r in 1:nrow(df)) {
code <- namibiaBug(code)
curcode <- namibiaBug(df[r,codecol])
if (curcode == code) {
for (y in yearspan) {
no <- no + 1
curcol <- str_c(y,"-averg")
averg[no] <- as.numeric(df[r,curcol])
}
}
}
return(data.frame(year=yearspan, averg=averg))
}