128 lines
3.0 KiB
R
128 lines
3.0 KiB
R
require(stringr)
|
|
require(RCurl)
|
|
|
|
|
|
dlParsePage <- function(url, dest) {
|
|
if (!file.exists(dest)) {
|
|
url_curl <- getURL(url, curl=handle)
|
|
write(url_curl, dest)
|
|
}
|
|
parsed <- htmlParse(dest, encoding = "UTF-8")
|
|
return(parsed)
|
|
}
|
|
|
|
# Calculate the number of days in a specific month
|
|
# (Important for manipulating the search URL because otherwise there would be articles
|
|
# of the NEXT month in the search results if we have a too large time span)
|
|
getDays <- function(i, df) {
|
|
cur_tspan <- names(df)[i]
|
|
cur_m <- as.numeric(str_extract(cur_tspan, "^\\d+\\>"))
|
|
cur_y <- as.numeric(str_extract(cur_tspan, "\\d{4}"))
|
|
if (cur_m == 12) {
|
|
nex_y <- cur_y + 1
|
|
nex_m <- 1
|
|
}
|
|
else {
|
|
nex_y <- cur_y
|
|
nex_m <- cur_m +1
|
|
}
|
|
cur_date <- str_c(cur_y, cur_m, "01", sep = "-")
|
|
nex_date <- str_c(nex_y, nex_m, "01", sep = "-")
|
|
days <- as.numeric(difftime(as.Date(nex_date), as.Date(cur_date)))
|
|
return(days)
|
|
}
|
|
|
|
getCode <- function(i, df) {
|
|
# There's a bug with appearances of Namibia, producing NAs instead of "NA"
|
|
if (! is.na(as.character(df[i]))) {
|
|
code <- as.character(df[i]) # Get the country's code
|
|
}
|
|
else {
|
|
code <- "NA"
|
|
}
|
|
return(code)
|
|
}
|
|
|
|
# NAs get converted into "NA" for Namibia
|
|
namibiaBug <- function(x) {
|
|
if (is.na(x)) {x <- "NA"}
|
|
return(x)
|
|
}
|
|
|
|
|
|
# Get all headlines for a specific country and a specific month
|
|
getHeadlines <- function(df, countrycol, descountry, month) {
|
|
no <- 0
|
|
news <- NULL
|
|
for (r in 1:nrow(df)) {
|
|
descountry <- namibiaBug(descountry)
|
|
curcountry <- namibiaBug(as.character(df[r,countrycol]))
|
|
if (curcountry == descountry) {
|
|
no <- no + 1
|
|
news[no] <- as.character(df[r,month])
|
|
}
|
|
}
|
|
cat(news)
|
|
}
|
|
|
|
|
|
# Remove months with 0 surprising newsfocuses
|
|
removeZeroMonths <- function(df, mincol, maxcol) {
|
|
delmonthno <- 0
|
|
delmonth <- NULL
|
|
for (c in mincol:maxcol) {
|
|
month <- names(df)[c]
|
|
no <- 0
|
|
for (r in 1:nrow(df)) {
|
|
no <- no + as.numeric(df[r,c])
|
|
}
|
|
if (no == 0) {
|
|
delmonthno <- delmonthno + 1
|
|
delmonth[delmonthno] <- month
|
|
}
|
|
}
|
|
return(df [! names(df) %in% delmonth])
|
|
}
|
|
|
|
# Get countries which produced newsfocuses in a given month
|
|
getFocusCountries <- function(df, month) {
|
|
exists <- FALSE
|
|
for (c in 1:ncol(df)) {
|
|
status <- str_detect(month, names(df)[c])
|
|
if (status) {
|
|
exists <- TRUE
|
|
}
|
|
}
|
|
if (!exists) {
|
|
return(cat(month, "isn't a valid column in given dataframe"))
|
|
}
|
|
no <- 0
|
|
countries <- NULL
|
|
for (r in 1:nrow(df)) {
|
|
supfoc <- df[r,month]
|
|
if (supfoc > 0) {
|
|
no <- no + 1
|
|
countries[no] <- as.character(df[r,"name"])
|
|
}
|
|
}
|
|
return(countries)
|
|
}
|
|
|
|
# Get average development of news over a span of years
|
|
getAverages <- function(df, codecol, code, yearspan) {
|
|
no <- 0
|
|
averg <- NULL
|
|
for (r in 1:nrow(df)) {
|
|
code <- namibiaBug(code)
|
|
curcode <- namibiaBug(df[r,codecol])
|
|
if (curcode == code) {
|
|
for (y in yearspan) {
|
|
no <- no + 1
|
|
curcol <- str_c(y,"-averg")
|
|
averg[no] <- as.numeric(df[r,curcol])
|
|
}
|
|
}
|
|
}
|
|
return(data.frame(year=yearspan, averg=averg))
|
|
}
|