initial commit v1.0
This commit is contained in:
127
R-Code/newsfokus-functions.R
Normal file
127
R-Code/newsfokus-functions.R
Normal file
@@ -0,0 +1,127 @@
|
||||
require(stringr)
|
||||
require(RCurl)
|
||||
|
||||
|
||||
dlParsePage <- function(url, dest) {
|
||||
if (!file.exists(dest)) {
|
||||
url_curl <- getURL(url, curl=handle)
|
||||
write(url_curl, dest)
|
||||
}
|
||||
parsed <- htmlParse(dest, encoding = "UTF-8")
|
||||
return(parsed)
|
||||
}
|
||||
|
||||
# Calculate the number of days in a specific month
|
||||
# (Important for manipulating the search URL because otherwise there would be articles
|
||||
# of the NEXT month in the search results if we have a too large time span)
|
||||
getDays <- function(i, df) {
|
||||
cur_tspan <- names(df)[i]
|
||||
cur_m <- as.numeric(str_extract(cur_tspan, "^\\d+\\>"))
|
||||
cur_y <- as.numeric(str_extract(cur_tspan, "\\d{4}"))
|
||||
if (cur_m == 12) {
|
||||
nex_y <- cur_y + 1
|
||||
nex_m <- 1
|
||||
}
|
||||
else {
|
||||
nex_y <- cur_y
|
||||
nex_m <- cur_m +1
|
||||
}
|
||||
cur_date <- str_c(cur_y, cur_m, "01", sep = "-")
|
||||
nex_date <- str_c(nex_y, nex_m, "01", sep = "-")
|
||||
days <- as.numeric(difftime(as.Date(nex_date), as.Date(cur_date)))
|
||||
return(days)
|
||||
}
|
||||
|
||||
getCode <- function(i, df) {
|
||||
# There's a bug with appearances of Namibia, producing NAs instead of "NA"
|
||||
if (! is.na(as.character(df[i]))) {
|
||||
code <- as.character(df[i]) # Get the country's code
|
||||
}
|
||||
else {
|
||||
code <- "NA"
|
||||
}
|
||||
return(code)
|
||||
}
|
||||
|
||||
# NAs get converted into "NA" for Namibia
|
||||
namibiaBug <- function(x) {
|
||||
if (is.na(x)) {x <- "NA"}
|
||||
return(x)
|
||||
}
|
||||
|
||||
|
||||
# Get all headlines for a specific country and a specific month
|
||||
getHeadlines <- function(df, countrycol, descountry, month) {
|
||||
no <- 0
|
||||
news <- NULL
|
||||
for (r in 1:nrow(df)) {
|
||||
descountry <- namibiaBug(descountry)
|
||||
curcountry <- namibiaBug(as.character(df[r,countrycol]))
|
||||
if (curcountry == descountry) {
|
||||
no <- no + 1
|
||||
news[no] <- as.character(df[r,month])
|
||||
}
|
||||
}
|
||||
cat(news)
|
||||
}
|
||||
|
||||
|
||||
# Remove months with 0 surprising newsfocuses
|
||||
removeZeroMonths <- function(df, mincol, maxcol) {
|
||||
delmonthno <- 0
|
||||
delmonth <- NULL
|
||||
for (c in mincol:maxcol) {
|
||||
month <- names(df)[c]
|
||||
no <- 0
|
||||
for (r in 1:nrow(df)) {
|
||||
no <- no + as.numeric(df[r,c])
|
||||
}
|
||||
if (no == 0) {
|
||||
delmonthno <- delmonthno + 1
|
||||
delmonth[delmonthno] <- month
|
||||
}
|
||||
}
|
||||
return(df [! names(df) %in% delmonth])
|
||||
}
|
||||
|
||||
# Get countries which produced newsfocuses in a given month
|
||||
getFocusCountries <- function(df, month) {
|
||||
exists <- FALSE
|
||||
for (c in 1:ncol(df)) {
|
||||
status <- str_detect(month, names(df)[c])
|
||||
if (status) {
|
||||
exists <- TRUE
|
||||
}
|
||||
}
|
||||
if (!exists) {
|
||||
return(cat(month, "isn't a valid column in given dataframe"))
|
||||
}
|
||||
no <- 0
|
||||
countries <- NULL
|
||||
for (r in 1:nrow(df)) {
|
||||
supfoc <- df[r,month]
|
||||
if (supfoc > 0) {
|
||||
no <- no + 1
|
||||
countries[no] <- as.character(df[r,"name"])
|
||||
}
|
||||
}
|
||||
return(countries)
|
||||
}
|
||||
|
||||
# Get average development of news over a span of years
|
||||
getAverages <- function(df, codecol, code, yearspan) {
|
||||
no <- 0
|
||||
averg <- NULL
|
||||
for (r in 1:nrow(df)) {
|
||||
code <- namibiaBug(code)
|
||||
curcode <- namibiaBug(df[r,codecol])
|
||||
if (curcode == code) {
|
||||
for (y in yearspan) {
|
||||
no <- no + 1
|
||||
curcol <- str_c(y,"-averg")
|
||||
averg[no] <- as.numeric(df[r,curcol])
|
||||
}
|
||||
}
|
||||
}
|
||||
return(data.frame(year=yearspan, averg=averg))
|
||||
}
|
||||
Reference in New Issue
Block a user