require(stringr)
require(RCurl)


dlParsePage <- function(url, dest) {
  if (!file.exists(dest)) {
    url_curl <- getURL(url, curl=handle)
    write(url_curl, dest)
  }
  parsed <- htmlParse(dest, encoding = "UTF-8")
  return(parsed)
}

# Calculate the number of days in a specific month
  # (Important for manipulating the search URL because otherwise there would be articles 
  # of the NEXT month in the search results if we have a too large time span)
getDays <- function(i, df) {
  cur_tspan <- names(df)[i]
  cur_m <- as.numeric(str_extract(cur_tspan, "^\\d+\\>"))
  cur_y <- as.numeric(str_extract(cur_tspan, "\\d{4}"))
  if (cur_m == 12) {
    nex_y <- cur_y + 1
    nex_m <- 1
  }
  else {
    nex_y <- cur_y
    nex_m <- cur_m +1
  }
  cur_date <- str_c(cur_y, cur_m, "01", sep = "-")
  nex_date <- str_c(nex_y, nex_m, "01", sep = "-")
  days <- as.numeric(difftime(as.Date(nex_date), as.Date(cur_date)))
  return(days)
}

getCode <- function(i, df) {
  # There's a bug with appearances of Namibia, producing NAs instead of "NA"
  if (! is.na(as.character(df[i]))) { 
    code <- as.character(df[i])  # Get the country's code
  }
  else {
    code <- "NA"
  }
  return(code)
}

# NAs get converted into "NA" for Namibia
namibiaBug <- function(x) {
  if (is.na(x)) {x <- "NA"}
  return(x)
}


# Get all headlines for a specific country and a specific month
getHeadlines <- function(df, countrycol, descountry, month) {
  no <- 0
  news <- NULL
  for (r in 1:nrow(df)) {
    descountry <- namibiaBug(descountry)
    curcountry <- namibiaBug(as.character(df[r,countrycol]))
    if (curcountry == descountry) {
      no <- no + 1
      news[no] <- as.character(df[r,month])
    }
  }
  cat(news)
}


# Remove months with 0 surprising newsfocuses
removeZeroMonths <- function(df, mincol, maxcol) {
  delmonthno <- 0
  delmonth <- NULL
  for (c in mincol:maxcol) {
    month <- names(df)[c]
    no <- 0
    for (r in 1:nrow(df)) {
      no <- no + as.numeric(df[r,c])
    }
    if (no == 0) {
      delmonthno <- delmonthno + 1
      delmonth[delmonthno] <- month
    }
  }
  return(df [! names(df) %in% delmonth])
}

# Get countries which produced newsfocuses in a given month
getFocusCountries <- function(df, month) {
exists <- FALSE
for (c in 1:ncol(df)) {
  status <- str_detect(month, names(df)[c])
  if (status) {
    exists <- TRUE
  }
}
if (!exists) {
  return(cat(month, "isn't a valid column in given dataframe"))
}
no <- 0
countries <- NULL
  for (r in 1:nrow(df)) {
    supfoc <- df[r,month]
    if (supfoc > 0) {
      no <- no + 1
      countries[no] <- as.character(df[r,"name"])
    }
  }
  return(countries)
}

# Get average development of news over a span of years
getAverages <- function(df, codecol, code, yearspan) {
  no <- 0
  averg <- NULL
  for (r in 1:nrow(df)) {
    code <- namibiaBug(code)
    curcode <- namibiaBug(df[r,codecol])
    if (curcode == code) {
      for (y in yearspan) {
        no <- no + 1
        curcol <- str_c(y,"-averg")
        averg[no] <- as.numeric(df[r,curcol])
      }
    }
  }
  return(data.frame(year=yearspan, averg=averg))
}