require(stringr)
require(dplyr)
require(XML)

setwd(dir="~/Dokumente/Uni/Aktuell/Datenerhebung WWW/Blogartikel 2 - Newsfokus")

clist_url <- "http://de.wikipedia.org/wiki/ISO-3166-1-Kodierliste"
clist_tbl <- readHTMLTable(clist_url, encoding = "UTF-8")[[1]]
names(clist_tbl) <- c("name", "code", "code-3", "code-num", "tld", "ioc", "code-2", "un")

clist <- data.frame(clist_tbl, stringsAsFactors = FALSE)
keep <- c("code", "name")
clist <- clist[keep]
clist$code <- as.character.factor(clist$code)
clist$name <- as.character.factor(clist$name)
rm(keep)

# Cleaning table from headlines, annotations and brackets
for (i in 1:nrow(clist)) {
  name <- as.character(clist$name[i])
  status <- str_detect(name, "!!")
  if (status) {
    clist <- clist[-i,]
  }
  rm(i, name, status)
}

cleanBrackets <- function(string) {
  string <- str_replace(string, "\\[.*\\]", "")
  string <- str_replace(string, "\\(.*\\)", "")
  string <- str_replace(string, ".+!", "")
  string <- str_trim(string)
  return(string)
}

for (i in 1:nrow(clist)) {
  name <- as.character(clist$name[i])
  code <- as.character(clist$code[i])
  clist$name[i] <- cleanBrackets(name)
  clist$code[i] <- cleanBrackets(code)
  rm(i, name, code)
}

write.table(clist, "newsfokus-countrylist.txt", sep=";", row.names=FALSE, col.names=FALSE) 

# Diese exportierte Länderliste wird noch manuell bearbeitet, siehe "newsfokus-countrylist-manualchanges.txt"