require(stringr) require(dplyr) require(XML) setwd(dir="~/Dokumente/Uni/Aktuell/Datenerhebung WWW/Blogartikel 2 - Newsfokus") clist_url <- "http://de.wikipedia.org/wiki/ISO-3166-1-Kodierliste" clist_tbl <- readHTMLTable(clist_url, encoding = "UTF-8")[[1]] names(clist_tbl) <- c("name", "code", "code-3", "code-num", "tld", "ioc", "code-2", "un") clist <- data.frame(clist_tbl, stringsAsFactors = FALSE) keep <- c("code", "name") clist <- clist[keep] clist$code <- as.character.factor(clist$code) clist$name <- as.character.factor(clist$name) rm(keep) # Cleaning table from headlines, annotations and brackets for (i in 1:nrow(clist)) { name <- as.character(clist$name[i]) status <- str_detect(name, "!!") if (status) { clist <- clist[-i,] } rm(i, name, status) } cleanBrackets <- function(string) { string <- str_replace(string, "\\[.*\\]", "") string <- str_replace(string, "\\(.*\\)", "") string <- str_replace(string, ".+!", "") string <- str_trim(string) return(string) } for (i in 1:nrow(clist)) { name <- as.character(clist$name[i]) code <- as.character(clist$code[i]) clist$name[i] <- cleanBrackets(name) clist$code[i] <- cleanBrackets(code) rm(i, name, code) } write.table(clist, "newsfokus-countrylist.txt", sep=";", row.names=FALSE, col.names=FALSE) # Diese exportierte Länderliste wird noch manuell bearbeitet, siehe "newsfokus-countrylist-manualchanges.txt"