Files
uni-surprising-newsfocus/R-Code/newsfokus-1-countrylist.R

47 lines
1.4 KiB
R
Raw Normal View History

2014-11-28 18:05:12 +01:00
require(stringr)
require(dplyr)
require(XML)
setwd(dir="~/Dokumente/Uni/Aktuell/Datenerhebung WWW/Blogartikel 2 - Newsfokus")
clist_url <- "http://de.wikipedia.org/wiki/ISO-3166-1-Kodierliste"
clist_tbl <- readHTMLTable(clist_url, encoding = "UTF-8")[[1]]
names(clist_tbl) <- c("name", "code", "code-3", "code-num", "tld", "ioc", "code-2", "un")
clist <- data.frame(clist_tbl, stringsAsFactors = FALSE)
keep <- c("code", "name")
clist <- clist[keep]
clist$code <- as.character.factor(clist$code)
clist$name <- as.character.factor(clist$name)
rm(keep)
# Cleaning table from headlines, annotations and brackets
for (i in 1:nrow(clist)) {
name <- as.character(clist$name[i])
status <- str_detect(name, "!!")
if (status) {
clist <- clist[-i,]
}
rm(i, name, status)
}
cleanBrackets <- function(string) {
string <- str_replace(string, "\\[.*\\]", "")
string <- str_replace(string, "\\(.*\\)", "")
string <- str_replace(string, ".+!", "")
string <- str_trim(string)
return(string)
}
for (i in 1:nrow(clist)) {
name <- as.character(clist$name[i])
code <- as.character(clist$code[i])
clist$name[i] <- cleanBrackets(name)
clist$code[i] <- cleanBrackets(code)
rm(i, name, code)
}
write.table(clist, "newsfokus-countrylist.txt", sep=";", row.names=FALSE, col.names=FALSE)
# Diese exportierte Länderliste wird noch manuell bearbeitet, siehe "newsfokus-countrylist-manualchanges.txt"