47 lines
1.4 KiB
R
47 lines
1.4 KiB
R
require(stringr)
|
|
require(dplyr)
|
|
require(XML)
|
|
|
|
setwd(dir="~/Dokumente/Uni/Aktuell/Datenerhebung WWW/Blogartikel 2 - Newsfokus")
|
|
|
|
clist_url <- "http://de.wikipedia.org/wiki/ISO-3166-1-Kodierliste"
|
|
clist_tbl <- readHTMLTable(clist_url, encoding = "UTF-8")[[1]]
|
|
names(clist_tbl) <- c("name", "code", "code-3", "code-num", "tld", "ioc", "code-2", "un")
|
|
|
|
clist <- data.frame(clist_tbl, stringsAsFactors = FALSE)
|
|
keep <- c("code", "name")
|
|
clist <- clist[keep]
|
|
clist$code <- as.character.factor(clist$code)
|
|
clist$name <- as.character.factor(clist$name)
|
|
rm(keep)
|
|
|
|
# Cleaning table from headlines, annotations and brackets
|
|
for (i in 1:nrow(clist)) {
|
|
name <- as.character(clist$name[i])
|
|
status <- str_detect(name, "!!")
|
|
if (status) {
|
|
clist <- clist[-i,]
|
|
}
|
|
rm(i, name, status)
|
|
}
|
|
|
|
cleanBrackets <- function(string) {
|
|
string <- str_replace(string, "\\[.*\\]", "")
|
|
string <- str_replace(string, "\\(.*\\)", "")
|
|
string <- str_replace(string, ".+!", "")
|
|
string <- str_trim(string)
|
|
return(string)
|
|
}
|
|
|
|
for (i in 1:nrow(clist)) {
|
|
name <- as.character(clist$name[i])
|
|
code <- as.character(clist$code[i])
|
|
clist$name[i] <- cleanBrackets(name)
|
|
clist$code[i] <- cleanBrackets(code)
|
|
rm(i, name, code)
|
|
}
|
|
|
|
write.table(clist, "newsfokus-countrylist.txt", sep=";", row.names=FALSE, col.names=FALSE)
|
|
|
|
# Diese exportierte Länderliste wird noch manuell bearbeitet, siehe "newsfokus-countrylist-manualchanges.txt"
|