initial commit v1.0
This commit is contained in:
46
R-Code/newsfokus-1-countrylist.R
Normal file
46
R-Code/newsfokus-1-countrylist.R
Normal file
@@ -0,0 +1,46 @@
|
||||
require(stringr)
|
||||
require(dplyr)
|
||||
require(XML)
|
||||
|
||||
setwd(dir="~/Dokumente/Uni/Aktuell/Datenerhebung WWW/Blogartikel 2 - Newsfokus")
|
||||
|
||||
clist_url <- "http://de.wikipedia.org/wiki/ISO-3166-1-Kodierliste"
|
||||
clist_tbl <- readHTMLTable(clist_url, encoding = "UTF-8")[[1]]
|
||||
names(clist_tbl) <- c("name", "code", "code-3", "code-num", "tld", "ioc", "code-2", "un")
|
||||
|
||||
clist <- data.frame(clist_tbl, stringsAsFactors = FALSE)
|
||||
keep <- c("code", "name")
|
||||
clist <- clist[keep]
|
||||
clist$code <- as.character.factor(clist$code)
|
||||
clist$name <- as.character.factor(clist$name)
|
||||
rm(keep)
|
||||
|
||||
# Cleaning table from headlines, annotations and brackets
|
||||
for (i in 1:nrow(clist)) {
|
||||
name <- as.character(clist$name[i])
|
||||
status <- str_detect(name, "!!")
|
||||
if (status) {
|
||||
clist <- clist[-i,]
|
||||
}
|
||||
rm(i, name, status)
|
||||
}
|
||||
|
||||
cleanBrackets <- function(string) {
|
||||
string <- str_replace(string, "\\[.*\\]", "")
|
||||
string <- str_replace(string, "\\(.*\\)", "")
|
||||
string <- str_replace(string, ".+!", "")
|
||||
string <- str_trim(string)
|
||||
return(string)
|
||||
}
|
||||
|
||||
for (i in 1:nrow(clist)) {
|
||||
name <- as.character(clist$name[i])
|
||||
code <- as.character(clist$code[i])
|
||||
clist$name[i] <- cleanBrackets(name)
|
||||
clist$code[i] <- cleanBrackets(code)
|
||||
rm(i, name, code)
|
||||
}
|
||||
|
||||
write.table(clist, "newsfokus-countrylist.txt", sep=";", row.names=FALSE, col.names=FALSE)
|
||||
|
||||
# Diese exportierte Länderliste wird noch manuell bearbeitet, siehe "newsfokus-countrylist-manualchanges.txt"
|
||||
Reference in New Issue
Block a user