tweets$tags <- ""
for(d in 1:nrow(issues)) {
# Go through every day
curdate <- issues$date[d]
cat(as.character(curdate),"\n")
# Put all tweets from specific day in a temporary DF
tweets_curday <- tweets[tweets[, "created_at"] == curdate, ]
for(t in 1:nrow(tweets_curday)){
# Select tweet's text, make it lowercase and remove hashtag indicators (#)
curtext <- as.character(tweets_curday$text[t])
curtext <- str_replace_all(curtext, "#", "")
curid <- as.character(tweets_curday$id_str[t])
# Now test each single issue (not tag!)
for(i in 1:length(issuelist)) {
curtags <- as.character(issuelist[[i]])
curissue <- names(issuelist)[i]
curfile <- str_c(id_folder,"/",curissue,".csv")
# Now test all tags of a single issue
for(s in 1:length(curtags)) {
curtag <- curtags[s]
curchars <- nchar(curtag, type = "chars")
# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
if(curchars <= 4) {
curacro <- checkAcronym(string = curtag, chars = curchars)
} else {
curacro <- FALSE
}
# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro)
if(tags_found == 1) {
# Raise number of findings on this day for this issue by 1
issues[d,curissue] <- issues[d,curissue] + 1
# Add issue and first matched tag of tweet to tweets-DF
oldissue <- tweets[tweets[, "id_str"] == curid, "issue"]
tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";")
oldtag <- tweets[tweets[, "id_str"] == curid, "tags"]
tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";")
# Add information to file for function viewPatternMatching
write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE)
break
}
else {
#cat("Nothing found\n")
}
} # /for curtags
} # /for issuelist
} # /for tweets_curday
} # /for drange
View(issues)
# MATCH TWEETS ------------------------------------------------------------
id_folder <- "matched-ids"
unlink(id_folder, recursive = TRUE)
dir.create(id_folder)
issues <- data.frame(date = drange)
issuelist <- xmlToList("issues.xml")
issueheads <- names(issuelist)
issues[issueheads] <- 0
tweets$issue <- ""
tweets$tags <- ""
for(d in 1:nrow(issues)) {
# Go through every day
curdate <- issues$date[d]
cat(as.character(curdate),"\n")
# Put all tweets from specific day in a temporary DF
tweets_curday <- tweets[tweets[, "created_at"] == curdate, ]
for(t in 1:nrow(tweets_curday)){
# Select tweet's text, make it lowercase and remove hashtag indicators (#)
curtext <- as.character(tweets_curday$text[t])
curtext <- str_replace_all(curtext, "#", "")
curid <- as.character(tweets_curday$id_str[t])
# Now test each single issue (not tag!)
for(i in 1:length(issuelist)) {
curtags <- as.character(issuelist[[i]])
curissue <- names(issuelist)[i]
curfile <- str_c(id_folder,"/",curissue,".csv")
# Now test all tags of a single issue
for(s in 1:length(curtags)) {
curtag <- curtags[s]
curchars <- nchar(curtag, type = "chars")
# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
if(curchars <= 4) {
curacro <- checkAcronym(string = curtag, chars = curchars)
} else {
curacro <- FALSE
}
# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro)
if(tags_found == 1) {
# Raise number of findings on this day for this issue by 1
issues[d,curissue] <- issues[d,curissue] + 1
# Add issue and first matched tag of tweet to tweets-DF
oldissue <- tweets[tweets[, "id_str"] == curid, "issue"]
tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";")
oldtag <- tweets[tweets[, "id_str"] == curid, "tags"]
tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";")
# Add information to file for function viewPatternMatching
write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE)
break
}
else {
#cat("Nothing found\n")
}
} # /for curtags
} # /for issuelist
} # /for tweets_curday
} # /for drange
#rm(tweets_curday,curacro, curchars, curdate,curfile,curid,curissue,curtag,curtags,curtext,d,date_end,date_start,i,id_folder,oldissue,oldtag,s,t,tags_found)
View(issues)
save(issues, "issues.RData")
save(issues, file="issues.RData")
readYN <- function(question) {
n <- readline(prompt=question)
n <- as.character(n)
return(n)
}
checkIssue <- function(string, issuelist) {
status <- any(str_detect(string, issuelist))
return(status)
}
checkAllIssues <- function(string, issuelist) {
status <- NULL
for(i in 1:length(string)) {
if(checkIssue(string[i], issuelist)) {
status[i] <- TRUE
}
else {
cat("Issue",string[i],"does not exist. Please try again.\n")
status[i] <- FALSE
}
}
return(status)
}
require(stringr)
require(XML)
require(stringr)
require(XML)
# FUNCTIONS ---------------------------------------------------------------
readYN <- function(question) {
n <- readline(prompt=question)
n <- as.character(n)
return(n)
}
checkIssue <- function(string, issuelist) {
status <- any(str_detect(string, issuelist))
return(status)
}
checkAllIssues <- function(string, issuelist) {
status <- NULL
for(i in 1:length(string)) {
if(checkIssue(string[i], issuelist)) {
status[i] <- TRUE
}
else {
cat("Issue",string[i],"does not exist. Please try again.\n")
status[i] <- FALSE
}
}
return(status)
}
c_issues <- data.frame(date = drange)
c_issuelist <- xmlToList("issues.xml")
c_issueheads <- names(issuelist)
c_issues[issueheads] <- 0
source("issuecomp-codingsample-function.R")
c_tweets <- tweets
View(c_tweets)
source("issuecomp-codingsample-function.R")
smartPatternMatch("Höflich, aber klares Statement zu Menschenrechten. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschenrecht\\b", 13, FALSE)
smartPatternMatch("Höflich, aber klares Statement zu Menschenrechten. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschenrechte\\b", 13, FALSE)
smartPatternMatch <- function(string, pattern, chars, acronym) {
pattern <- str_c("\\b", pattern, "\\b")
if(chars <= 4) {
found <- agrep(pattern, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE)
}
else if(chars >= 8) {
cat("bla")
found <- agrep(pattern, string, max.distance = list(all = 2), ignore.case = !acronym, fixed = FALSE)
}
else {
found <- agrep(pattern, string, max.distance = list(all = 1), ignore.case = !acronym, fixed = FALSE)
}
found <- convertLogical0(found)
return(found)
}
smartPatternMatch("Höflich, aber klares Statement zu Menschenrechten. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschenrechte\\b", 13, FALSE)
smartPatternMatch <- function(string, pattern, chars, acronym) {
pattern <- str_c("\\b", pattern, "\\b")
if(chars <= 4) {
found <- agrep(pattern, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE)
}
else if(chars >= 8) {
found <- agrep(pattern, string, max.distance = list(all = 2), ignore.case = !acronym, fixed = FALSE)
}
else {
found <- agrep(pattern, string, max.distance = list(all = 1), ignore.case = !acronym, fixed = FALSE)
}
found <- convertLogical0(found)
return(found)
}
smartPatternMatch <- function(string, pattern, chars, acronym) {
pattern <- str_c("\\b", pattern, "\\b")
if(chars <= 4) {
found <- agrep(pattern, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE)
}
else if(chars >= 8) {
found <- agrep(pattern, string, max.distance = list(all = 3), ignore.case = !acronym, fixed = FALSE)
}
else {
found <- agrep(pattern, string, max.distance = list(all = 1), ignore.case = !acronym, fixed = FALSE)
}
found <- convertLogical0(found)
return(found)
}
smartPatternMatch("Höflich, aber klares Statement zu Menschenrechten. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschenrechte\\b", 13, FALSE)
smartPatternMatch("Höflich, aber klares Statement zu Menschenrechten. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschenrecht\\b", 13, FALSE)
smartPatternMatch("Höflich, aber klares Statement zu Menschenrechten. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschenracht\\b", 13, FALSE)
smartPatternMatch("Höflich, aber klares Statement zu Menschenrechten. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschen-recht\\b", 13, FALSE)
smartPatternMatch("Höflich, aber klares Statement zu Menschen-Rechten. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschenrecht\\b", 13, FALSE)
smartPatternMatch("Höflich, aber klares Statement zu Menschen-Rechte. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschenrecht\\b", 13, FALSE)
smartPatternMatch("Bla bla Tomate ", "\\Tomate\\b", 6, FALSE)
smartPatternMatch("Bla bla Tomaten bla bla", "\\Tomate\\b", 6, FALSE)
smartPatternMatch <- function(string, pattern, chars, acronym) {
pattern <- str_c("\\b", pattern, "\\b")
if(chars <= 4) {
found <- agrep(pattern, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE)
}
else if(chars >= 8) {
found <- agrep(pattern, string, max.distance = list(all = 3), ignore.case = !acronym, fixed = FALSE)
}
else {
found <- agrep(pattern, string, max.distance = list(all = 2), ignore.case = !acronym, fixed = FALSE)
}
found <- convertLogical0(found)
return(found)
}
smartPatternMatch("Bla bla Tomaten bla bla", "\\Tomate\\b", 6, FALSE)
smartPatternMatch("Bla bla Menschen bla bla", "\\Menschen\\b", 8, FALSE)
smartPatternMatch("Bla bla Menschen bla bla", "\\Menschen\\b", 7, FALSE)
smartPatternMatch("Bla bla Menschen bla bla", "\\Mensch\\b", 7, FALSE)
smartPatternMatch("Bla bla Menschen bla bla", "\\Mensch\\b", 8, FALSE)
smartPatternMatch("Bla bla Nazis bla bla", "\\Nazis\\b", 8, FALSE)
smartPatternMatch("Bla bla Nazis bla bla", "\\Nazis\\b", 5, FALSE)
smartPatternMatch("Bla bla Nazis bla bla", "\\Nazi\\b", 4, FALSE)
smartPatternMatch("Bla bla Nazi bla bla", "\\Nazis\\b", 5, FALSE)
source("issuecomp-codingsample-function.R")
smartPatternMatch("Der kleine Flüchtlingsjunge war", "\\bFlüchtling\\b", 9, FALSE)
str_detect("Der kleine Flüchtlingsjunge war", pattern = "\\bFlüchtling\\b")
str_detect("Der kleine Flüchtlingsjunge war", pattern = "Flüchtling")
str_detect("Der kleine Flücht lingsjunge war", pattern = "Flüchtling")
smartPatternMatch <- function(string, pattern, chars, acronym) {
pattern <- str_c("\\b", pattern, "\\b")
if(chars <= 4) { # 4 or less
found <- agrep(pattern, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE)
}
else if(chars >= 8) { # 8 or more
found <- agrep(pattern, string, max.distance = list(all = 3), ignore.case = !acronym, fixed = FALSE)
cat(found)
}
else { # 5,6,7
found <- agrep(pattern, string, max.distance = list(all = 2), ignore.case = !acronym, fixed = FALSE)
}
found <- convertLogical0(found)
return(found)
}
str_detect("Der kleine Flücht lingsjunge war", pattern = "Flüchtling")
smartPatternMatch("Der kleine Flüchtlingsjunge war", "\\bFlüchtling\\b", 9, FALSE)
smartPatternMatch("Der kleine Flüchtlingsjunge war", "\\bFlüchtling\\b", 9, FALSE)
smartPatternMatch <- function(string, pattern, chars, acronym) {
pattern <- str_c("\\b", pattern, "\\b")
if(chars <= 4) { # 4 or less
found <- agrep(pattern, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE)
}
else if(chars >= 8) { # 8 or more
found <- agrep(pattern, string, max.distance = list(all = 3), ignore.case = !acronym, fixed = FALSE)
cat("it's",found)
}
else { # 5,6,7
found <- agrep(pattern, string, max.distance = list(all = 2), ignore.case = !acronym, fixed = FALSE)
}
found <- convertLogical0(found)
return(found)
}
smartPatternMatch("Der kleine Flüchtlingsjunge war", "\\bFlüchtling\\b", 9, FALSE)
str_detect("Der kleine Flücht lingsjunge war", pattern = "Flüchtling")
str_detect("Der kleine Flüchtlingsjunge war", pattern = "Flüchtling")
smartPatternMatch("Der kleine Flüchtlingsjunge war", "\\bFlüchtling\\b", 9, FALSE)
smartPatternMatch("Der kleine Flüchtlinge war", "\\bFlüchtling\\b", 9, FALSE)
grep("Flüchtling","Der kleine Flüchtlingsjunge war", ignore.case = TRUE, fixed=FALSE)
grep("\\bFlüchtling\\b","Der kleine Flüchtlingsjunge war", ignore.case = TRUE, fixed=FALSE)
grep("\\bFlüchtling\\b","Der kleine Flüchtlingsjunge war", ignore.case = TRUE, fixed=TRUE)
grep("\\bFlüchtling\\b","Der kleine Flüchtlingsjunge war", ignore.case = TRUE, fixed=FALSE)
grep("Flüchtling","Der kleine Flüchtlingsjunge war", ignore.case = TRUE, fixed=FALSE)
grep("Flüchtling","Der kleine Flücht-lingsjunge war", ignore.case = TRUE, fixed=FALSE)
grep("Flüchtling","Der kleine Flüchtlingsjunge war", ignore.case = TRUE, fixed=FALSE)
smartPatternMatch <- function(string, pattern, chars, acronym) {
patternrex <- str_c("\\b", pattern, "\\b")
if(chars <= 4) { # 4 or less
found <- agrep(patternrex, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE)
}
else if(chars >= 8) { # 8 or more
found <- agrep(patternrex, string, max.distance = list(all = 3), ignore.case = !acronym, fixed = FALSE)
if(convertLogical0(found) == 0) {
found <- grep(pattern, string, ignore.case = !acronym, fixed = FALSE)
}
}
else { # 5,6,7
found <- agrep(patternrex, string, max.distance = list(all = 2), ignore.case = !acronym, fixed = FALSE)
}
found <- convertLogical0(found)
return(found)
}
smartPatternMatch("Der kleine Flüchtlingsjunge war", "Flüchtling", 9, FALSE)
smartPatternMatch("Der kleine Flüchtlingsjunge war", "Flüchtling", 9, FALSE)
smartPatternMatch("Der kleine Flüchtlingsjunge war", "Flüchtling", 7, FALSE)
c_errors <- read.csv("issuecomp-codingsample-error.csv", header = F, sep=",", colClasses="character")
names(c_errors) <- c("str_id", "code", "issue", "tags", "text")
for(r in 1:nrow(c_errors)) {
c_errcode <- as.character(c_errors$code[r])
c_errissue <- as.character(c_errors$issue[r])
c_errtags <- as.character(c_errors$tags[r])
c_errtext <- as.character(c_errors$text[r])
c_errid <- as.character(c_errors$str_id[r])
cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errtags, "\n", sep="")
source("issuecomp-codingsample-function2.R")
}
View(c_errors)
viewMatchingTweets(date = "2014-05-10", issue = "agrar.204", id_folder)
viewMatchingTweets(date = "2014-05-10", issue = "agrar.402", id_folder)
viewMatchingTweets(date = "2014-01-10", issue = "agrar.402", id_folder)
viewMatchingTweets(date = "2014-01-20", issue = "agrar.402", id_folder)
viewMatchingTweets(date = "2014-01-10", issue = "agrar.403", id_folder)
viewMatchingTweets(date = "2014-04-10", issue = "agrar.403", id_folder)
viewMatchingTweets(date = "2014-05-10", issue = "agrar.403", id_folder)
viewMatchingTweets(date = "2014-02-11", issue = "agrar.403", id_folder)
viewMatchingTweets(date = "2014-08-01", issue = "agrar.403", id_folder)
issuelist <- xmlToList("issues.xml")
issuelist
issuelist[[1]]
xmlTreeParse(file = "issues.xml")
View(issues)
issuelist
issueheads
issuelist[[1]]
issuelist2 <- xmlTreeParse(file = "issues.xml")
issuelist2[[1]]
issuelist2[[2]]
issuelist2[[1,2]]
issuelist2[1
issuelist2[1]
issuelist2$doc$file
issuelist2$doc$version
xmlParse("issues.xml")
issuelist2 <- xmlParse("issues.xml")
issuelist2[1]
issuelist2[2]
issuelist2
issuelist
issuelist$edu.606
issuelist$edu.606[1]
issuelist$edu.606[2]
issuelist$edu.606[3]
issueheads
issuelist$macro.100
length(issuelist$macro.100)
length(issuelist$macro.101)
length(issuelist$macro.103)
length(issuelist$macro.105)
issuelist$macro.105
issuelist$macro.105[2]
issueheads
as.character(issuelist[[1]])
as.character(issuelist[[2]])
test <- issueheads[1]
test
as.character(issuelist$test)
as.character(issuelist$macro.100)
as.character(issuelist[test])
as.character(issuelist[test,1])
as.character(issuelist[1,test])
as.character(issuelist[test])
issuelist[test]
issuelist[test]
length(issuelist[test])
length(issuelist$macro.100)
issuelist$macro.100
test
issuelist[test]
issuelist[,test]
issuelist[,as.character(test)]
issuelist[[test]]
issuelist[,test]
issuelist[test]
issuelist[[test]]
length(issuelist[[test]])
issuelist[[test]]
issuelist[[test]][1]
as.character(issuelist[[test]][1])
as.character(issuelist[[test]])
issueheads
issueheads[2]
as.character(issuelist[[i]])
as.character(issuelist[[1]])
as.character(issuelist[[test]])
i <- 1
curissue <- issueheads[i]
curtags <- as.character(issuelist[[curissue]])
curfile <- str_c(id_folder,"/",curissue,".csv")
curissue
curtags
curfile
curtags[2]
# MATCH TWEETS ------------------------------------------------------------
id_folder <- "matched-ids"
unlink(id_folder, recursive = TRUE)
dir.create(id_folder)
issues <- data.frame(date = drange)
issuelist <- xmlToList("issues.xml")
issueheads <- names(issuelist)
issues[issueheads] <- 0
tweets$issue <- ""
tweets$tags <- ""
for(d in 1:nrow(issues)) {
# Go through every day
curdate <- issues$date[d]
cat(as.character(curdate),"\n")
# Put all tweets from specific day in a temporary DF
tweets_curday <- tweets[tweets[, "created_at"] == curdate, ]
for(t in 1:nrow(tweets_curday)){
# Select tweet's text, make it lowercase and remove hashtag indicators (#)
curtext <- as.character(tweets_curday$text[t])
curtext <- str_replace_all(curtext, "#", "")
curid <- as.character(tweets_curday$id_str[t])
# Now test each single issue (not tag!)
for(i in 1:length(issueheads)) {
curissue <- issueheads[i]
curtags <- as.character(issuelist[[curissue]])
curfile <- str_c(id_folder,"/",curissue,".csv")
# Now test all tags of a single issue
for(s in 1:length(curtags)) {
curtag <- curtags[s]
curchars <- nchar(curtag, type = "chars")
# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
if(curchars <= 4) {
curacro <- checkAcronym(string = curtag, chars = curchars)
} else {
curacro <- FALSE
}
# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro)
if(tags_found == 1) {
# Raise number of findings on this day for this issue by 1
issues[d,curissue] <- issues[d,curissue] + 1
# Add issue and first matched tag of tweet to tweets-DF
oldissue <- tweets[tweets[, "id_str"] == curid, "issue"]
tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";")
oldtag <- tweets[tweets[, "id_str"] == curid, "tags"]
tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";")
# Add information to file for function viewPatternMatching
write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE)
break
}
else {
#cat("Nothing found\n")
}
} # /for curtags
} # /for issuelist
} # /for tweets_curday
} # /for drange
smartPatternMatch(string = "er ist pädophil ", pattern = "pädophilie", chars = 10, acronym = FALSE)
smartPatternMatch(string = "er ist pädophiler ", pattern = "pädophilie", chars = 10, acronym = FALSE)
smartPatternMatch(string = "er ist pädophiler ", pattern = "Pädophilie", chars = 10, acronym = FALSE)
smartPatternMatch(string = "er ist pädophiles ", pattern = "Pädophilie", chars = 10, acronym = FALSE)
id_folder <- "matched-ids"
unlink(id_folder, recursive = TRUE)
dir.create(id_folder)
issues <- data.frame(date = drange)
issuelist <- xmlToList("issues.xml")
issueheads <- names(issuelist)
issues[issueheads] <- 0
tweets$issue <- ""
tweets$tags <- ""
issueheads
issuelist <- xmlToList("issues.xml")
issuelist
issueheads
View(issues)
issuelist$text
issuelist$macro.100
issuelist$macro.101
issuelist$text
issuelist$text <- NULL
issueheads <- names(issuelist)
issueheads
issuelist
issuelist$text <- ""
issuelist
issuelist$text <- NA
issuelist
issuelist$text
issuelist$text[1]
issuelist$text[2]
issuelist$text[6]
issuelist$text[10]
issues <- data.frame(date = drange)
issuelist <- xmlToList("issues.xml")
issues <- data.frame(date = drange)
issuelist <- xmlToList("issues.xml")
issueheads <- names(issuelist)
issues[issueheads] <- 0
tweets$issue <- ""
tweets$tags <- ""
View(tweets)