tweets$tags <- "" for(d in 1:nrow(issues)) { # Go through every day curdate <- issues$date[d] cat(as.character(curdate),"\n") # Put all tweets from specific day in a temporary DF tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] for(t in 1:nrow(tweets_curday)){ # Select tweet's text, make it lowercase and remove hashtag indicators (#) curtext <- as.character(tweets_curday$text[t]) curtext <- str_replace_all(curtext, "#", "") curid <- as.character(tweets_curday$id_str[t]) # Now test each single issue (not tag!) for(i in 1:length(issuelist)) { curtags <- as.character(issuelist[[i]]) curissue <- names(issuelist)[i] curfile <- str_c(id_folder,"/",curissue,".csv") # Now test all tags of a single issue for(s in 1:length(curtags)) { curtag <- curtags[s] curchars <- nchar(curtag, type = "chars") # Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch if(curchars <= 4) { curacro <- checkAcronym(string = curtag, chars = curchars) } else { curacro <- FALSE } # Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance) tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro) if(tags_found == 1) { # Raise number of findings on this day for this issue by 1 issues[d,curissue] <- issues[d,curissue] + 1 # Add issue and first matched tag of tweet to tweets-DF oldissue <- tweets[tweets[, "id_str"] == curid, "issue"] tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";") oldtag <- tweets[tweets[, "id_str"] == curid, "tags"] tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";") # Add information to file for function viewPatternMatching write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE) break } else { #cat("Nothing found\n") } } # /for curtags } # /for issuelist } # /for tweets_curday } # /for drange View(issues) # MATCH TWEETS ------------------------------------------------------------ id_folder <- "matched-ids" unlink(id_folder, recursive = TRUE) dir.create(id_folder) issues <- data.frame(date = drange) issuelist <- xmlToList("issues.xml") issueheads <- names(issuelist) issues[issueheads] <- 0 tweets$issue <- "" tweets$tags <- "" for(d in 1:nrow(issues)) { # Go through every day curdate <- issues$date[d] cat(as.character(curdate),"\n") # Put all tweets from specific day in a temporary DF tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] for(t in 1:nrow(tweets_curday)){ # Select tweet's text, make it lowercase and remove hashtag indicators (#) curtext <- as.character(tweets_curday$text[t]) curtext <- str_replace_all(curtext, "#", "") curid <- as.character(tweets_curday$id_str[t]) # Now test each single issue (not tag!) for(i in 1:length(issuelist)) { curtags <- as.character(issuelist[[i]]) curissue <- names(issuelist)[i] curfile <- str_c(id_folder,"/",curissue,".csv") # Now test all tags of a single issue for(s in 1:length(curtags)) { curtag <- curtags[s] curchars <- nchar(curtag, type = "chars") # Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch if(curchars <= 4) { curacro <- checkAcronym(string = curtag, chars = curchars) } else { curacro <- FALSE } # Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance) tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro) if(tags_found == 1) { # Raise number of findings on this day for this issue by 1 issues[d,curissue] <- issues[d,curissue] + 1 # Add issue and first matched tag of tweet to tweets-DF oldissue <- tweets[tweets[, "id_str"] == curid, "issue"] tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";") oldtag <- tweets[tweets[, "id_str"] == curid, "tags"] tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";") # Add information to file for function viewPatternMatching write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE) break } else { #cat("Nothing found\n") } } # /for curtags } # /for issuelist } # /for tweets_curday } # /for drange #rm(tweets_curday,curacro, curchars, curdate,curfile,curid,curissue,curtag,curtags,curtext,d,date_end,date_start,i,id_folder,oldissue,oldtag,s,t,tags_found) View(issues) save(issues, "issues.RData") save(issues, file="issues.RData") readYN <- function(question) { n <- readline(prompt=question) n <- as.character(n) return(n) } checkIssue <- function(string, issuelist) { status <- any(str_detect(string, issuelist)) return(status) } checkAllIssues <- function(string, issuelist) { status <- NULL for(i in 1:length(string)) { if(checkIssue(string[i], issuelist)) { status[i] <- TRUE } else { cat("Issue",string[i],"does not exist. Please try again.\n") status[i] <- FALSE } } return(status) } require(stringr) require(XML) require(stringr) require(XML) # FUNCTIONS --------------------------------------------------------------- readYN <- function(question) { n <- readline(prompt=question) n <- as.character(n) return(n) } checkIssue <- function(string, issuelist) { status <- any(str_detect(string, issuelist)) return(status) } checkAllIssues <- function(string, issuelist) { status <- NULL for(i in 1:length(string)) { if(checkIssue(string[i], issuelist)) { status[i] <- TRUE } else { cat("Issue",string[i],"does not exist. Please try again.\n") status[i] <- FALSE } } return(status) } c_issues <- data.frame(date = drange) c_issuelist <- xmlToList("issues.xml") c_issueheads <- names(issuelist) c_issues[issueheads] <- 0 source("issuecomp-codingsample-function.R") c_tweets <- tweets View(c_tweets) source("issuecomp-codingsample-function.R") smartPatternMatch("Höflich, aber klares Statement zu Menschenrechten. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschenrecht\\b", 13, FALSE) smartPatternMatch("Höflich, aber klares Statement zu Menschenrechten. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschenrechte\\b", 13, FALSE) smartPatternMatch <- function(string, pattern, chars, acronym) { pattern <- str_c("\\b", pattern, "\\b") if(chars <= 4) { found <- agrep(pattern, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE) } else if(chars >= 8) { cat("bla") found <- agrep(pattern, string, max.distance = list(all = 2), ignore.case = !acronym, fixed = FALSE) } else { found <- agrep(pattern, string, max.distance = list(all = 1), ignore.case = !acronym, fixed = FALSE) } found <- convertLogical0(found) return(found) } smartPatternMatch("Höflich, aber klares Statement zu Menschenrechten. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschenrechte\\b", 13, FALSE) smartPatternMatch <- function(string, pattern, chars, acronym) { pattern <- str_c("\\b", pattern, "\\b") if(chars <= 4) { found <- agrep(pattern, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE) } else if(chars >= 8) { found <- agrep(pattern, string, max.distance = list(all = 2), ignore.case = !acronym, fixed = FALSE) } else { found <- agrep(pattern, string, max.distance = list(all = 1), ignore.case = !acronym, fixed = FALSE) } found <- convertLogical0(found) return(found) } smartPatternMatch <- function(string, pattern, chars, acronym) { pattern <- str_c("\\b", pattern, "\\b") if(chars <= 4) { found <- agrep(pattern, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE) } else if(chars >= 8) { found <- agrep(pattern, string, max.distance = list(all = 3), ignore.case = !acronym, fixed = FALSE) } else { found <- agrep(pattern, string, max.distance = list(all = 1), ignore.case = !acronym, fixed = FALSE) } found <- convertLogical0(found) return(found) } smartPatternMatch("Höflich, aber klares Statement zu Menschenrechten. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschenrechte\\b", 13, FALSE) smartPatternMatch("Höflich, aber klares Statement zu Menschenrechten. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschenrecht\\b", 13, FALSE) smartPatternMatch("Höflich, aber klares Statement zu Menschenrechten. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschenracht\\b", 13, FALSE) smartPatternMatch("Höflich, aber klares Statement zu Menschenrechten. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschen-recht\\b", 13, FALSE) smartPatternMatch("Höflich, aber klares Statement zu Menschen-Rechten. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschenrecht\\b", 13, FALSE) smartPatternMatch("Höflich, aber klares Statement zu Menschen-Rechte. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschenrecht\\b", 13, FALSE) smartPatternMatch("Bla bla Tomate ", "\\Tomate\\b", 6, FALSE) smartPatternMatch("Bla bla Tomaten bla bla", "\\Tomate\\b", 6, FALSE) smartPatternMatch <- function(string, pattern, chars, acronym) { pattern <- str_c("\\b", pattern, "\\b") if(chars <= 4) { found <- agrep(pattern, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE) } else if(chars >= 8) { found <- agrep(pattern, string, max.distance = list(all = 3), ignore.case = !acronym, fixed = FALSE) } else { found <- agrep(pattern, string, max.distance = list(all = 2), ignore.case = !acronym, fixed = FALSE) } found <- convertLogical0(found) return(found) } smartPatternMatch("Bla bla Tomaten bla bla", "\\Tomate\\b", 6, FALSE) smartPatternMatch("Bla bla Menschen bla bla", "\\Menschen\\b", 8, FALSE) smartPatternMatch("Bla bla Menschen bla bla", "\\Menschen\\b", 7, FALSE) smartPatternMatch("Bla bla Menschen bla bla", "\\Mensch\\b", 7, FALSE) smartPatternMatch("Bla bla Menschen bla bla", "\\Mensch\\b", 8, FALSE) smartPatternMatch("Bla bla Nazis bla bla", "\\Nazis\\b", 8, FALSE) smartPatternMatch("Bla bla Nazis bla bla", "\\Nazis\\b", 5, FALSE) smartPatternMatch("Bla bla Nazis bla bla", "\\Nazi\\b", 4, FALSE) smartPatternMatch("Bla bla Nazi bla bla", "\\Nazis\\b", 5, FALSE) source("issuecomp-codingsample-function.R") smartPatternMatch("Der kleine Flüchtlingsjunge war", "\\bFlüchtling\\b", 9, FALSE) str_detect("Der kleine Flüchtlingsjunge war", pattern = "\\bFlüchtling\\b") str_detect("Der kleine Flüchtlingsjunge war", pattern = "Flüchtling") str_detect("Der kleine Flücht lingsjunge war", pattern = "Flüchtling") smartPatternMatch <- function(string, pattern, chars, acronym) { pattern <- str_c("\\b", pattern, "\\b") if(chars <= 4) { # 4 or less found <- agrep(pattern, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE) } else if(chars >= 8) { # 8 or more found <- agrep(pattern, string, max.distance = list(all = 3), ignore.case = !acronym, fixed = FALSE) cat(found) } else { # 5,6,7 found <- agrep(pattern, string, max.distance = list(all = 2), ignore.case = !acronym, fixed = FALSE) } found <- convertLogical0(found) return(found) } str_detect("Der kleine Flücht lingsjunge war", pattern = "Flüchtling") smartPatternMatch("Der kleine Flüchtlingsjunge war", "\\bFlüchtling\\b", 9, FALSE) smartPatternMatch("Der kleine Flüchtlingsjunge war", "\\bFlüchtling\\b", 9, FALSE) smartPatternMatch <- function(string, pattern, chars, acronym) { pattern <- str_c("\\b", pattern, "\\b") if(chars <= 4) { # 4 or less found <- agrep(pattern, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE) } else if(chars >= 8) { # 8 or more found <- agrep(pattern, string, max.distance = list(all = 3), ignore.case = !acronym, fixed = FALSE) cat("it's",found) } else { # 5,6,7 found <- agrep(pattern, string, max.distance = list(all = 2), ignore.case = !acronym, fixed = FALSE) } found <- convertLogical0(found) return(found) } smartPatternMatch("Der kleine Flüchtlingsjunge war", "\\bFlüchtling\\b", 9, FALSE) str_detect("Der kleine Flücht lingsjunge war", pattern = "Flüchtling") str_detect("Der kleine Flüchtlingsjunge war", pattern = "Flüchtling") smartPatternMatch("Der kleine Flüchtlingsjunge war", "\\bFlüchtling\\b", 9, FALSE) smartPatternMatch("Der kleine Flüchtlinge war", "\\bFlüchtling\\b", 9, FALSE) grep("Flüchtling","Der kleine Flüchtlingsjunge war", ignore.case = TRUE, fixed=FALSE) grep("\\bFlüchtling\\b","Der kleine Flüchtlingsjunge war", ignore.case = TRUE, fixed=FALSE) grep("\\bFlüchtling\\b","Der kleine Flüchtlingsjunge war", ignore.case = TRUE, fixed=TRUE) grep("\\bFlüchtling\\b","Der kleine Flüchtlingsjunge war", ignore.case = TRUE, fixed=FALSE) grep("Flüchtling","Der kleine Flüchtlingsjunge war", ignore.case = TRUE, fixed=FALSE) grep("Flüchtling","Der kleine Flücht-lingsjunge war", ignore.case = TRUE, fixed=FALSE) grep("Flüchtling","Der kleine Flüchtlingsjunge war", ignore.case = TRUE, fixed=FALSE) smartPatternMatch <- function(string, pattern, chars, acronym) { patternrex <- str_c("\\b", pattern, "\\b") if(chars <= 4) { # 4 or less found <- agrep(patternrex, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE) } else if(chars >= 8) { # 8 or more found <- agrep(patternrex, string, max.distance = list(all = 3), ignore.case = !acronym, fixed = FALSE) if(convertLogical0(found) == 0) { found <- grep(pattern, string, ignore.case = !acronym, fixed = FALSE) } } else { # 5,6,7 found <- agrep(patternrex, string, max.distance = list(all = 2), ignore.case = !acronym, fixed = FALSE) } found <- convertLogical0(found) return(found) } smartPatternMatch("Der kleine Flüchtlingsjunge war", "Flüchtling", 9, FALSE) smartPatternMatch("Der kleine Flüchtlingsjunge war", "Flüchtling", 9, FALSE) smartPatternMatch("Der kleine Flüchtlingsjunge war", "Flüchtling", 7, FALSE) c_errors <- read.csv("issuecomp-codingsample-error.csv", header = F, sep=",", colClasses="character") names(c_errors) <- c("str_id", "code", "issue", "tags", "text") for(r in 1:nrow(c_errors)) { c_errcode <- as.character(c_errors$code[r]) c_errissue <- as.character(c_errors$issue[r]) c_errtags <- as.character(c_errors$tags[r]) c_errtext <- as.character(c_errors$text[r]) c_errid <- as.character(c_errors$str_id[r]) cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errtags, "\n", sep="") source("issuecomp-codingsample-function2.R") } View(c_errors) viewMatchingTweets(date = "2014-05-10", issue = "agrar.204", id_folder) viewMatchingTweets(date = "2014-05-10", issue = "agrar.402", id_folder) viewMatchingTweets(date = "2014-01-10", issue = "agrar.402", id_folder) viewMatchingTweets(date = "2014-01-20", issue = "agrar.402", id_folder) viewMatchingTweets(date = "2014-01-10", issue = "agrar.403", id_folder) viewMatchingTweets(date = "2014-04-10", issue = "agrar.403", id_folder) viewMatchingTweets(date = "2014-05-10", issue = "agrar.403", id_folder) viewMatchingTweets(date = "2014-02-11", issue = "agrar.403", id_folder) viewMatchingTweets(date = "2014-08-01", issue = "agrar.403", id_folder) issuelist <- xmlToList("issues.xml") issuelist issuelist[[1]] xmlTreeParse(file = "issues.xml") View(issues) issuelist issueheads issuelist[[1]] issuelist2 <- xmlTreeParse(file = "issues.xml") issuelist2[[1]] issuelist2[[2]] issuelist2[[1,2]] issuelist2[1 issuelist2[1] issuelist2$doc$file issuelist2$doc$version xmlParse("issues.xml") issuelist2 <- xmlParse("issues.xml") issuelist2[1] issuelist2[2] issuelist2 issuelist issuelist$edu.606 issuelist$edu.606[1] issuelist$edu.606[2] issuelist$edu.606[3] issueheads issuelist$macro.100 length(issuelist$macro.100) length(issuelist$macro.101) length(issuelist$macro.103) length(issuelist$macro.105) issuelist$macro.105 issuelist$macro.105[2] issueheads as.character(issuelist[[1]]) as.character(issuelist[[2]]) test <- issueheads[1] test as.character(issuelist$test) as.character(issuelist$macro.100) as.character(issuelist[test]) as.character(issuelist[test,1]) as.character(issuelist[1,test]) as.character(issuelist[test]) issuelist[test] issuelist[test] length(issuelist[test]) length(issuelist$macro.100) issuelist$macro.100 test issuelist[test] issuelist[,test] issuelist[,as.character(test)] issuelist[[test]] issuelist[,test] issuelist[test] issuelist[[test]] length(issuelist[[test]]) issuelist[[test]] issuelist[[test]][1] as.character(issuelist[[test]][1]) as.character(issuelist[[test]]) issueheads issueheads[2] as.character(issuelist[[i]]) as.character(issuelist[[1]]) as.character(issuelist[[test]]) i <- 1 curissue <- issueheads[i] curtags <- as.character(issuelist[[curissue]]) curfile <- str_c(id_folder,"/",curissue,".csv") curissue curtags curfile curtags[2] # MATCH TWEETS ------------------------------------------------------------ id_folder <- "matched-ids" unlink(id_folder, recursive = TRUE) dir.create(id_folder) issues <- data.frame(date = drange) issuelist <- xmlToList("issues.xml") issueheads <- names(issuelist) issues[issueheads] <- 0 tweets$issue <- "" tweets$tags <- "" for(d in 1:nrow(issues)) { # Go through every day curdate <- issues$date[d] cat(as.character(curdate),"\n") # Put all tweets from specific day in a temporary DF tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] for(t in 1:nrow(tweets_curday)){ # Select tweet's text, make it lowercase and remove hashtag indicators (#) curtext <- as.character(tweets_curday$text[t]) curtext <- str_replace_all(curtext, "#", "") curid <- as.character(tweets_curday$id_str[t]) # Now test each single issue (not tag!) for(i in 1:length(issueheads)) { curissue <- issueheads[i] curtags <- as.character(issuelist[[curissue]]) curfile <- str_c(id_folder,"/",curissue,".csv") # Now test all tags of a single issue for(s in 1:length(curtags)) { curtag <- curtags[s] curchars <- nchar(curtag, type = "chars") # Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch if(curchars <= 4) { curacro <- checkAcronym(string = curtag, chars = curchars) } else { curacro <- FALSE } # Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance) tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro) if(tags_found == 1) { # Raise number of findings on this day for this issue by 1 issues[d,curissue] <- issues[d,curissue] + 1 # Add issue and first matched tag of tweet to tweets-DF oldissue <- tweets[tweets[, "id_str"] == curid, "issue"] tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";") oldtag <- tweets[tweets[, "id_str"] == curid, "tags"] tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";") # Add information to file for function viewPatternMatching write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE) break } else { #cat("Nothing found\n") } } # /for curtags } # /for issuelist } # /for tweets_curday } # /for drange smartPatternMatch(string = "er ist pädophil ", pattern = "pädophilie", chars = 10, acronym = FALSE) smartPatternMatch(string = "er ist pädophiler ", pattern = "pädophilie", chars = 10, acronym = FALSE) smartPatternMatch(string = "er ist pädophiler ", pattern = "Pädophilie", chars = 10, acronym = FALSE) smartPatternMatch(string = "er ist pädophiles ", pattern = "Pädophilie", chars = 10, acronym = FALSE) id_folder <- "matched-ids" unlink(id_folder, recursive = TRUE) dir.create(id_folder) issues <- data.frame(date = drange) issuelist <- xmlToList("issues.xml") issueheads <- names(issuelist) issues[issueheads] <- 0 tweets$issue <- "" tweets$tags <- "" issueheads issuelist <- xmlToList("issues.xml") issuelist issueheads View(issues) issuelist$text issuelist$macro.100 issuelist$macro.101 issuelist$text issuelist$text <- NULL issueheads <- names(issuelist) issueheads issuelist issuelist$text <- "" issuelist issuelist$text <- NA issuelist issuelist$text issuelist$text[1] issuelist$text[2] issuelist$text[6] issuelist$text[10] issues <- data.frame(date = drange) issuelist <- xmlToList("issues.xml") issues <- data.frame(date = drange) issuelist <- xmlToList("issues.xml") issueheads <- names(issuelist) issues[issueheads] <- 0 tweets$issue <- "" tweets$tags <- "" View(tweets)