# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch if(curchars <= 4) { curacro <- checkAcronym(string = curtag, chars = curchars) } else { curacro <- FALSE } # Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance) tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro) if(tags_found == 1) { #cat("Matched", curtag, "with", curtext,"\n") issues[d,curissue] <- issues[d,curissue] + 1 write(str_c(curdate,";\"",curid,"\""), curfile, append = TRUE) break } else { #cat("Nothing found\n") } } # /for curtags } # /for issuelist } # /for tweets_curday } # /for drange smartPatternMatch("kerTips: Riker workplace tip: Flirt when no one else is looking. http", "IS", 2, TRUE) smartPatternMatch("kerTips: Riker workplace tip: Flirt when no one else is looking. http", "is", 2, TRUE) viewMatchingTweets("2014-01-06", "issue.iraq", id_folder) # MATCH TWEETS ------------------------------------------------------------ id_folder <- "matched-ids" unlink(id_folder, recursive = TRUE) dir.create(id_folder) issues <- data.frame(date = drange) issuelist <- xmlToList("issues.xml") issueheads <- names(issuelist) issues[issueheads] <- 0 for(d in 1:nrow(issues)) { # Go through every day curdate <- issues$date[d] cat(as.character(curdate),"\n") # Put all tweets from specific day in a temporary DF tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] for(t in 1:nrow(tweets_curday)){ # Select tweet's text, make it lowercase and remove hashtag indicators (#) curtext <- as.character(tweets_curday$text[t]) curtext <- str_replace_all(curtext, "#", "") curid <- as.character(tweets_curday$id_str[t]) # Now test each single issue (not tag!) for(i in 1:length(issuelist)) { curtags <- as.character(issuelist[[i]]) curissue <- names(issuelist)[i] curfile <- str_c(id_folder,"/",curissue,".csv") # Now test all tags of a single issue for(s in 1:length(curtags)) { curtag <- curtags[s] curchars <- nchar(curtag, type = "chars") # Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch if(curchars <= 4) { curacro <- checkAcronym(string = curtag, chars = curchars) } else { curacro <- FALSE } # Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance) tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro) if(tags_found == 1) { #cat("Matched", curtag, "with", curtext,"\n") issues[d,curissue] <- issues[d,curissue] + 1 write(str_c(curdate,";\"",curid,"\""), curfile, append = TRUE) break } else { #cat("Nothing found\n") } } # /for curtags } # /for issuelist } # /for tweets_curday } # /for drange source("issuecomp-functions.R") # MATCH TWEETS ------------------------------------------------------------ id_folder <- "matched-ids" unlink(id_folder, recursive = TRUE) dir.create(id_folder) issues <- data.frame(date = drange) issuelist <- xmlToList("issues.xml") issueheads <- names(issuelist) issues[issueheads] <- 0 for(d in 1:nrow(issues)) { # Go through every day curdate <- issues$date[d] cat(as.character(curdate),"\n") # Put all tweets from specific day in a temporary DF tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] for(t in 1:nrow(tweets_curday)){ # Select tweet's text, make it lowercase and remove hashtag indicators (#) curtext <- as.character(tweets_curday$text[t]) curtext <- str_replace_all(curtext, "#", "") curid <- as.character(tweets_curday$id_str[t]) # Now test each single issue (not tag!) for(i in 1:length(issuelist)) { curtags <- as.character(issuelist[[i]]) curissue <- names(issuelist)[i] curfile <- str_c(id_folder,"/",curissue,".csv") # Now test all tags of a single issue for(s in 1:length(curtags)) { curtag <- curtags[s] curchars <- nchar(curtag, type = "chars") # Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch if(curchars <= 4) { curacro <- checkAcronym(string = curtag, chars = curchars) } else { curacro <- FALSE } # Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance) tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro) if(tags_found == 1) { #cat("Matched", curtag, "with", curtext,"\n") issues[d,curissue] <- issues[d,curissue] + 1 write(str_c(curdate,";\"",curid,"\";"curtag), curfile, append = TRUE) break } else { #cat("Nothing found\n") } } # /for curtags } # /for issuelist } # /for tweets_curday } # /for drange # MATCH TWEETS ------------------------------------------------------------ id_folder <- "matched-ids" unlink(id_folder, recursive = TRUE) dir.create(id_folder) issues <- data.frame(date = drange) issuelist <- xmlToList("issues.xml") issueheads <- names(issuelist) issues[issueheads] <- 0 for(d in 1:nrow(issues)) { # Go through every day curdate <- issues$date[d] cat(as.character(curdate),"\n") # Put all tweets from specific day in a temporary DF tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] for(t in 1:nrow(tweets_curday)){ # Select tweet's text, make it lowercase and remove hashtag indicators (#) curtext <- as.character(tweets_curday$text[t]) curtext <- str_replace_all(curtext, "#", "") curid <- as.character(tweets_curday$id_str[t]) # Now test each single issue (not tag!) for(i in 1:length(issuelist)) { curtags <- as.character(issuelist[[i]]) curissue <- names(issuelist)[i] curfile <- str_c(id_folder,"/",curissue,".csv") # Now test all tags of a single issue for(s in 1:length(curtags)) { curtag <- curtags[s] curchars <- nchar(curtag, type = "chars") # Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch if(curchars <= 4) { curacro <- checkAcronym(string = curtag, chars = curchars) } else { curacro <- FALSE } # Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance) tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro) if(tags_found == 1) { #cat("Matched", curtag, "with", curtext,"\n") issues[d,curissue] <- issues[d,curissue] + 1 write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE) break } else { #cat("Nothing found\n") } } # /for curtags } # /for issuelist } # /for tweets_curday } # /for drange source("issuecomp-functions.R") viewMatchingTweets("2014-01-06", "issue.iraq", id_folder) viewMatchingTweets("2014-01-07", "issue.iraq", id_folder) viewMatchingTweets("2014-01-09", "issue.iraq", id_folder) curtext <- "Willkürlich Menschen an ihrer #Versammlungsfreiheit zu hindern ist eindeutig rechtswidrig. http://t.co/A7IQfISIhP #Gefahrengebiet #Hamburg" str_replace_all(curtext, "http://.+\\W", "") str_replace_all(curtext, "http://.+?\\W", "") str_replace_all(curtext, "http://.+?\\s", "") str_replace_all(curtext, "http://.+?\\s", "") curtext <- "test http://google.de haha http://nsa.gov eqiuhe" str_replace_all(curtext, "http://.+?\\s", "") str_replace_all(curtext, "http://.+?\\s", "URL") str_replace_all(curtext, "http://.+?\\s", "URL ") viewMatchingTweets("2014-01-09", "issue.iraq", id_folder) # MATCH TWEETS ------------------------------------------------------------ id_folder <- "matched-ids" unlink(id_folder, recursive = TRUE) dir.create(id_folder) issues <- data.frame(date = drange) issuelist <- xmlToList("issues.xml") issueheads <- names(issuelist) issues[issueheads] <- 0 for(d in 1:nrow(issues)) { # Go through every day curdate <- issues$date[d] cat(as.character(curdate),"\n") # Put all tweets from specific day in a temporary DF tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] for(t in 1:nrow(tweets_curday)){ # Select tweet's text, make it lowercase and remove hashtag indicators (#) curtext <- as.character(tweets_curday$text[t]) curtext <- str_replace_all(curtext, "#", "") curtext <- str_replace_all(curtext, "http://.+?\\s", "URL ") curid <- as.character(tweets_curday$id_str[t]) # Now test each single issue (not tag!) for(i in 1:length(issuelist)) { curtags <- as.character(issuelist[[i]]) curissue <- names(issuelist)[i] curfile <- str_c(id_folder,"/",curissue,".csv") # Now test all tags of a single issue for(s in 1:length(curtags)) { curtag <- curtags[s] curchars <- nchar(curtag, type = "chars") # Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch if(curchars <= 4) { curacro <- checkAcronym(string = curtag, chars = curchars) } else { curacro <- FALSE } # Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance) tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro) if(tags_found == 1) { #cat("Matched", curtag, "with", curtext,"\n") issues[d,curissue] <- issues[d,curissue] + 1 write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE) break } else { #cat("Nothing found\n") } } # /for curtags } # /for issuelist } # /for tweets_curday } # /for drange viewMatchingTweets("2014-01-09", "issue.iraq", id_folder) viewMatchingTweets("2014-01-08", "issue.iraq", id_folder) viewMatchingTweets("2014-01-10", "issue.iraq", id_folder) curtext str_replace_all(curtext, "http://.+?\\>", "URL ") str_replace_all(curtext, "http://.+?\\<", "URL ") curtext <- str_replace_all(curtext, "http://.+?\\b", "URL ") str_replace_all(curtext, "http://.+?\\b", "URL ") str_replace_all(curtext, "http://.+?\\s", "URL ") curtext curtext <- as.character(tweets_curday$text[t]) curtext str_replace_all(curtext, "http://.+?\\s", "URL ") str_replace_all(curtext, "http://.+?\\b", "URL ") str_replace_all(curtext, "http://.+?\\<", "URL ") str_replace_all(curtext, "http://.+?\\>", "URL ") str_replace_all(curtext, "http://.+?\\s", "URL ") str_replace_all(curtext, "$", " ") curtext <- str_replace_all(curtext, "$", " ") curtext str_replace_all(curtext, "http://.+?\\s", "URL ") viewMatchingTweets("2014-01-10", "issue.iraq", id_folder) # MATCH TWEETS ------------------------------------------------------------ id_folder <- "matched-ids" unlink(id_folder, recursive = TRUE) dir.create(id_folder) issues <- data.frame(date = drange) issuelist <- xmlToList("issues.xml") issueheads <- names(issuelist) issues[issueheads] <- 0 for(d in 1:nrow(issues)) { # Go through every day curdate <- issues$date[d] cat(as.character(curdate),"\n") # Put all tweets from specific day in a temporary DF tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] for(t in 1:nrow(tweets_curday)){ # Select tweet's text, make it lowercase and remove hashtag indicators (#) curtext <- as.character(tweets_curday$text[t]) curtext <- str_replace_all(curtext, "#", "") curtext <- str_replace_all(curtext, "$", " ") curtext <- str_replace_all(curtext, "http://.+?\\s", "URL ") curid <- as.character(tweets_curday$id_str[t]) # Now test each single issue (not tag!) for(i in 1:length(issuelist)) { curtags <- as.character(issuelist[[i]]) curissue <- names(issuelist)[i] curfile <- str_c(id_folder,"/",curissue,".csv") # Now test all tags of a single issue for(s in 1:length(curtags)) { curtag <- curtags[s] curchars <- nchar(curtag, type = "chars") # Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch if(curchars <= 4) { curacro <- checkAcronym(string = curtag, chars = curchars) } else { curacro <- FALSE } # Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance) tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro) if(tags_found == 1) { #cat("Matched", curtag, "with", curtext,"\n") issues[d,curissue] <- issues[d,curissue] + 1 write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE) break } else { #cat("Nothing found\n") } } # /for curtags } # /for issuelist } # /for tweets_curday } # /for drange viewMatchingTweets("2014-01-10", "issue.iraq", id_folder) # MATCH TWEETS ------------------------------------------------------------ id_folder <- "matched-ids" unlink(id_folder, recursive = TRUE) dir.create(id_folder) issues <- data.frame(date = drange) issuelist <- xmlToList("issues.xml") issueheads <- names(issuelist) issues[issueheads] <- 0 for(d in 1:nrow(issues)) { # Go through every day curdate <- issues$date[d] cat(as.character(curdate),"\n") # Put all tweets from specific day in a temporary DF tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] for(t in 1:nrow(tweets_curday)){ # Select tweet's text, make it lowercase and remove hashtag indicators (#) curtext <- as.character(tweets_curday$text[t]) curtext <- str_replace_all(curtext, "#", "") curtext <- str_replace_all(curtext, "$", " ") curtext <- str_replace_all(curtext, "http://.+?\\s", "URL ") curid <- as.character(tweets_curday$id_str[t]) # Now test each single issue (not tag!) for(i in 1:length(issuelist)) { curtags <- as.character(issuelist[[i]]) curissue <- names(issuelist)[i] curfile <- str_c(id_folder,"/",curissue,".csv") # Now test all tags of a single issue for(s in 1:length(curtags)) { curtag <- curtags[s] curchars <- nchar(curtag, type = "chars") # Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch if(curchars <= 4) { curacro <- checkAcronym(string = curtag, chars = curchars) } else { curacro <- FALSE } # Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance) tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro) if(tags_found == 1) { #cat("Matched", curtag, "with", curtext,"\n") issues[d,curissue] <- issues[d,curissue] + 1 write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE) break } else { #cat("Nothing found\n") } } # /for curtags } # /for issuelist } # /for tweets_curday } # /for drange View(issues) viewMatchingTweets("2014-12-18", "issue.edathy", id_folder) issues_melt <- melt(issues,id="date") ggplot(issues_melt,aes(x=date,y=value,colour=variable,group=variable)) + geom_line(size=1) ggplot(issues_melt,aes(x=date,y=value,colour=variable,group=variable)) + geom_smooth(size=1,method="loess",formula = y ~ x, se=FALSE) ggplot(issues_melt,aes(x=date,y=value,colour=variable,group=variable)) + geom_line(size=1) ggplot(issues_melt,aes(x=date,y=value,colour=variable,group=variable)) + geom_line(size=1) ggplot(issues_melt,aes(x=date,y=value,colour=variable,group=variable)) + geom_smooth(size=1,method="loess",formula = y ~ x, se=FALSE) viewMatchingTweets("2014-12-18", "issue.conservative", id_folder) agrep("christ", "Jungparlamentarier gleich Schriftführerdienst hat", max.distance = list(all = 2), ignore.case = TRUE, fixed = FALSE) agrep("\\bchrist\\b", "Jungparlamentarier gleich Schriftführerdienst hat", max.distance = list(all = 2), ignore.case = TRUE, fixed = FALSE) agrep("\\bchrist\\b", "Bla Christ bla", max.distance = list(all = 2), ignore.case = TRUE, fixed = FALSE) agrep("\\bchrist\\b", "Bla Christus bla", max.distance = list(all = 2), ignore.case = TRUE, fixed = FALSE) agrep("\\bchrist\\b", "Bla Christu bla", max.distance = list(all = 2), ignore.case = TRUE, fixed = FALSE) agrep("\\bchrist\\b", "Bla Christus bla", max.distance = list(all = 2), ignore.case = TRUE, fixed = FALSE) agrep("\\bchrist\\b", "Bla Christus bla", max.distance = list(all = 3), ignore.case = TRUE, fixed = FALSE) agrep("\\bchrist\\b", "Bla christus bla", max.distance = list(all = 3), ignore.case = TRUE, fixed = FALSE) agrep("\\bchrist\\b", "Bla christus bla", max.distance = list(all = 2), ignore.case = TRUE, fixed = FALSE) agrep("\\bchrist\\b", "Bla christen bla", max.distance = list(all = 3), ignore.case = TRUE, fixed = FALSE) agrep("\\bchrist\\b", "Bla Antichrist bla", max.distance = list(all = 3), ignore.case = TRUE, fixed = FALSE) agrep("\\bchrist\\b", "Bla Christian bla", max.distance = list(all = 3), ignore.case = TRUE, fixed = FALSE) agrep("\\bchrist\\b", "Bla Christian bla", max.distance = list(all = 3), ignore.case = TRUE, fixed = FALSE, value=TRUE) agrep("\\bchrist\\b", "Bla Christi bla", max.distance = list(all = 3), ignore.case = TRUE, fixed = FALSE, value=TRUE) agrep("\\bchrist\\b", "Bla Christi bla", max.distance = list(all = 3), ignore.case = TRUE, fixed = FALSE) agrep("\\bIS\\b", "Wir sind bei ISN Network", max.distance = list(all = 0), ignore.case = TRUE, fixed = FALSE) agrep("\\bIS\\b", "Wir sind bei ISN Network", max.distance = list(all = 0), ignore.case = F, fixed = FALSE) id_folder <- "matched-ids" unlink(id_folder, recursive = TRUE) dir.create(id_folder) issues <- data.frame(date = drange) issuelist <- xmlToList("issues.xml") issueheads <- names(issuelist) issues[issueheads] <- 0 for(d in 1:nrow(issues)) { # Go through every day curdate <- issues$date[d] cat(as.character(curdate),"\n") # Put all tweets from specific day in a temporary DF tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] for(t in 1:nrow(tweets_curday)){ # Select tweet's text, make it lowercase and remove hashtag indicators (#) curtext <- as.character(tweets_curday$text[t]) curtext <- str_replace_all(curtext, "#", "") curtext <- str_replace_all(curtext, "$", " ") curtext <- str_replace_all(curtext, "http://.+?\\s", "URL ") curid <- as.character(tweets_curday$id_str[t]) # Now test each single issue (not tag!) for(i in 1:length(issuelist)) { curtags <- as.character(issuelist[[i]]) curissue <- names(issuelist)[i] curfile <- str_c(id_folder,"/",curissue,".csv") # Now test all tags of a single issue for(s in 1:length(curtags)) { curtag <- curtags[s] curchars <- nchar(curtag, type = "chars") # Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch if(curchars <= 4) { curacro <- checkAcronym(string = curtag, chars = curchars) } else { curacro <- FALSE } # Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance) tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro) if(tags_found == 1) { #cat("Matched", curtag, "with", curtext,"\n") issues[d,curissue] <- issues[d,curissue] + 1 write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE) break } else { #cat("Nothing found\n") } } # /for curtags } # /for issuelist } # /for tweets_curday } # /for drange issues_melt <- melt(issues,id="date") ggplot(issues_melt,aes(x=date,y=value,colour=variable,group=variable)) + geom_smooth(size=1,method="loess",formula = y ~ x, se=FALSE) viewMatchingTweets("2014-12-18", "issue.conservative", id_folder) pattern agrep("\\bchrist\\b", "RT @christophheyes: Morgen in der Presse: Oppermann - Briefkasten gestohlen! Gabriel - Poesiealbum nicht mehr auffindbar! #edathy #hartmann", max.distance = list(all = 1), ignore.case = TRUE, fixed = FALSE) smartPatternMatch source("issuecomp-functions.R") smartPatternMatch # MATCH TWEETS ------------------------------------------------------------ id_folder <- "matched-ids" unlink(id_folder, recursive = TRUE) dir.create(id_folder) issues <- data.frame(date = drange) issuelist <- xmlToList("issues.xml") issueheads <- names(issuelist) issues[issueheads] <- 0 for(d in 1:nrow(issues)) { # Go through every day curdate <- issues$date[d] cat(as.character(curdate),"\n") # Put all tweets from specific day in a temporary DF tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] for(t in 1:nrow(tweets_curday)){ # Select tweet's text, make it lowercase and remove hashtag indicators (#) curtext <- as.character(tweets_curday$text[t]) curtext <- str_replace_all(curtext, "#", "") curtext <- str_replace_all(curtext, "$", " ") curtext <- str_replace_all(curtext, "http://.+?\\s", "URL ") curid <- as.character(tweets_curday$id_str[t]) # Now test each single issue (not tag!) for(i in 1:length(issuelist)) { curtags <- as.character(issuelist[[i]]) curissue <- names(issuelist)[i] curfile <- str_c(id_folder,"/",curissue,".csv") # Now test all tags of a single issue for(s in 1:length(curtags)) { curtag <- curtags[s] curchars <- nchar(curtag, type = "chars") # Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch if(curchars <= 4) { curacro <- checkAcronym(string = curtag, chars = curchars) } else { curacro <- FALSE } # Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance) tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro) if(tags_found == 1) { #cat("Matched", curtag, "with", curtext,"\n") issues[d,curissue] <- issues[d,curissue] + 1 write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE) break } else { #cat("Nothing found\n") } } # /for curtags } # /for issuelist } # /for tweets_curday } # /for drange issues_melt <- melt(issues,id="date") ggplot(issues_melt,aes(x=date,y=value,colour=variable,group=variable)) + geom_smooth(size=1,method="loess",formula = y ~ x, se=FALSE) viewMatchingTweets("2014-12-18", "issue.conservative", id_folder) viewMatchingTweets("2014-05-18", "issue.conservative", id_folder) viewMatchingTweets("2014-05-1", "issue.conservative", id_folder) viewMatchingTweets("2014-05-01", "issue.conservative", id_folder) viewMatchingTweets("2014-05-02", "issue.conservative", id_folder) viewMatchingTweets("2014-05-10", "issue.conservative", id_folder) viewMatchingTweets("2014-05-10", "issue.middleeast", id_folder) viewMatchingTweets("2014-05-10", "issue.iraw", id_folder) viewMatchingTweets("2014-05-10", "issue.iraq", id_folder) viewMatchingTweets("2014-08-10", "issue.iraq", id_folder) viewMatchingTweets("2014-11-10", "issue.iraq", id_folder) viewMatchingTweets("2014-12-10", "issue.iraq", id_folder) View(issues) viewMatchingTweets("2014-09-19", "issue.control", id_folder)