diff --git a/.Rhistory b/.Rhistory index e8b2772..3664d88 100644 --- a/.Rhistory +++ b/.Rhistory @@ -1,7 +1,142 @@ +for(s in 1:length(curtags)) { +curtag <- curtags[s] +curchars <- nchar(curtag, type = "chars") +# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch +if(curchars <= 4) { +curacro <- checkAcronym(string = curtag, chars = curchars) +} else { +curacro <- FALSE +} +# Now expand the current tag by possible suffixes that may be plural forms +if(!curacro) { +for(e in 1:length(tagexpand)) { +curtag[e] <- str_c(curtag[1], tagexpand[e]) +} +} +# Set Levenshtein distance depending on char length +if(curchars <= 4) { +curdistance <- 0 +} else { +curdistance <- 1 +} +# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow also 1 (Levenshtein distance) +tags_found <- NULL +# Match the tweet with each variation of tagexpand +for(e in 1:length(curtag)) { +tags_found[e] <- smartPatternMatch(curtext, curtag[e], curdistance, curacro) +} +tags_found <- any(tags_found) +curtag <- curtag[1] +if(tags_found == TRUE) { +# # Raise number of findings on this day for this issue by 1 +# issues[d,curissue] <- issues[d,curissue] + 1 +# +# # Add issue and first matched tag of tweet to tweets-DF +# oldissue <- tweets[tweets[, "id_str"] == curid, "issue"] +# tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";") +# oldtag <- tweets[tweets[, "id_str"] == curid, "tags"] +# tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";") +# Add information to file for function viewPatternMatching +write(str_c(curdate,";\"",curid,"\";",curissue,";",curtag), curfile, append = TRUE) +cat(paste("Match!\n"), file="issuecomp-analysis.log", append=TRUE) +# data.frame(date=curdate, issue=curissue) +break # next issue, no more tags from same issue +} +else { +#cat("Nothing found\n") +} } # /for curtags } # /for issuelist } # /for tweets_curday } # /for drange +# MATCH TWEETS ------------------------------------------------------------ +id_folder <- "matched-ids" +unlink(id_folder, recursive = TRUE) +dir.create(id_folder) +issues <- data.frame(date = drange) +issuelist <- readLines("issues.xml") +issuelist <- str_replace_all(string = issuelist, pattern = ".*", "") +issuelist <- xmlToList(issuelist) +issueheads <- names(issuelist) +issues[issueheads] <- 0 +tweets$issue <- "" +tweets$tags <- "" +tagexpand <- c("", "s", "n", "en", "er", "e") +# Parallelisation +writeLines(c(""), "issuecomp-analysis.log") +cl<-makeCluster(4) +registerDoParallel(cl) +df<-foreach(d = 1:nrow(issues), .packages = c("stringr"), .combine=rbind) %dopar% { +#for(d in 1:nrow(issues)) { +# Go through every day +curdate <- issues$date[d] +cat(paste(as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE) +# Put all tweets from specific day in a temporary DF +tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] +for(t in 1:nrow(tweets_curday)){ +cat(paste("Starting tweet", t, "of",as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE) +# Select tweet's text, make it lowercase and remove hashtag indicators (#) +curtext <- as.character(tweets_curday$text[t]) +curtext <- str_replace_all(curtext, "#", "") +curid <- as.character(tweets_curday$id_str[t]) +# Now test each single issue (not tag!) +for(i in 1:length(issueheads)) { +curissue <- issueheads[i] +curtags <- as.character(issuelist[[curissue]]) +curfile <- str_c(id_folder,"/",curissue,".csv") +# Now test all tags of a single issue +for(s in 1:length(curtags)) { +curtag <- curtags[s] +curchars <- nchar(curtag, type = "chars") +# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch +if(curchars <= 4) { +curacro <- checkAcronym(string = curtag, chars = curchars) +} else { +curacro <- FALSE +} +# Now expand the current tag by possible suffixes that may be plural forms +if(!curacro) { +for(e in 1:length(tagexpand)) { +curtag[e] <- str_c(curtag[1], tagexpand[e]) +} +} +# Set Levenshtein distance depending on char length +if(curchars <= 4) { +curdistance <- 0 +} else { +curdistance <- 1 +} +# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow also 1 (Levenshtein distance) +tags_found <- NULL +# Match the tweet with each variation of tagexpand +for(e in 1:length(curtag)) { +tags_found[e] <- smartPatternMatch(curtext, curtag[e], curdistance, curacro) +} +tags_found <- any(tags_found) +curtag <- curtag[1] +if(tags_found == TRUE) { +# # Raise number of findings on this day for this issue by 1 +# issues[d,curissue] <- issues[d,curissue] + 1 +# +# # Add issue and first matched tag of tweet to tweets-DF +# oldissue <- tweets[tweets[, "id_str"] == curid, "issue"] +# tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";") +# oldtag <- tweets[tweets[, "id_str"] == curid, "tags"] +# tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";") +# Add information to file for function viewPatternMatching +write(str_c(curdate,";\"",curid,"\";",curissue,";",curtag), curfile, append = TRUE) +cat(paste("Match!\n"), file="issuecomp-analysis.log", append=TRUE) +# data.frame(date=curdate, issue=curissue) +break # next issue, no more tags from same issue +} +else { +#cat("Nothing found\n") +} +} # /for curtags +} # /for issuelist +} # /for tweets_curday +} # /for drange +#rm(tweets_curday,curacro, curchars, curdate,curfile,curid,curissue,curtag,curtags,curtext,d,date_end,date_start,i,id_folder,oldissue,oldtag,s,t,tags_found) stopCluster(cl) require(lubridate) require(XML) @@ -11,6 +146,12 @@ require(stringr) library(foreach) library(doParallel) source("issuecomp-functions.R") +load(file = "tweets_untagged.RData") +# Create date range +date_start <- as.Date("2014-01-01") +date_end <- as.Date("2014-12-31") +drange <- as.integer(date_end - date_start) +drange <- date_start + days(0:drange) # MATCH TWEETS ------------------------------------------------------------ id_folder <- "matched-ids" unlink(id_folder, recursive = TRUE) @@ -19,426 +160,14 @@ issues <- data.frame(date = drange) issuelist <- readLines("issues.xml") issuelist <- str_replace_all(string = issuelist, pattern = ".*", "") issuelist <- xmlToList(issuelist) -issueheads <- names(issuelist) -issues[issueheads] <- 0 -tweets$issue <- "" -tweets$tags <- "" -tagexpand <- c("", "s", "n", "en", "er") -# Parallelisation -writeLines(c(""), "issuecomp-analysis.log") -cl<-makeCluster(3) -registerDoParallel(cl) -df<-foreach(d = 1:nrow(issues), .packages = c("stringr"), .combine=rbind) %dopar% { -#for(d in 1:nrow(issues)) { -# Go through every day -curdate <- issues$date[d] -cat(paste(as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE) -# Put all tweets from specific day in a temporary DF -tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] -for(t in 1:nrow(tweets_curday)){ -cat(paste("Starting tweet", t, "of",as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE) -# Select tweet's text, make it lowercase and remove hashtag indicators (#) -curtext <- as.character(tweets_curday$text[t]) -curtext <- str_replace_all(curtext, "#", "") -curid <- as.character(tweets_curday$id_str[t]) -# Now test each single issue (not tag!) -for(i in 1:length(issueheads)) { -curissue <- issueheads[i] -curtags <- as.character(issuelist[[curissue]]) -curfile <- str_c(id_folder,"/",curissue,".csv") -# Now test all tags of a single issue -for(s in 1:length(curtags)) { -curtag <- curtags[s] -curchars <- nchar(curtag, type = "chars") -# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch -if(curchars <= 4) { -curacro <- checkAcronym(string = curtag, chars = curchars) -} else { -curacro <- FALSE -} -# Now expand the current tag by possible suffixes that may be plural forms -if(!curacro) { -for(e in 1:length(tagexpand)) { -curtag[e] <- str_c(curtag[1], tagexpand[e]) -} -} -# Set Levenshtein distance depending on char length -if(curchars <= 4) { -curdistance <- 0 -} else { -curdistance <- 1 -} -# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow also 1 (Levenshtein distance) -tags_found <- NULL -# Match the tweet with each variation of tagexpand -for(e in 1:length(curtag)) { -tags_found[e] <- smartPatternMatch(curtext, curtag[e], curdistance, curacro) -} -tags_found <- any(tags_found) -curtag <- curtag[1] -if(tags_found == TRUE) { -# Raise number of findings on this day for this issue by 1 -issues[d,curissue] <- issues[d,curissue] + 1 -# Add issue and first matched tag of tweet to tweets-DF -oldissue <- tweets[tweets[, "id_str"] == curid, "issue"] -tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";") -oldtag <- tweets[tweets[, "id_str"] == curid, "tags"] -tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";") -# Add information to file for function viewPatternMatching -write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE) -cat(paste("Match!\n"), file="issuecomp-analysis.log", append=TRUE) -break -} -else { -#cat("Nothing found\n") -} -} # /for curtags -} # /for issuelist -} # /for tweets_curday -} # /for drange -View(issues) -require(lubridate) -require(XML) -require(ggplot2) -require(reshape2) -require(stringr) -library(foreach) -library(doParallel) -# MATCH TWEETS ------------------------------------------------------------ -id_folder <- "matched-ids" -unlink(id_folder, recursive = TRUE) -dir.create(id_folder) -issues <- data.frame(date = drange) -issuelist <- readLines("issues.xml") -issuelist <- str_replace_all(string = issuelist, pattern = ".*", "") -issuelist <- xmlToList(issuelist) -issueheads <- names(issuelist) -issues[issueheads] <- 0 -tweets$issue <- "" -tweets$tags <- "" -tagexpand <- c("", "s", "n", "en", "er") -# Parallelisation -writeLines(c(""), "issuecomp-analysis.log") -cl<-makeCluster(3) -registerDoParallel(cl) -df<-foreach(d = 1:3, .packages = c("stringr"), .combine=rbind) %dopar% { -#df<-foreach(d = 1:nrow(issues), .packages = c("stringr"), .combine=rbind) %dopar% { -#for(d in 1:nrow(issues)) { -# Go through every day -curdate <- issues$date[d] -cat(paste(as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE) -# Put all tweets from specific day in a temporary DF -tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] -for(t in 1:25){ -#for(t in 1:nrow(tweets_curday)){ -cat(paste("Starting tweet", t, "of",as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE) -# Select tweet's text, make it lowercase and remove hashtag indicators (#) -curtext <- as.character(tweets_curday$text[t]) -curtext <- str_replace_all(curtext, "#", "") -curid <- as.character(tweets_curday$id_str[t]) -# Now test each single issue (not tag!) -for(i in 1:length(issueheads)) { -curissue <- issueheads[i] -curtags <- as.character(issuelist[[curissue]]) -curfile <- str_c(id_folder,"/",curissue,".csv") -# Now test all tags of a single issue -for(s in 1:length(curtags)) { -curtag <- curtags[s] -curchars <- nchar(curtag, type = "chars") -# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch -if(curchars <= 4) { -curacro <- checkAcronym(string = curtag, chars = curchars) -} else { -curacro <- FALSE -} -# Now expand the current tag by possible suffixes that may be plural forms -if(!curacro) { -for(e in 1:length(tagexpand)) { -curtag[e] <- str_c(curtag[1], tagexpand[e]) -} -} -# Set Levenshtein distance depending on char length -if(curchars <= 4) { -curdistance <- 0 -} else { -curdistance <- 1 -} -# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow also 1 (Levenshtein distance) -tags_found <- NULL -# Match the tweet with each variation of tagexpand -for(e in 1:length(curtag)) { -tags_found[e] <- smartPatternMatch(curtext, curtag[e], curdistance, curacro) -} -tags_found <- any(tags_found) -curtag <- curtag[1] -if(tags_found == TRUE) { -# Raise number of findings on this day for this issue by 1 -issues[d,curissue] <- issues[d,curissue] + 1 -# Add issue and first matched tag of tweet to tweets-DF -oldissue <- tweets[tweets[, "id_str"] == curid, "issue"] -tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";") -oldtag <- tweets[tweets[, "id_str"] == curid, "tags"] -tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";") -# Add information to file for function viewPatternMatching -write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE) -cat(paste("Match!\n"), file="issuecomp-analysis.log", append=TRUE) -break -} -else { -#cat("Nothing found\n") -} -} # /for curtags -} # /for issuelist -} # /for tweets_curday -} # /for drange -#rm(tweets_curday,curacro, curchars, curdate,curfile,curid,curissue,curtag,curtags,curtext,d,date_end,date_start,i,id_folder,oldissue,oldtag,s,t,tags_found) -stopCluster(cl) -View(issues) -rm(data) -df -require(lubridate) -require(XML) -require(ggplot2) -require(reshape2) -require(stringr) -library(foreach) -library(doParallel) -View(issues) -# MATCH TWEETS ------------------------------------------------------------ -id_folder <- "matched-ids" -unlink(id_folder, recursive = TRUE) -dir.create(id_folder) -issues <- data.frame(date = drange) -issuelist <- readLines("issues.xml") -issuelist <- str_replace_all(string = issuelist, pattern = ".*", "") -issuelist <- xmlToList(issuelist) -issueheads <- names(issuelist) -issues[issueheads] <- 0 -tweets$issue <- "" -tweets$tags <- "" -tagexpand <- c("", "s", "n", "en", "er") -# Parallelisation -writeLines(c(""), "issuecomp-analysis.log") -cl<-makeCluster(3) -registerDoParallel(cl) -df<-foreach(d = 1:3, .packages = c("stringr"), .combine=rbind) %dopar% { -#df<-foreach(d = 1:nrow(issues), .packages = c("stringr"), .combine=rbind) %dopar% { -#for(d in 1:nrow(issues)) { -# Go through every day -curdate <- issues$date[d] -cat(paste(as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE) -# Put all tweets from specific day in a temporary DF -tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] -for(t in 1:25){ -#for(t in 1:nrow(tweets_curday)){ -cat(paste("Starting tweet", t, "of",as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE) -# Select tweet's text, make it lowercase and remove hashtag indicators (#) -curtext <- as.character(tweets_curday$text[t]) -curtext <- str_replace_all(curtext, "#", "") -curid <- as.character(tweets_curday$id_str[t]) -# Now test each single issue (not tag!) -for(i in 1:length(issueheads)) { -curissue <- issueheads[i] -curtags <- as.character(issuelist[[curissue]]) -curfile <- str_c(id_folder,"/",curissue,".csv") -# Now test all tags of a single issue -for(s in 1:length(curtags)) { -curtag <- curtags[s] -curchars <- nchar(curtag, type = "chars") -# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch -if(curchars <= 4) { -curacro <- checkAcronym(string = curtag, chars = curchars) -} else { -curacro <- FALSE -} -# Now expand the current tag by possible suffixes that may be plural forms -if(!curacro) { -for(e in 1:length(tagexpand)) { -curtag[e] <- str_c(curtag[1], tagexpand[e]) -} -} -# Set Levenshtein distance depending on char length -if(curchars <= 4) { -curdistance <- 0 -} else { -curdistance <- 1 -} -# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow also 1 (Levenshtein distance) -tags_found <- NULL -# Match the tweet with each variation of tagexpand -for(e in 1:length(curtag)) { -tags_found[e] <- smartPatternMatch(curtext, curtag[e], curdistance, curacro) -} -tags_found <- any(tags_found) -curtag <- curtag[1] -if(tags_found == TRUE) { -# Raise number of findings on this day for this issue by 1 -issues[d,curissue] <- issues[d,curissue] + 1 -# Add issue and first matched tag of tweet to tweets-DF -oldissue <- tweets[tweets[, "id_str"] == curid, "issue"] -tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";") -oldtag <- tweets[tweets[, "id_str"] == curid, "tags"] -tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";") -# Add information to file for function viewPatternMatching -write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE) -cat(paste("Match!\n"), file="issuecomp-analysis.log", append=TRUE) -data.frame(date=curdate, issue=curissue) -break -} -else { -#cat("Nothing found\n") -} -} # /for curtags -} # /for issuelist -} # /for tweets_curday -} # /for drange -#rm(tweets_curday,curacro, curchars, curdate,curfile,curid,curissue,curtag,curtags,curtext,d,date_end,date_start,i,id_folder,oldissue,oldtag,s,t,tags_found) -stopCluster(cl) -df -drange -nrow(tweets[tweets[, "id_str"] == curid, "issue"]) -curid -nrow(tweets[tweets[, "date"] == 2014-01-10, "issue"]) -nrow(tweets[tweets[, "created_at"] == 2014-01-10, "issue"]) -View(tweets) -tweets[tweets[, "created_at"] == 2014-01-10, "issue"] -tweets[tweets[, "created_at"] == 2014-01-10, ] -tweets[tweets[, "created_at"] == "2014-01-10", ] -tweets[tweets[, "created_at"] == "2014-01-10", "user"] -nrow(tweets[tweets[, "created_at"] == "2014-01-10", "user"]) -nrow(tweets[tweets[, "created_at"] == "2014-01-10", "id_str"]) -length(tweets[tweets[, "created_at"] == "2014-01-10", "id_str"]) -writeLines(c(""), "issuecomp-analysis.log") -cl<-makeCluster(3) -registerDoParallel(cl) -df<-foreach(d = 1:length(drange), .packages = c("stringr"), .combine=rbind) %dopar% { -curdate <- as.character(drange[d]) -curissue <- length(tweets[tweets[, "created_at"] == curdate, "id_str"]) -data.frame(date=curdate, tweets=curissue) -} -stopCluster(cl) -View(df) -writeLines(c(""), "issuecomp-analysis.log") -cl<-makeCluster(3) -registerDoParallel(cl) -df<-foreach(d = 1:length(drange), .packages = c("stringr"), .combine=rbind) %dopar% { -for(i in 1:5) { -curdate <- as.character(drange[d]) -curissue <- length(tweets[tweets[, "created_at"] == curdate, "id_str"]) -data.frame(date=curdate, tweets=curissue) -} -} -stopCluster(cl) -View(tweets) -curissue -curtag -# MATCH TWEETS ------------------------------------------------------------ -id_folder <- "matched-ids" -unlink(id_folder, recursive = TRUE) -dir.create(id_folder) -issues <- data.frame(date = drange) -issuelist <- readLines("issues.xml") -issuelist <- str_replace_all(string = issuelist, pattern = ".*", "") -issuelist <- xmlToList(issuelist) -issueheads <- names(issuelist) +issueheads <- names(issuelist) issues[issueheads] <- 0 tweets$issue <- "" tweets$tags <- "" tagexpand <- c("", "s", "n", "en", "er", "e") # Parallelisation writeLines(c(""), "issuecomp-analysis.log") -cl<-makeCluster(3) -registerDoParallel(cl) -df<-foreach(d = 1:3, .packages = c("stringr"), .combine=rbind) %dopar% { -#df<-foreach(d = 1:nrow(issues), .packages = c("stringr"), .combine=rbind) %dopar% { -#for(d in 1:nrow(issues)) { -# Go through every day -curdate <- issues$date[d] -cat(paste(as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE) -# Put all tweets from specific day in a temporary DF -tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] -for(t in 1:25){ -#for(t in 1:nrow(tweets_curday)){ -cat(paste("Starting tweet", t, "of",as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE) -# Select tweet's text, make it lowercase and remove hashtag indicators (#) -curtext <- as.character(tweets_curday$text[t]) -curtext <- str_replace_all(curtext, "#", "") -curid <- as.character(tweets_curday$id_str[t]) -# Now test each single issue (not tag!) -for(i in 1:length(issueheads)) { -curissue <- issueheads[i] -curtags <- as.character(issuelist[[curissue]]) -curfile <- str_c(id_folder,"/",curissue,".csv") -# Now test all tags of a single issue -for(s in 1:length(curtags)) { -curtag <- curtags[s] -curchars <- nchar(curtag, type = "chars") -# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch -if(curchars <= 4) { -curacro <- checkAcronym(string = curtag, chars = curchars) -} else { -curacro <- FALSE -} -# Now expand the current tag by possible suffixes that may be plural forms -if(!curacro) { -for(e in 1:length(tagexpand)) { -curtag[e] <- str_c(curtag[1], tagexpand[e]) -} -} -# Set Levenshtein distance depending on char length -if(curchars <= 4) { -curdistance <- 0 -} else { -curdistance <- 1 -} -# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow also 1 (Levenshtein distance) -tags_found <- NULL -# Match the tweet with each variation of tagexpand -for(e in 1:length(curtag)) { -tags_found[e] <- smartPatternMatch(curtext, curtag[e], curdistance, curacro) -} -tags_found <- any(tags_found) -curtag <- curtag[1] -if(tags_found == TRUE) { -# # Raise number of findings on this day for this issue by 1 -# issues[d,curissue] <- issues[d,curissue] + 1 -# -# # Add issue and first matched tag of tweet to tweets-DF -# oldissue <- tweets[tweets[, "id_str"] == curid, "issue"] -# tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";") -# oldtag <- tweets[tweets[, "id_str"] == curid, "tags"] -# tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";") -# Add information to file for function viewPatternMatching -write(str_c(curdate,";\"",curid,"\";",curissue,";",curtag), curfile, append = TRUE) -cat(paste("Match!\n"), file="issuecomp-analysis.log", append=TRUE) -# data.frame(date=curdate, issue=curissue) -break # next issue, no more tags from same issue -} -else { -#cat("Nothing found\n") -} -} # /for curtags -} # /for issuelist -} # /for tweets_curday -} # /for drange -#rm(tweets_curday,curacro, curchars, curdate,curfile,curid,curissue,curtag,curtags,curtext,d,date_end,date_start,i,id_folder,oldissue,oldtag,s,t,tags_found) -stopCluster(cl) -# MATCH TWEETS ------------------------------------------------------------ -id_folder <- "matched-ids" -unlink(id_folder, recursive = TRUE) -dir.create(id_folder) -issues <- data.frame(date = drange) -issuelist <- readLines("issues.xml") -issuelist <- str_replace_all(string = issuelist, pattern = ".*", "") -issuelist <- xmlToList(issuelist) -issueheads <- names(issuelist) -issues[issueheads] <- 0 -tweets$issue <- "" -tweets$tags <- "" -tagexpand <- c("", "s", "n", "en", "er", "e") -# Parallelisation -writeLines(c(""), "issuecomp-analysis.log") -cl<-makeCluster(3) +cl<-makeCluster(4) registerDoParallel(cl) df<-foreach(d = 1:nrow(issues), .packages = c("stringr"), .combine=rbind) %dopar% { #for(d in 1:nrow(issues)) { @@ -510,3 +239,274 @@ else { } # /for issuelist } # /for tweets_curday } # /for drange +# MATCH TWEETS ------------------------------------------------------------ +id_folder <- "matched-ids" +unlink(id_folder, recursive = TRUE) +dir.create(id_folder) +issues <- data.frame(date = drange) +issuelist <- readLines("issues.xml") +issuelist <- str_replace_all(string = issuelist, pattern = ".*", "") +issuelist <- xmlToList(issuelist) +issueheads <- names(issuelist) +issues[issueheads] <- 0 +tweets$issue <- "" +tweets$tags <- "" +tagexpand <- c("", "s", "n", "en", "er", "e") +# Parallelisation +writeLines(c(""), "issuecomp-analysis.log") +cl<-makeCluster(4) +registerDoParallel(cl) +df<-foreach(d = 1:nrow(issues), .packages = c("stringr"), .combine=rbind) %dopar% { +#for(d in 1:nrow(issues)) { +# Go through every day +curdate <- issues$date[d] +cat(paste(as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE) +# Put all tweets from specific day in a temporary DF +tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] +for(t in 1:nrow(tweets_curday)){ +cat(paste("Starting tweet", t, "of",as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE) +# Select tweet's text, make it lowercase and remove hashtag indicators (#) +curtext <- as.character(tweets_curday$text[t]) +curtext <- str_replace_all(curtext, "#", "") +curid <- as.character(tweets_curday$id_str[t]) +# Now test each single issue (not tag!) +for(i in 1:length(issueheads)) { +curissue <- issueheads[i] +curtags <- as.character(issuelist[[curissue]]) +curfile <- str_c(id_folder,"/",curissue,".csv") +# Now test all tags of a single issue +for(s in 1:length(curtags)) { +curtag <- curtags[s] +curchars <- nchar(curtag, type = "chars") +# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch +if(curchars <= 4) { +curacro <- checkAcronym(string = curtag, chars = curchars) +} else { +curacro <- FALSE +} +# Now expand the current tag by possible suffixes that may be plural forms +if(!curacro) { +for(e in 1:length(tagexpand)) { +curtag[e] <- str_c(curtag[1], tagexpand[e]) +} +} +# Set Levenshtein distance depending on char length +if(curchars <= 4) { +curdistance <- 0 +} else { +curdistance <- 1 +} +# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow also 1 (Levenshtein distance) +tags_found <- NULL +# Match the tweet with each variation of tagexpand +for(e in 1:length(curtag)) { +tags_found[e] <- smartPatternMatch(curtext, curtag[e], curdistance, curacro) +} +tags_found <- any(tags_found) +curtag <- curtag[1] +if(tags_found == TRUE) { +# # Raise number of findings on this day for this issue by 1 +# issues[d,curissue] <- issues[d,curissue] + 1 +# +# # Add issue and first matched tag of tweet to tweets-DF +# oldissue <- tweets[tweets[, "id_str"] == curid, "issue"] +# tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";") +# oldtag <- tweets[tweets[, "id_str"] == curid, "tags"] +# tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";") +# Add information to file for function viewPatternMatching +write(str_c(curdate,";\"",curid,"\";",curissue,";",curtag), curfile, append = TRUE) +cat(paste("Match!\n"), file="issuecomp-analysis.log", append=TRUE) +# data.frame(date=curdate, issue=curissue) +break # next issue, no more tags from same issue +} +else { +#cat("Nothing found\n") +} +} # /for curtags +} # /for issuelist +} # /for tweets_curday +} # /for drange +df +# MATCH TWEETS ------------------------------------------------------------ +id_folder <- "matched-ids" +unlink(id_folder, recursive = TRUE) +dir.create(id_folder) +issues <- data.frame(date = drange) +issuelist <- readLines("issues.xml") +issuelist <- str_replace_all(string = issuelist, pattern = ".*", "") +issuelist <- xmlToList(issuelist) +issueheads <- names(issuelist) +issues[issueheads] <- 0 +tweets$issue <- "" +tweets$tags <- "" +tagexpand <- c("", "s", "n", "en", "er", "e") +# Parallelisation +writeLines(c(""), "issuecomp-analysis.log") +cl<-makeCluster(4) +registerDoParallel(cl) +foreach(d = 1:nrow(issues), .packages = c("stringr"), .combine=rbind) %dopar% { +#for(d in 1:nrow(issues)) { +# Go through every day +curdate <- issues$date[d] +cat(paste(as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE) +# Put all tweets from specific day in a temporary DF +tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] +for(t in 1:nrow(tweets_curday)){ +# cat(paste("Starting tweet", t, "of",as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE) +# Select tweet's text, make it lowercase and remove hashtag indicators (#) +curtext <- as.character(tweets_curday$text[t]) +curtext <- str_replace_all(curtext, "#", "") +curid <- as.character(tweets_curday$id_str[t]) +# Now test each single issue (not tag!) +for(i in 1:length(issueheads)) { +curissue <- issueheads[i] +curtags <- as.character(issuelist[[curissue]]) +curfile <- str_c(id_folder,"/",curissue,".csv") +# Now test all tags of a single issue +for(s in 1:length(curtags)) { +curtag <- curtags[s] +curchars <- nchar(curtag, type = "chars") +# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch +if(curchars <= 4) { +curacro <- checkAcronym(string = curtag, chars = curchars) +} else { +curacro <- FALSE +} +# Now expand the current tag by possible suffixes that may be plural forms +if(!curacro) { +for(e in 1:length(tagexpand)) { +curtag[e] <- str_c(curtag[1], tagexpand[e]) +} +} +# Set Levenshtein distance depending on char length +if(curchars <= 4) { +curdistance <- 0 +} else { +curdistance <- 1 +} +# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow also 1 (Levenshtein distance) +tags_found <- NULL +# Match the tweet with each variation of tagexpand +for(e in 1:length(curtag)) { +tags_found[e] <- smartPatternMatch(curtext, curtag[e], curdistance, curacro) +} +tags_found <- any(tags_found) +curtag <- curtag[1] +if(tags_found == TRUE) { +# # Raise number of findings on this day for this issue by 1 +# issues[d,curissue] <- issues[d,curissue] + 1 +# +# # Add issue and first matched tag of tweet to tweets-DF +# oldissue <- tweets[tweets[, "id_str"] == curid, "issue"] +# tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";") +# oldtag <- tweets[tweets[, "id_str"] == curid, "tags"] +# tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";") +# Add information to file for function viewPatternMatching +write(str_c(curdate,";\"",curid,"\";",curissue,";",curtag), curfile, append = TRUE) +cat(paste("Match!\n"), file="issuecomp-analysis.log", append=TRUE) +# data.frame(date=curdate, issue=curissue) +break # next issue, no more tags from same issue +} +else { +#cat("Nothing found\n") +} +} # /for curtags +} # /for issuelist +} # /for tweets_curday +} # /for drange +#rm(tweets_curday,curacro, curchars, curdate,curfile,curid,curissue,curtag,curtags,curtext,d,date_end,date_start,i,id_folder,oldissue,oldtag,s,t,tags_found) +stopCluster(cl) +require(lubridate) +require(XML) +require(ggplot2) +require(reshape2) +require(stringr) +library(foreach) +library(doParallel) +# MATCH TWEETS ------------------------------------------------------------ +id_folder <- "matched-ids" +unlink(id_folder, recursive = TRUE) +dir.create(id_folder) +issues <- data.frame(date = drange) +issuelist <- readLines("issues.xml") +issuelist <- str_replace_all(string = issuelist, pattern = ".*", "") +issuelist <- xmlToList(issuelist) +issueheads <- names(issuelist) +issues[issueheads] <- 0 +tweets$issue <- "" +tweets$tags <- "" +tagexpand <- c("", "s", "n", "en", "er", "e") +# Parallelisation +writeLines(c(""), "issuecomp-analysis.log") +cl<-makeCluster(4) +registerDoParallel(cl) +foreach(d = 1:nrow(issues), .packages = c("stringr"), .combine=rbind) %dopar% { +#for(d in 1:nrow(issues)) { +# Go through every day +curdate <- issues$date[d] +cat(paste(as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE) +# Put all tweets from specific day in a temporary DF +tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] +for(t in 1:nrow(tweets_curday)){ +# cat(paste("Starting tweet", t, "of",as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE) +# Select tweet's text, make it lowercase and remove hashtag indicators (#) +curtext <- as.character(tweets_curday$text[t]) +curtext <- str_replace_all(curtext, "#", "") +curid <- as.character(tweets_curday$id_str[t]) +# Now test each single issue (not tag!) +for(i in 1:length(issueheads)) { +curissue <- issueheads[i] +curtags <- as.character(issuelist[[curissue]]) +curfile <- str_c(id_folder,"/",curissue,".csv") +# Now test all tags of a single issue +for(s in 1:length(curtags)) { +curtag <- curtags[s] +curchars <- nchar(curtag, type = "chars") +# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch +if(curchars <= 4) { +curacro <- checkAcronym(string = curtag, chars = curchars) +} else { +curacro <- FALSE +} +# Now expand the current tag by possible suffixes that may be plural forms +if(!curacro) { +for(e in 1:length(tagexpand)) { +curtag[e] <- str_c(curtag[1], tagexpand[e]) +} +} +# Set Levenshtein distance depending on char length +if(curchars <= 4) { +curdistance <- 0 +} else { +curdistance <- 1 +} +# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow also 1 (Levenshtein distance) +tags_found <- NULL +# Match the tweet with each variation of tagexpand +for(e in 1:length(curtag)) { +tags_found[e] <- smartPatternMatch(curtext, curtag[e], curdistance, curacro) +} +tags_found <- any(tags_found) +curtag <- curtag[1] +if(tags_found == TRUE) { +# # Raise number of findings on this day for this issue by 1 +# issues[d,curissue] <- issues[d,curissue] + 1 +# +# # Add issue and first matched tag of tweet to tweets-DF +# oldissue <- tweets[tweets[, "id_str"] == curid, "issue"] +# tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";") +# oldtag <- tweets[tweets[, "id_str"] == curid, "tags"] +# tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";") +# Add information to file for function viewPatternMatching +write(str_c(curdate,";\"",curid,"\";",curissue,";",curtag), curfile, append = TRUE) +cat(paste("Match!\n"), file="issuecomp-analysis.log", append=TRUE) +# data.frame(date=curdate, issue=curissue) +break # next issue, no more tags from same issue +} +else { +#cat("Nothing found\n") +} +} # /for curtags +} # /for issuelist +} # /for tweets_curday +} # /for drange diff --git a/issuecomp-analysis.R b/issuecomp-analysis.R index 822589e..5409a98 100644 --- a/issuecomp-analysis.R +++ b/issuecomp-analysis.R @@ -40,7 +40,7 @@ writeLines(c(""), "issuecomp-analysis.log") cl<-makeCluster(4) registerDoParallel(cl) -df<-foreach(d = 1:nrow(issues), .packages = c("stringr"), .combine=rbind) %dopar% { +foreach(d = 1:nrow(issues), .packages = c("stringr"), .combine=rbind) %dopar% { #for(d in 1:nrow(issues)) { # Go through every day curdate <- issues$date[d] @@ -50,7 +50,7 @@ df<-foreach(d = 1:nrow(issues), .packages = c("stringr"), .combine=rbind) %dopar tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] for(t in 1:nrow(tweets_curday)){ - cat(paste("Starting tweet", t, "of",as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE) +# cat(paste("Starting tweet", t, "of",as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE) # Select tweet's text, make it lowercase and remove hashtag indicators (#) curtext <- as.character(tweets_curday$text[t]) curtext <- str_replace_all(curtext, "#", "") @@ -110,7 +110,7 @@ df<-foreach(d = 1:nrow(issues), .packages = c("stringr"), .combine=rbind) %dopar # Add information to file for function viewPatternMatching write(str_c(curdate,";\"",curid,"\";",curissue,";",curtag), curfile, append = TRUE) - cat(paste("Match!\n"), file="issuecomp-analysis.log", append=TRUE) +# cat(paste("Match!\n"), file="issuecomp-analysis.log", append=TRUE) # data.frame(date=curdate, issue=curissue) break # next issue, no more tags from same issue } diff --git a/issuecomp-analysis.log b/issuecomp-analysis.log index ce909e3..21fb9db 100644 --- a/issuecomp-analysis.log +++ b/issuecomp-analysis.log @@ -1,76 +1,27 @@ -2014-01-02 2014-01-01 +2014-01-02 2014-01-03 -Starting tweet 1 of 2014-01-03 -Starting tweet 1 of 2014-01-01 -Starting tweet 1 of 2014-01-02 -Starting tweet 2 of 2014-01-01 -Starting tweet 2 of 2014-01-02 -Starting tweet 2 of 2014-01-03 -Starting tweet 3 of 2014-01-01 -Match! -Starting tweet 3 of 2014-01-02 -Starting tweet 3 of 2014-01-03 -Starting tweet 4 of 2014-01-02 -Starting tweet 4 of 2014-01-01 -Starting tweet 4 of 2014-01-03 +2014-01-04 +Match! +Match! +Match! +Match! +Match! +Match! Match! Match! Match! -Starting tweet 5 of 2014-01-02 -Starting tweet 5 of 2014-01-01 -Starting tweet 5 of 2014-01-03 -Starting tweet 6 of 2014-01-02 -Starting tweet 6 of 2014-01-01 -Starting tweet 6 of 2014-01-03 Match! Match! -Starting tweet 7 of 2014-01-02 -Starting tweet 7 of 2014-01-01 -Starting tweet 7 of 2014-01-03 -Starting tweet 8 of 2014-01-01 -Starting tweet 8 of 2014-01-02 -Starting tweet 8 of 2014-01-03 Match! -Starting tweet 9 of 2014-01-01 -Starting tweet 9 of 2014-01-02 -Starting tweet 9 of 2014-01-03 -Starting tweet 10 of 2014-01-01 -Starting tweet 10 of 2014-01-02 -Starting tweet 10 of 2014-01-03 -Starting tweet 11 of 2014-01-01 -Starting tweet 11 of 2014-01-02 Match! -Starting tweet 11 of 2014-01-03 -Starting tweet 12 of 2014-01-01 -Starting tweet 12 of 2014-01-02 -Starting tweet 12 of 2014-01-03 -Starting tweet 13 of 2014-01-01 Match! -Starting tweet 13 of 2014-01-02 -Starting tweet 13 of 2014-01-03 -Starting tweet 14 of 2014-01-01 -Starting tweet 14 of 2014-01-02 -Starting tweet 14 of 2014-01-03 -Starting tweet 15 of 2014-01-01 Match! -Starting tweet 15 of 2014-01-02 Match! -Starting tweet 15 of 2014-01-03 -Starting tweet 16 of 2014-01-01 -Starting tweet 16 of 2014-01-02 Match! -Starting tweet 16 of 2014-01-03 -Starting tweet 17 of 2014-01-01 Match! -Starting tweet 17 of 2014-01-02 -Starting tweet 17 of 2014-01-03 Match! -Starting tweet 18 of 2014-01-01 Match! -Starting tweet 18 of 2014-01-03 Match! -Starting tweet 18 of 2014-01-02 Match! -Starting tweet 19 of 2014-01-01