diff --git a/.Rhistory b/.Rhistory index d55074d..e8b2772 100644 --- a/.Rhistory +++ b/.Rhistory @@ -1,317 +1,3 @@ -curacro <- checkAcronym(string = curtag, chars = curchars) -} else { -curacro <- FALSE -} -# Now expand the current tag by possible suffixes that may be plural forms -if(!curacro) { -for(e in 1:length(tagexpand)) { -curtag[e] <- str_c(curtag[1], tagexpand[e]) -} -} -# Set Levenshtein distance depending on char length -if(curchars <= 4) { -curdistance <- 0 -} else { -curdistance <- 1 -} -# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow also 1 (Levenshtein distance) -tags_found <- NULL -# Match the tweet with each variation of tagexpand -for(e in 1:length(curtag)) { -tags_found[e] <- smartPatternMatch(curtext, curtag[e], curdistance, curacro) -} -tags_found <- any(tags_found) -curtag <- curtag[1] -if(tags_found == TRUE) { -# Raise number of findings on this day for this issue by 1 -issues[d,curissue] <- issues[d,curissue] + 1 -# Add issue and first matched tag of tweet to tweets-DF -oldissue <- tweets[tweets[, "id_str"] == curid, "issue"] -tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";") -oldtag <- tweets[tweets[, "id_str"] == curid, "tags"] -tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";") -# Add information to file for function viewPatternMatching -write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE) -cat("Match!\n") -break -} -else { -#cat("Nothing found\n") -} -} # /for curtags -} # /for issuelist -} # /for tweets_curday -} # /for drange -stopCluster(cl) -View(issues) -cl -df -View(data) -stopCluster(cl) -# MATCH TWEETS ------------------------------------------------------------ -id_folder <- "matched-ids" -unlink(id_folder, recursive = TRUE) -dir.create(id_folder) -issues <- data.frame(date = drange) -issuelist <- readLines("issues.xml") -issuelist <- str_replace_all(string = issuelist, pattern = ".*", "") -issuelist <- xmlToList(issuelist) -issueheads <- names(issuelist) -issues[issueheads] <- 0 -tweets$issue <- "" -tweets$tags <- "" -tagexpand <- c("", "s", "n", "en", "er") -# Parallelisation -writeLines(c(""), "issuecomp-analysis.log") -cl<-makeCluster(3) -registerDoParallel(cl) -df<-foreach(d = 1:nrow(issues), .packages = c("stringr")) %dopar% { -#for(d in 1:nrow(issues)) { -# Go through every day -curdate <- issues$date[d] -cat(paste(as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE) -# Put all tweets from specific day in a temporary DF -tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] -for(t in 1:nrow(tweets_curday)){ -cat(paste("Starting tweet", t, "of",as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE) -# Select tweet's text, make it lowercase and remove hashtag indicators (#) -curtext <- as.character(tweets_curday$text[t]) -curtext <- str_replace_all(curtext, "#", "") -curid <- as.character(tweets_curday$id_str[t]) -# Now test each single issue (not tag!) -for(i in 1:length(issueheads)) { -curissue <- issueheads[i] -curtags <- as.character(issuelist[[curissue]]) -curfile <- str_c(id_folder,"/",curissue,".csv") -# Now test all tags of a single issue -for(s in 1:length(curtags)) { -curtag <- curtags[s] -curchars <- nchar(curtag, type = "chars") -# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch -if(curchars <= 4) { -curacro <- checkAcronym(string = curtag, chars = curchars) -} else { -curacro <- FALSE -} -# Now expand the current tag by possible suffixes that may be plural forms -if(!curacro) { -for(e in 1:length(tagexpand)) { -curtag[e] <- str_c(curtag[1], tagexpand[e]) -} -} -# Set Levenshtein distance depending on char length -if(curchars <= 4) { -curdistance <- 0 -} else { -curdistance <- 1 -} -# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow also 1 (Levenshtein distance) -tags_found <- NULL -# Match the tweet with each variation of tagexpand -for(e in 1:length(curtag)) { -tags_found[e] <- smartPatternMatch(curtext, curtag[e], curdistance, curacro) -} -tags_found <- any(tags_found) -curtag <- curtag[1] -if(tags_found == TRUE) { -# Raise number of findings on this day for this issue by 1 -issues[d,curissue] <- issues[d,curissue] + 1 -# Add issue and first matched tag of tweet to tweets-DF -oldissue <- tweets[tweets[, "id_str"] == curid, "issue"] -tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";") -oldtag <- tweets[tweets[, "id_str"] == curid, "tags"] -tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";") -# Add information to file for function viewPatternMatching -write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE) -cat("Match!\n") -break -} -else { -#cat("Nothing found\n") -} -} # /for curtags -} # /for issuelist -} # /for tweets_curday -} # /for drange -#rm(tweets_curday,curacro, curchars, curdate,curfile,curid,curissue,curtag,curtags,curtext,d,date_end,date_start,i,id_folder,oldissue,oldtag,s,t,tags_found) -stopCluster(cl) -require(lubridate) -require(XML) -require(ggplot2) -require(reshape2) -require(stringr) -library(foreach) -library(doParallel) -# MATCH TWEETS ------------------------------------------------------------ -id_folder <- "matched-ids" -unlink(id_folder, recursive = TRUE) -dir.create(id_folder) -issues <- data.frame(date = drange) -issuelist <- readLines("issues.xml") -issuelist <- str_replace_all(string = issuelist, pattern = ".*", "") -issuelist <- xmlToList(issuelist) -issueheads <- names(issuelist) -issues[issueheads] <- 0 -tweets$issue <- "" -tweets$tags <- "" -tagexpand <- c("", "s", "n", "en", "er") -# Parallelisation -writeLines(c(""), "issuecomp-analysis.log") -cl<-makeCluster(3) -registerDoParallel(cl) -df<-foreach(d = 1:nrow(issues), .packages = c("stringr")) %dopar% { -#for(d in 1:nrow(issues)) { -# Go through every day -curdate <- issues$date[d] -cat(paste(as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE) -# Put all tweets from specific day in a temporary DF -tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] -for(t in 1:nrow(tweets_curday)){ -cat(paste("Starting tweet", t, "of",as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE) -# Select tweet's text, make it lowercase and remove hashtag indicators (#) -curtext <- as.character(tweets_curday$text[t]) -curtext <- str_replace_all(curtext, "#", "") -curid <- as.character(tweets_curday$id_str[t]) -# Now test each single issue (not tag!) -for(i in 1:length(issueheads)) { -curissue <- issueheads[i] -curtags <- as.character(issuelist[[curissue]]) -curfile <- str_c(id_folder,"/",curissue,".csv") -# Now test all tags of a single issue -for(s in 1:length(curtags)) { -curtag <- curtags[s] -curchars <- nchar(curtag, type = "chars") -# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch -if(curchars <= 4) { -curacro <- checkAcronym(string = curtag, chars = curchars) -} else { -curacro <- FALSE -} -# Now expand the current tag by possible suffixes that may be plural forms -if(!curacro) { -for(e in 1:length(tagexpand)) { -curtag[e] <- str_c(curtag[1], tagexpand[e]) -} -} -# Set Levenshtein distance depending on char length -if(curchars <= 4) { -curdistance <- 0 -} else { -curdistance <- 1 -} -# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow also 1 (Levenshtein distance) -tags_found <- NULL -# Match the tweet with each variation of tagexpand -for(e in 1:length(curtag)) { -tags_found[e] <- smartPatternMatch(curtext, curtag[e], curdistance, curacro) -} -tags_found <- any(tags_found) -curtag <- curtag[1] -if(tags_found == TRUE) { -# Raise number of findings on this day for this issue by 1 -issues[d,curissue] <- issues[d,curissue] + 1 -# Add issue and first matched tag of tweet to tweets-DF -oldissue <- tweets[tweets[, "id_str"] == curid, "issue"] -tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";") -oldtag <- tweets[tweets[, "id_str"] == curid, "tags"] -tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";") -# Add information to file for function viewPatternMatching -write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE) -cat("Match!\n") -break -} -else { -#cat("Nothing found\n") -} -} # /for curtags -} # /for issuelist -} # /for tweets_curday -} # /for drange -stopCluster(cl) -cl<-makeCluster(3) -registerDoParallel(cl) -stopCluster(cl) -# MATCH TWEETS ------------------------------------------------------------ -id_folder <- "matched-ids" -unlink(id_folder, recursive = TRUE) -dir.create(id_folder) -issues <- data.frame(date = drange) -issuelist <- readLines("issues.xml") -issuelist <- str_replace_all(string = issuelist, pattern = ".*", "") -issuelist <- xmlToList(issuelist) -issueheads <- names(issuelist) -issues[issueheads] <- 0 -tweets$issue <- "" -tweets$tags <- "" -tagexpand <- c("", "s", "n", "en", "er") -# Parallelisation -writeLines(c(""), "issuecomp-analysis.log") -cl<-makeCluster(3) -registerDoParallel(cl) -df<-foreach(d = 1:nrow(issues), .packages = c("stringr"), .combine=rbind) %dopar% { -#for(d in 1:nrow(issues)) { -# Go through every day -curdate <- issues$date[d] -cat(paste(as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE) -# Put all tweets from specific day in a temporary DF -tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] -for(t in 1:nrow(tweets_curday)){ -cat(paste("Starting tweet", t, "of",as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE) -# Select tweet's text, make it lowercase and remove hashtag indicators (#) -curtext <- as.character(tweets_curday$text[t]) -curtext <- str_replace_all(curtext, "#", "") -curid <- as.character(tweets_curday$id_str[t]) -# Now test each single issue (not tag!) -for(i in 1:length(issueheads)) { -curissue <- issueheads[i] -curtags <- as.character(issuelist[[curissue]]) -curfile <- str_c(id_folder,"/",curissue,".csv") -# Now test all tags of a single issue -for(s in 1:length(curtags)) { -curtag <- curtags[s] -curchars <- nchar(curtag, type = "chars") -# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch -if(curchars <= 4) { -curacro <- checkAcronym(string = curtag, chars = curchars) -} else { -curacro <- FALSE -} -# Now expand the current tag by possible suffixes that may be plural forms -if(!curacro) { -for(e in 1:length(tagexpand)) { -curtag[e] <- str_c(curtag[1], tagexpand[e]) -} -} -# Set Levenshtein distance depending on char length -if(curchars <= 4) { -curdistance <- 0 -} else { -curdistance <- 1 -} -# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow also 1 (Levenshtein distance) -tags_found <- NULL -# Match the tweet with each variation of tagexpand -for(e in 1:length(curtag)) { -tags_found[e] <- smartPatternMatch(curtext, curtag[e], curdistance, curacro) -} -tags_found <- any(tags_found) -curtag <- curtag[1] -if(tags_found == TRUE) { -# Raise number of findings on this day for this issue by 1 -issues[d,curissue] <- issues[d,curissue] + 1 -# Add issue and first matched tag of tweet to tweets-DF -oldissue <- tweets[tweets[, "id_str"] == curid, "issue"] -tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";") -oldtag <- tweets[tweets[, "id_str"] == curid, "tags"] -tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";") -# Add information to file for function viewPatternMatching -write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE) -cat("Match!\n") -break -} -else { -#cat("Nothing found\n") -} } # /for curtags } # /for issuelist } # /for tweets_curday @@ -510,3 +196,317 @@ stopCluster(cl) View(issues) rm(data) df +require(lubridate) +require(XML) +require(ggplot2) +require(reshape2) +require(stringr) +library(foreach) +library(doParallel) +View(issues) +# MATCH TWEETS ------------------------------------------------------------ +id_folder <- "matched-ids" +unlink(id_folder, recursive = TRUE) +dir.create(id_folder) +issues <- data.frame(date = drange) +issuelist <- readLines("issues.xml") +issuelist <- str_replace_all(string = issuelist, pattern = ".*", "") +issuelist <- xmlToList(issuelist) +issueheads <- names(issuelist) +issues[issueheads] <- 0 +tweets$issue <- "" +tweets$tags <- "" +tagexpand <- c("", "s", "n", "en", "er") +# Parallelisation +writeLines(c(""), "issuecomp-analysis.log") +cl<-makeCluster(3) +registerDoParallel(cl) +df<-foreach(d = 1:3, .packages = c("stringr"), .combine=rbind) %dopar% { +#df<-foreach(d = 1:nrow(issues), .packages = c("stringr"), .combine=rbind) %dopar% { +#for(d in 1:nrow(issues)) { +# Go through every day +curdate <- issues$date[d] +cat(paste(as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE) +# Put all tweets from specific day in a temporary DF +tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] +for(t in 1:25){ +#for(t in 1:nrow(tweets_curday)){ +cat(paste("Starting tweet", t, "of",as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE) +# Select tweet's text, make it lowercase and remove hashtag indicators (#) +curtext <- as.character(tweets_curday$text[t]) +curtext <- str_replace_all(curtext, "#", "") +curid <- as.character(tweets_curday$id_str[t]) +# Now test each single issue (not tag!) +for(i in 1:length(issueheads)) { +curissue <- issueheads[i] +curtags <- as.character(issuelist[[curissue]]) +curfile <- str_c(id_folder,"/",curissue,".csv") +# Now test all tags of a single issue +for(s in 1:length(curtags)) { +curtag <- curtags[s] +curchars <- nchar(curtag, type = "chars") +# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch +if(curchars <= 4) { +curacro <- checkAcronym(string = curtag, chars = curchars) +} else { +curacro <- FALSE +} +# Now expand the current tag by possible suffixes that may be plural forms +if(!curacro) { +for(e in 1:length(tagexpand)) { +curtag[e] <- str_c(curtag[1], tagexpand[e]) +} +} +# Set Levenshtein distance depending on char length +if(curchars <= 4) { +curdistance <- 0 +} else { +curdistance <- 1 +} +# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow also 1 (Levenshtein distance) +tags_found <- NULL +# Match the tweet with each variation of tagexpand +for(e in 1:length(curtag)) { +tags_found[e] <- smartPatternMatch(curtext, curtag[e], curdistance, curacro) +} +tags_found <- any(tags_found) +curtag <- curtag[1] +if(tags_found == TRUE) { +# Raise number of findings on this day for this issue by 1 +issues[d,curissue] <- issues[d,curissue] + 1 +# Add issue and first matched tag of tweet to tweets-DF +oldissue <- tweets[tweets[, "id_str"] == curid, "issue"] +tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";") +oldtag <- tweets[tweets[, "id_str"] == curid, "tags"] +tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";") +# Add information to file for function viewPatternMatching +write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE) +cat(paste("Match!\n"), file="issuecomp-analysis.log", append=TRUE) +data.frame(date=curdate, issue=curissue) +break +} +else { +#cat("Nothing found\n") +} +} # /for curtags +} # /for issuelist +} # /for tweets_curday +} # /for drange +#rm(tweets_curday,curacro, curchars, curdate,curfile,curid,curissue,curtag,curtags,curtext,d,date_end,date_start,i,id_folder,oldissue,oldtag,s,t,tags_found) +stopCluster(cl) +df +drange +nrow(tweets[tweets[, "id_str"] == curid, "issue"]) +curid +nrow(tweets[tweets[, "date"] == 2014-01-10, "issue"]) +nrow(tweets[tweets[, "created_at"] == 2014-01-10, "issue"]) +View(tweets) +tweets[tweets[, "created_at"] == 2014-01-10, "issue"] +tweets[tweets[, "created_at"] == 2014-01-10, ] +tweets[tweets[, "created_at"] == "2014-01-10", ] +tweets[tweets[, "created_at"] == "2014-01-10", "user"] +nrow(tweets[tweets[, "created_at"] == "2014-01-10", "user"]) +nrow(tweets[tweets[, "created_at"] == "2014-01-10", "id_str"]) +length(tweets[tweets[, "created_at"] == "2014-01-10", "id_str"]) +writeLines(c(""), "issuecomp-analysis.log") +cl<-makeCluster(3) +registerDoParallel(cl) +df<-foreach(d = 1:length(drange), .packages = c("stringr"), .combine=rbind) %dopar% { +curdate <- as.character(drange[d]) +curissue <- length(tweets[tweets[, "created_at"] == curdate, "id_str"]) +data.frame(date=curdate, tweets=curissue) +} +stopCluster(cl) +View(df) +writeLines(c(""), "issuecomp-analysis.log") +cl<-makeCluster(3) +registerDoParallel(cl) +df<-foreach(d = 1:length(drange), .packages = c("stringr"), .combine=rbind) %dopar% { +for(i in 1:5) { +curdate <- as.character(drange[d]) +curissue <- length(tweets[tweets[, "created_at"] == curdate, "id_str"]) +data.frame(date=curdate, tweets=curissue) +} +} +stopCluster(cl) +View(tweets) +curissue +curtag +# MATCH TWEETS ------------------------------------------------------------ +id_folder <- "matched-ids" +unlink(id_folder, recursive = TRUE) +dir.create(id_folder) +issues <- data.frame(date = drange) +issuelist <- readLines("issues.xml") +issuelist <- str_replace_all(string = issuelist, pattern = ".*", "") +issuelist <- xmlToList(issuelist) +issueheads <- names(issuelist) +issues[issueheads] <- 0 +tweets$issue <- "" +tweets$tags <- "" +tagexpand <- c("", "s", "n", "en", "er", "e") +# Parallelisation +writeLines(c(""), "issuecomp-analysis.log") +cl<-makeCluster(3) +registerDoParallel(cl) +df<-foreach(d = 1:3, .packages = c("stringr"), .combine=rbind) %dopar% { +#df<-foreach(d = 1:nrow(issues), .packages = c("stringr"), .combine=rbind) %dopar% { +#for(d in 1:nrow(issues)) { +# Go through every day +curdate <- issues$date[d] +cat(paste(as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE) +# Put all tweets from specific day in a temporary DF +tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] +for(t in 1:25){ +#for(t in 1:nrow(tweets_curday)){ +cat(paste("Starting tweet", t, "of",as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE) +# Select tweet's text, make it lowercase and remove hashtag indicators (#) +curtext <- as.character(tweets_curday$text[t]) +curtext <- str_replace_all(curtext, "#", "") +curid <- as.character(tweets_curday$id_str[t]) +# Now test each single issue (not tag!) +for(i in 1:length(issueheads)) { +curissue <- issueheads[i] +curtags <- as.character(issuelist[[curissue]]) +curfile <- str_c(id_folder,"/",curissue,".csv") +# Now test all tags of a single issue +for(s in 1:length(curtags)) { +curtag <- curtags[s] +curchars <- nchar(curtag, type = "chars") +# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch +if(curchars <= 4) { +curacro <- checkAcronym(string = curtag, chars = curchars) +} else { +curacro <- FALSE +} +# Now expand the current tag by possible suffixes that may be plural forms +if(!curacro) { +for(e in 1:length(tagexpand)) { +curtag[e] <- str_c(curtag[1], tagexpand[e]) +} +} +# Set Levenshtein distance depending on char length +if(curchars <= 4) { +curdistance <- 0 +} else { +curdistance <- 1 +} +# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow also 1 (Levenshtein distance) +tags_found <- NULL +# Match the tweet with each variation of tagexpand +for(e in 1:length(curtag)) { +tags_found[e] <- smartPatternMatch(curtext, curtag[e], curdistance, curacro) +} +tags_found <- any(tags_found) +curtag <- curtag[1] +if(tags_found == TRUE) { +# # Raise number of findings on this day for this issue by 1 +# issues[d,curissue] <- issues[d,curissue] + 1 +# +# # Add issue and first matched tag of tweet to tweets-DF +# oldissue <- tweets[tweets[, "id_str"] == curid, "issue"] +# tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";") +# oldtag <- tweets[tweets[, "id_str"] == curid, "tags"] +# tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";") +# Add information to file for function viewPatternMatching +write(str_c(curdate,";\"",curid,"\";",curissue,";",curtag), curfile, append = TRUE) +cat(paste("Match!\n"), file="issuecomp-analysis.log", append=TRUE) +# data.frame(date=curdate, issue=curissue) +break # next issue, no more tags from same issue +} +else { +#cat("Nothing found\n") +} +} # /for curtags +} # /for issuelist +} # /for tweets_curday +} # /for drange +#rm(tweets_curday,curacro, curchars, curdate,curfile,curid,curissue,curtag,curtags,curtext,d,date_end,date_start,i,id_folder,oldissue,oldtag,s,t,tags_found) +stopCluster(cl) +# MATCH TWEETS ------------------------------------------------------------ +id_folder <- "matched-ids" +unlink(id_folder, recursive = TRUE) +dir.create(id_folder) +issues <- data.frame(date = drange) +issuelist <- readLines("issues.xml") +issuelist <- str_replace_all(string = issuelist, pattern = ".*", "") +issuelist <- xmlToList(issuelist) +issueheads <- names(issuelist) +issues[issueheads] <- 0 +tweets$issue <- "" +tweets$tags <- "" +tagexpand <- c("", "s", "n", "en", "er", "e") +# Parallelisation +writeLines(c(""), "issuecomp-analysis.log") +cl<-makeCluster(3) +registerDoParallel(cl) +df<-foreach(d = 1:nrow(issues), .packages = c("stringr"), .combine=rbind) %dopar% { +#for(d in 1:nrow(issues)) { +# Go through every day +curdate <- issues$date[d] +cat(paste(as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE) +# Put all tweets from specific day in a temporary DF +tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] +for(t in 1:nrow(tweets_curday)){ +cat(paste("Starting tweet", t, "of",as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE) +# Select tweet's text, make it lowercase and remove hashtag indicators (#) +curtext <- as.character(tweets_curday$text[t]) +curtext <- str_replace_all(curtext, "#", "") +curid <- as.character(tweets_curday$id_str[t]) +# Now test each single issue (not tag!) +for(i in 1:length(issueheads)) { +curissue <- issueheads[i] +curtags <- as.character(issuelist[[curissue]]) +curfile <- str_c(id_folder,"/",curissue,".csv") +# Now test all tags of a single issue +for(s in 1:length(curtags)) { +curtag <- curtags[s] +curchars <- nchar(curtag, type = "chars") +# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch +if(curchars <= 4) { +curacro <- checkAcronym(string = curtag, chars = curchars) +} else { +curacro <- FALSE +} +# Now expand the current tag by possible suffixes that may be plural forms +if(!curacro) { +for(e in 1:length(tagexpand)) { +curtag[e] <- str_c(curtag[1], tagexpand[e]) +} +} +# Set Levenshtein distance depending on char length +if(curchars <= 4) { +curdistance <- 0 +} else { +curdistance <- 1 +} +# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow also 1 (Levenshtein distance) +tags_found <- NULL +# Match the tweet with each variation of tagexpand +for(e in 1:length(curtag)) { +tags_found[e] <- smartPatternMatch(curtext, curtag[e], curdistance, curacro) +} +tags_found <- any(tags_found) +curtag <- curtag[1] +if(tags_found == TRUE) { +# # Raise number of findings on this day for this issue by 1 +# issues[d,curissue] <- issues[d,curissue] + 1 +# +# # Add issue and first matched tag of tweet to tweets-DF +# oldissue <- tweets[tweets[, "id_str"] == curid, "issue"] +# tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";") +# oldtag <- tweets[tweets[, "id_str"] == curid, "tags"] +# tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";") +# Add information to file for function viewPatternMatching +write(str_c(curdate,";\"",curid,"\";",curissue,";",curtag), curfile, append = TRUE) +cat(paste("Match!\n"), file="issuecomp-analysis.log", append=TRUE) +# data.frame(date=curdate, issue=curissue) +break # next issue, no more tags from same issue +} +else { +#cat("Nothing found\n") +} +} # /for curtags +} # /for issuelist +} # /for tweets_curday +} # /for drange diff --git a/.gitignore b/.gitignore index 7238fcd..1e59548 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ tweets_untagged.csv tweets_untagged.RData .RData matched-ids +issuecomp-analysis.log diff --git a/issuecomp-analysis.R b/issuecomp-analysis.R index c33f18e..822589e 100644 --- a/issuecomp-analysis.R +++ b/issuecomp-analysis.R @@ -33,15 +33,14 @@ issues[issueheads] <- 0 tweets$issue <- "" tweets$tags <- "" -tagexpand <- c("", "s", "n", "en", "er") +tagexpand <- c("", "s", "n", "en", "er", "e") # Parallelisation writeLines(c(""), "issuecomp-analysis.log") -cl<-makeCluster(3) +cl<-makeCluster(4) registerDoParallel(cl) -df<-foreach(d = 1:3, .packages = c("stringr"), .combine=rbind) %dopar% { -#df<-foreach(d = 1:nrow(issues), .packages = c("stringr"), .combine=rbind) %dopar% { +df<-foreach(d = 1:nrow(issues), .packages = c("stringr"), .combine=rbind) %dopar% { #for(d in 1:nrow(issues)) { # Go through every day curdate <- issues$date[d] @@ -50,8 +49,7 @@ df<-foreach(d = 1:3, .packages = c("stringr"), .combine=rbind) %dopar% { # Put all tweets from specific day in a temporary DF tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] - for(t in 1:25){ - #for(t in 1:nrow(tweets_curday)){ + for(t in 1:nrow(tweets_curday)){ cat(paste("Starting tweet", t, "of",as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE) # Select tweet's text, make it lowercase and remove hashtag indicators (#) curtext <- as.character(tweets_curday$text[t]) @@ -101,25 +99,25 @@ df<-foreach(d = 1:3, .packages = c("stringr"), .combine=rbind) %dopar% { curtag <- curtag[1] if(tags_found == TRUE) { - # Raise number of findings on this day for this issue by 1 - issues[d,curissue] <- issues[d,curissue] + 1 - - # Add issue and first matched tag of tweet to tweets-DF - oldissue <- tweets[tweets[, "id_str"] == curid, "issue"] - tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";") - oldtag <- tweets[tweets[, "id_str"] == curid, "tags"] - tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";") +# # Raise number of findings on this day for this issue by 1 +# issues[d,curissue] <- issues[d,curissue] + 1 +# +# # Add issue and first matched tag of tweet to tweets-DF +# oldissue <- tweets[tweets[, "id_str"] == curid, "issue"] +# tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";") +# oldtag <- tweets[tweets[, "id_str"] == curid, "tags"] +# tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";") # Add information to file for function viewPatternMatching - write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE) + write(str_c(curdate,";\"",curid,"\";",curissue,";",curtag), curfile, append = TRUE) cat(paste("Match!\n"), file="issuecomp-analysis.log", append=TRUE) - break +# data.frame(date=curdate, issue=curissue) + break # next issue, no more tags from same issue } else { #cat("Nothing found\n") } } # /for curtags - } # /for issuelist } # /for tweets_curday } # /for drange diff --git a/issuecomp-analysis.log b/issuecomp-analysis.log index c073cf9..ce909e3 100644 --- a/issuecomp-analysis.log +++ b/issuecomp-analysis.log @@ -1,103 +1,76 @@ -2014-01-01 2014-01-02 +2014-01-01 2014-01-03 Starting tweet 1 of 2014-01-03 Starting tweet 1 of 2014-01-01 Starting tweet 1 of 2014-01-02 Starting tweet 2 of 2014-01-01 -Starting tweet 2 of 2014-01-03 Starting tweet 2 of 2014-01-02 -Starting tweet 3 of 2014-01-03 +Starting tweet 2 of 2014-01-03 Starting tweet 3 of 2014-01-01 Match! Starting tweet 3 of 2014-01-02 +Starting tweet 3 of 2014-01-03 +Starting tweet 4 of 2014-01-02 +Starting tweet 4 of 2014-01-01 Starting tweet 4 of 2014-01-03 Match! -Starting tweet 4 of 2014-01-01 Match! -Starting tweet 4 of 2014-01-02 -Starting tweet 5 of 2014-01-03 -Starting tweet 5 of 2014-01-01 -Starting tweet 6 of 2014-01-03 Match! Starting tweet 5 of 2014-01-02 -Match! -Starting tweet 6 of 2014-01-01 +Starting tweet 5 of 2014-01-01 +Starting tweet 5 of 2014-01-03 Starting tweet 6 of 2014-01-02 -Starting tweet 7 of 2014-01-03 -Starting tweet 7 of 2014-01-01 +Starting tweet 6 of 2014-01-01 +Starting tweet 6 of 2014-01-03 +Match! Match! Starting tweet 7 of 2014-01-02 -Starting tweet 8 of 2014-01-03 +Starting tweet 7 of 2014-01-01 +Starting tweet 7 of 2014-01-03 Starting tweet 8 of 2014-01-01 -Match! Starting tweet 8 of 2014-01-02 -Starting tweet 9 of 2014-01-03 +Starting tweet 8 of 2014-01-03 +Match! Starting tweet 9 of 2014-01-01 Starting tweet 9 of 2014-01-02 -Starting tweet 10 of 2014-01-03 +Starting tweet 9 of 2014-01-03 Starting tweet 10 of 2014-01-01 Starting tweet 10 of 2014-01-02 -Match! -Starting tweet 11 of 2014-01-03 +Starting tweet 10 of 2014-01-03 Starting tweet 11 of 2014-01-01 Starting tweet 11 of 2014-01-02 -Starting tweet 12 of 2014-01-03 +Match! +Starting tweet 11 of 2014-01-03 Starting tweet 12 of 2014-01-01 Starting tweet 12 of 2014-01-02 -Starting tweet 13 of 2014-01-03 -Match! +Starting tweet 12 of 2014-01-03 Starting tweet 13 of 2014-01-01 +Match! Starting tweet 13 of 2014-01-02 -Starting tweet 14 of 2014-01-03 +Starting tweet 13 of 2014-01-03 Starting tweet 14 of 2014-01-01 Starting tweet 14 of 2014-01-02 -Match! -Starting tweet 15 of 2014-01-03 +Starting tweet 14 of 2014-01-03 Starting tweet 15 of 2014-01-01 Match! Starting tweet 15 of 2014-01-02 Match! -Starting tweet 16 of 2014-01-03 +Starting tweet 15 of 2014-01-03 Starting tweet 16 of 2014-01-01 Starting tweet 16 of 2014-01-02 +Match! +Starting tweet 16 of 2014-01-03 +Starting tweet 17 of 2014-01-01 +Match! +Starting tweet 17 of 2014-01-02 Starting tweet 17 of 2014-01-03 Match! -Starting tweet 17 of 2014-01-01 -Starting tweet 17 of 2014-01-02 +Starting tweet 18 of 2014-01-01 Match! Starting tweet 18 of 2014-01-03 Match! -Match! Starting tweet 18 of 2014-01-02 Match! -Starting tweet 18 of 2014-01-01 -Starting tweet 19 of 2014-01-03 -Match! -Starting tweet 19 of 2014-01-02 Starting tweet 19 of 2014-01-01 -Match! -Starting tweet 20 of 2014-01-03 -Starting tweet 20 of 2014-01-02 -Match! -Starting tweet 20 of 2014-01-01 -Match! -Starting tweet 21 of 2014-01-03 -Starting tweet 21 of 2014-01-02 -Match! -Starting tweet 21 of 2014-01-01 -Starting tweet 22 of 2014-01-03 -Starting tweet 22 of 2014-01-02 -Starting tweet 22 of 2014-01-01 -Starting tweet 23 of 2014-01-03 -Starting tweet 23 of 2014-01-02 -Starting tweet 23 of 2014-01-01 -Starting tweet 24 of 2014-01-03 -Starting tweet 24 of 2014-01-02 -Starting tweet 24 of 2014-01-01 -Match! -Starting tweet 25 of 2014-01-03 -Starting tweet 25 of 2014-01-02 -Starting tweet 25 of 2014-01-01 -Match!