diff --git a/.Rhistory b/.Rhistory index ed88fc5..e0e8af3 100644 --- a/.Rhistory +++ b/.Rhistory @@ -1,88 +1,3 @@ -issueheads <- names(issuelist) -issues[issueheads] <- 0 -tweets$issue <- "" -tweets$tags <- "" -tagexpand <- c("", "s", "n", "en", "er", "e") -# Parallelisation -writeLines(c(""), "issuecomp-analysis.log") -cl<-makeCluster(4) -registerDoParallel(cl) -foreach(d = 1:nrow(issues), .packages = c("stringr"), .combine=rbind) %dopar% { -#for(d in 1:nrow(issues)) { -# Go through every day -curdate <- issues$date[d] -cat(paste(as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE) -# Put all tweets from specific day in a temporary DF -tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] -for(t in 1:nrow(tweets_curday)){ -# cat(paste("Starting tweet", t, "of",as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE) -# Select tweet's text, make it lowercase and remove hashtag indicators (#) -curtext <- as.character(tweets_curday$text[t]) -curtext <- str_replace_all(curtext, "#", "") -curid <- as.character(tweets_curday$id_str[t]) -# Now test each single issue (not tag!) -for(i in 1:length(issueheads)) { -curissue <- issueheads[i] -curtags <- as.character(issuelist[[curissue]]) -curfile <- str_c(id_folder,"/",curissue,".csv") -# Now test all tags of a single issue -for(s in 1:length(curtags)) { -curtag <- curtags[s] -curchars <- nchar(curtag, type = "chars") -# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch -if(curchars <= 4) { -curacro <- checkAcronym(string = curtag, chars = curchars) -} else { -curacro <- FALSE -} -# Now expand the current tag by possible suffixes that may be plural forms -if(!curacro) { -for(e in 1:length(tagexpand)) { -curtag[e] <- str_c(curtag[1], tagexpand[e]) -} -} -# Set Levenshtein distance depending on char length -if(curchars <= 4) { -curdistance <- 0 -} else { -curdistance <- 1 -} -# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow also 1 (Levenshtein distance) -tags_found <- NULL -# Match the tweet with each variation of tagexpand -for(e in 1:length(curtag)) { -tags_found[e] <- smartPatternMatch(curtext, curtag[e], curdistance, curacro) -} -tags_found <- any(tags_found) -curtag <- curtag[1] -if(tags_found == TRUE) { -# # Raise number of findings on this day for this issue by 1 -# issues[d,curissue] <- issues[d,curissue] + 1 -# -# # Add issue and first matched tag of tweet to tweets-DF -# oldissue <- tweets[tweets[, "id_str"] == curid, "issue"] -# tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";") -# oldtag <- tweets[tweets[, "id_str"] == curid, "tags"] -# tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";") -# Add information to file for function viewPatternMatching -write(str_c(curdate,";\"",curid,"\";",curissue,";",curtag), curfile, append = TRUE) -cat(paste("Match!\n"), file="issuecomp-analysis.log", append=TRUE) -# data.frame(date=curdate, issue=curissue) -break # next issue, no more tags from same issue -} -else { -#cat("Nothing found\n") -} -} # /for curtags -} # /for issuelist -} # /for tweets_curday -} # /for drange -#rm(tweets_curday,curacro, curchars, curdate,curfile,curid,curissue,curtag,curtags,curtext,d,date_end,date_start,i,id_folder,oldissue,oldtag,s,t,tags_found) -stopCluster(cl) -require(lubridate) -require(XML) -require(ggplot2) -require(reshape2) require(stringr) library(foreach) library(doParallel) @@ -510,3 +425,88 @@ else { } # /for issuelist } # /for tweets_curday } # /for drange +require(lubridate) +require(XML) +require(ggplot2) +require(reshape2) +require(stringr) +library(foreach) +library(doParallel) +cl<-makeCluster(3) +registerDoParallel(cl) +foreach(d = 101:nrow(issues), .packages = c("stringr"), .combine=rbind) %dopar% { +#for(d in 1:nrow(issues)) { +# Go through every day +curdate <- issues$date[d] +cat(paste(as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE) +# Put all tweets from specific day in a temporary DF +tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] +for(t in 1:nrow(tweets_curday)){ +# cat(paste("Starting tweet", t, "of",as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE) +# Select tweet's text, make it lowercase and remove hashtag indicators (#) +curtext <- as.character(tweets_curday$text[t]) +curtext <- str_replace_all(curtext, "#", "") +curid <- as.character(tweets_curday$id_str[t]) +# Now test each single issue (not tag!) +for(i in 1:length(issueheads)) { +curissue <- issueheads[i] +curtags <- as.character(issuelist[[curissue]]) +curfile <- str_c(id_folder,"/",curissue,".csv") +# Now test all tags of a single issue +for(s in 1:length(curtags)) { +curtag <- curtags[s] +curchars <- nchar(curtag, type = "chars") +# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch +if(curchars <= 4) { +curacro <- checkAcronym(string = curtag, chars = curchars) +} else { +curacro <- FALSE +} +# Now expand the current tag by possible suffixes that may be plural forms +if(!curacro) { +for(e in 1:length(tagexpand)) { +curtag[e] <- str_c(curtag[1], tagexpand[e]) +} +} +# Set Levenshtein distance depending on char length +if(curchars <= 4) { +curdistance <- 0 +} else { +curdistance <- 1 +} +# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow also 1 (Levenshtein distance) +tags_found <- NULL +# Match the tweet with each variation of tagexpand +for(e in 1:length(curtag)) { +tags_found[e] <- smartPatternMatch(curtext, curtag[e], curdistance, curacro) +} +tags_found <- any(tags_found) +curtag <- curtag[1] +if(tags_found == TRUE) { +# # Raise number of findings on this day for this issue by 1 +# issues[d,curissue] <- issues[d,curissue] + 1 +# +# # Add issue and first matched tag of tweet to tweets-DF +# oldissue <- tweets[tweets[, "id_str"] == curid, "issue"] +# tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";") +# oldtag <- tweets[tweets[, "id_str"] == curid, "tags"] +# tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";") +# Add information to file for function viewPatternMatching +write(str_c(curdate,";\"",curid,"\";",curissue,";",curtag), curfile, append = TRUE) +# cat(paste("Match!\n"), file="issuecomp-analysis.log", append=TRUE) +# data.frame(date=curdate, issue=curissue) +break # next issue, no more tags from same issue +} +else { +#cat("Nothing found\n") +} +} # /for curtags +} # /for issuelist +} # /for tweets_curday +} # /for drange +stopCluster(cl) +drange[200] +drange[300] +drange[280] +drange[270] +drange[259] diff --git a/issuecomp-analysis.R b/issuecomp-analysis.R index 2f9743a..5f655c1 100644 --- a/issuecomp-analysis.R +++ b/issuecomp-analysis.R @@ -37,10 +37,10 @@ tagexpand <- c("", "s", "n", "en", "er", "e") # Parallelisation writeLines(c(""), "issuecomp-analysis.log") -cl<-makeCluster(3) +cl<-makeCluster(4) registerDoParallel(cl) -foreach(d = 101:nrow(issues), .packages = c("stringr"), .combine=rbind) %dopar% { +foreach(d = 260:nrow(issues), .packages = c("stringr"), .combine=rbind) %dopar% { #for(d in 1:nrow(issues)) { # Go through every day curdate <- issues$date[d] diff --git a/issuecomp-analysis.log b/issuecomp-analysis.log index 0467926..e196e60 100644 --- a/issuecomp-analysis.log +++ b/issuecomp-analysis.log @@ -108,3 +108,271 @@ 2014-04-13 2014-04-14 2014-04-15 +2014-04-11 +2014-04-12 +2014-04-13 +2014-04-14 +2014-04-15 +2014-04-16 +2014-04-17 +2014-04-18 +2014-04-19 +2014-04-20 +2014-04-21 +2014-04-22 +2014-04-23 +2014-04-24 +2014-04-25 +2014-04-26 +2014-04-27 +2014-04-28 +2014-04-29 +2014-04-30 +2014-05-01 +2014-05-02 +2014-05-03 +2014-05-04 +2014-05-05 +2014-05-06 +2014-05-07 +2014-05-08 +2014-05-09 +2014-05-10 +2014-05-11 +2014-05-12 +2014-05-13 +2014-05-14 +2014-05-15 +2014-05-16 +2014-05-17 +2014-05-18 +2014-05-19 +2014-05-20 +2014-05-21 +2014-05-22 +2014-05-23 +2014-05-24 +2014-05-25 +2014-05-26 +2014-05-27 +2014-05-28 +2014-05-29 +2014-05-30 +2014-05-31 +2014-06-01 +2014-06-02 +2014-06-03 +2014-06-04 +2014-06-05 +2014-06-06 +2014-06-07 +2014-06-08 +2014-06-09 +2014-06-10 +2014-06-11 +2014-06-12 +2014-06-13 +2014-06-14 +2014-06-15 +2014-06-16 +2014-06-17 +2014-06-18 +2014-06-19 +2014-06-20 +2014-06-21 +2014-06-22 +2014-06-23 +2014-06-24 +2014-06-25 +2014-06-26 +2014-06-27 +2014-06-28 +2014-06-29 +2014-06-30 +2014-07-01 +2014-07-02 +2014-07-03 +2014-07-04 +2014-07-05 +2014-07-06 +2014-07-07 +2014-07-08 +2014-07-09 +2014-07-10 +2014-07-11 +2014-07-12 +2014-07-13 +2014-07-14 +2014-07-15 +2014-07-16 +2014-07-17 +2014-07-18 +2014-07-19 +2014-07-20 +2014-07-21 +2014-07-22 +2014-07-23 +2014-07-24 +2014-07-25 +2014-07-26 +2014-07-27 +2014-07-28 +2014-07-29 +2014-07-30 +2014-07-31 +2014-08-01 +2014-08-02 +2014-08-03 +2014-08-04 +2014-08-05 +2014-08-06 +2014-08-07 +2014-08-08 +2014-08-09 +2014-08-10 +2014-08-11 +2014-08-12 +2014-08-13 +2014-08-14 +2014-08-15 +2014-08-16 +2014-08-17 +2014-08-18 +2014-08-19 +2014-08-20 +2014-08-21 +2014-08-22 +2014-08-23 +2014-08-24 +2014-08-25 +2014-08-26 +2014-08-27 +2014-08-28 +2014-08-29 +2014-08-30 +2014-08-31 +2014-09-01 +2014-09-02 +2014-09-03 +2014-09-04 +2014-09-05 +2014-09-06 +2014-09-07 +2014-09-08 +2014-09-09 +2014-09-10 +2014-09-11 +2014-09-12 +2014-09-13 +2014-09-14 +2014-09-15 +2014-09-16 +2014-09-17 +2014-09-18 +2014-09-19 +2014-09-17 +2014-09-18 +2014-09-19 +2014-09-20 +2014-09-21 +2014-09-22 +2014-09-23 +2014-09-24 +2014-09-25 +2014-09-26 +2014-09-27 +2014-09-28 +2014-09-29 +2014-09-30 +2014-10-01 +2014-10-02 +2014-10-03 +2014-10-04 +2014-10-05 +2014-10-06 +2014-10-07 +2014-10-08 +2014-10-09 +2014-10-10 +2014-10-11 +2014-10-12 +2014-10-13 +2014-10-14 +2014-10-15 +2014-10-16 +2014-10-17 +2014-10-18 +2014-10-19 +2014-10-20 +2014-10-21 +2014-10-22 +2014-10-23 +2014-10-24 +2014-10-25 +2014-10-26 +2014-10-27 +2014-10-28 +2014-10-29 +2014-10-30 +2014-10-31 +2014-11-01 +2014-11-02 +2014-11-03 +2014-11-04 +2014-11-05 +2014-11-06 +2014-11-07 +2014-11-08 +2014-11-09 +2014-11-10 +2014-11-11 +2014-11-12 +2014-11-13 +2014-11-14 +2014-11-15 +2014-11-16 +2014-11-17 +2014-11-18 +2014-11-19 +2014-11-20 +2014-11-21 +2014-11-22 +2014-11-23 +2014-11-24 +2014-11-25 +2014-11-26 +2014-11-27 +2014-11-28 +2014-11-29 +2014-11-30 +2014-12-01 +2014-12-02 +2014-12-03 +2014-12-04 +2014-12-05 +2014-12-06 +2014-12-07 +2014-12-08 +2014-12-09 +2014-12-10 +2014-12-11 +2014-12-12 +2014-12-13 +2014-12-14 +2014-12-15 +2014-12-16 +2014-12-17 +2014-12-18 +2014-12-19 +2014-12-20 +2014-12-21 +2014-12-22 +2014-12-23 +2014-12-24 +2014-12-25 +2014-12-26 +2014-12-27 +2014-12-28 +2014-12-29 +2014-12-30 +2014-12-31 diff --git a/matched-ids.tar b/matched-ids.tar index 24dae10..bcf465a 100644 Binary files a/matched-ids.tar and b/matched-ids.tar differ