diff --git a/.Rhistory b/.Rhistory index 3664d88..ed88fc5 100644 --- a/.Rhistory +++ b/.Rhistory @@ -1,340 +1,3 @@ -for(s in 1:length(curtags)) { -curtag <- curtags[s] -curchars <- nchar(curtag, type = "chars") -# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch -if(curchars <= 4) { -curacro <- checkAcronym(string = curtag, chars = curchars) -} else { -curacro <- FALSE -} -# Now expand the current tag by possible suffixes that may be plural forms -if(!curacro) { -for(e in 1:length(tagexpand)) { -curtag[e] <- str_c(curtag[1], tagexpand[e]) -} -} -# Set Levenshtein distance depending on char length -if(curchars <= 4) { -curdistance <- 0 -} else { -curdistance <- 1 -} -# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow also 1 (Levenshtein distance) -tags_found <- NULL -# Match the tweet with each variation of tagexpand -for(e in 1:length(curtag)) { -tags_found[e] <- smartPatternMatch(curtext, curtag[e], curdistance, curacro) -} -tags_found <- any(tags_found) -curtag <- curtag[1] -if(tags_found == TRUE) { -# # Raise number of findings on this day for this issue by 1 -# issues[d,curissue] <- issues[d,curissue] + 1 -# -# # Add issue and first matched tag of tweet to tweets-DF -# oldissue <- tweets[tweets[, "id_str"] == curid, "issue"] -# tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";") -# oldtag <- tweets[tweets[, "id_str"] == curid, "tags"] -# tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";") -# Add information to file for function viewPatternMatching -write(str_c(curdate,";\"",curid,"\";",curissue,";",curtag), curfile, append = TRUE) -cat(paste("Match!\n"), file="issuecomp-analysis.log", append=TRUE) -# data.frame(date=curdate, issue=curissue) -break # next issue, no more tags from same issue -} -else { -#cat("Nothing found\n") -} -} # /for curtags -} # /for issuelist -} # /for tweets_curday -} # /for drange -# MATCH TWEETS ------------------------------------------------------------ -id_folder <- "matched-ids" -unlink(id_folder, recursive = TRUE) -dir.create(id_folder) -issues <- data.frame(date = drange) -issuelist <- readLines("issues.xml") -issuelist <- str_replace_all(string = issuelist, pattern = ".*", "") -issuelist <- xmlToList(issuelist) -issueheads <- names(issuelist) -issues[issueheads] <- 0 -tweets$issue <- "" -tweets$tags <- "" -tagexpand <- c("", "s", "n", "en", "er", "e") -# Parallelisation -writeLines(c(""), "issuecomp-analysis.log") -cl<-makeCluster(4) -registerDoParallel(cl) -df<-foreach(d = 1:nrow(issues), .packages = c("stringr"), .combine=rbind) %dopar% { -#for(d in 1:nrow(issues)) { -# Go through every day -curdate <- issues$date[d] -cat(paste(as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE) -# Put all tweets from specific day in a temporary DF -tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] -for(t in 1:nrow(tweets_curday)){ -cat(paste("Starting tweet", t, "of",as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE) -# Select tweet's text, make it lowercase and remove hashtag indicators (#) -curtext <- as.character(tweets_curday$text[t]) -curtext <- str_replace_all(curtext, "#", "") -curid <- as.character(tweets_curday$id_str[t]) -# Now test each single issue (not tag!) -for(i in 1:length(issueheads)) { -curissue <- issueheads[i] -curtags <- as.character(issuelist[[curissue]]) -curfile <- str_c(id_folder,"/",curissue,".csv") -# Now test all tags of a single issue -for(s in 1:length(curtags)) { -curtag <- curtags[s] -curchars <- nchar(curtag, type = "chars") -# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch -if(curchars <= 4) { -curacro <- checkAcronym(string = curtag, chars = curchars) -} else { -curacro <- FALSE -} -# Now expand the current tag by possible suffixes that may be plural forms -if(!curacro) { -for(e in 1:length(tagexpand)) { -curtag[e] <- str_c(curtag[1], tagexpand[e]) -} -} -# Set Levenshtein distance depending on char length -if(curchars <= 4) { -curdistance <- 0 -} else { -curdistance <- 1 -} -# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow also 1 (Levenshtein distance) -tags_found <- NULL -# Match the tweet with each variation of tagexpand -for(e in 1:length(curtag)) { -tags_found[e] <- smartPatternMatch(curtext, curtag[e], curdistance, curacro) -} -tags_found <- any(tags_found) -curtag <- curtag[1] -if(tags_found == TRUE) { -# # Raise number of findings on this day for this issue by 1 -# issues[d,curissue] <- issues[d,curissue] + 1 -# -# # Add issue and first matched tag of tweet to tweets-DF -# oldissue <- tweets[tweets[, "id_str"] == curid, "issue"] -# tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";") -# oldtag <- tweets[tweets[, "id_str"] == curid, "tags"] -# tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";") -# Add information to file for function viewPatternMatching -write(str_c(curdate,";\"",curid,"\";",curissue,";",curtag), curfile, append = TRUE) -cat(paste("Match!\n"), file="issuecomp-analysis.log", append=TRUE) -# data.frame(date=curdate, issue=curissue) -break # next issue, no more tags from same issue -} -else { -#cat("Nothing found\n") -} -} # /for curtags -} # /for issuelist -} # /for tweets_curday -} # /for drange -#rm(tweets_curday,curacro, curchars, curdate,curfile,curid,curissue,curtag,curtags,curtext,d,date_end,date_start,i,id_folder,oldissue,oldtag,s,t,tags_found) -stopCluster(cl) -require(lubridate) -require(XML) -require(ggplot2) -require(reshape2) -require(stringr) -library(foreach) -library(doParallel) -source("issuecomp-functions.R") -load(file = "tweets_untagged.RData") -# Create date range -date_start <- as.Date("2014-01-01") -date_end <- as.Date("2014-12-31") -drange <- as.integer(date_end - date_start) -drange <- date_start + days(0:drange) -# MATCH TWEETS ------------------------------------------------------------ -id_folder <- "matched-ids" -unlink(id_folder, recursive = TRUE) -dir.create(id_folder) -issues <- data.frame(date = drange) -issuelist <- readLines("issues.xml") -issuelist <- str_replace_all(string = issuelist, pattern = ".*", "") -issuelist <- xmlToList(issuelist) -issueheads <- names(issuelist) -issues[issueheads] <- 0 -tweets$issue <- "" -tweets$tags <- "" -tagexpand <- c("", "s", "n", "en", "er", "e") -# Parallelisation -writeLines(c(""), "issuecomp-analysis.log") -cl<-makeCluster(4) -registerDoParallel(cl) -df<-foreach(d = 1:nrow(issues), .packages = c("stringr"), .combine=rbind) %dopar% { -#for(d in 1:nrow(issues)) { -# Go through every day -curdate <- issues$date[d] -cat(paste(as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE) -# Put all tweets from specific day in a temporary DF -tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] -for(t in 1:nrow(tweets_curday)){ -cat(paste("Starting tweet", t, "of",as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE) -# Select tweet's text, make it lowercase and remove hashtag indicators (#) -curtext <- as.character(tweets_curday$text[t]) -curtext <- str_replace_all(curtext, "#", "") -curid <- as.character(tweets_curday$id_str[t]) -# Now test each single issue (not tag!) -for(i in 1:length(issueheads)) { -curissue <- issueheads[i] -curtags <- as.character(issuelist[[curissue]]) -curfile <- str_c(id_folder,"/",curissue,".csv") -# Now test all tags of a single issue -for(s in 1:length(curtags)) { -curtag <- curtags[s] -curchars <- nchar(curtag, type = "chars") -# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch -if(curchars <= 4) { -curacro <- checkAcronym(string = curtag, chars = curchars) -} else { -curacro <- FALSE -} -# Now expand the current tag by possible suffixes that may be plural forms -if(!curacro) { -for(e in 1:length(tagexpand)) { -curtag[e] <- str_c(curtag[1], tagexpand[e]) -} -} -# Set Levenshtein distance depending on char length -if(curchars <= 4) { -curdistance <- 0 -} else { -curdistance <- 1 -} -# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow also 1 (Levenshtein distance) -tags_found <- NULL -# Match the tweet with each variation of tagexpand -for(e in 1:length(curtag)) { -tags_found[e] <- smartPatternMatch(curtext, curtag[e], curdistance, curacro) -} -tags_found <- any(tags_found) -curtag <- curtag[1] -if(tags_found == TRUE) { -# # Raise number of findings on this day for this issue by 1 -# issues[d,curissue] <- issues[d,curissue] + 1 -# -# # Add issue and first matched tag of tweet to tweets-DF -# oldissue <- tweets[tweets[, "id_str"] == curid, "issue"] -# tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";") -# oldtag <- tweets[tweets[, "id_str"] == curid, "tags"] -# tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";") -# Add information to file for function viewPatternMatching -write(str_c(curdate,";\"",curid,"\";",curissue,";",curtag), curfile, append = TRUE) -cat(paste("Match!\n"), file="issuecomp-analysis.log", append=TRUE) -# data.frame(date=curdate, issue=curissue) -break # next issue, no more tags from same issue -} -else { -#cat("Nothing found\n") -} -} # /for curtags -} # /for issuelist -} # /for tweets_curday -} # /for drange -# MATCH TWEETS ------------------------------------------------------------ -id_folder <- "matched-ids" -unlink(id_folder, recursive = TRUE) -dir.create(id_folder) -issues <- data.frame(date = drange) -issuelist <- readLines("issues.xml") -issuelist <- str_replace_all(string = issuelist, pattern = ".*", "") -issuelist <- xmlToList(issuelist) -issueheads <- names(issuelist) -issues[issueheads] <- 0 -tweets$issue <- "" -tweets$tags <- "" -tagexpand <- c("", "s", "n", "en", "er", "e") -# Parallelisation -writeLines(c(""), "issuecomp-analysis.log") -cl<-makeCluster(4) -registerDoParallel(cl) -df<-foreach(d = 1:nrow(issues), .packages = c("stringr"), .combine=rbind) %dopar% { -#for(d in 1:nrow(issues)) { -# Go through every day -curdate <- issues$date[d] -cat(paste(as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE) -# Put all tweets from specific day in a temporary DF -tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] -for(t in 1:nrow(tweets_curday)){ -cat(paste("Starting tweet", t, "of",as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE) -# Select tweet's text, make it lowercase and remove hashtag indicators (#) -curtext <- as.character(tweets_curday$text[t]) -curtext <- str_replace_all(curtext, "#", "") -curid <- as.character(tweets_curday$id_str[t]) -# Now test each single issue (not tag!) -for(i in 1:length(issueheads)) { -curissue <- issueheads[i] -curtags <- as.character(issuelist[[curissue]]) -curfile <- str_c(id_folder,"/",curissue,".csv") -# Now test all tags of a single issue -for(s in 1:length(curtags)) { -curtag <- curtags[s] -curchars <- nchar(curtag, type = "chars") -# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch -if(curchars <= 4) { -curacro <- checkAcronym(string = curtag, chars = curchars) -} else { -curacro <- FALSE -} -# Now expand the current tag by possible suffixes that may be plural forms -if(!curacro) { -for(e in 1:length(tagexpand)) { -curtag[e] <- str_c(curtag[1], tagexpand[e]) -} -} -# Set Levenshtein distance depending on char length -if(curchars <= 4) { -curdistance <- 0 -} else { -curdistance <- 1 -} -# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow also 1 (Levenshtein distance) -tags_found <- NULL -# Match the tweet with each variation of tagexpand -for(e in 1:length(curtag)) { -tags_found[e] <- smartPatternMatch(curtext, curtag[e], curdistance, curacro) -} -tags_found <- any(tags_found) -curtag <- curtag[1] -if(tags_found == TRUE) { -# # Raise number of findings on this day for this issue by 1 -# issues[d,curissue] <- issues[d,curissue] + 1 -# -# # Add issue and first matched tag of tweet to tweets-DF -# oldissue <- tweets[tweets[, "id_str"] == curid, "issue"] -# tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";") -# oldtag <- tweets[tweets[, "id_str"] == curid, "tags"] -# tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";") -# Add information to file for function viewPatternMatching -write(str_c(curdate,";\"",curid,"\";",curissue,";",curtag), curfile, append = TRUE) -cat(paste("Match!\n"), file="issuecomp-analysis.log", append=TRUE) -# data.frame(date=curdate, issue=curissue) -break # next issue, no more tags from same issue -} -else { -#cat("Nothing found\n") -} -} # /for curtags -} # /for issuelist -} # /for tweets_curday -} # /for drange -df -# MATCH TWEETS ------------------------------------------------------------ -id_folder <- "matched-ids" -unlink(id_folder, recursive = TRUE) -dir.create(id_folder) -issues <- data.frame(date = drange) -issuelist <- readLines("issues.xml") -issuelist <- str_replace_all(string = issuelist, pattern = ".*", "") -issuelist <- xmlToList(issuelist) issueheads <- names(issuelist) issues[issueheads] <- 0 tweets$issue <- "" @@ -510,3 +173,340 @@ else { } # /for issuelist } # /for tweets_curday } # /for drange +require(lubridate) +require(XML) +require(ggplot2) +require(reshape2) +require(stringr) +library(foreach) +library(doParallel) +# MATCH TWEETS ------------------------------------------------------------ +id_folder <- "matched-ids" +unlink(id_folder, recursive = TRUE) +dir.create(id_folder) +issues <- data.frame(date = drange) +issuelist <- readLines("issues.xml") +issuelist <- str_replace_all(string = issuelist, pattern = ".*", "") +issuelist <- xmlToList(issuelist) +issueheads <- names(issuelist) +issues[issueheads] <- 0 +tweets$issue <- "" +tweets$tags <- "" +tagexpand <- c("", "s", "n", "en", "er", "e") +# Parallelisation +writeLines(c(""), "issuecomp-analysis.log") +cl<-makeCluster(4) +registerDoParallel(cl) +foreach(d = 1:nrow(issues), .packages = c("stringr"), .combine=rbind) %dopar% { +#for(d in 1:nrow(issues)) { +# Go through every day +curdate <- issues$date[d] +cat(paste(as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE) +# Put all tweets from specific day in a temporary DF +tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] +for(t in 1:nrow(tweets_curday)){ +# cat(paste("Starting tweet", t, "of",as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE) +# Select tweet's text, make it lowercase and remove hashtag indicators (#) +curtext <- as.character(tweets_curday$text[t]) +curtext <- str_replace_all(curtext, "#", "") +curid <- as.character(tweets_curday$id_str[t]) +# Now test each single issue (not tag!) +for(i in 1:length(issueheads)) { +curissue <- issueheads[i] +curtags <- as.character(issuelist[[curissue]]) +curfile <- str_c(id_folder,"/",curissue,".csv") +# Now test all tags of a single issue +for(s in 1:length(curtags)) { +curtag <- curtags[s] +curchars <- nchar(curtag, type = "chars") +# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch +if(curchars <= 4) { +curacro <- checkAcronym(string = curtag, chars = curchars) +} else { +curacro <- FALSE +} +# Now expand the current tag by possible suffixes that may be plural forms +if(!curacro) { +for(e in 1:length(tagexpand)) { +curtag[e] <- str_c(curtag[1], tagexpand[e]) +} +} +# Set Levenshtein distance depending on char length +if(curchars <= 4) { +curdistance <- 0 +} else { +curdistance <- 1 +} +# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow also 1 (Levenshtein distance) +tags_found <- NULL +# Match the tweet with each variation of tagexpand +for(e in 1:length(curtag)) { +tags_found[e] <- smartPatternMatch(curtext, curtag[e], curdistance, curacro) +} +tags_found <- any(tags_found) +curtag <- curtag[1] +if(tags_found == TRUE) { +# # Raise number of findings on this day for this issue by 1 +# issues[d,curissue] <- issues[d,curissue] + 1 +# +# # Add issue and first matched tag of tweet to tweets-DF +# oldissue <- tweets[tweets[, "id_str"] == curid, "issue"] +# tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";") +# oldtag <- tweets[tweets[, "id_str"] == curid, "tags"] +# tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";") +# Add information to file for function viewPatternMatching +write(str_c(curdate,";\"",curid,"\";",curissue,";",curtag), curfile, append = TRUE) +# cat(paste("Match!\n"), file="issuecomp-analysis.log", append=TRUE) +# data.frame(date=curdate, issue=curissue) +break # next issue, no more tags from same issue +} +else { +#cat("Nothing found\n") +} +} # /for curtags +} # /for issuelist +} # /for tweets_curday +} # /for drange +stopCluster(cl) +drange +drange[40] +drange[50] +View(issues) +require(lubridate) +require(XML) +require(ggplot2) +require(reshape2) +require(stringr) +library(foreach) +library(doParallel) +drange[70] +drange[80] +drange[90] +cl<-makeCluster(4) +registerDoParallel(cl) +foreach(d = 51:90, .packages = c("stringr"), .combine=rbind) %dopar% { +#for(d in 1:nrow(issues)) { +# Go through every day +curdate <- issues$date[d] +cat(paste(as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE) +# Put all tweets from specific day in a temporary DF +tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] +for(t in 1:nrow(tweets_curday)){ +# cat(paste("Starting tweet", t, "of",as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE) +# Select tweet's text, make it lowercase and remove hashtag indicators (#) +curtext <- as.character(tweets_curday$text[t]) +curtext <- str_replace_all(curtext, "#", "") +curid <- as.character(tweets_curday$id_str[t]) +# Now test each single issue (not tag!) +for(i in 1:length(issueheads)) { +curissue <- issueheads[i] +curtags <- as.character(issuelist[[curissue]]) +curfile <- str_c(id_folder,"/",curissue,".csv") +# Now test all tags of a single issue +for(s in 1:length(curtags)) { +curtag <- curtags[s] +curchars <- nchar(curtag, type = "chars") +# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch +if(curchars <= 4) { +curacro <- checkAcronym(string = curtag, chars = curchars) +} else { +curacro <- FALSE +} +# Now expand the current tag by possible suffixes that may be plural forms +if(!curacro) { +for(e in 1:length(tagexpand)) { +curtag[e] <- str_c(curtag[1], tagexpand[e]) +} +} +# Set Levenshtein distance depending on char length +if(curchars <= 4) { +curdistance <- 0 +} else { +curdistance <- 1 +} +# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow also 1 (Levenshtein distance) +tags_found <- NULL +# Match the tweet with each variation of tagexpand +for(e in 1:length(curtag)) { +tags_found[e] <- smartPatternMatch(curtext, curtag[e], curdistance, curacro) +} +tags_found <- any(tags_found) +curtag <- curtag[1] +if(tags_found == TRUE) { +# # Raise number of findings on this day for this issue by 1 +# issues[d,curissue] <- issues[d,curissue] + 1 +# +# # Add issue and first matched tag of tweet to tweets-DF +# oldissue <- tweets[tweets[, "id_str"] == curid, "issue"] +# tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";") +# oldtag <- tweets[tweets[, "id_str"] == curid, "tags"] +# tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";") +# Add information to file for function viewPatternMatching +write(str_c(curdate,";\"",curid,"\";",curissue,";",curtag), curfile, append = TRUE) +# cat(paste("Match!\n"), file="issuecomp-analysis.log", append=TRUE) +# data.frame(date=curdate, issue=curissue) +break # next issue, no more tags from same issue +} +else { +#cat("Nothing found\n") +} +} # /for curtags +} # /for issuelist +} # /for tweets_curday +} # /for drange +#rm(tweets_curday,curacro, curchars, curdate,curfile,curid,curissue,curtag,curtags,curtext,d,date_end,date_start,i,id_folder,oldissue,oldtag,s,t,tags_found) +stopCluster(cl) +drange[121] +cl<-makeCluster(4) +registerDoParallel(cl) +foreach(d = 91:120, .packages = c("stringr"), .combine=rbind) %dopar% { +#for(d in 1:nrow(issues)) { +# Go through every day +curdate <- issues$date[d] +cat(paste(as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE) +# Put all tweets from specific day in a temporary DF +tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] +for(t in 1:nrow(tweets_curday)){ +# cat(paste("Starting tweet", t, "of",as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE) +# Select tweet's text, make it lowercase and remove hashtag indicators (#) +curtext <- as.character(tweets_curday$text[t]) +curtext <- str_replace_all(curtext, "#", "") +curid <- as.character(tweets_curday$id_str[t]) +# Now test each single issue (not tag!) +for(i in 1:length(issueheads)) { +curissue <- issueheads[i] +curtags <- as.character(issuelist[[curissue]]) +curfile <- str_c(id_folder,"/",curissue,".csv") +# Now test all tags of a single issue +for(s in 1:length(curtags)) { +curtag <- curtags[s] +curchars <- nchar(curtag, type = "chars") +# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch +if(curchars <= 4) { +curacro <- checkAcronym(string = curtag, chars = curchars) +} else { +curacro <- FALSE +} +# Now expand the current tag by possible suffixes that may be plural forms +if(!curacro) { +for(e in 1:length(tagexpand)) { +curtag[e] <- str_c(curtag[1], tagexpand[e]) +} +} +# Set Levenshtein distance depending on char length +if(curchars <= 4) { +curdistance <- 0 +} else { +curdistance <- 1 +} +# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow also 1 (Levenshtein distance) +tags_found <- NULL +# Match the tweet with each variation of tagexpand +for(e in 1:length(curtag)) { +tags_found[e] <- smartPatternMatch(curtext, curtag[e], curdistance, curacro) +} +tags_found <- any(tags_found) +curtag <- curtag[1] +if(tags_found == TRUE) { +# # Raise number of findings on this day for this issue by 1 +# issues[d,curissue] <- issues[d,curissue] + 1 +# +# # Add issue and first matched tag of tweet to tweets-DF +# oldissue <- tweets[tweets[, "id_str"] == curid, "issue"] +# tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";") +# oldtag <- tweets[tweets[, "id_str"] == curid, "tags"] +# tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";") +# Add information to file for function viewPatternMatching +write(str_c(curdate,";\"",curid,"\";",curissue,";",curtag), curfile, append = TRUE) +# cat(paste("Match!\n"), file="issuecomp-analysis.log", append=TRUE) +# data.frame(date=curdate, issue=curissue) +break # next issue, no more tags from same issue +} +else { +#cat("Nothing found\n") +} +} # /for curtags +} # /for issuelist +} # /for tweets_curday +} # /for drange +stopCluster(cl) +drange[102] +require(lubridate) +require(XML) +require(ggplot2) +require(reshape2) +require(stringr) +library(foreach) +library(doParallel) +cl<-makeCluster(4) +registerDoParallel(cl) +foreach(d = 101:nrow(issues), .packages = c("stringr"), .combine=rbind) %dopar% { +#for(d in 1:nrow(issues)) { +# Go through every day +curdate <- issues$date[d] +cat(paste(as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE) +# Put all tweets from specific day in a temporary DF +tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] +for(t in 1:nrow(tweets_curday)){ +# cat(paste("Starting tweet", t, "of",as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE) +# Select tweet's text, make it lowercase and remove hashtag indicators (#) +curtext <- as.character(tweets_curday$text[t]) +curtext <- str_replace_all(curtext, "#", "") +curid <- as.character(tweets_curday$id_str[t]) +# Now test each single issue (not tag!) +for(i in 1:length(issueheads)) { +curissue <- issueheads[i] +curtags <- as.character(issuelist[[curissue]]) +curfile <- str_c(id_folder,"/",curissue,".csv") +# Now test all tags of a single issue +for(s in 1:length(curtags)) { +curtag <- curtags[s] +curchars <- nchar(curtag, type = "chars") +# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch +if(curchars <= 4) { +curacro <- checkAcronym(string = curtag, chars = curchars) +} else { +curacro <- FALSE +} +# Now expand the current tag by possible suffixes that may be plural forms +if(!curacro) { +for(e in 1:length(tagexpand)) { +curtag[e] <- str_c(curtag[1], tagexpand[e]) +} +} +# Set Levenshtein distance depending on char length +if(curchars <= 4) { +curdistance <- 0 +} else { +curdistance <- 1 +} +# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow also 1 (Levenshtein distance) +tags_found <- NULL +# Match the tweet with each variation of tagexpand +for(e in 1:length(curtag)) { +tags_found[e] <- smartPatternMatch(curtext, curtag[e], curdistance, curacro) +} +tags_found <- any(tags_found) +curtag <- curtag[1] +if(tags_found == TRUE) { +# # Raise number of findings on this day for this issue by 1 +# issues[d,curissue] <- issues[d,curissue] + 1 +# +# # Add issue and first matched tag of tweet to tweets-DF +# oldissue <- tweets[tweets[, "id_str"] == curid, "issue"] +# tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";") +# oldtag <- tweets[tweets[, "id_str"] == curid, "tags"] +# tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";") +# Add information to file for function viewPatternMatching +write(str_c(curdate,";\"",curid,"\";",curissue,";",curtag), curfile, append = TRUE) +# cat(paste("Match!\n"), file="issuecomp-analysis.log", append=TRUE) +# data.frame(date=curdate, issue=curissue) +break # next issue, no more tags from same issue +} +else { +#cat("Nothing found\n") +} +} # /for curtags +} # /for issuelist +} # /for tweets_curday +} # /for drange diff --git a/issuecomp-analysis.R b/issuecomp-analysis.R index 5409a98..2f9743a 100644 --- a/issuecomp-analysis.R +++ b/issuecomp-analysis.R @@ -21,8 +21,8 @@ drange <- date_start + days(0:drange) # MATCH TWEETS ------------------------------------------------------------ id_folder <- "matched-ids" -unlink(id_folder, recursive = TRUE) -dir.create(id_folder) +#unlink(id_folder, recursive = TRUE) +#dir.create(id_folder) issues <- data.frame(date = drange) issuelist <- readLines("issues.xml") @@ -37,10 +37,10 @@ tagexpand <- c("", "s", "n", "en", "er", "e") # Parallelisation writeLines(c(""), "issuecomp-analysis.log") -cl<-makeCluster(4) +cl<-makeCluster(3) registerDoParallel(cl) -foreach(d = 1:nrow(issues), .packages = c("stringr"), .combine=rbind) %dopar% { +foreach(d = 101:nrow(issues), .packages = c("stringr"), .combine=rbind) %dopar% { #for(d in 1:nrow(issues)) { # Go through every day curdate <- issues$date[d] diff --git a/issuecomp-analysis.log b/issuecomp-analysis.log index 21fb9db..0467926 100644 --- a/issuecomp-analysis.log +++ b/issuecomp-analysis.log @@ -3,25 +3,108 @@ 2014-01-02 2014-01-03 2014-01-04 -Match! -Match! -Match! -Match! -Match! -Match! -Match! -Match! -Match! -Match! -Match! -Match! -Match! -Match! -Match! -Match! -Match! -Match! -Match! -Match! -Match! -Match! +2014-01-05 +2014-01-06 +2014-01-07 +2014-01-08 +2014-01-09 +2014-01-10 +2014-01-11 +2014-01-12 +2014-01-13 +2014-01-14 +2014-01-15 +2014-01-16 +2014-01-17 +2014-01-18 +2014-01-19 +2014-01-20 +2014-01-21 +2014-01-22 +2014-01-23 +2014-01-24 +2014-01-25 +2014-01-26 +2014-01-27 +2014-01-28 +2014-01-29 +2014-01-30 +2014-01-31 +2014-02-01 +2014-02-02 +2014-02-03 +2014-02-04 +2014-02-05 +2014-02-06 +2014-02-07 +2014-02-08 +2014-02-09 +2014-02-10 +2014-02-11 +2014-02-12 +2014-02-13 +2014-02-14 +2014-02-15 +2014-02-16 +2014-02-17 +2014-02-18 +2014-02-19 +2014-02-20 +2014-02-21 +2014-02-22 +2014-02-23 +2014-02-20 +2014-02-21 +2014-02-22 +2014-02-23 +2014-02-24 +2014-02-25 +2014-02-26 +2014-02-27 +2014-02-28 +2014-03-01 +2014-03-02 +2014-03-03 +2014-03-04 +2014-03-05 +2014-03-06 +2014-03-07 +2014-03-08 +2014-03-09 +2014-03-10 +2014-03-11 +2014-03-12 +2014-03-13 +2014-03-14 +2014-03-15 +2014-03-16 +2014-03-17 +2014-03-18 +2014-03-19 +2014-03-20 +2014-03-21 +2014-03-22 +2014-03-23 +2014-03-24 +2014-03-25 +2014-03-26 +2014-03-27 +2014-03-28 +2014-03-29 +2014-03-30 +2014-03-31 +2014-04-01 +2014-04-02 +2014-04-03 +2014-04-04 +2014-04-05 +2014-04-06 +2014-04-07 +2014-04-08 +2014-04-09 +2014-04-10 +2014-04-11 +2014-04-12 +2014-04-13 +2014-04-14 +2014-04-15 diff --git a/matched-ids.tar b/matched-ids.tar new file mode 100644 index 0000000..24dae10 Binary files /dev/null and b/matched-ids.tar differ