diff --git a/.Rhistory b/.Rhistory index d6f3a60..f2fe296 100644 --- a/.Rhistory +++ b/.Rhistory @@ -1,512 +1,512 @@ +# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch +if(curchars <= 4) { +curacro <- checkAcronym(string = curtag, chars = curchars) +} else { +curacro <- FALSE } -## END ERROR HANDLING ## -# Delete unnecessary columns and add username and real name to dataframe -tweets_temp <- tweets_temp[keep] -tweets_temp <- cbind(user=user, name=name, tweets_temp) -# Now sleep 3 second to dodge 300queries/15min limit -cat("[",a,"/",nrow(acc_df),"] ", sep = "") -cat("User: ",user," in loop: ",loop,". \n", sep = "") -Sys.sleep(2) -if(tweets_full$id_str[nrow(tweets_full)] == tweets_temp$id_str[nrow(tweets_temp)] && nrow(tweets_full) > 0) { -cat("[INFO] Last tweet of temp is last tweet of full. Abort loop and begin with next user.\n") +# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance) +tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro) +if(tags_found == 1) { +#cat("Matched", curtag, "with", curtext,"\n") +issues[d,curissue] <- issues[d,curissue] + 1 +write(str_c(curdate,";\"",curid,"\""), curfile, append = TRUE) break } -## Last loop is reached. Now clear the data frame -# # Is the last tweet in tweets_temp from 2013? -# status <- str_detect(tweets_temp$created_at[nrow(tweets_temp)], "2014$") -# Extract year of last tweet in tweets_temp -year <- as.numeric(str_extract(tweets_temp$created_at[nrow(tweets_temp)], "\\d{4}$")) -status <- year < 2014 -if (status) { # Starting when tweet is earlier than 2014 (i.e. 2013, 2012...) -# # Delete all tweets other than from 2014 -# old <- 0 -# for(r in 1:nrow(tweets_temp)) { -# status <- str_detect(tweets_temp$created_at[r], "2014$") -# if(is.na(status)) { -# #status <- FALSE -# cat("[INFO] NA-Status in Tweet", r) -# } -# if(!status) { # Starting when tweet not from 2014 -# old <- old + 1 -# } -# } -# if(old > 0) { -# old <- old - 1 -# -# # If even the first entry isn't from 2014, we have to set "old" manually because of a bug -# status <- str_detect(tweets_temp$created_at[1], "2014$") -# if(!status) { -# old <- nrow(tweets_temp) -# cat("[INFO] Timeline enhält keinen einzigen aus 2014\n") -# } -# -# # delete all lines which are older than 2014 -# tweets_temp <- head(tweets_temp, -old) -# } -# rm(old) -tweets_full <- insertRow(tweets_full, tweets_temp) -#rm(tweets_temp) -break # End loop because 2013 is reached -} -# The last tweet is newer or equal 2014, so we need another loop else { -# Setting max_id to gather next 200 tweets -max_id <- tweets_temp$id_str[nrow(tweets_temp)] -loop <- loop + 1 # just for stats -tweets_full <- insertRow(tweets_full, tweets_temp) -#rm(tweets_temp) +#cat("Nothing found\n") } -} # /repeat -tweets_complete <- insertRow(tweets_complete, tweets_full) -tweets_full <- head(tweets_full, -nrow(tweets_full)) # Empty tweets_full -cat("User:",user,"finished after",loop,"loops. Total Tweets now:",nrow(tweets_complete),"\n") -write.csv(tweets_complete, "tweets_complete.csv") -# Every tweet from 2014 or newer from user[a] is downloaded. Now next user in for-loop -} -api_url <- "https://api.twitter.com/1.1/statuses/user_timeline.json"; -max_count <- "200" -keep <- c("created_at", "id_str", "text", "retweet_count") -# tweets_complete: All tweets -# tweets_full: All tweets of current user -# tweets_temp: The current max 200 tweets of current user -tweets_full <- data.frame(user=character(), name=character(), created_at=character(), id_str=character(), text=character(), retweet_count=character()) -tweets_complete <- tweets_full -for(a in 1:nrow(acc_df)) { -user <- as.character(acc_df$twitter_acc[a]) -name <- as.character(acc_df$name[a]) -max_id <- "999999999999999999" -loop <- 1 -error <- 0 -repeat { -# Define specific search query -query <- c(include_rts=1, exclude_replies="true", trim_user="true", include_entities="false", -screen_name=user, -count=max_count, -max_id=max_id); -# At first, work with an temporary tweet-DB -current <- twitter_api_call(api_url, query, api_params) -rm(tweets_temp) -tweets_temp <- fromJSON(correctJSON(current)) -## START ERROR HANDLING ## -# Empty API output -status <- errorEmptyAPI(tweets_temp) -if(status == 1) { Sys.sleep(3);error <- error + 1;next} -if(status == 2) {break} -# Contains "error" column -status <- errorErrorColumn(tweets_temp) -if(status == 1) { Sys.sleep(3);error <- error + 1;next} -if(status == 2) {break} -# Check if error code exists -code <- errorCheckCode(tweets_temp) # 0 if no error -if(code == 34) { # page does not exist -status <- errorCode34() -if(status == 1) { Sys.sleep(3);error <- error + 1;next} -if(status == 2) {break} -} -if(code == 88) { # rate limit exceeded -wait <- errorCode88() -Sys.sleep(wait) -next -} -## END ERROR HANDLING ## -# Delete unnecessary columns and add username and real name to dataframe -tweets_temp <- tweets_temp[keep] -tweets_temp <- cbind(user=user, name=name, tweets_temp) -# Now sleep 3 second to dodge 300queries/15min limit -cat("[",a,"/",nrow(acc_df),"] ", sep = "") -cat("User: ",user," in loop: ",loop,". \n", sep = "") -Sys.sleep(2) -if(tweets_full$id_str[nrow(tweets_full)] == tweets_temp$id_str[nrow(tweets_temp)] && nrow(tweets_full) > 0) { -cat("[INFO] Last tweet of temp is last tweet of full. Abort loop and begin with next user.\n") -break -} -## Last loop is reached. Now clear the data frame -# # Is the last tweet in tweets_temp from 2013? -# status <- str_detect(tweets_temp$created_at[nrow(tweets_temp)], "2014$") -# Extract year of last tweet in tweets_temp -year <- as.numeric(str_extract(tweets_temp$created_at[nrow(tweets_temp)], "\\d{4}$")) -status <- year < 2014 -if (status) { # Starting when tweet is earlier than 2014 (i.e. 2013, 2012...) -# # Delete all tweets other than from 2014 -# old <- 0 -# for(r in 1:nrow(tweets_temp)) { -# status <- str_detect(tweets_temp$created_at[r], "2014$") -# if(is.na(status)) { -# #status <- FALSE -# cat("[INFO] NA-Status in Tweet", r) -# } -# if(!status) { # Starting when tweet not from 2014 -# old <- old + 1 -# } -# } -# if(old > 0) { -# old <- old - 1 -# -# # If even the first entry isn't from 2014, we have to set "old" manually because of a bug -# status <- str_detect(tweets_temp$created_at[1], "2014$") -# if(!status) { -# old <- nrow(tweets_temp) -# cat("[INFO] Timeline enhält keinen einzigen aus 2014\n") -# } -# -# # delete all lines which are older than 2014 -# tweets_temp <- head(tweets_temp, -old) -# } -# rm(old) -tweets_full <- insertRow(tweets_full, tweets_temp) -#rm(tweets_temp) -break # End loop because 2013 is reached -} -# The last tweet is newer or equal 2014, so we need another loop -else { -# Setting max_id to gather next 200 tweets -max_id <- tweets_temp$id_str[nrow(tweets_temp)] -loop <- loop + 1 # just for stats -tweets_full <- insertRow(tweets_full, tweets_temp) -#rm(tweets_temp) -} -} # /repeat -tweets_complete <- insertRow(tweets_complete, tweets_full) -tweets_full <- head(tweets_full, -nrow(tweets_full)) # Empty tweets_full -cat("User:",user,"finished after",loop,"loops. Total Tweets now:",nrow(tweets_complete),"\n") -write.csv(tweets_complete, "tweets_complete.csv") -# Every tweet from 2014 or newer from user[a] is downloaded. Now next user in for-loop -} -tweets_full <- data.frame(user=character(), name=character(), created_at=character(), id_str=character(), text=character(), retweet_count=character()) -tweets_complete <- tweets_full -for(a in 1:nrow(acc_df)) { -user <- as.character(acc_df$twitter_acc[a]) -name <- as.character(acc_df$name[a]) -max_id <- "999999999999999999" -loop <- 1 -error <- 0 -repeat { -# Define specific search query -query <- c(include_rts=1, exclude_replies="true", trim_user="true", include_entities="false", -screen_name=user, -count=max_count, -max_id=max_id); -# At first, work with an temporary tweet-DB -current <- twitter_api_call(api_url, query, api_params) -rm(tweets_temp) -tweets_temp <- fromJSON(correctJSON(current)) -## START ERROR HANDLING ## -# Empty API output -status <- errorEmptyAPI(tweets_temp) -if(status == 1) { Sys.sleep(3);error <- error + 1;next} -if(status == 2) {break} -# Contains "error" column -status <- errorErrorColumn(tweets_temp) -if(status == 1) { Sys.sleep(3);error <- error + 1;next} -if(status == 2) {break} -# Check if error code exists -code <- errorCheckCode(tweets_temp) # 0 if no error -if(code == 34) { # page does not exist -status <- errorCode34() -if(status == 1) { Sys.sleep(3);error <- error + 1;next} -if(status == 2) {break} -} -if(code == 88) { # rate limit exceeded -wait <- errorCode88() -Sys.sleep(wait) -next -} -## END ERROR HANDLING ## -# Delete unnecessary columns and add username and real name to dataframe -tweets_temp <- tweets_temp[keep] -tweets_temp <- cbind(user=user, name=name, tweets_temp) -# Now sleep 3 second to dodge 300queries/15min limit -cat("[",a,"/",nrow(acc_df),"] ", sep = "") -cat("User: ",user," in loop: ",loop,". \n", sep = "") -Sys.sleep(2) -if(tweets_full$id_str[nrow(tweets_full)] == tweets_temp$id_str[nrow(tweets_temp)] && nrow(tweets_full) > 0) { -cat("[INFO] Last tweet of temp is last tweet of full. Abort loop and begin with next user.\n") -break -} -## Last loop is reached. Now clear the data frame -# # Is the last tweet in tweets_temp from 2013? -# status <- str_detect(tweets_temp$created_at[nrow(tweets_temp)], "2014$") -# Extract year of last tweet in tweets_temp -year_last <- as.numeric(str_extract(tweets_temp$created_at[nrow(tweets_temp)], "\\d{4}$")) -status <- year_last < 2014 -if (status) { # Starting when tweet is earlier than 2014 (i.e. 2013, 2012...) -year_first <- as.numeric(str_extract(tweets_temp$created_at[1], "\\d{4}$")) -status <- year_first < 2014 -cat("[INFO] Timeline enhält keinen einzigen aus 2014\n") -} -# # Delete all tweets other than from 2014 -# old <- 0 -# for(r in 1:nrow(tweets_temp)) { -# status <- str_detect(tweets_temp$created_at[r], "2014$") -# if(is.na(status)) { -# #status <- FALSE -# cat("[INFO] NA-Status in Tweet", r) -# } -# if(!status) { # Starting when tweet not from 2014 -# old <- old + 1 -# } -# } -# if(old > 0) { -# old <- old - 1 -# -# # If even the first entry isn't from 2014, we have to set "old" manually because of a bug -# status <- str_detect(tweets_temp$created_at[1], "2014$") -# if(!status) { -# old <- nrow(tweets_temp) -# cat("[INFO] Timeline enhält keinen einzigen aus 2014\n") -# } -# -# # delete all lines which are older than 2014 -# tweets_temp <- head(tweets_temp, -old) -# } -# rm(old) -tweets_full <- insertRow(tweets_full, tweets_temp) -#rm(tweets_temp) -break # End loop because 2013 is reached -} -# The last tweet is newer or equal 2014, so we need another loop -else { -# Setting max_id to gather next 200 tweets -max_id <- tweets_temp$id_str[nrow(tweets_temp)] -loop <- loop + 1 # just for stats -tweets_full <- insertRow(tweets_full, tweets_temp) -#rm(tweets_temp) -} -} # /repeat -tweets_complete <- insertRow(tweets_complete, tweets_full) -tweets_full <- head(tweets_full, -nrow(tweets_full)) # Empty tweets_full -cat("User:",user,"finished after",loop,"loops. Total Tweets now:",nrow(tweets_complete),"\n") -write.csv(tweets_complete, "tweets_complete.csv") -# Every tweet from 2014 or newer from user[a] is downloaded. Now next user in for-loop -} -tweets_full <- data.frame(user=character(), name=character(), created_at=character(), id_str=character(), text=character(), retweet_count=character()) -tweets_complete <- tweets_full -for(a in 1:nrow(acc_df)) { -user <- as.character(acc_df$twitter_acc[a]) -name <- as.character(acc_df$name[a]) -max_id <- "999999999999999999" -loop <- 1 -error <- 0 -repeat { -# Define specific search query -query <- c(include_rts=1, exclude_replies="true", trim_user="true", include_entities="false", -screen_name=user, -count=max_count, -max_id=max_id); -# At first, work with an temporary tweet-DB -current <- twitter_api_call(api_url, query, api_params) -rm(tweets_temp) -tweets_temp <- fromJSON(correctJSON(current)) -## START ERROR HANDLING ## -# Empty API output -status <- errorEmptyAPI(tweets_temp) -if(status == 1) { Sys.sleep(3);error <- error + 1;next} -if(status == 2) {break} -# Contains "error" column -status <- errorErrorColumn(tweets_temp) -if(status == 1) { Sys.sleep(3);error <- error + 1;next} -if(status == 2) {break} -# Check if error code exists -code <- errorCheckCode(tweets_temp) # 0 if no error -if(code == 34) { # page does not exist -status <- errorCode34() -if(status == 1) { Sys.sleep(3);error <- error + 1;next} -if(status == 2) {break} -} -if(code == 88) { # rate limit exceeded -wait <- errorCode88() -Sys.sleep(wait) -next -} -## END ERROR HANDLING ## -# Delete unnecessary columns and add username and real name to dataframe -tweets_temp <- tweets_temp[keep] -tweets_temp <- cbind(user=user, name=name, tweets_temp) -# Now sleep 3 second to dodge 300queries/15min limit -cat("[",a,"/",nrow(acc_df),"] ", sep = "") -cat("User: ",user," in loop: ",loop,". \n", sep = "") -Sys.sleep(2) -if(tweets_full$id_str[nrow(tweets_full)] == tweets_temp$id_str[nrow(tweets_temp)] && nrow(tweets_full) > 0) { -cat("[INFO] Last tweet of temp is last tweet of full. Abort loop and begin with next user.\n") -break -} -## Last loop is reached. Now clear the data frame -# # Is the last tweet in tweets_temp from 2013? -# status <- str_detect(tweets_temp$created_at[nrow(tweets_temp)], "2014$") -# Extract year of last tweet in tweets_temp -year_last <- as.numeric(str_extract(tweets_temp$created_at[nrow(tweets_temp)], "\\d{4}$")) -status <- year_last < 2014 -if (status) { # Starting when tweet is earlier than 2014 (i.e. 2013, 2012...) -# Is even the first tweet older than 2014? -year_first <- as.numeric(str_extract(tweets_temp$created_at[1], "\\d{4}$")) -status <- year_first < 2014 -if(status) { -cat("[INFO] Timeline enhält keinen einzigen aus 2014\n") -} -# # Delete all tweets other than from 2014 -# old <- 0 -# for(r in 1:nrow(tweets_temp)) { -# status <- str_detect(tweets_temp$created_at[r], "2014$") -# if(is.na(status)) { -# #status <- FALSE -# cat("[INFO] NA-Status in Tweet", r) -# } -# if(!status) { # Starting when tweet not from 2014 -# old <- old + 1 -# } -# } -# if(old > 0) { -# old <- old - 1 -# -# # If even the first entry isn't from 2014, we have to set "old" manually because of a bug -# status <- str_detect(tweets_temp$created_at[1], "2014$") -# if(!status) { -# old <- nrow(tweets_temp) -# cat("[INFO] Timeline enhält keinen einzigen aus 2014\n") -# } -# -# # delete all lines which are older than 2014 -# tweets_temp <- head(tweets_temp, -old) -# } -# rm(old) -tweets_full <- insertRow(tweets_full, tweets_temp) -#rm(tweets_temp) -break # End loop because 2013 is reached -} -# The last tweet is newer or equal 2014, so we need another loop -else { -# Setting max_id to gather next 200 tweets -max_id <- tweets_temp$id_str[nrow(tweets_temp)] -loop <- loop + 1 # just for stats -tweets_full <- insertRow(tweets_full, tweets_temp) -#rm(tweets_temp) -} -} # /repeat -tweets_complete <- insertRow(tweets_complete, tweets_full) -tweets_full <- head(tweets_full, -nrow(tweets_full)) # Empty tweets_full -cat("User:",user,"finished after",loop,"loops. Total Tweets now:",nrow(tweets_complete),"\n") -write.csv(tweets_complete, "tweets_complete.csv") -# Every tweet from 2014 or newer from user[a] is downloaded. Now next user in for-loop -} -rm(a, code, current, error, loop, max_id, name, query, r, status, user, wait, tweets_full, tweets_temp) -rm(a, code, current, error, loop, max_id, max_count, year_first, year_last, name, query, status, user, wait, tweets_full, tweets_temp) -rm(year) -save(tweets_complete, file="tweets_complete.RData") -tweets_complete2 <- tweets_complete -View(tweets_complete2) -tweets <- tweets_complete[!duplicated(tweets_complete), ] -tweets <- na.omit(tweets) -save(tweets, file="tweets.RData") -rm(tweets_complete2) -Sys.setlocale("LC_TIME", "C") -tweets$created_at <- as.POSIXct(tweets$created_at, format = "%a %b %d %H:%M:%S %z %Y") -tweets <- tweets[order(tweets$created_at), ] -head(tweets) -delrow <- NULL -for(r in 1:nrow(tweets)) { -if(format(tweets$created_at[r], "%Y") != "2014") { -delrow <- c(delrow, r) -} -if(format(tweets$created_at[r], "%Y") == "2014") { -break -} -} -delrow <- NULL -pb <- txtProgressBar(min = 0, max = total, style = 3) -for(r in 1:nrow(tweets)) { -setTxtProgressBar(pb, r) -if(format(tweets$created_at[r], "%Y") != "2014") { -delrow <- c(delrow, r) -} -if(format(tweets$created_at[r], "%Y") == "2014") { -break -} -} -pb <- txtProgressBar(min = 0, max = total, style = 3) -pb <- txtProgressBar(min = 0, max = nrow(tweets), style = 3) -for(r in 1:nrow(tweets)) { -setTxtProgressBar(pb, r) -if(format(tweets$created_at[r], "%Y") != "2014") { -delrow <- c(delrow, r) -} -if(format(tweets$created_at[r], "%Y") == "2014") { -break -} -} -tweets <- tweets[-delrow, ] -rm(delrow, r) -summary(tweets) -tweets$created_at[140000] -tweets$created_at[130000] -tweets$created_at[1] -tweets$created_at[2] -tweets$created_at[100] -tweets$created_at[141086] -delrow -delrow <- NULL -for(r in 1:nrow(tweets)) { -if(format(tweets$created_at[r], "%Y") != "2014") { -delrow <- c(delrow, r) -} -if(format(tweets$created_at[r], "%Y") == "2014") { -break -} -} -pb <- txtProgressBar(min = 0, max = nrow(tweets), style = 3) -for(r in 1:nrow(tweets)) { -if(format(tweets$created_at[r], "%Y") != "2014") { -delrow <- c(delrow, r) -} -setTxtProgressBar(pb, r) -} -tweets <- tweets[-delrow, ] -tweets$created_at[137800] -tweets$created_at[137876] -rm(delrow, r) -rm(pb) -tweets$created_at <- format(tweets$created_at, "%Y-%m-%d") -tweets$created_at[137876] -save(tweets, file="tweets.RData") -readLines("twitter-api-consumerkey.txt") -api_params2 <- api_params -readLines("twitter-api-consumerkey.txt") -readLines("twitter-api-consumerkey.txt")[2] -readLines("twitter-api-credentials.txt")[2] -readLines("twitter-api-credentials.txt")[1] -api_params <- c( -"oauth_consumer_key" = readLines("twitter-api-credentials.txt")[2], -"oauth_nonce" = NA, -"oauth_signature_method" = "HMAC-SHA1", -"oauth_timestamp" = NA, -"oauth_token" = readLines("twitter-api-credentials.txt")[4], -"oauth_version" = "1.0", -"consumer_secret" = readLines("twitter-api-credentials.txt")[3], -"oauth_token_secret" = readLines("twitter-api-credentials.txt")[5] -) -api_params -api_params2 -rm(tweets_complete) -rm(api_params2) -source("issuecomp-functions.R") -source("issuecomp-functions.R") -setwd("~/Dokumente/Uni/Aktuell/BA-Arbeit/uni-ba-issuecomp") -require(lubridate) -require(XML) -require(ggplot2) -require(reshape2) -date_start <- as.Date("2014-01-01") -date_end <- as.Date("2014-12-01") -drange <- as.integer(date_end - date_start) -drange <- date_start + days(0:drange) +} # /for curtags +} # /for issuelist +} # /for tweets_curday +} # /for drange +smartPatternMatch("kerTips: Riker workplace tip: Flirt when no one else is looking. http", "IS", 2, TRUE) +smartPatternMatch("kerTips: Riker workplace tip: Flirt when no one else is looking. http", "is", 2, TRUE) +viewMatchingTweets("2014-01-06", "issue.iraq", id_folder) +# MATCH TWEETS ------------------------------------------------------------ +id_folder <- "matched-ids" +unlink(id_folder, recursive = TRUE) +dir.create(id_folder) issues <- data.frame(date = drange) -View(issues) -date_start <- as.Date("2014-01-01") -date_end <- as.Date("2014-12-31") -drange <- as.integer(date_end - date_start) -drange <- date_start + days(0:drange) -issues <- data.frame(date = drange) -View(issues) issuelist <- xmlToList("issues.xml") issueheads <- names(issuelist) issues[issueheads] <- 0 +for(d in 1:nrow(issues)) { +# Go through every day +curdate <- issues$date[d] +cat(as.character(curdate),"\n") +# Put all tweets from specific day in a temporary DF +tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] +for(t in 1:nrow(tweets_curday)){ +# Select tweet's text, make it lowercase and remove hashtag indicators (#) +curtext <- as.character(tweets_curday$text[t]) +curtext <- str_replace_all(curtext, "#", "") +curid <- as.character(tweets_curday$id_str[t]) +# Now test each single issue (not tag!) +for(i in 1:length(issuelist)) { +curtags <- as.character(issuelist[[i]]) +curissue <- names(issuelist)[i] +curfile <- str_c(id_folder,"/",curissue,".csv") +# Now test all tags of a single issue +for(s in 1:length(curtags)) { +curtag <- curtags[s] +curchars <- nchar(curtag, type = "chars") +# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch +if(curchars <= 4) { +curacro <- checkAcronym(string = curtag, chars = curchars) +} else { +curacro <- FALSE +} +# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance) +tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro) +if(tags_found == 1) { +#cat("Matched", curtag, "with", curtext,"\n") +issues[d,curissue] <- issues[d,curissue] + 1 +write(str_c(curdate,";\"",curid,"\""), curfile, append = TRUE) +break +} +else { +#cat("Nothing found\n") +} +} # /for curtags +} # /for issuelist +} # /for tweets_curday +} # /for drange +source("issuecomp-functions.R") +# MATCH TWEETS ------------------------------------------------------------ +id_folder <- "matched-ids" +unlink(id_folder, recursive = TRUE) +dir.create(id_folder) +issues <- data.frame(date = drange) +issuelist <- xmlToList("issues.xml") +issueheads <- names(issuelist) +issues[issueheads] <- 0 +for(d in 1:nrow(issues)) { +# Go through every day +curdate <- issues$date[d] +cat(as.character(curdate),"\n") +# Put all tweets from specific day in a temporary DF +tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] +for(t in 1:nrow(tweets_curday)){ +# Select tweet's text, make it lowercase and remove hashtag indicators (#) +curtext <- as.character(tweets_curday$text[t]) +curtext <- str_replace_all(curtext, "#", "") +curid <- as.character(tweets_curday$id_str[t]) +# Now test each single issue (not tag!) +for(i in 1:length(issuelist)) { +curtags <- as.character(issuelist[[i]]) +curissue <- names(issuelist)[i] +curfile <- str_c(id_folder,"/",curissue,".csv") +# Now test all tags of a single issue +for(s in 1:length(curtags)) { +curtag <- curtags[s] +curchars <- nchar(curtag, type = "chars") +# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch +if(curchars <= 4) { +curacro <- checkAcronym(string = curtag, chars = curchars) +} else { +curacro <- FALSE +} +# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance) +tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro) +if(tags_found == 1) { +#cat("Matched", curtag, "with", curtext,"\n") +issues[d,curissue] <- issues[d,curissue] + 1 +write(str_c(curdate,";\"",curid,"\";"curtag), curfile, append = TRUE) +break +} +else { +#cat("Nothing found\n") +} +} # /for curtags +} # /for issuelist +} # /for tweets_curday +} # /for drange +# MATCH TWEETS ------------------------------------------------------------ +id_folder <- "matched-ids" +unlink(id_folder, recursive = TRUE) +dir.create(id_folder) +issues <- data.frame(date = drange) +issuelist <- xmlToList("issues.xml") +issueheads <- names(issuelist) +issues[issueheads] <- 0 +for(d in 1:nrow(issues)) { +# Go through every day +curdate <- issues$date[d] +cat(as.character(curdate),"\n") +# Put all tweets from specific day in a temporary DF +tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] +for(t in 1:nrow(tweets_curday)){ +# Select tweet's text, make it lowercase and remove hashtag indicators (#) +curtext <- as.character(tweets_curday$text[t]) +curtext <- str_replace_all(curtext, "#", "") +curid <- as.character(tweets_curday$id_str[t]) +# Now test each single issue (not tag!) +for(i in 1:length(issuelist)) { +curtags <- as.character(issuelist[[i]]) +curissue <- names(issuelist)[i] +curfile <- str_c(id_folder,"/",curissue,".csv") +# Now test all tags of a single issue +for(s in 1:length(curtags)) { +curtag <- curtags[s] +curchars <- nchar(curtag, type = "chars") +# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch +if(curchars <= 4) { +curacro <- checkAcronym(string = curtag, chars = curchars) +} else { +curacro <- FALSE +} +# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance) +tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro) +if(tags_found == 1) { +#cat("Matched", curtag, "with", curtext,"\n") +issues[d,curissue] <- issues[d,curissue] + 1 +write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE) +break +} +else { +#cat("Nothing found\n") +} +} # /for curtags +} # /for issuelist +} # /for tweets_curday +} # /for drange +source("issuecomp-functions.R") +viewMatchingTweets("2014-01-06", "issue.iraq", id_folder) +viewMatchingTweets("2014-01-07", "issue.iraq", id_folder) +viewMatchingTweets("2014-01-09", "issue.iraq", id_folder) +curtext <- "Willkürlich Menschen an ihrer #Versammlungsfreiheit zu hindern ist eindeutig rechtswidrig. http://t.co/A7IQfISIhP #Gefahrengebiet #Hamburg" +str_replace_all(curtext, "http://.+\\W", "") +str_replace_all(curtext, "http://.+?\\W", "") +str_replace_all(curtext, "http://.+?\\s", "") +str_replace_all(curtext, "http://.+?\\s", "") +curtext <- "test http://google.de haha http://nsa.gov eqiuhe" +str_replace_all(curtext, "http://.+?\\s", "") +str_replace_all(curtext, "http://.+?\\s", "URL") +str_replace_all(curtext, "http://.+?\\s", "URL ") +viewMatchingTweets("2014-01-09", "issue.iraq", id_folder) +# MATCH TWEETS ------------------------------------------------------------ +id_folder <- "matched-ids" +unlink(id_folder, recursive = TRUE) +dir.create(id_folder) +issues <- data.frame(date = drange) +issuelist <- xmlToList("issues.xml") +issueheads <- names(issuelist) +issues[issueheads] <- 0 +for(d in 1:nrow(issues)) { +# Go through every day +curdate <- issues$date[d] +cat(as.character(curdate),"\n") +# Put all tweets from specific day in a temporary DF +tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] +for(t in 1:nrow(tweets_curday)){ +# Select tweet's text, make it lowercase and remove hashtag indicators (#) +curtext <- as.character(tweets_curday$text[t]) +curtext <- str_replace_all(curtext, "#", "") +curtext <- str_replace_all(curtext, "http://.+?\\s", "URL ") +curid <- as.character(tweets_curday$id_str[t]) +# Now test each single issue (not tag!) +for(i in 1:length(issuelist)) { +curtags <- as.character(issuelist[[i]]) +curissue <- names(issuelist)[i] +curfile <- str_c(id_folder,"/",curissue,".csv") +# Now test all tags of a single issue +for(s in 1:length(curtags)) { +curtag <- curtags[s] +curchars <- nchar(curtag, type = "chars") +# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch +if(curchars <= 4) { +curacro <- checkAcronym(string = curtag, chars = curchars) +} else { +curacro <- FALSE +} +# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance) +tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro) +if(tags_found == 1) { +#cat("Matched", curtag, "with", curtext,"\n") +issues[d,curissue] <- issues[d,curissue] + 1 +write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE) +break +} +else { +#cat("Nothing found\n") +} +} # /for curtags +} # /for issuelist +} # /for tweets_curday +} # /for drange +viewMatchingTweets("2014-01-09", "issue.iraq", id_folder) +viewMatchingTweets("2014-01-08", "issue.iraq", id_folder) +viewMatchingTweets("2014-01-10", "issue.iraq", id_folder) +curtext +str_replace_all(curtext, "http://.+?\\>", "URL ") +str_replace_all(curtext, "http://.+?\\<", "URL ") +curtext <- str_replace_all(curtext, "http://.+?\\b", "URL ") +str_replace_all(curtext, "http://.+?\\b", "URL ") +str_replace_all(curtext, "http://.+?\\s", "URL ") +curtext +curtext <- as.character(tweets_curday$text[t]) +curtext +str_replace_all(curtext, "http://.+?\\s", "URL ") +str_replace_all(curtext, "http://.+?\\b", "URL ") +str_replace_all(curtext, "http://.+?\\<", "URL ") +str_replace_all(curtext, "http://.+?\\>", "URL ") +str_replace_all(curtext, "http://.+?\\s", "URL ") +str_replace_all(curtext, "$", " ") +curtext <- str_replace_all(curtext, "$", " ") +curtext +str_replace_all(curtext, "http://.+?\\s", "URL ") +viewMatchingTweets("2014-01-10", "issue.iraq", id_folder) +# MATCH TWEETS ------------------------------------------------------------ +id_folder <- "matched-ids" +unlink(id_folder, recursive = TRUE) +dir.create(id_folder) +issues <- data.frame(date = drange) +issuelist <- xmlToList("issues.xml") +issueheads <- names(issuelist) +issues[issueheads] <- 0 +for(d in 1:nrow(issues)) { +# Go through every day +curdate <- issues$date[d] +cat(as.character(curdate),"\n") +# Put all tweets from specific day in a temporary DF +tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] +for(t in 1:nrow(tweets_curday)){ +# Select tweet's text, make it lowercase and remove hashtag indicators (#) +curtext <- as.character(tweets_curday$text[t]) +curtext <- str_replace_all(curtext, "#", "") +curtext <- str_replace_all(curtext, "$", " ") +curtext <- str_replace_all(curtext, "http://.+?\\s", "URL ") +curid <- as.character(tweets_curday$id_str[t]) +# Now test each single issue (not tag!) +for(i in 1:length(issuelist)) { +curtags <- as.character(issuelist[[i]]) +curissue <- names(issuelist)[i] +curfile <- str_c(id_folder,"/",curissue,".csv") +# Now test all tags of a single issue +for(s in 1:length(curtags)) { +curtag <- curtags[s] +curchars <- nchar(curtag, type = "chars") +# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch +if(curchars <= 4) { +curacro <- checkAcronym(string = curtag, chars = curchars) +} else { +curacro <- FALSE +} +# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance) +tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro) +if(tags_found == 1) { +#cat("Matched", curtag, "with", curtext,"\n") +issues[d,curissue] <- issues[d,curissue] + 1 +write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE) +break +} +else { +#cat("Nothing found\n") +} +} # /for curtags +} # /for issuelist +} # /for tweets_curday +} # /for drange +viewMatchingTweets("2014-01-10", "issue.iraq", id_folder) +# MATCH TWEETS ------------------------------------------------------------ +id_folder <- "matched-ids" +unlink(id_folder, recursive = TRUE) +dir.create(id_folder) +issues <- data.frame(date = drange) +issuelist <- xmlToList("issues.xml") +issueheads <- names(issuelist) +issues[issueheads] <- 0 +for(d in 1:nrow(issues)) { +# Go through every day +curdate <- issues$date[d] +cat(as.character(curdate),"\n") +# Put all tweets from specific day in a temporary DF +tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] +for(t in 1:nrow(tweets_curday)){ +# Select tweet's text, make it lowercase and remove hashtag indicators (#) +curtext <- as.character(tweets_curday$text[t]) +curtext <- str_replace_all(curtext, "#", "") +curtext <- str_replace_all(curtext, "$", " ") +curtext <- str_replace_all(curtext, "http://.+?\\s", "URL ") +curid <- as.character(tweets_curday$id_str[t]) +# Now test each single issue (not tag!) +for(i in 1:length(issuelist)) { +curtags <- as.character(issuelist[[i]]) +curissue <- names(issuelist)[i] +curfile <- str_c(id_folder,"/",curissue,".csv") +# Now test all tags of a single issue +for(s in 1:length(curtags)) { +curtag <- curtags[s] +curchars <- nchar(curtag, type = "chars") +# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch +if(curchars <= 4) { +curacro <- checkAcronym(string = curtag, chars = curchars) +} else { +curacro <- FALSE +} +# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance) +tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro) +if(tags_found == 1) { +#cat("Matched", curtag, "with", curtext,"\n") +issues[d,curissue] <- issues[d,curissue] + 1 +write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE) +break +} +else { +#cat("Nothing found\n") +} +} # /for curtags +} # /for issuelist +} # /for tweets_curday +} # /for drange View(issues) +viewMatchingTweets("2014-12-18", "issue.edathy", id_folder) +issues_melt <- melt(issues,id="date") +ggplot(issues_melt,aes(x=date,y=value,colour=variable,group=variable)) + geom_line(size=1) +ggplot(issues_melt,aes(x=date,y=value,colour=variable,group=variable)) + geom_smooth(size=1,method="loess",formula = y ~ x, se=FALSE) +ggplot(issues_melt,aes(x=date,y=value,colour=variable,group=variable)) + geom_line(size=1) +ggplot(issues_melt,aes(x=date,y=value,colour=variable,group=variable)) + geom_line(size=1) +ggplot(issues_melt,aes(x=date,y=value,colour=variable,group=variable)) + geom_smooth(size=1,method="loess",formula = y ~ x, se=FALSE) +viewMatchingTweets("2014-12-18", "issue.conservative", id_folder) +agrep("christ", "Jungparlamentarier gleich Schriftführerdienst hat", max.distance = list(all = 2), ignore.case = TRUE, fixed = FALSE) +agrep("\\bchrist\\b", "Jungparlamentarier gleich Schriftführerdienst hat", max.distance = list(all = 2), ignore.case = TRUE, fixed = FALSE) +agrep("\\bchrist\\b", "Bla Christ bla", max.distance = list(all = 2), ignore.case = TRUE, fixed = FALSE) +agrep("\\bchrist\\b", "Bla Christus bla", max.distance = list(all = 2), ignore.case = TRUE, fixed = FALSE) +agrep("\\bchrist\\b", "Bla Christu bla", max.distance = list(all = 2), ignore.case = TRUE, fixed = FALSE) +agrep("\\bchrist\\b", "Bla Christus bla", max.distance = list(all = 2), ignore.case = TRUE, fixed = FALSE) +agrep("\\bchrist\\b", "Bla Christus bla", max.distance = list(all = 3), ignore.case = TRUE, fixed = FALSE) +agrep("\\bchrist\\b", "Bla christus bla", max.distance = list(all = 3), ignore.case = TRUE, fixed = FALSE) +agrep("\\bchrist\\b", "Bla christus bla", max.distance = list(all = 2), ignore.case = TRUE, fixed = FALSE) +agrep("\\bchrist\\b", "Bla christen bla", max.distance = list(all = 3), ignore.case = TRUE, fixed = FALSE) +agrep("\\bchrist\\b", "Bla Antichrist bla", max.distance = list(all = 3), ignore.case = TRUE, fixed = FALSE) +agrep("\\bchrist\\b", "Bla Christian bla", max.distance = list(all = 3), ignore.case = TRUE, fixed = FALSE) +agrep("\\bchrist\\b", "Bla Christian bla", max.distance = list(all = 3), ignore.case = TRUE, fixed = FALSE, value=TRUE) +agrep("\\bchrist\\b", "Bla Christi bla", max.distance = list(all = 3), ignore.case = TRUE, fixed = FALSE, value=TRUE) +agrep("\\bchrist\\b", "Bla Christi bla", max.distance = list(all = 3), ignore.case = TRUE, fixed = FALSE) +agrep("\\bIS\\b", "Wir sind bei ISN Network", max.distance = list(all = 0), ignore.case = TRUE, fixed = FALSE) +agrep("\\bIS\\b", "Wir sind bei ISN Network", max.distance = list(all = 0), ignore.case = F, fixed = FALSE) +id_folder <- "matched-ids" +unlink(id_folder, recursive = TRUE) +dir.create(id_folder) +issues <- data.frame(date = drange) +issuelist <- xmlToList("issues.xml") +issueheads <- names(issuelist) +issues[issueheads] <- 0 +for(d in 1:nrow(issues)) { +# Go through every day +curdate <- issues$date[d] +cat(as.character(curdate),"\n") +# Put all tweets from specific day in a temporary DF +tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] +for(t in 1:nrow(tweets_curday)){ +# Select tweet's text, make it lowercase and remove hashtag indicators (#) +curtext <- as.character(tweets_curday$text[t]) +curtext <- str_replace_all(curtext, "#", "") +curtext <- str_replace_all(curtext, "$", " ") +curtext <- str_replace_all(curtext, "http://.+?\\s", "URL ") +curid <- as.character(tweets_curday$id_str[t]) +# Now test each single issue (not tag!) +for(i in 1:length(issuelist)) { +curtags <- as.character(issuelist[[i]]) +curissue <- names(issuelist)[i] +curfile <- str_c(id_folder,"/",curissue,".csv") +# Now test all tags of a single issue +for(s in 1:length(curtags)) { +curtag <- curtags[s] +curchars <- nchar(curtag, type = "chars") +# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch +if(curchars <= 4) { +curacro <- checkAcronym(string = curtag, chars = curchars) +} else { +curacro <- FALSE +} +# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance) +tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro) +if(tags_found == 1) { +#cat("Matched", curtag, "with", curtext,"\n") +issues[d,curissue] <- issues[d,curissue] + 1 +write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE) +break +} +else { +#cat("Nothing found\n") +} +} # /for curtags +} # /for issuelist +} # /for tweets_curday +} # /for drange +issues_melt <- melt(issues,id="date") +ggplot(issues_melt,aes(x=date,y=value,colour=variable,group=variable)) + geom_smooth(size=1,method="loess",formula = y ~ x, se=FALSE) +viewMatchingTweets("2014-12-18", "issue.conservative", id_folder) +pattern +agrep("\\bchrist\\b", "RT @christophheyes: Morgen in der Presse: Oppermann - Briefkasten gestohlen! Gabriel - Poesiealbum nicht mehr auffindbar! #edathy #hartmann", max.distance = list(all = 1), ignore.case = TRUE, fixed = FALSE) +smartPatternMatch +source("issuecomp-functions.R") +smartPatternMatch +# MATCH TWEETS ------------------------------------------------------------ +id_folder <- "matched-ids" +unlink(id_folder, recursive = TRUE) +dir.create(id_folder) +issues <- data.frame(date = drange) +issuelist <- xmlToList("issues.xml") +issueheads <- names(issuelist) +issues[issueheads] <- 0 +for(d in 1:nrow(issues)) { +# Go through every day +curdate <- issues$date[d] +cat(as.character(curdate),"\n") +# Put all tweets from specific day in a temporary DF +tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] +for(t in 1:nrow(tweets_curday)){ +# Select tweet's text, make it lowercase and remove hashtag indicators (#) +curtext <- as.character(tweets_curday$text[t]) +curtext <- str_replace_all(curtext, "#", "") +curtext <- str_replace_all(curtext, "$", " ") +curtext <- str_replace_all(curtext, "http://.+?\\s", "URL ") +curid <- as.character(tweets_curday$id_str[t]) +# Now test each single issue (not tag!) +for(i in 1:length(issuelist)) { +curtags <- as.character(issuelist[[i]]) +curissue <- names(issuelist)[i] +curfile <- str_c(id_folder,"/",curissue,".csv") +# Now test all tags of a single issue +for(s in 1:length(curtags)) { +curtag <- curtags[s] +curchars <- nchar(curtag, type = "chars") +# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch +if(curchars <= 4) { +curacro <- checkAcronym(string = curtag, chars = curchars) +} else { +curacro <- FALSE +} +# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance) +tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro) +if(tags_found == 1) { +#cat("Matched", curtag, "with", curtext,"\n") +issues[d,curissue] <- issues[d,curissue] + 1 +write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE) +break +} +else { +#cat("Nothing found\n") +} +} # /for curtags +} # /for issuelist +} # /for tweets_curday +} # /for drange +issues_melt <- melt(issues,id="date") +ggplot(issues_melt,aes(x=date,y=value,colour=variable,group=variable)) + geom_smooth(size=1,method="loess",formula = y ~ x, se=FALSE) +viewMatchingTweets("2014-12-18", "issue.conservative", id_folder) +viewMatchingTweets("2014-05-18", "issue.conservative", id_folder) +viewMatchingTweets("2014-05-1", "issue.conservative", id_folder) +viewMatchingTweets("2014-05-01", "issue.conservative", id_folder) +viewMatchingTweets("2014-05-02", "issue.conservative", id_folder) +viewMatchingTweets("2014-05-10", "issue.conservative", id_folder) +viewMatchingTweets("2014-05-10", "issue.middleeast", id_folder) +viewMatchingTweets("2014-05-10", "issue.iraw", id_folder) +viewMatchingTweets("2014-05-10", "issue.iraq", id_folder) +viewMatchingTweets("2014-08-10", "issue.iraq", id_folder) +viewMatchingTweets("2014-11-10", "issue.iraq", id_folder) +viewMatchingTweets("2014-12-10", "issue.iraq", id_folder) +View(issues) +viewMatchingTweets("2014-09-19", "issue.control", id_folder) diff --git a/.gitignore b/.gitignore index d6a0d18..34c589d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ tweets_complete.csv current.txt .RData +matched-ids diff --git a/issuecomp-analysis.R b/issuecomp-analysis.R index 2e41b01..e5e7611 100644 --- a/issuecomp-analysis.R +++ b/issuecomp-analysis.R @@ -4,6 +4,8 @@ require(ggplot2) require(reshape2) require(stringr) +source("issuecomp-functions.R") + # Create date range date_start <- as.Date("2014-01-01") date_end <- as.Date("2014-12-31") @@ -13,6 +15,10 @@ drange <- date_start + days(0:drange) # MATCH TWEETS ------------------------------------------------------------ +id_folder <- "matched-ids" +unlink(id_folder, recursive = TRUE) +dir.create(id_folder) + issues <- data.frame(date = drange) issuelist <- xmlToList("issues.xml") issueheads <- names(issuelist) @@ -30,22 +36,34 @@ for(d in 1:nrow(issues)) { # Select tweet's text, make it lowercase and remove hashtag indicators (#) curtext <- as.character(tweets_curday$text[t]) curtext <- str_replace_all(curtext, "#", "") + curtext <- str_replace_all(curtext, "$", " ") + curtext <- str_replace_all(curtext, "http://.+?\\s", "URL ") + curid <- as.character(tweets_curday$id_str[t]) # Now test each single issue (not tag!) for(i in 1:length(issuelist)) { curtags <- as.character(issuelist[[i]]) curissue <- names(issuelist)[i] + curfile <- str_c(id_folder,"/",curissue,".csv") # Now test all tags of a single issue - for(t in 1:length(curtags)) { - curtag <- str_c("\\W", curtags[t], "\\W") - curchars <- nchar(curtag, type = "chars") - 4 + for(s in 1:length(curtags)) { + curtag <- curtags[s] + curchars <- nchar(curtag, type = "chars") + # Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch + if(curchars <= 4) { + curacro <- checkAcronym(string = curtag, chars = curchars) + } else { + curacro <- FALSE + } + # Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance) - tags_found <- smartPatternMatch(curtext, curtag, curchars) + tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro) if(tags_found == 1) { #cat("Matched", curtag, "with", curtext,"\n") issues[d,curissue] <- issues[d,curissue] + 1 + write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE) break } else { @@ -59,44 +77,6 @@ for(d in 1:nrow(issues)) { -# WEEKLY INTERVALS -------------------------------------------------------- - - - -## Do not use days but week intervals - -wrange <- (as.integer(date_end - date_start) / 7) -wrange <- floor(wrange) - 1 -wrange <- date_start + weeks(0:wrange) -issues_week <- data.frame(week = wrange) -issues_week[issueheads] <- 0 - - -for(w in 1:nrow(issues_week)) { - curweek <- issues_week$week[w] - currange <- curweek + days(0:6) - - day <- 1 - - - for(d in 1:nrow(issues)) { - curday <- issues$date[d] - - if(curweek == curday) { - for(c in 2:ncol(issues)) { - curissue <- names(issues)[c] - d2 <- d + 6 - curvalue <- sum(issues[d:d2,curissue]) - issues_week[w, curissue] <- curvalue - - } # /for issues columns - } # /if day matches first day of week - - } # /for issues rows -} # /for issues_week - - - # VISUALS ----------------------------------------------------------------- @@ -105,10 +85,6 @@ issues_melt <- melt(issues,id="date") ggplot(issues_melt,aes(x=date,y=value,colour=variable,group=variable)) + geom_line(size=1) ggplot(issues_melt,aes(x=date,y=value,colour=variable,group=variable)) + geom_smooth(size=1,method="loess",formula = y ~ x, se=FALSE) -# Level: weeks -issues_week_melt <- melt(issues_week,id="week") -ggplot(issues_week_melt,aes(x=week,y=value,colour=variable,group=variable)) + geom_line(size=1) -ggplot(issues_week_melt,aes(x=week,y=value,colour=variable,group=variable)) + geom_smooth(size=1,method="loess",formula = y ~ x, se=FALSE) # POSSIBLY USEFUL CODE ---------------------------------------------------- diff --git a/issuecomp-functions.R b/issuecomp-functions.R index 3c915bd..c7ff670 100644 --- a/issuecomp-functions.R +++ b/issuecomp-functions.R @@ -26,20 +26,47 @@ convertLogical0 <- function(var) { return(var) } -smartPatternMatch <- function(string, pattern, chars) { - if(chars < 5) { - found <- agrep(pattern, string, max.distance = list(all = 0), ignore.case = TRUE, fixed = FALSE) +smartPatternMatch <- function(string, pattern, chars, acronym) { + pattern <- str_c("\\b", pattern, "\\b") + + if(chars <= 4) { + found <- agrep(pattern, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE) } - else if(chars > 7) { - found <- agrep(pattern, string, max.distance = list(all = 2), ignore.case = TRUE, fixed = FALSE) + else if(chars >= 8) { + found <- agrep(pattern, string, max.distance = list(all = 2), ignore.case = !acronym, fixed = FALSE) } else { - found <- agrep(pattern, string, max.distance = list(all = 1), ignore.case = TRUE, fixed = FALSE) + found <- agrep(pattern, string, max.distance = list(all = 1), ignore.case = !acronym, fixed = FALSE) } found <- convertLogical0(found) return(found) } +viewMatchingTweets <- function(date, issue, folder) { + file <- str_c(folder,"/",issue,".csv") + df <- read.csv(file, sep = ";", colClasses="character", header = FALSE) + for(r in 1:nrow(df)) { + curdate <- as.character(df[r,1]) + if(curdate == date) { + curid <- as.character(df[r,2]) + curtag <- as.character(df[r,3]) + cat(tweets$text[tweets$id_str == curid]," - ",curtag,"\n") + } + } +} + + +checkAcronym <- function(string, chars) { + curtag_up <- str_replace_all(string = curtag, pattern = "[[:lower:]]", replacement = "") + curchars_up <- nchar(curtag_up, type = "chars") + if(curchars_up == curchars) { + return(TRUE) + } + else { + return(FALSE) + } +} + ## ERROR HANDLING # Check for empty API returns (0 or 1 or 2) diff --git a/issues.xml b/issues.xml index c791322..31559cd 100644 --- a/issues.xml +++ b/issues.xml @@ -62,9 +62,9 @@ irak - isis - is - kalifat + ISIS + IS + Kalifat