diff --git a/.Rhistory b/.Rhistory
index d6f3a60..f2fe296 100644
--- a/.Rhistory
+++ b/.Rhistory
@@ -1,512 +1,512 @@
+# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
+if(curchars <= 4) {
+curacro <- checkAcronym(string = curtag, chars = curchars)
+} else {
+curacro <- FALSE
}
-## END ERROR HANDLING ##
-# Delete unnecessary columns and add username and real name to dataframe
-tweets_temp <- tweets_temp[keep]
-tweets_temp <- cbind(user=user, name=name, tweets_temp)
-# Now sleep 3 second to dodge 300queries/15min limit
-cat("[",a,"/",nrow(acc_df),"] ", sep = "")
-cat("User: ",user," in loop: ",loop,". \n", sep = "")
-Sys.sleep(2)
-if(tweets_full$id_str[nrow(tweets_full)] == tweets_temp$id_str[nrow(tweets_temp)] && nrow(tweets_full) > 0) {
-cat("[INFO] Last tweet of temp is last tweet of full. Abort loop and begin with next user.\n")
+# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
+tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro)
+if(tags_found == 1) {
+#cat("Matched", curtag, "with", curtext,"\n")
+issues[d,curissue] <- issues[d,curissue] + 1
+write(str_c(curdate,";\"",curid,"\""), curfile, append = TRUE)
break
}
-## Last loop is reached. Now clear the data frame
-# # Is the last tweet in tweets_temp from 2013?
-# status <- str_detect(tweets_temp$created_at[nrow(tweets_temp)], "2014$")
-# Extract year of last tweet in tweets_temp
-year <- as.numeric(str_extract(tweets_temp$created_at[nrow(tweets_temp)], "\\d{4}$"))
-status <- year < 2014
-if (status) { # Starting when tweet is earlier than 2014 (i.e. 2013, 2012...)
-# # Delete all tweets other than from 2014
-# old <- 0
-# for(r in 1:nrow(tweets_temp)) {
-# status <- str_detect(tweets_temp$created_at[r], "2014$")
-# if(is.na(status)) {
-# #status <- FALSE
-# cat("[INFO] NA-Status in Tweet", r)
-# }
-# if(!status) { # Starting when tweet not from 2014
-# old <- old + 1
-# }
-# }
-# if(old > 0) {
-# old <- old - 1
-#
-# # If even the first entry isn't from 2014, we have to set "old" manually because of a bug
-# status <- str_detect(tweets_temp$created_at[1], "2014$")
-# if(!status) {
-# old <- nrow(tweets_temp)
-# cat("[INFO] Timeline enhält keinen einzigen aus 2014\n")
-# }
-#
-# # delete all lines which are older than 2014
-# tweets_temp <- head(tweets_temp, -old)
-# }
-# rm(old)
-tweets_full <- insertRow(tweets_full, tweets_temp)
-#rm(tweets_temp)
-break # End loop because 2013 is reached
-}
-# The last tweet is newer or equal 2014, so we need another loop
else {
-# Setting max_id to gather next 200 tweets
-max_id <- tweets_temp$id_str[nrow(tweets_temp)]
-loop <- loop + 1 # just for stats
-tweets_full <- insertRow(tweets_full, tweets_temp)
-#rm(tweets_temp)
+#cat("Nothing found\n")
}
-} # /repeat
-tweets_complete <- insertRow(tweets_complete, tweets_full)
-tweets_full <- head(tweets_full, -nrow(tweets_full)) # Empty tweets_full
-cat("User:",user,"finished after",loop,"loops. Total Tweets now:",nrow(tweets_complete),"\n")
-write.csv(tweets_complete, "tweets_complete.csv")
-# Every tweet from 2014 or newer from user[a] is downloaded. Now next user in for-loop
-}
-api_url <- "https://api.twitter.com/1.1/statuses/user_timeline.json";
-max_count <- "200"
-keep <- c("created_at", "id_str", "text", "retweet_count")
-# tweets_complete: All tweets
-# tweets_full: All tweets of current user
-# tweets_temp: The current max 200 tweets of current user
-tweets_full <- data.frame(user=character(), name=character(), created_at=character(), id_str=character(), text=character(), retweet_count=character())
-tweets_complete <- tweets_full
-for(a in 1:nrow(acc_df)) {
-user <- as.character(acc_df$twitter_acc[a])
-name <- as.character(acc_df$name[a])
-max_id <- "999999999999999999"
-loop <- 1
-error <- 0
-repeat {
-# Define specific search query
-query <- c(include_rts=1, exclude_replies="true", trim_user="true", include_entities="false",
-screen_name=user,
-count=max_count,
-max_id=max_id);
-# At first, work with an temporary tweet-DB
-current <- twitter_api_call(api_url, query, api_params)
-rm(tweets_temp)
-tweets_temp <- fromJSON(correctJSON(current))
-## START ERROR HANDLING ##
-# Empty API output
-status <- errorEmptyAPI(tweets_temp)
-if(status == 1) { Sys.sleep(3);error <- error + 1;next}
-if(status == 2) {break}
-# Contains "error" column
-status <- errorErrorColumn(tweets_temp)
-if(status == 1) { Sys.sleep(3);error <- error + 1;next}
-if(status == 2) {break}
-# Check if error code exists
-code <- errorCheckCode(tweets_temp) # 0 if no error
-if(code == 34) { # page does not exist
-status <- errorCode34()
-if(status == 1) { Sys.sleep(3);error <- error + 1;next}
-if(status == 2) {break}
-}
-if(code == 88) { # rate limit exceeded
-wait <- errorCode88()
-Sys.sleep(wait)
-next
-}
-## END ERROR HANDLING ##
-# Delete unnecessary columns and add username and real name to dataframe
-tweets_temp <- tweets_temp[keep]
-tweets_temp <- cbind(user=user, name=name, tweets_temp)
-# Now sleep 3 second to dodge 300queries/15min limit
-cat("[",a,"/",nrow(acc_df),"] ", sep = "")
-cat("User: ",user," in loop: ",loop,". \n", sep = "")
-Sys.sleep(2)
-if(tweets_full$id_str[nrow(tweets_full)] == tweets_temp$id_str[nrow(tweets_temp)] && nrow(tweets_full) > 0) {
-cat("[INFO] Last tweet of temp is last tweet of full. Abort loop and begin with next user.\n")
-break
-}
-## Last loop is reached. Now clear the data frame
-# # Is the last tweet in tweets_temp from 2013?
-# status <- str_detect(tweets_temp$created_at[nrow(tweets_temp)], "2014$")
-# Extract year of last tweet in tweets_temp
-year <- as.numeric(str_extract(tweets_temp$created_at[nrow(tweets_temp)], "\\d{4}$"))
-status <- year < 2014
-if (status) { # Starting when tweet is earlier than 2014 (i.e. 2013, 2012...)
-# # Delete all tweets other than from 2014
-# old <- 0
-# for(r in 1:nrow(tweets_temp)) {
-# status <- str_detect(tweets_temp$created_at[r], "2014$")
-# if(is.na(status)) {
-# #status <- FALSE
-# cat("[INFO] NA-Status in Tweet", r)
-# }
-# if(!status) { # Starting when tweet not from 2014
-# old <- old + 1
-# }
-# }
-# if(old > 0) {
-# old <- old - 1
-#
-# # If even the first entry isn't from 2014, we have to set "old" manually because of a bug
-# status <- str_detect(tweets_temp$created_at[1], "2014$")
-# if(!status) {
-# old <- nrow(tweets_temp)
-# cat("[INFO] Timeline enhält keinen einzigen aus 2014\n")
-# }
-#
-# # delete all lines which are older than 2014
-# tweets_temp <- head(tweets_temp, -old)
-# }
-# rm(old)
-tweets_full <- insertRow(tweets_full, tweets_temp)
-#rm(tweets_temp)
-break # End loop because 2013 is reached
-}
-# The last tweet is newer or equal 2014, so we need another loop
-else {
-# Setting max_id to gather next 200 tweets
-max_id <- tweets_temp$id_str[nrow(tweets_temp)]
-loop <- loop + 1 # just for stats
-tweets_full <- insertRow(tweets_full, tweets_temp)
-#rm(tweets_temp)
-}
-} # /repeat
-tweets_complete <- insertRow(tweets_complete, tweets_full)
-tweets_full <- head(tweets_full, -nrow(tweets_full)) # Empty tweets_full
-cat("User:",user,"finished after",loop,"loops. Total Tweets now:",nrow(tweets_complete),"\n")
-write.csv(tweets_complete, "tweets_complete.csv")
-# Every tweet from 2014 or newer from user[a] is downloaded. Now next user in for-loop
-}
-tweets_full <- data.frame(user=character(), name=character(), created_at=character(), id_str=character(), text=character(), retweet_count=character())
-tweets_complete <- tweets_full
-for(a in 1:nrow(acc_df)) {
-user <- as.character(acc_df$twitter_acc[a])
-name <- as.character(acc_df$name[a])
-max_id <- "999999999999999999"
-loop <- 1
-error <- 0
-repeat {
-# Define specific search query
-query <- c(include_rts=1, exclude_replies="true", trim_user="true", include_entities="false",
-screen_name=user,
-count=max_count,
-max_id=max_id);
-# At first, work with an temporary tweet-DB
-current <- twitter_api_call(api_url, query, api_params)
-rm(tweets_temp)
-tweets_temp <- fromJSON(correctJSON(current))
-## START ERROR HANDLING ##
-# Empty API output
-status <- errorEmptyAPI(tweets_temp)
-if(status == 1) { Sys.sleep(3);error <- error + 1;next}
-if(status == 2) {break}
-# Contains "error" column
-status <- errorErrorColumn(tweets_temp)
-if(status == 1) { Sys.sleep(3);error <- error + 1;next}
-if(status == 2) {break}
-# Check if error code exists
-code <- errorCheckCode(tweets_temp) # 0 if no error
-if(code == 34) { # page does not exist
-status <- errorCode34()
-if(status == 1) { Sys.sleep(3);error <- error + 1;next}
-if(status == 2) {break}
-}
-if(code == 88) { # rate limit exceeded
-wait <- errorCode88()
-Sys.sleep(wait)
-next
-}
-## END ERROR HANDLING ##
-# Delete unnecessary columns and add username and real name to dataframe
-tweets_temp <- tweets_temp[keep]
-tweets_temp <- cbind(user=user, name=name, tweets_temp)
-# Now sleep 3 second to dodge 300queries/15min limit
-cat("[",a,"/",nrow(acc_df),"] ", sep = "")
-cat("User: ",user," in loop: ",loop,". \n", sep = "")
-Sys.sleep(2)
-if(tweets_full$id_str[nrow(tweets_full)] == tweets_temp$id_str[nrow(tweets_temp)] && nrow(tweets_full) > 0) {
-cat("[INFO] Last tweet of temp is last tweet of full. Abort loop and begin with next user.\n")
-break
-}
-## Last loop is reached. Now clear the data frame
-# # Is the last tweet in tweets_temp from 2013?
-# status <- str_detect(tweets_temp$created_at[nrow(tweets_temp)], "2014$")
-# Extract year of last tweet in tweets_temp
-year_last <- as.numeric(str_extract(tweets_temp$created_at[nrow(tweets_temp)], "\\d{4}$"))
-status <- year_last < 2014
-if (status) { # Starting when tweet is earlier than 2014 (i.e. 2013, 2012...)
-year_first <- as.numeric(str_extract(tweets_temp$created_at[1], "\\d{4}$"))
-status <- year_first < 2014
-cat("[INFO] Timeline enhält keinen einzigen aus 2014\n")
-}
-# # Delete all tweets other than from 2014
-# old <- 0
-# for(r in 1:nrow(tweets_temp)) {
-# status <- str_detect(tweets_temp$created_at[r], "2014$")
-# if(is.na(status)) {
-# #status <- FALSE
-# cat("[INFO] NA-Status in Tweet", r)
-# }
-# if(!status) { # Starting when tweet not from 2014
-# old <- old + 1
-# }
-# }
-# if(old > 0) {
-# old <- old - 1
-#
-# # If even the first entry isn't from 2014, we have to set "old" manually because of a bug
-# status <- str_detect(tweets_temp$created_at[1], "2014$")
-# if(!status) {
-# old <- nrow(tweets_temp)
-# cat("[INFO] Timeline enhält keinen einzigen aus 2014\n")
-# }
-#
-# # delete all lines which are older than 2014
-# tweets_temp <- head(tweets_temp, -old)
-# }
-# rm(old)
-tweets_full <- insertRow(tweets_full, tweets_temp)
-#rm(tweets_temp)
-break # End loop because 2013 is reached
-}
-# The last tweet is newer or equal 2014, so we need another loop
-else {
-# Setting max_id to gather next 200 tweets
-max_id <- tweets_temp$id_str[nrow(tweets_temp)]
-loop <- loop + 1 # just for stats
-tweets_full <- insertRow(tweets_full, tweets_temp)
-#rm(tweets_temp)
-}
-} # /repeat
-tweets_complete <- insertRow(tweets_complete, tweets_full)
-tweets_full <- head(tweets_full, -nrow(tweets_full)) # Empty tweets_full
-cat("User:",user,"finished after",loop,"loops. Total Tweets now:",nrow(tweets_complete),"\n")
-write.csv(tweets_complete, "tweets_complete.csv")
-# Every tweet from 2014 or newer from user[a] is downloaded. Now next user in for-loop
-}
-tweets_full <- data.frame(user=character(), name=character(), created_at=character(), id_str=character(), text=character(), retweet_count=character())
-tweets_complete <- tweets_full
-for(a in 1:nrow(acc_df)) {
-user <- as.character(acc_df$twitter_acc[a])
-name <- as.character(acc_df$name[a])
-max_id <- "999999999999999999"
-loop <- 1
-error <- 0
-repeat {
-# Define specific search query
-query <- c(include_rts=1, exclude_replies="true", trim_user="true", include_entities="false",
-screen_name=user,
-count=max_count,
-max_id=max_id);
-# At first, work with an temporary tweet-DB
-current <- twitter_api_call(api_url, query, api_params)
-rm(tweets_temp)
-tweets_temp <- fromJSON(correctJSON(current))
-## START ERROR HANDLING ##
-# Empty API output
-status <- errorEmptyAPI(tweets_temp)
-if(status == 1) { Sys.sleep(3);error <- error + 1;next}
-if(status == 2) {break}
-# Contains "error" column
-status <- errorErrorColumn(tweets_temp)
-if(status == 1) { Sys.sleep(3);error <- error + 1;next}
-if(status == 2) {break}
-# Check if error code exists
-code <- errorCheckCode(tweets_temp) # 0 if no error
-if(code == 34) { # page does not exist
-status <- errorCode34()
-if(status == 1) { Sys.sleep(3);error <- error + 1;next}
-if(status == 2) {break}
-}
-if(code == 88) { # rate limit exceeded
-wait <- errorCode88()
-Sys.sleep(wait)
-next
-}
-## END ERROR HANDLING ##
-# Delete unnecessary columns and add username and real name to dataframe
-tweets_temp <- tweets_temp[keep]
-tweets_temp <- cbind(user=user, name=name, tweets_temp)
-# Now sleep 3 second to dodge 300queries/15min limit
-cat("[",a,"/",nrow(acc_df),"] ", sep = "")
-cat("User: ",user," in loop: ",loop,". \n", sep = "")
-Sys.sleep(2)
-if(tweets_full$id_str[nrow(tweets_full)] == tweets_temp$id_str[nrow(tweets_temp)] && nrow(tweets_full) > 0) {
-cat("[INFO] Last tweet of temp is last tweet of full. Abort loop and begin with next user.\n")
-break
-}
-## Last loop is reached. Now clear the data frame
-# # Is the last tweet in tweets_temp from 2013?
-# status <- str_detect(tweets_temp$created_at[nrow(tweets_temp)], "2014$")
-# Extract year of last tweet in tweets_temp
-year_last <- as.numeric(str_extract(tweets_temp$created_at[nrow(tweets_temp)], "\\d{4}$"))
-status <- year_last < 2014
-if (status) { # Starting when tweet is earlier than 2014 (i.e. 2013, 2012...)
-# Is even the first tweet older than 2014?
-year_first <- as.numeric(str_extract(tweets_temp$created_at[1], "\\d{4}$"))
-status <- year_first < 2014
-if(status) {
-cat("[INFO] Timeline enhält keinen einzigen aus 2014\n")
-}
-# # Delete all tweets other than from 2014
-# old <- 0
-# for(r in 1:nrow(tweets_temp)) {
-# status <- str_detect(tweets_temp$created_at[r], "2014$")
-# if(is.na(status)) {
-# #status <- FALSE
-# cat("[INFO] NA-Status in Tweet", r)
-# }
-# if(!status) { # Starting when tweet not from 2014
-# old <- old + 1
-# }
-# }
-# if(old > 0) {
-# old <- old - 1
-#
-# # If even the first entry isn't from 2014, we have to set "old" manually because of a bug
-# status <- str_detect(tweets_temp$created_at[1], "2014$")
-# if(!status) {
-# old <- nrow(tweets_temp)
-# cat("[INFO] Timeline enhält keinen einzigen aus 2014\n")
-# }
-#
-# # delete all lines which are older than 2014
-# tweets_temp <- head(tweets_temp, -old)
-# }
-# rm(old)
-tweets_full <- insertRow(tweets_full, tweets_temp)
-#rm(tweets_temp)
-break # End loop because 2013 is reached
-}
-# The last tweet is newer or equal 2014, so we need another loop
-else {
-# Setting max_id to gather next 200 tweets
-max_id <- tweets_temp$id_str[nrow(tweets_temp)]
-loop <- loop + 1 # just for stats
-tweets_full <- insertRow(tweets_full, tweets_temp)
-#rm(tweets_temp)
-}
-} # /repeat
-tweets_complete <- insertRow(tweets_complete, tweets_full)
-tweets_full <- head(tweets_full, -nrow(tweets_full)) # Empty tweets_full
-cat("User:",user,"finished after",loop,"loops. Total Tweets now:",nrow(tweets_complete),"\n")
-write.csv(tweets_complete, "tweets_complete.csv")
-# Every tweet from 2014 or newer from user[a] is downloaded. Now next user in for-loop
-}
-rm(a, code, current, error, loop, max_id, name, query, r, status, user, wait, tweets_full, tweets_temp)
-rm(a, code, current, error, loop, max_id, max_count, year_first, year_last, name, query, status, user, wait, tweets_full, tweets_temp)
-rm(year)
-save(tweets_complete, file="tweets_complete.RData")
-tweets_complete2 <- tweets_complete
-View(tweets_complete2)
-tweets <- tweets_complete[!duplicated(tweets_complete), ]
-tweets <- na.omit(tweets)
-save(tweets, file="tweets.RData")
-rm(tweets_complete2)
-Sys.setlocale("LC_TIME", "C")
-tweets$created_at <- as.POSIXct(tweets$created_at, format = "%a %b %d %H:%M:%S %z %Y")
-tweets <- tweets[order(tweets$created_at), ]
-head(tweets)
-delrow <- NULL
-for(r in 1:nrow(tweets)) {
-if(format(tweets$created_at[r], "%Y") != "2014") {
-delrow <- c(delrow, r)
-}
-if(format(tweets$created_at[r], "%Y") == "2014") {
-break
-}
-}
-delrow <- NULL
-pb <- txtProgressBar(min = 0, max = total, style = 3)
-for(r in 1:nrow(tweets)) {
-setTxtProgressBar(pb, r)
-if(format(tweets$created_at[r], "%Y") != "2014") {
-delrow <- c(delrow, r)
-}
-if(format(tweets$created_at[r], "%Y") == "2014") {
-break
-}
-}
-pb <- txtProgressBar(min = 0, max = total, style = 3)
-pb <- txtProgressBar(min = 0, max = nrow(tweets), style = 3)
-for(r in 1:nrow(tweets)) {
-setTxtProgressBar(pb, r)
-if(format(tweets$created_at[r], "%Y") != "2014") {
-delrow <- c(delrow, r)
-}
-if(format(tweets$created_at[r], "%Y") == "2014") {
-break
-}
-}
-tweets <- tweets[-delrow, ]
-rm(delrow, r)
-summary(tweets)
-tweets$created_at[140000]
-tweets$created_at[130000]
-tweets$created_at[1]
-tweets$created_at[2]
-tweets$created_at[100]
-tweets$created_at[141086]
-delrow
-delrow <- NULL
-for(r in 1:nrow(tweets)) {
-if(format(tweets$created_at[r], "%Y") != "2014") {
-delrow <- c(delrow, r)
-}
-if(format(tweets$created_at[r], "%Y") == "2014") {
-break
-}
-}
-pb <- txtProgressBar(min = 0, max = nrow(tweets), style = 3)
-for(r in 1:nrow(tweets)) {
-if(format(tweets$created_at[r], "%Y") != "2014") {
-delrow <- c(delrow, r)
-}
-setTxtProgressBar(pb, r)
-}
-tweets <- tweets[-delrow, ]
-tweets$created_at[137800]
-tweets$created_at[137876]
-rm(delrow, r)
-rm(pb)
-tweets$created_at <- format(tweets$created_at, "%Y-%m-%d")
-tweets$created_at[137876]
-save(tweets, file="tweets.RData")
-readLines("twitter-api-consumerkey.txt")
-api_params2 <- api_params
-readLines("twitter-api-consumerkey.txt")
-readLines("twitter-api-consumerkey.txt")[2]
-readLines("twitter-api-credentials.txt")[2]
-readLines("twitter-api-credentials.txt")[1]
-api_params <- c(
-"oauth_consumer_key" = readLines("twitter-api-credentials.txt")[2],
-"oauth_nonce" = NA,
-"oauth_signature_method" = "HMAC-SHA1",
-"oauth_timestamp" = NA,
-"oauth_token" = readLines("twitter-api-credentials.txt")[4],
-"oauth_version" = "1.0",
-"consumer_secret" = readLines("twitter-api-credentials.txt")[3],
-"oauth_token_secret" = readLines("twitter-api-credentials.txt")[5]
-)
-api_params
-api_params2
-rm(tweets_complete)
-rm(api_params2)
-source("issuecomp-functions.R")
-source("issuecomp-functions.R")
-setwd("~/Dokumente/Uni/Aktuell/BA-Arbeit/uni-ba-issuecomp")
-require(lubridate)
-require(XML)
-require(ggplot2)
-require(reshape2)
-date_start <- as.Date("2014-01-01")
-date_end <- as.Date("2014-12-01")
-drange <- as.integer(date_end - date_start)
-drange <- date_start + days(0:drange)
+} # /for curtags
+} # /for issuelist
+} # /for tweets_curday
+} # /for drange
+smartPatternMatch("kerTips: Riker workplace tip: Flirt when no one else is looking. http", "IS", 2, TRUE)
+smartPatternMatch("kerTips: Riker workplace tip: Flirt when no one else is looking. http", "is", 2, TRUE)
+viewMatchingTweets("2014-01-06", "issue.iraq", id_folder)
+# MATCH TWEETS ------------------------------------------------------------
+id_folder <- "matched-ids"
+unlink(id_folder, recursive = TRUE)
+dir.create(id_folder)
issues <- data.frame(date = drange)
-View(issues)
-date_start <- as.Date("2014-01-01")
-date_end <- as.Date("2014-12-31")
-drange <- as.integer(date_end - date_start)
-drange <- date_start + days(0:drange)
-issues <- data.frame(date = drange)
-View(issues)
issuelist <- xmlToList("issues.xml")
issueheads <- names(issuelist)
issues[issueheads] <- 0
+for(d in 1:nrow(issues)) {
+# Go through every day
+curdate <- issues$date[d]
+cat(as.character(curdate),"\n")
+# Put all tweets from specific day in a temporary DF
+tweets_curday <- tweets[tweets[, "created_at"] == curdate, ]
+for(t in 1:nrow(tweets_curday)){
+# Select tweet's text, make it lowercase and remove hashtag indicators (#)
+curtext <- as.character(tweets_curday$text[t])
+curtext <- str_replace_all(curtext, "#", "")
+curid <- as.character(tweets_curday$id_str[t])
+# Now test each single issue (not tag!)
+for(i in 1:length(issuelist)) {
+curtags <- as.character(issuelist[[i]])
+curissue <- names(issuelist)[i]
+curfile <- str_c(id_folder,"/",curissue,".csv")
+# Now test all tags of a single issue
+for(s in 1:length(curtags)) {
+curtag <- curtags[s]
+curchars <- nchar(curtag, type = "chars")
+# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
+if(curchars <= 4) {
+curacro <- checkAcronym(string = curtag, chars = curchars)
+} else {
+curacro <- FALSE
+}
+# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
+tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro)
+if(tags_found == 1) {
+#cat("Matched", curtag, "with", curtext,"\n")
+issues[d,curissue] <- issues[d,curissue] + 1
+write(str_c(curdate,";\"",curid,"\""), curfile, append = TRUE)
+break
+}
+else {
+#cat("Nothing found\n")
+}
+} # /for curtags
+} # /for issuelist
+} # /for tweets_curday
+} # /for drange
+source("issuecomp-functions.R")
+# MATCH TWEETS ------------------------------------------------------------
+id_folder <- "matched-ids"
+unlink(id_folder, recursive = TRUE)
+dir.create(id_folder)
+issues <- data.frame(date = drange)
+issuelist <- xmlToList("issues.xml")
+issueheads <- names(issuelist)
+issues[issueheads] <- 0
+for(d in 1:nrow(issues)) {
+# Go through every day
+curdate <- issues$date[d]
+cat(as.character(curdate),"\n")
+# Put all tweets from specific day in a temporary DF
+tweets_curday <- tweets[tweets[, "created_at"] == curdate, ]
+for(t in 1:nrow(tweets_curday)){
+# Select tweet's text, make it lowercase and remove hashtag indicators (#)
+curtext <- as.character(tweets_curday$text[t])
+curtext <- str_replace_all(curtext, "#", "")
+curid <- as.character(tweets_curday$id_str[t])
+# Now test each single issue (not tag!)
+for(i in 1:length(issuelist)) {
+curtags <- as.character(issuelist[[i]])
+curissue <- names(issuelist)[i]
+curfile <- str_c(id_folder,"/",curissue,".csv")
+# Now test all tags of a single issue
+for(s in 1:length(curtags)) {
+curtag <- curtags[s]
+curchars <- nchar(curtag, type = "chars")
+# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
+if(curchars <= 4) {
+curacro <- checkAcronym(string = curtag, chars = curchars)
+} else {
+curacro <- FALSE
+}
+# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
+tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro)
+if(tags_found == 1) {
+#cat("Matched", curtag, "with", curtext,"\n")
+issues[d,curissue] <- issues[d,curissue] + 1
+write(str_c(curdate,";\"",curid,"\";"curtag), curfile, append = TRUE)
+break
+}
+else {
+#cat("Nothing found\n")
+}
+} # /for curtags
+} # /for issuelist
+} # /for tweets_curday
+} # /for drange
+# MATCH TWEETS ------------------------------------------------------------
+id_folder <- "matched-ids"
+unlink(id_folder, recursive = TRUE)
+dir.create(id_folder)
+issues <- data.frame(date = drange)
+issuelist <- xmlToList("issues.xml")
+issueheads <- names(issuelist)
+issues[issueheads] <- 0
+for(d in 1:nrow(issues)) {
+# Go through every day
+curdate <- issues$date[d]
+cat(as.character(curdate),"\n")
+# Put all tweets from specific day in a temporary DF
+tweets_curday <- tweets[tweets[, "created_at"] == curdate, ]
+for(t in 1:nrow(tweets_curday)){
+# Select tweet's text, make it lowercase and remove hashtag indicators (#)
+curtext <- as.character(tweets_curday$text[t])
+curtext <- str_replace_all(curtext, "#", "")
+curid <- as.character(tweets_curday$id_str[t])
+# Now test each single issue (not tag!)
+for(i in 1:length(issuelist)) {
+curtags <- as.character(issuelist[[i]])
+curissue <- names(issuelist)[i]
+curfile <- str_c(id_folder,"/",curissue,".csv")
+# Now test all tags of a single issue
+for(s in 1:length(curtags)) {
+curtag <- curtags[s]
+curchars <- nchar(curtag, type = "chars")
+# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
+if(curchars <= 4) {
+curacro <- checkAcronym(string = curtag, chars = curchars)
+} else {
+curacro <- FALSE
+}
+# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
+tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro)
+if(tags_found == 1) {
+#cat("Matched", curtag, "with", curtext,"\n")
+issues[d,curissue] <- issues[d,curissue] + 1
+write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE)
+break
+}
+else {
+#cat("Nothing found\n")
+}
+} # /for curtags
+} # /for issuelist
+} # /for tweets_curday
+} # /for drange
+source("issuecomp-functions.R")
+viewMatchingTweets("2014-01-06", "issue.iraq", id_folder)
+viewMatchingTweets("2014-01-07", "issue.iraq", id_folder)
+viewMatchingTweets("2014-01-09", "issue.iraq", id_folder)
+curtext <- "Willkürlich Menschen an ihrer #Versammlungsfreiheit zu hindern ist eindeutig rechtswidrig. http://t.co/A7IQfISIhP #Gefahrengebiet #Hamburg"
+str_replace_all(curtext, "http://.+\\W", "")
+str_replace_all(curtext, "http://.+?\\W", "")
+str_replace_all(curtext, "http://.+?\\s", "")
+str_replace_all(curtext, "http://.+?\\s", "")
+curtext <- "test http://google.de haha http://nsa.gov eqiuhe"
+str_replace_all(curtext, "http://.+?\\s", "")
+str_replace_all(curtext, "http://.+?\\s", "URL")
+str_replace_all(curtext, "http://.+?\\s", "URL ")
+viewMatchingTweets("2014-01-09", "issue.iraq", id_folder)
+# MATCH TWEETS ------------------------------------------------------------
+id_folder <- "matched-ids"
+unlink(id_folder, recursive = TRUE)
+dir.create(id_folder)
+issues <- data.frame(date = drange)
+issuelist <- xmlToList("issues.xml")
+issueheads <- names(issuelist)
+issues[issueheads] <- 0
+for(d in 1:nrow(issues)) {
+# Go through every day
+curdate <- issues$date[d]
+cat(as.character(curdate),"\n")
+# Put all tweets from specific day in a temporary DF
+tweets_curday <- tweets[tweets[, "created_at"] == curdate, ]
+for(t in 1:nrow(tweets_curday)){
+# Select tweet's text, make it lowercase and remove hashtag indicators (#)
+curtext <- as.character(tweets_curday$text[t])
+curtext <- str_replace_all(curtext, "#", "")
+curtext <- str_replace_all(curtext, "http://.+?\\s", "URL ")
+curid <- as.character(tweets_curday$id_str[t])
+# Now test each single issue (not tag!)
+for(i in 1:length(issuelist)) {
+curtags <- as.character(issuelist[[i]])
+curissue <- names(issuelist)[i]
+curfile <- str_c(id_folder,"/",curissue,".csv")
+# Now test all tags of a single issue
+for(s in 1:length(curtags)) {
+curtag <- curtags[s]
+curchars <- nchar(curtag, type = "chars")
+# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
+if(curchars <= 4) {
+curacro <- checkAcronym(string = curtag, chars = curchars)
+} else {
+curacro <- FALSE
+}
+# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
+tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro)
+if(tags_found == 1) {
+#cat("Matched", curtag, "with", curtext,"\n")
+issues[d,curissue] <- issues[d,curissue] + 1
+write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE)
+break
+}
+else {
+#cat("Nothing found\n")
+}
+} # /for curtags
+} # /for issuelist
+} # /for tweets_curday
+} # /for drange
+viewMatchingTweets("2014-01-09", "issue.iraq", id_folder)
+viewMatchingTweets("2014-01-08", "issue.iraq", id_folder)
+viewMatchingTweets("2014-01-10", "issue.iraq", id_folder)
+curtext
+str_replace_all(curtext, "http://.+?\\>", "URL ")
+str_replace_all(curtext, "http://.+?\\<", "URL ")
+curtext <- str_replace_all(curtext, "http://.+?\\b", "URL ")
+str_replace_all(curtext, "http://.+?\\b", "URL ")
+str_replace_all(curtext, "http://.+?\\s", "URL ")
+curtext
+curtext <- as.character(tweets_curday$text[t])
+curtext
+str_replace_all(curtext, "http://.+?\\s", "URL ")
+str_replace_all(curtext, "http://.+?\\b", "URL ")
+str_replace_all(curtext, "http://.+?\\<", "URL ")
+str_replace_all(curtext, "http://.+?\\>", "URL ")
+str_replace_all(curtext, "http://.+?\\s", "URL ")
+str_replace_all(curtext, "$", " ")
+curtext <- str_replace_all(curtext, "$", " ")
+curtext
+str_replace_all(curtext, "http://.+?\\s", "URL ")
+viewMatchingTweets("2014-01-10", "issue.iraq", id_folder)
+# MATCH TWEETS ------------------------------------------------------------
+id_folder <- "matched-ids"
+unlink(id_folder, recursive = TRUE)
+dir.create(id_folder)
+issues <- data.frame(date = drange)
+issuelist <- xmlToList("issues.xml")
+issueheads <- names(issuelist)
+issues[issueheads] <- 0
+for(d in 1:nrow(issues)) {
+# Go through every day
+curdate <- issues$date[d]
+cat(as.character(curdate),"\n")
+# Put all tweets from specific day in a temporary DF
+tweets_curday <- tweets[tweets[, "created_at"] == curdate, ]
+for(t in 1:nrow(tweets_curday)){
+# Select tweet's text, make it lowercase and remove hashtag indicators (#)
+curtext <- as.character(tweets_curday$text[t])
+curtext <- str_replace_all(curtext, "#", "")
+curtext <- str_replace_all(curtext, "$", " ")
+curtext <- str_replace_all(curtext, "http://.+?\\s", "URL ")
+curid <- as.character(tweets_curday$id_str[t])
+# Now test each single issue (not tag!)
+for(i in 1:length(issuelist)) {
+curtags <- as.character(issuelist[[i]])
+curissue <- names(issuelist)[i]
+curfile <- str_c(id_folder,"/",curissue,".csv")
+# Now test all tags of a single issue
+for(s in 1:length(curtags)) {
+curtag <- curtags[s]
+curchars <- nchar(curtag, type = "chars")
+# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
+if(curchars <= 4) {
+curacro <- checkAcronym(string = curtag, chars = curchars)
+} else {
+curacro <- FALSE
+}
+# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
+tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro)
+if(tags_found == 1) {
+#cat("Matched", curtag, "with", curtext,"\n")
+issues[d,curissue] <- issues[d,curissue] + 1
+write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE)
+break
+}
+else {
+#cat("Nothing found\n")
+}
+} # /for curtags
+} # /for issuelist
+} # /for tweets_curday
+} # /for drange
+viewMatchingTweets("2014-01-10", "issue.iraq", id_folder)
+# MATCH TWEETS ------------------------------------------------------------
+id_folder <- "matched-ids"
+unlink(id_folder, recursive = TRUE)
+dir.create(id_folder)
+issues <- data.frame(date = drange)
+issuelist <- xmlToList("issues.xml")
+issueheads <- names(issuelist)
+issues[issueheads] <- 0
+for(d in 1:nrow(issues)) {
+# Go through every day
+curdate <- issues$date[d]
+cat(as.character(curdate),"\n")
+# Put all tweets from specific day in a temporary DF
+tweets_curday <- tweets[tweets[, "created_at"] == curdate, ]
+for(t in 1:nrow(tweets_curday)){
+# Select tweet's text, make it lowercase and remove hashtag indicators (#)
+curtext <- as.character(tweets_curday$text[t])
+curtext <- str_replace_all(curtext, "#", "")
+curtext <- str_replace_all(curtext, "$", " ")
+curtext <- str_replace_all(curtext, "http://.+?\\s", "URL ")
+curid <- as.character(tweets_curday$id_str[t])
+# Now test each single issue (not tag!)
+for(i in 1:length(issuelist)) {
+curtags <- as.character(issuelist[[i]])
+curissue <- names(issuelist)[i]
+curfile <- str_c(id_folder,"/",curissue,".csv")
+# Now test all tags of a single issue
+for(s in 1:length(curtags)) {
+curtag <- curtags[s]
+curchars <- nchar(curtag, type = "chars")
+# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
+if(curchars <= 4) {
+curacro <- checkAcronym(string = curtag, chars = curchars)
+} else {
+curacro <- FALSE
+}
+# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
+tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro)
+if(tags_found == 1) {
+#cat("Matched", curtag, "with", curtext,"\n")
+issues[d,curissue] <- issues[d,curissue] + 1
+write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE)
+break
+}
+else {
+#cat("Nothing found\n")
+}
+} # /for curtags
+} # /for issuelist
+} # /for tweets_curday
+} # /for drange
View(issues)
+viewMatchingTweets("2014-12-18", "issue.edathy", id_folder)
+issues_melt <- melt(issues,id="date")
+ggplot(issues_melt,aes(x=date,y=value,colour=variable,group=variable)) + geom_line(size=1)
+ggplot(issues_melt,aes(x=date,y=value,colour=variable,group=variable)) + geom_smooth(size=1,method="loess",formula = y ~ x, se=FALSE)
+ggplot(issues_melt,aes(x=date,y=value,colour=variable,group=variable)) + geom_line(size=1)
+ggplot(issues_melt,aes(x=date,y=value,colour=variable,group=variable)) + geom_line(size=1)
+ggplot(issues_melt,aes(x=date,y=value,colour=variable,group=variable)) + geom_smooth(size=1,method="loess",formula = y ~ x, se=FALSE)
+viewMatchingTweets("2014-12-18", "issue.conservative", id_folder)
+agrep("christ", "Jungparlamentarier gleich Schriftführerdienst hat", max.distance = list(all = 2), ignore.case = TRUE, fixed = FALSE)
+agrep("\\bchrist\\b", "Jungparlamentarier gleich Schriftführerdienst hat", max.distance = list(all = 2), ignore.case = TRUE, fixed = FALSE)
+agrep("\\bchrist\\b", "Bla Christ bla", max.distance = list(all = 2), ignore.case = TRUE, fixed = FALSE)
+agrep("\\bchrist\\b", "Bla Christus bla", max.distance = list(all = 2), ignore.case = TRUE, fixed = FALSE)
+agrep("\\bchrist\\b", "Bla Christu bla", max.distance = list(all = 2), ignore.case = TRUE, fixed = FALSE)
+agrep("\\bchrist\\b", "Bla Christus bla", max.distance = list(all = 2), ignore.case = TRUE, fixed = FALSE)
+agrep("\\bchrist\\b", "Bla Christus bla", max.distance = list(all = 3), ignore.case = TRUE, fixed = FALSE)
+agrep("\\bchrist\\b", "Bla christus bla", max.distance = list(all = 3), ignore.case = TRUE, fixed = FALSE)
+agrep("\\bchrist\\b", "Bla christus bla", max.distance = list(all = 2), ignore.case = TRUE, fixed = FALSE)
+agrep("\\bchrist\\b", "Bla christen bla", max.distance = list(all = 3), ignore.case = TRUE, fixed = FALSE)
+agrep("\\bchrist\\b", "Bla Antichrist bla", max.distance = list(all = 3), ignore.case = TRUE, fixed = FALSE)
+agrep("\\bchrist\\b", "Bla Christian bla", max.distance = list(all = 3), ignore.case = TRUE, fixed = FALSE)
+agrep("\\bchrist\\b", "Bla Christian bla", max.distance = list(all = 3), ignore.case = TRUE, fixed = FALSE, value=TRUE)
+agrep("\\bchrist\\b", "Bla Christi bla", max.distance = list(all = 3), ignore.case = TRUE, fixed = FALSE, value=TRUE)
+agrep("\\bchrist\\b", "Bla Christi bla", max.distance = list(all = 3), ignore.case = TRUE, fixed = FALSE)
+agrep("\\bIS\\b", "Wir sind bei ISN Network", max.distance = list(all = 0), ignore.case = TRUE, fixed = FALSE)
+agrep("\\bIS\\b", "Wir sind bei ISN Network", max.distance = list(all = 0), ignore.case = F, fixed = FALSE)
+id_folder <- "matched-ids"
+unlink(id_folder, recursive = TRUE)
+dir.create(id_folder)
+issues <- data.frame(date = drange)
+issuelist <- xmlToList("issues.xml")
+issueheads <- names(issuelist)
+issues[issueheads] <- 0
+for(d in 1:nrow(issues)) {
+# Go through every day
+curdate <- issues$date[d]
+cat(as.character(curdate),"\n")
+# Put all tweets from specific day in a temporary DF
+tweets_curday <- tweets[tweets[, "created_at"] == curdate, ]
+for(t in 1:nrow(tweets_curday)){
+# Select tweet's text, make it lowercase and remove hashtag indicators (#)
+curtext <- as.character(tweets_curday$text[t])
+curtext <- str_replace_all(curtext, "#", "")
+curtext <- str_replace_all(curtext, "$", " ")
+curtext <- str_replace_all(curtext, "http://.+?\\s", "URL ")
+curid <- as.character(tweets_curday$id_str[t])
+# Now test each single issue (not tag!)
+for(i in 1:length(issuelist)) {
+curtags <- as.character(issuelist[[i]])
+curissue <- names(issuelist)[i]
+curfile <- str_c(id_folder,"/",curissue,".csv")
+# Now test all tags of a single issue
+for(s in 1:length(curtags)) {
+curtag <- curtags[s]
+curchars <- nchar(curtag, type = "chars")
+# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
+if(curchars <= 4) {
+curacro <- checkAcronym(string = curtag, chars = curchars)
+} else {
+curacro <- FALSE
+}
+# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
+tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro)
+if(tags_found == 1) {
+#cat("Matched", curtag, "with", curtext,"\n")
+issues[d,curissue] <- issues[d,curissue] + 1
+write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE)
+break
+}
+else {
+#cat("Nothing found\n")
+}
+} # /for curtags
+} # /for issuelist
+} # /for tweets_curday
+} # /for drange
+issues_melt <- melt(issues,id="date")
+ggplot(issues_melt,aes(x=date,y=value,colour=variable,group=variable)) + geom_smooth(size=1,method="loess",formula = y ~ x, se=FALSE)
+viewMatchingTweets("2014-12-18", "issue.conservative", id_folder)
+pattern
+agrep("\\bchrist\\b", "RT @christophheyes: Morgen in der Presse: Oppermann - Briefkasten gestohlen! Gabriel - Poesiealbum nicht mehr auffindbar! #edathy #hartmann", max.distance = list(all = 1), ignore.case = TRUE, fixed = FALSE)
+smartPatternMatch
+source("issuecomp-functions.R")
+smartPatternMatch
+# MATCH TWEETS ------------------------------------------------------------
+id_folder <- "matched-ids"
+unlink(id_folder, recursive = TRUE)
+dir.create(id_folder)
+issues <- data.frame(date = drange)
+issuelist <- xmlToList("issues.xml")
+issueheads <- names(issuelist)
+issues[issueheads] <- 0
+for(d in 1:nrow(issues)) {
+# Go through every day
+curdate <- issues$date[d]
+cat(as.character(curdate),"\n")
+# Put all tweets from specific day in a temporary DF
+tweets_curday <- tweets[tweets[, "created_at"] == curdate, ]
+for(t in 1:nrow(tweets_curday)){
+# Select tweet's text, make it lowercase and remove hashtag indicators (#)
+curtext <- as.character(tweets_curday$text[t])
+curtext <- str_replace_all(curtext, "#", "")
+curtext <- str_replace_all(curtext, "$", " ")
+curtext <- str_replace_all(curtext, "http://.+?\\s", "URL ")
+curid <- as.character(tweets_curday$id_str[t])
+# Now test each single issue (not tag!)
+for(i in 1:length(issuelist)) {
+curtags <- as.character(issuelist[[i]])
+curissue <- names(issuelist)[i]
+curfile <- str_c(id_folder,"/",curissue,".csv")
+# Now test all tags of a single issue
+for(s in 1:length(curtags)) {
+curtag <- curtags[s]
+curchars <- nchar(curtag, type = "chars")
+# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
+if(curchars <= 4) {
+curacro <- checkAcronym(string = curtag, chars = curchars)
+} else {
+curacro <- FALSE
+}
+# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
+tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro)
+if(tags_found == 1) {
+#cat("Matched", curtag, "with", curtext,"\n")
+issues[d,curissue] <- issues[d,curissue] + 1
+write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE)
+break
+}
+else {
+#cat("Nothing found\n")
+}
+} # /for curtags
+} # /for issuelist
+} # /for tweets_curday
+} # /for drange
+issues_melt <- melt(issues,id="date")
+ggplot(issues_melt,aes(x=date,y=value,colour=variable,group=variable)) + geom_smooth(size=1,method="loess",formula = y ~ x, se=FALSE)
+viewMatchingTweets("2014-12-18", "issue.conservative", id_folder)
+viewMatchingTweets("2014-05-18", "issue.conservative", id_folder)
+viewMatchingTweets("2014-05-1", "issue.conservative", id_folder)
+viewMatchingTweets("2014-05-01", "issue.conservative", id_folder)
+viewMatchingTweets("2014-05-02", "issue.conservative", id_folder)
+viewMatchingTweets("2014-05-10", "issue.conservative", id_folder)
+viewMatchingTweets("2014-05-10", "issue.middleeast", id_folder)
+viewMatchingTweets("2014-05-10", "issue.iraw", id_folder)
+viewMatchingTweets("2014-05-10", "issue.iraq", id_folder)
+viewMatchingTweets("2014-08-10", "issue.iraq", id_folder)
+viewMatchingTweets("2014-11-10", "issue.iraq", id_folder)
+viewMatchingTweets("2014-12-10", "issue.iraq", id_folder)
+View(issues)
+viewMatchingTweets("2014-09-19", "issue.control", id_folder)
diff --git a/.gitignore b/.gitignore
index d6a0d18..34c589d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
tweets_complete.csv
current.txt
.RData
+matched-ids
diff --git a/issuecomp-analysis.R b/issuecomp-analysis.R
index 2e41b01..e5e7611 100644
--- a/issuecomp-analysis.R
+++ b/issuecomp-analysis.R
@@ -4,6 +4,8 @@ require(ggplot2)
require(reshape2)
require(stringr)
+source("issuecomp-functions.R")
+
# Create date range
date_start <- as.Date("2014-01-01")
date_end <- as.Date("2014-12-31")
@@ -13,6 +15,10 @@ drange <- date_start + days(0:drange)
# MATCH TWEETS ------------------------------------------------------------
+id_folder <- "matched-ids"
+unlink(id_folder, recursive = TRUE)
+dir.create(id_folder)
+
issues <- data.frame(date = drange)
issuelist <- xmlToList("issues.xml")
issueheads <- names(issuelist)
@@ -30,22 +36,34 @@ for(d in 1:nrow(issues)) {
# Select tweet's text, make it lowercase and remove hashtag indicators (#)
curtext <- as.character(tweets_curday$text[t])
curtext <- str_replace_all(curtext, "#", "")
+ curtext <- str_replace_all(curtext, "$", " ")
+ curtext <- str_replace_all(curtext, "http://.+?\\s", "URL ")
+ curid <- as.character(tweets_curday$id_str[t])
# Now test each single issue (not tag!)
for(i in 1:length(issuelist)) {
curtags <- as.character(issuelist[[i]])
curissue <- names(issuelist)[i]
+ curfile <- str_c(id_folder,"/",curissue,".csv")
# Now test all tags of a single issue
- for(t in 1:length(curtags)) {
- curtag <- str_c("\\W", curtags[t], "\\W")
- curchars <- nchar(curtag, type = "chars") - 4
+ for(s in 1:length(curtags)) {
+ curtag <- curtags[s]
+ curchars <- nchar(curtag, type = "chars")
+ # Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
+ if(curchars <= 4) {
+ curacro <- checkAcronym(string = curtag, chars = curchars)
+ } else {
+ curacro <- FALSE
+ }
+
# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
- tags_found <- smartPatternMatch(curtext, curtag, curchars)
+ tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro)
if(tags_found == 1) {
#cat("Matched", curtag, "with", curtext,"\n")
issues[d,curissue] <- issues[d,curissue] + 1
+ write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE)
break
}
else {
@@ -59,44 +77,6 @@ for(d in 1:nrow(issues)) {
-# WEEKLY INTERVALS --------------------------------------------------------
-
-
-
-## Do not use days but week intervals
-
-wrange <- (as.integer(date_end - date_start) / 7)
-wrange <- floor(wrange) - 1
-wrange <- date_start + weeks(0:wrange)
-issues_week <- data.frame(week = wrange)
-issues_week[issueheads] <- 0
-
-
-for(w in 1:nrow(issues_week)) {
- curweek <- issues_week$week[w]
- currange <- curweek + days(0:6)
-
- day <- 1
-
-
- for(d in 1:nrow(issues)) {
- curday <- issues$date[d]
-
- if(curweek == curday) {
- for(c in 2:ncol(issues)) {
- curissue <- names(issues)[c]
- d2 <- d + 6
- curvalue <- sum(issues[d:d2,curissue])
- issues_week[w, curissue] <- curvalue
-
- } # /for issues columns
- } # /if day matches first day of week
-
- } # /for issues rows
-} # /for issues_week
-
-
-
# VISUALS -----------------------------------------------------------------
@@ -105,10 +85,6 @@ issues_melt <- melt(issues,id="date")
ggplot(issues_melt,aes(x=date,y=value,colour=variable,group=variable)) + geom_line(size=1)
ggplot(issues_melt,aes(x=date,y=value,colour=variable,group=variable)) + geom_smooth(size=1,method="loess",formula = y ~ x, se=FALSE)
-# Level: weeks
-issues_week_melt <- melt(issues_week,id="week")
-ggplot(issues_week_melt,aes(x=week,y=value,colour=variable,group=variable)) + geom_line(size=1)
-ggplot(issues_week_melt,aes(x=week,y=value,colour=variable,group=variable)) + geom_smooth(size=1,method="loess",formula = y ~ x, se=FALSE)
# POSSIBLY USEFUL CODE ----------------------------------------------------
diff --git a/issuecomp-functions.R b/issuecomp-functions.R
index 3c915bd..c7ff670 100644
--- a/issuecomp-functions.R
+++ b/issuecomp-functions.R
@@ -26,20 +26,47 @@ convertLogical0 <- function(var) {
return(var)
}
-smartPatternMatch <- function(string, pattern, chars) {
- if(chars < 5) {
- found <- agrep(pattern, string, max.distance = list(all = 0), ignore.case = TRUE, fixed = FALSE)
+smartPatternMatch <- function(string, pattern, chars, acronym) {
+ pattern <- str_c("\\b", pattern, "\\b")
+
+ if(chars <= 4) {
+ found <- agrep(pattern, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE)
}
- else if(chars > 7) {
- found <- agrep(pattern, string, max.distance = list(all = 2), ignore.case = TRUE, fixed = FALSE)
+ else if(chars >= 8) {
+ found <- agrep(pattern, string, max.distance = list(all = 2), ignore.case = !acronym, fixed = FALSE)
}
else {
- found <- agrep(pattern, string, max.distance = list(all = 1), ignore.case = TRUE, fixed = FALSE)
+ found <- agrep(pattern, string, max.distance = list(all = 1), ignore.case = !acronym, fixed = FALSE)
}
found <- convertLogical0(found)
return(found)
}
+viewMatchingTweets <- function(date, issue, folder) {
+ file <- str_c(folder,"/",issue,".csv")
+ df <- read.csv(file, sep = ";", colClasses="character", header = FALSE)
+ for(r in 1:nrow(df)) {
+ curdate <- as.character(df[r,1])
+ if(curdate == date) {
+ curid <- as.character(df[r,2])
+ curtag <- as.character(df[r,3])
+ cat(tweets$text[tweets$id_str == curid]," - ",curtag,"\n")
+ }
+ }
+}
+
+
+checkAcronym <- function(string, chars) {
+ curtag_up <- str_replace_all(string = curtag, pattern = "[[:lower:]]", replacement = "")
+ curchars_up <- nchar(curtag_up, type = "chars")
+ if(curchars_up == curchars) {
+ return(TRUE)
+ }
+ else {
+ return(FALSE)
+ }
+}
+
## ERROR HANDLING
# Check for empty API returns (0 or 1 or 2)
diff --git a/issues.xml b/issues.xml
index c791322..31559cd 100644
--- a/issues.xml
+++ b/issues.xml
@@ -62,9 +62,9 @@
irak
- isis
- is
- kalifat
+ ISIS
+ IS
+ Kalifat