diff --git a/issuecomp-scraping.R b/issuecomp-1-scraping.R similarity index 100% rename from issuecomp-scraping.R rename to issuecomp-1-scraping.R diff --git a/issuecomp-analysis.R b/issuecomp-2-analysis.R similarity index 67% rename from issuecomp-analysis.R rename to issuecomp-2-analysis.R index b19c48b..4033d27 100644 --- a/issuecomp-analysis.R +++ b/issuecomp-2-analysis.R @@ -55,7 +55,7 @@ foreach(d = 1:nrow(issues), .packages = c("stringr"), .combine=rbind) %dopar% { tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] for(t in 1:nrow(tweets_curday)){ -# cat(paste("Starting tweet", t, "of",as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE) + # cat(paste("Starting tweet", t, "of",as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE) # Select tweet's text, make it lowercase and remove hashtag indicators (#) curtext <- as.character(tweets_curday$text[t]) curtext <- str_replace_all(curtext, "#", "") @@ -67,7 +67,7 @@ foreach(d = 1:nrow(issues), .packages = c("stringr"), .combine=rbind) %dopar% { curissue <- issueheads[i] curtags <- as.character(issuelist[[curissue]]) curfile <- str_c(id_folder,"/",curissue,".csv") - + # Now test all tags of a single issue for(s in 1:length(curtags)) { curtag <- curtags[s] @@ -93,7 +93,7 @@ foreach(d = 1:nrow(issues), .packages = c("stringr"), .combine=rbind) %dopar% { } else { curdistance <- 1 } - + # Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow also 1 (Levenshtein distance) tags_found <- NULL # Match the tweet with each variation of tagexpand @@ -104,19 +104,19 @@ foreach(d = 1:nrow(issues), .packages = c("stringr"), .combine=rbind) %dopar% { curtag <- curtag[1] if(tags_found == TRUE) { -# # Raise number of findings on this day for this issue by 1 -# issues[d,curissue] <- issues[d,curissue] + 1 -# -# # Add issue and first matched tag of tweet to tweets-DF -# oldissue <- tweets[tweets[, "id_str"] == curid, "issue"] -# tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";") -# oldtag <- tweets[tweets[, "id_str"] == curid, "tags"] -# tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";") + # # Raise number of findings on this day for this issue by 1 + # issues[d,curissue] <- issues[d,curissue] + 1 + # + # # Add issue and first matched tag of tweet to tweets-DF + # oldissue <- tweets[tweets[, "id_str"] == curid, "issue"] + # tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";") + # oldtag <- tweets[tweets[, "id_str"] == curid, "tags"] + # tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";") # Add information to file for function viewPatternMatching write(str_c(curdate,";\"",curid,"\";",curissue,";",curtag), curfile, append = TRUE) -# cat(paste("Match!\n"), file="issuecomp-analysis.log", append=TRUE) -# data.frame(date=curdate, issue=curissue) + # cat(paste("Match!\n"), file="issuecomp-analysis.log", append=TRUE) + # data.frame(date=curdate, issue=curissue) break # next issue, no more tags from same issue } else { @@ -182,60 +182,6 @@ for(r in 1:nrow(results)) { # SAVING ------------------------------------------------------------------ save(tweets, file="tweets_tagged.RData") +save(issues, file="issues.RData") - -# SOME TESTS -------------------------------------------------------------- - -stats <- data.frame(date=drange) -stats$tpd <- 0 - -# Total number of tweets per day over time -for(r in 1:length(drange)) { - stats$tpd[r] <- length(tweets[tweets[, "created_at"] == drange[r], "id_str"]) -} - -stats_melt <- melt(stats, id="date") -g1 <- ggplot(data = stats_melt, aes(x=date,y=value,colour=variable, group=variable)) + - geom_line() + - geom_smooth(size=1,formula = y ~ x, method="loess", se=FALSE, color=1) -g1 - -rm(g1, r) - - -# Show party percentage of twitter users -acc_parties <- data.frame(party = c("cducsu", "spd", "linke", "gruene")) -acc_parties$btw13 <- c(49.3, 30.6, 10.1, 10.0) # seats of party / 631 seats -acc_parties$twitter <- 0 -for(p in 1:nrow(acc_parties)) { - acc_parties$twitter[p] <- round(nrow(acc_df[acc_df$party == as.character(acc_parties$party[p]), ]) / 280 * 100) -} -pie(acc_parties$btw13, col=c("black", "red", "purple", "green"), labels = c("CDU/CSU", "SPD", "Die LINKE", "Bündnis 90/Grüne"), clockwise = T, - main = "Seats of parties in the parliament") -pie(acc_parties$twitter, col=c("black", "red", "purple", "green"), labels = c("CDU/CSU", "SPD", "Die LINKE", "Bündnis 90/Grüne"), clockwise = T, - main = "Percentage of parties' MdBs of all Twitter accounts") - -rm(acc_parties, p) - - -# VISUALS ----------------------------------------------------------------- - - -# Level: days -issues_melt <- melt(issues,id="date") -ggplot(issues_melt,aes(x=date,y=value,colour=variable,group=variable)) + geom_line(size=1) -ggplot(issues_melt,aes(x=date,y=value,colour=variable,group=variable)) + geom_smooth(size=1,method="loess",formula = y ~ x, se=FALSE) - - - -# POSSIBLY USEFUL CODE ---------------------------------------------------- - -# Limits of list -length(issuelist) -length(issuelist[[2]]) - -# Select all tweets from current day in drange -tweets_curday <- tweets[tweets[, "created_at"] == drange[5], ] -# Is column a issue counting column? -str_detect(names(issues[2]), "^issue") \ No newline at end of file diff --git a/issuecomp-3-calc.R b/issuecomp-3-calc.R new file mode 100644 index 0000000..c6535ed --- /dev/null +++ b/issuecomp-3-calc.R @@ -0,0 +1,130 @@ +require(stringr) +require(reshape2) +require(ggplot2) +require(vars) + +# Create dataframes with only non-sensational (i) and sensational (s) issue columns +drop_s <- which(str_detect(names(issues), "^s")) +drop_i <- which(str_detect(names(issues), "^i")) +issues_i <- issues[,-drop_s] +issues_s <- issues[,-drop_i] + +# # +# ENTROPY +# # +# Entropy non-sensational issues +issues_i$total <- rowSums(issues_i[2:ncol(issues_i)]) +issues_i$entropy <- 0 + +for(r in 1:nrow(issues_i)) { + curtotal <- as.numeric(issues_i$total[r]) + curp <- 0 + for(c in 2:ncol(issues_i)) { + curcount <- as.numeric(issues_i[r,c]) + curp[c] <- curcount / curtotal + } + curp <- curp [2:length(curp)-2] + curdrop <- which(curp==0) + curp <- curp[-curdrop] + issues_i$entropy[r] <- sum(-1 * curp * log(curp)) +} + +# Entropy sensational issues +issues_s$total <- rowSums(issues_s[2:ncol(issues_s)]) +issues_s$entropy <- 0 + +for(r in 1:nrow(issues_s)) { + curtotal <- as.numeric(issues_s$total[r]) + curp <- 0 + for(c in 2:ncol(issues_s)) { + curcount <- as.numeric(issues_s[r,c]) + curp[c] <- curcount / curtotal + } + curp <- curp [2:length(curp)-2] + curdrop <- which(curp==0) + curp <- curp[-curdrop] + issues_s$entropy[r] <- sum(-1 * curp * log(curp)) +} + + +# Compare total tweets vs. total issue findings +stats_total <- data.frame(date=drange) +stats_total$tpd <- 0 +stats_total$ipd <- issues_i$total +stats_total$spd <- issues_s$total +# Total number of tweets per day over time +for(r in 1:length(drange)) { + stats_total$tpd[r] <- length(tweets[tweets[, "created_at"] == drange[r], "id_str"]) +} + +stats_melt <- melt(stats_total, id="date") +g1 <- ggplot(data = stats_melt, aes(x=date,y=value,colour=variable, group=variable)) + + geom_line()+ + geom_smooth(size=1,formula = y ~ x, method="loess", se=FALSE, color=1) +g1 + +# Visuals for entropy in time series +stats_entropy <- data.frame(date=drange) +stats_entropy$entropy <- issues_i$entropy + +stats_entropy <- melt(stats_entropy, id="date") + +g1 <- ggplot(data = stats_entropy, aes(x=date,y=value,colour=variable, group=variable)) + + geom_line() + + geom_smooth(size=1,formula = y ~ x, method="loess", se=FALSE, color=1) +g1 + +# SOME TESTS -------------------------------------------------------------- + +stats <- data.frame(date=drange) +stats$tpd <- 0 + +# Total number of tweets per day over time +for(r in 1:length(drange)) { + stats$tpd[r] <- length(tweets[tweets[, "created_at"] == drange[r], "id_str"]) +} + +stats_melt <- melt(stats, id="date") +g1 <- ggplot(data = stats_melt, aes(x=date,y=value,colour=variable, group=variable)) + + geom_line() + + geom_smooth(size=1,formula = y ~ x, method="loess", se=FALSE, color=1) +g1 + +rm(g1, r) + + +# Show party percentage of twitter users +acc_parties <- data.frame(party = c("cducsu", "spd", "linke", "gruene")) +acc_parties$btw13 <- c(49.3, 30.6, 10.1, 10.0) # seats of party / 631 seats +acc_parties$twitter <- 0 +for(p in 1:nrow(acc_parties)) { + acc_parties$twitter[p] <- round(nrow(acc_df[acc_df$party == as.character(acc_parties$party[p]), ]) / 280 * 100) +} +pie(acc_parties$btw13, col=c("black", "red", "purple", "green"), labels = c("CDU/CSU", "SPD", "Die LINKE", "Bündnis 90/Grüne"), clockwise = T, + main = "Seats of parties in the parliament") +pie(acc_parties$twitter, col=c("black", "red", "purple", "green"), labels = c("CDU/CSU", "SPD", "Die LINKE", "Bündnis 90/Grüne"), clockwise = T, + main = "Percentage of parties' MdBs of all Twitter accounts") + +rm(acc_parties, p) + + +# VISUALS ----------------------------------------------------------------- + + +# Level: days +issues_melt <- melt(issues,id="date") +ggplot(issues_melt,aes(x=date,y=value,colour=variable,group=variable)) + geom_line(size=1) +ggplot(issues_melt,aes(x=date,y=value,colour=variable,group=variable)) + geom_smooth(size=1,method="loess",formula = y ~ x, se=FALSE) + + + +# POSSIBLY USEFUL CODE ---------------------------------------------------- + +# Limits of list +length(issuelist) +length(issuelist[[2]]) + +# Select all tweets from current day in drange +tweets_curday <- tweets[tweets[, "created_at"] == drange[5], ] +# Is column a issue counting column? +str_detect(names(issues[2]), "^issue") \ No newline at end of file diff --git a/issues.RData b/issues.RData index 309866c..41909d3 100644 Binary files a/issues.RData and b/issues.RData differ