require(stringr) require(reshape2) require(ggplot2) require(vars) # Create dataframes with only non-sensational (i) and sensational (s) issue columns drop_s <- which(str_detect(names(issues), "^s")) drop_i <- which(str_detect(names(issues), "^i")) issues_i <- issues[,-drop_s] issues_s <- issues[,-drop_i] # # # ENTROPY # # # Entropy non-sensational issues issues_i$total <- rowSums(issues_i[2:ncol(issues_i)]) issues_i$entropy <- 0 for(r in 1:nrow(issues_i)) { curtotal <- as.numeric(issues_i$total[r]) curp <- 0 for(c in 2:ncol(issues_i)) { curcount <- as.numeric(issues_i[r,c]) curp[c] <- curcount / curtotal } curp <- curp [2:length(curp)-2] curdrop <- which(curp==0) curp <- curp[-curdrop] issues_i$entropy[r] <- sum(-1 * curp * log(curp)) } # Entropy sensational issues issues_s$total <- rowSums(issues_s[2:ncol(issues_s)]) issues_s$entropy <- 0 for(r in 1:nrow(issues_s)) { curtotal <- as.numeric(issues_s$total[r]) curp <- 0 for(c in 2:ncol(issues_s)) { curcount <- as.numeric(issues_s[r,c]) curp[c] <- curcount / curtotal } curp <- curp [2:length(curp)-2] curdrop <- which(curp==0) curp <- curp[-curdrop] issues_s$entropy[r] <- sum(-1 * curp * log(curp)) } # Compare total tweets vs. total issue findings stats_total <- data.frame(date=drange) stats_total$tpd <- 0 stats_total$ipd <- issues_i$total stats_total$spd <- issues_s$total # Total number of tweets per day over time for(r in 1:length(drange)) { stats_total$tpd[r] <- length(tweets[tweets[, "created_at"] == drange[r], "id_str"]) } stats_melt <- melt(stats_total, id="date") g1 <- ggplot(data = stats_melt, aes(x=date,y=value,colour=variable, group=variable)) + geom_line()+ geom_smooth(size=1,formula = y ~ x, method="loess", se=FALSE, color=1) g1 # Visuals for entropy in time series stats_entropy <- data.frame(date=drange) stats_entropy$entropy <- issues_i$entropy stats_entropy <- melt(stats_entropy, id="date") g1 <- ggplot(data = stats_entropy, aes(x=date,y=value,colour=variable, group=variable)) + geom_line() + geom_smooth(size=1,formula = y ~ x, method="loess", se=FALSE, color=1) g1 # SOME TESTS -------------------------------------------------------------- stats <- data.frame(date=drange) stats$tpd <- 0 # Total number of tweets per day over time for(r in 1:length(drange)) { stats$tpd[r] <- length(tweets[tweets[, "created_at"] == drange[r], "id_str"]) } stats_melt <- melt(stats, id="date") g1 <- ggplot(data = stats_melt, aes(x=date,y=value,colour=variable, group=variable)) + geom_line() + geom_smooth(size=1,formula = y ~ x, method="loess", se=FALSE, color=1) g1 rm(g1, r) # Show party percentage of twitter users acc_parties <- data.frame(party = c("cducsu", "spd", "linke", "gruene")) acc_parties$btw13 <- c(49.3, 30.6, 10.1, 10.0) # seats of party / 631 seats acc_parties$twitter <- 0 for(p in 1:nrow(acc_parties)) { acc_parties$twitter[p] <- round(nrow(acc_df[acc_df$party == as.character(acc_parties$party[p]), ]) / 280 * 100) } pie(acc_parties$btw13, col=c("black", "red", "purple", "green"), labels = c("CDU/CSU", "SPD", "Die LINKE", "Bündnis 90/Grüne"), clockwise = T, main = "Seats of parties in the parliament") pie(acc_parties$twitter, col=c("black", "red", "purple", "green"), labels = c("CDU/CSU", "SPD", "Die LINKE", "Bündnis 90/Grüne"), clockwise = T, main = "Percentage of parties' MdBs of all Twitter accounts") rm(acc_parties, p) # VISUALS ----------------------------------------------------------------- # Level: days issues_melt <- melt(issues,id="date") ggplot(issues_melt,aes(x=date,y=value,colour=variable,group=variable)) + geom_line(size=1) ggplot(issues_melt,aes(x=date,y=value,colour=variable,group=variable)) + geom_smooth(size=1,method="loess",formula = y ~ x, se=FALSE) # POSSIBLY USEFUL CODE ---------------------------------------------------- # Limits of list length(issuelist) length(issuelist[[2]]) # Select all tweets from current day in drange tweets_curday <- tweets[tweets[, "created_at"] == drange[5], ] # Is column a issue counting column? str_detect(names(issues[2]), "^issue")