diff --git a/.Rhistory b/.Rhistory index 489ff74..1de9c10 100644 --- a/.Rhistory +++ b/.Rhistory @@ -1,512 +1,512 @@ -cat("[INFO] Last tweet of temp is last tweet of full. Abort loop and begin with next user.\n") -#break +cat("Positive in", curissue,"\n") } -tweets_full$id_str[nrow(tweets_full)] -tweets_temp$id_str[nrow(tweets_temp)] -tweets_complete$user[20674] -a -current -tweets_full <- data.frame(user=character(), name=character(), created_at=character(), id_str=character(), text=character(), retweet_count=character()) -for(a in 1:nrow(acc_df)) { -user <- as.character(acc_df$screenname[a]) -name <- as.character(acc_df$name[a]) -max_id <- "999999999999999999" -loop <- 1 -error <- 0 -repeat { -# Define specific search query -query <- c(include_rts=1, exclude_replies="true", trim_user="true", include_entities="false", -screen_name=user, -count=max_count, -max_id=max_id); -# At first, work with an temporary tweet-DB -current <- twitter_api_call(api_url, query, api_params) -rm(tweets_temp) -tweets_temp <- fromJSON(correctJSON(current)) -## START ERROR HANDLING ## -# Empty API output -status <- errorEmptyAPI(tweets_temp) -if(status == 1) { Sys.sleep(3);error <- error + 1;next} -if(status == 2) {break} -# Contains "error" column -status <- errorEmptyAPI(tweets_temp) -if(status == 1) { Sys.sleep(3);error <- error + 1;next} -if(status == 2) {break} -# Check if error code exists -code <- errorCheckCode(tweets_temp) # 0 if no error -if(code == 34) { # page does not exist -status <- errorCode34 -if(status == 1) { Sys.sleep(3);error <- error + 1;next} -if(status == 2) {break} +} # /for issuelist +} # /for tweets_curday +} # /for drange +d +for(d in 1:length(drange)) { +# Put all tweets from specific day in a temporary DF +tweets_curday <- tweets[tweets[, "created_at"] == drange[d], ] +for(t in 1:nrow(tweets_curday)){ +# Select tweet's text, make it lowercase and remove hashtag indicators (#) +curtext <- tolower(as.character(tweets_curday$text[t])) +curtext <- str_replace_all(text, "#", "") +for(i in 1:length(issuelist)) { +curtags <- as.character(issuelist[[i]]) +curissue <- names(issuelist)[i] +tags_found <- str_detect(curtext, sprintf("%s", tags)) +tags_found <- any(tags_found) +if(tags_found) { +cat("Positive in", curissue,"from",drange[d],"\n") } -if(code == 88) { # rate limit exceeded -wait <- errorCode88() -Sys.sleep(wait) -next +} # /for issuelist +} # /for tweets_curday +} # /for drange +View(tweets_curday) +tags_found +drange[d] +as.character(drange[d]) +cat(as.character(drange[d])) +cat(as.character(drange[d]) +cat(drange[d]) +for(d in 1:length(drange)) { +# Put all tweets from specific day in a temporary DF +tweets_curday <- tweets[tweets[, "created_at"] == drange[d], ] +for(t in 1:nrow(tweets_curday)){ +# Select tweet's text, make it lowercase and remove hashtag indicators (#) +curtext <- tolower(as.character(tweets_curday$text[t])) +curtext <- str_replace_all(curtext, "#", "") +for(i in 1:length(issuelist)) { +curtags <- as.character(issuelist[[i]]) +curissue <- names(issuelist)[i] +tags_found <- str_detect(curtext, sprintf("%s", tags)) +tags_found <- any(tags_found) +if(tags_found) { +cat("Positive in", curissue,"from",as.character(drange[d]),"\n") } -## END ERROR HANDLING ## -# Delete unnecessary columns and add username and real name to dataframe -tweets_temp <- tweets_temp[keep] -tweets_temp <- cbind(user=user, name=name, tweets_temp) -# Now sleep 3 second to dodge 300queries/15min limit -cat("[",a,"/",nrow(acc_df),"] ", sep = "") -cat("User: ",user," in loop: ",loop,". \n", sep = "") -Sys.sleep(2) -if(tweets_full$id_str[nrow(tweets_full)] == tweets_temp$id_str[nrow(tweets_temp)] && nrow(tweets_full) > 0) { -cat("[INFO] Last tweet of temp is last tweet of full. Abort loop and begin with next user.\n") -#break -} -## Last loop is reached. Now clear the data frame -# Is the last tweet in tweets_temp from 2013? -status <- str_detect(tweets_temp$created_at[nrow(tweets_temp)], "2014$") -if (!status) { # Starting when tweet not from 2014 -# Delete all tweets other than from 2014 -old <- 0 -for(r in 1:nrow(tweets_temp)) { -status <- str_detect(tweets_temp$created_at[r], "2014$") -if(is.na(status)) { -#status <- FALSE -cat("[INFO] NA-Status in Tweet", r) -} -if(!status) { # Starting when tweet not from 2014 -old <- old + 1 +} # /for issuelist +} # /for tweets_curday +} # /for drange +View(tweets_curday) +curtext +for(i in 1:length(issuelist)) { +curtags <- as.character(issuelist[[i]]) +curissue <- names(issuelist)[i] +tags_found <- str_detect(curtext, sprintf("%s", tags)) +tags_found <- any(tags_found) +if(tags_found) { +cat("Positive in", curissue,"from",as.character(drange[d]),"\n") } } -if(old > 0) { -old <- old - 1 -# If even the first entry isn't from 2014, we have to set "old" manually because of a bug -status <- str_detect(tweets_temp$created_at[1], "2014$") -if(!status) { -old <- nrow(tweets_temp) -cat("[INFO] Timeline enhält keinen einzigen aus 2014\n") +tags_found +for(d in 1:length(drange)) { +# Put all tweets from specific day in a temporary DF +tweets_curday <- tweets[tweets[, "created_at"] == drange[d], ] +for(t in 1:nrow(tweets_curday)){ +# Select tweet's text, make it lowercase and remove hashtag indicators (#) +curtext <- tolower(as.character(tweets_curday$text[t])) +curtext <- str_replace_all(curtext, "#", "") +for(i in 1:length(issuelist)) { +curtags <- as.character(issuelist[[i]]) +curissue <- names(issuelist)[i] +tags_found <- str_detect(curtext, sprintf("%s", tags)) +tags_found <- any(tags_found) +if(tags_found) { +cat("Positive in", curissue,"from",as.character(drange[d]),"\n") } -# delete all lines which are older than 2014 -tweets_temp <- head(tweets_temp, -old) -} -rm(old) -tweets_full <- insertRow(tweets_full, tweets_temp) -#rm(tweets_temp) -break # End loop because 2013 is reached -} -# The last tweet is still from 2014, so we need another loop else { -# Setting max_id to gather next 200 tweets -max_id <- tweets_temp$id_str[nrow(tweets_temp)] -loop <- loop + 1 # just for stats -tweets_full <- insertRow(tweets_full, tweets_temp) -#rm(tweets_temp) +cat("Nothing found\n") } -} # /repeat -tweets_complete <- insertRow(tweets_complete, tweets_full) -tweets_full <- head(tweets_full, -nrow(tweets_full)) # Empty tweets_full -cat("User:",user,"finished after",loop,"loops. Total Tweets now:",nrow(tweets_complete),"\n") -write.csv(tweets_complete, "tweets_complete.csv") -# Every tweet from 2014 from user[r] is downloaded. Now next user in for-loop +} # /for issuelist +} # /for tweets_curday +} # /for drange +View(issues) +curissue +issues[1,2] +issues[1,] +issues[1,curissue] +issues[2,curissue] +issues[t,curissue] +drange[d] +issues$date[d] +for(d in 1:nrow(issues)) { +curdate <- issues$date[d] +# Put all tweets from specific day in a temporary DF +tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] +for(t in 1:nrow(tweets_curday)){ +# Select tweet's text, make it lowercase and remove hashtag indicators (#) +curtext <- tolower(as.character(tweets_curday$text[t])) +curtext <- str_replace_all(curtext, "#", "") +for(i in 1:length(issuelist)) { +curtags <- as.character(issuelist[[i]]) +curissue <- names(issuelist)[i] +tags_found <- str_detect(curtext, sprintf("%s", tags)) +tags_found <- any(tags_found) +if(tags_found) { +#cat("Positive in", curissue,"from",as.character(drange[d]),"\n") +issues[d,curissue] <- issues[d,curissue] + 1 } -tweets_full <- data.frame(user=character(), name=character(), created_at=character(), id_str=character(), text=character(), retweet_count=character()) -for(a in 66:nrow(acc_df)) { -user <- as.character(acc_df$screenname[a]) -name <- as.character(acc_df$name[a]) -max_id <- "999999999999999999" -loop <- 1 -error <- 0 -repeat { -# Define specific search query -query <- c(include_rts=1, exclude_replies="true", trim_user="true", include_entities="false", -screen_name=user, -count=max_count, -max_id=max_id); -# At first, work with an temporary tweet-DB -current <- twitter_api_call(api_url, query, api_params) -rm(tweets_temp) -tweets_temp <- fromJSON(correctJSON(current)) -## START ERROR HANDLING ## -# Empty API output -status <- errorEmptyAPI(tweets_temp) -if(status == 1) { Sys.sleep(3);error <- error + 1;next} -if(status == 2) {break} -# Contains "error" column -status <- errorEmptyAPI(tweets_temp) -if(status == 1) { Sys.sleep(3);error <- error + 1;next} -if(status == 2) {break} -# Check if error code exists -code <- errorCheckCode(tweets_temp) # 0 if no error -if(code == 34) { # page does not exist -status <- errorCode34 -if(status == 1) { Sys.sleep(3);error <- error + 1;next} -if(status == 2) {break} -} -if(code == 88) { # rate limit exceeded -wait <- errorCode88() -Sys.sleep(wait) -next -} -## END ERROR HANDLING ## -# Delete unnecessary columns and add username and real name to dataframe -tweets_temp <- tweets_temp[keep] -tweets_temp <- cbind(user=user, name=name, tweets_temp) -# Now sleep 3 second to dodge 300queries/15min limit -cat("[",a,"/",nrow(acc_df),"] ", sep = "") -cat("User: ",user," in loop: ",loop,". \n", sep = "") -Sys.sleep(2) -if(tweets_full$id_str[nrow(tweets_full)] == tweets_temp$id_str[nrow(tweets_temp)] && nrow(tweets_full) > 0) { -cat("[INFO] Last tweet of temp is last tweet of full. Abort loop and begin with next user.\n") -#break -} -## Last loop is reached. Now clear the data frame -# Is the last tweet in tweets_temp from 2013? -status <- str_detect(tweets_temp$created_at[nrow(tweets_temp)], "2014$") -if (!status) { # Starting when tweet not from 2014 -# Delete all tweets other than from 2014 -old <- 0 -for(r in 1:nrow(tweets_temp)) { -status <- str_detect(tweets_temp$created_at[r], "2014$") -if(is.na(status)) { -#status <- FALSE -cat("[INFO] NA-Status in Tweet", r) -} -if(!status) { # Starting when tweet not from 2014 -old <- old + 1 -} -} -if(old > 0) { -old <- old - 1 -# If even the first entry isn't from 2014, we have to set "old" manually because of a bug -status <- str_detect(tweets_temp$created_at[1], "2014$") -if(!status) { -old <- nrow(tweets_temp) -cat("[INFO] Timeline enhält keinen einzigen aus 2014\n") -} -# delete all lines which are older than 2014 -tweets_temp <- head(tweets_temp, -old) -} -rm(old) -tweets_full <- insertRow(tweets_full, tweets_temp) -#rm(tweets_temp) -break # End loop because 2013 is reached -} -# The last tweet is still from 2014, so we need another loop else { -# Setting max_id to gather next 200 tweets -max_id <- tweets_temp$id_str[nrow(tweets_temp)] -loop <- loop + 1 # just for stats -tweets_full <- insertRow(tweets_full, tweets_temp) -#rm(tweets_temp) +#cat("Nothing found\n") } -} # /repeat -tweets_complete <- insertRow(tweets_complete, tweets_full) -tweets_full <- head(tweets_full, -nrow(tweets_full)) # Empty tweets_full -cat("User:",user,"finished after",loop,"loops. Total Tweets now:",nrow(tweets_complete),"\n") -write.csv(tweets_complete, "tweets_complete.csv") -# Every tweet from 2014 from user[r] is downloaded. Now next user in for-loop +} # /for issuelist +} # /for tweets_curday +} # /for drange +View(issues) +tags_found +curtags +curissue +curtext +curdate +tags +issues$issue.edathy <- 0 +issues$issue.ttip <- 0 +View(issues) +for(d in 1:nrow(issues)) { +curdate <- issues$date[d] +# Put all tweets from specific day in a temporary DF +tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] +for(t in 1:nrow(tweets_curday)){ +# Select tweet's text, make it lowercase and remove hashtag indicators (#) +curtext <- tolower(as.character(tweets_curday$text[t])) +curtext <- str_replace_all(curtext, "#", "") +for(i in 1:length(issuelist)) { +curtags <- as.character(issuelist[[i]]) +curissue <- names(issuelist)[i] +tags_found <- str_detect(curtext, sprintf("%s", curtags)) +tags_found <- any(tags_found) +if(tags_found) { +#cat("Positive in", curissue,"from",as.character(drange[d]),"\n") +issues[d,curissue] <- issues[d,curissue] + 1 } -a -tweets_complete$user[22982] -tweets_full <- data.frame(user=character(), name=character(), created_at=character(), id_str=character(), text=character(), retweet_count=character()) -for(a in 68:nrow(acc_df)) { -user <- as.character(acc_df$screenname[a]) -name <- as.character(acc_df$name[a]) -max_id <- "999999999999999999" -loop <- 1 -error <- 0 -repeat { -# Define specific search query -query <- c(include_rts=1, exclude_replies="true", trim_user="true", include_entities="false", -screen_name=user, -count=max_count, -max_id=max_id); -# At first, work with an temporary tweet-DB -current <- twitter_api_call(api_url, query, api_params) -rm(tweets_temp) -tweets_temp <- fromJSON(correctJSON(current)) -## START ERROR HANDLING ## -# Empty API output -status <- errorEmptyAPI(tweets_temp) -if(status == 1) { Sys.sleep(3);error <- error + 1;next} -if(status == 2) {break} -# Contains "error" column -status <- errorEmptyAPI(tweets_temp) -if(status == 1) { Sys.sleep(3);error <- error + 1;next} -if(status == 2) {break} -# Check if error code exists -code <- errorCheckCode(tweets_temp) # 0 if no error -if(code == 34) { # page does not exist -status <- errorCode34 -if(status == 1) { Sys.sleep(3);error <- error + 1;next} -if(status == 2) {break} +else { +#cat("Nothing found\n") } -if(code == 88) { # rate limit exceeded -wait <- errorCode88() -Sys.sleep(wait) -next +} # /for issuelist +} # /for tweets_curday +} # /for drange +View(issues) +View(tweets_curday) +write.csv(tweets_curday, "tweets_curday.csv") +curtags +curtext <- "Dies ist ein toller Text zum Testen mit Spacko" +curtags <- c("toller", "testen", "pack") +str_detect(curtext, sprintf("%s", curtags)) +curtags +str_c("", curtags, "") +str_c(" ", curtags, " ") +curtags <- str_c(" ", curtags, " ") +str_detect(curtext, sprintf("%s", curtags)) +curtags <- c("toller", "testen", "pack") +curtext <- tolower(curtext) +str_detect(curtext, sprintf("%s", curtags)) +curtext +curtext <- "ein toller text testen(haha) spacko" +"bla" +str_detect(curtext, sprintf("%s", curtags)) +str_detect(curtext, "\\Wtesten\\W") +str_detect(curtext, "\\Wtesten\\w") +str_detect(curtext, "\\Wtesten\\W") +str_detect(curtext, "\\Wpack\\W") +str_detect(curtext, "\\Wpacko\\W") +curtags <- str_c("\\W", curtags, "\\W") +str_detect(curtext, sprintf("%s", curtags)) +curtags +curtext <- "ein toller text testen-mit spacko" +str_detect(curtext, sprintf("%s", curtags)) +curtags +issues$issue.edathy <- 0 +issues$issue.ttip <- 0 +issuelist <- xmlToList("issues.xml") +issueheads <- names(issuelist) +issues[issueheads] <- 0 +View(issues) +for(d in 1:nrow(issues)) { +curdate <- issues$date[d] +# Put all tweets from specific day in a temporary DF +tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] +for(t in 1:nrow(tweets_curday)){ +# Select tweet's text, make it lowercase and remove hashtag indicators (#) +curtext <- tolower(as.character(tweets_curday$text[t])) +curtext <- str_replace_all(curtext, "#", "") +for(i in 1:length(issuelist)) { +curtags <- as.character(issuelist[[i]]) +curissue <- names(issuelist)[i] +curtags <- str_c("\\W", curtags, "\\W") +tags_found <- str_detect(curtext, sprintf("%s", curtags)) +tags_found <- any(tags_found) +if(tags_found) { +#cat("Positive in", curissue,"from",as.character(drange[d]),"\n") +issues[d,curissue] <- issues[d,curissue] + 1 } -## END ERROR HANDLING ## -# Delete unnecessary columns and add username and real name to dataframe -tweets_temp <- tweets_temp[keep] -tweets_temp <- cbind(user=user, name=name, tweets_temp) -# Now sleep 3 second to dodge 300queries/15min limit -cat("[",a,"/",nrow(acc_df),"] ", sep = "") -cat("User: ",user," in loop: ",loop,". \n", sep = "") -Sys.sleep(2) -if(tweets_full$id_str[nrow(tweets_full)] == tweets_temp$id_str[nrow(tweets_temp)] && nrow(tweets_full) > 0) { -cat("[INFO] Last tweet of temp is last tweet of full. Abort loop and begin with next user.\n") +else { +#cat("Nothing found\n") +} +} # /for issuelist +} # /for tweets_curday +} # /for drange +View(issues) +issuelist <- xmlToList("issues.xml") +issueheads <- names(issuelist) +issues[issueheads] <- 0 +for(d in 1:nrow(issues)) { +curdate <- issues$date[d] +# Put all tweets from specific day in a temporary DF +tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] +for(t in 1:nrow(tweets_curday)){ +# Select tweet's text, make it lowercase and remove hashtag indicators (#) +curtext <- tolower(as.character(tweets_curday$text[t])) +curtext <- str_replace_all(curtext, "#", "") +for(i in 1:length(issuelist)) { +curtags <- as.character(issuelist[[i]]) +curissue <- names(issuelist)[i] +curtags <- str_c("\\W", curtags, "\\W") +tags_found <- str_detect(curtext, sprintf("%s", curtags)) +tags_found <- any(tags_found) +if(tags_found) { +#cat("Positive in", curissue,"from",as.character(drange[d]),"\n") +issues[d,curissue] <- issues[d,curissue] + 1 +} +else { +#cat("Nothing found\n") +} +} # /for issuelist +} # /for tweets_curday +} # /for drange +View(issues) +issuelist <- xmlToList("issues.xml") +issueheads <- names(issuelist) +issues[issueheads] <- 0 +View(issues) +issuelist <- xmlToList("issues.xml") +issueheads <- names(issuelist) +issues[issueheads] <- 0 +View(issues) +issues <- data.frame(date = drange) +issuelist <- xmlToList("issues.xml") +issueheads <- names(issuelist) +issues[issueheads] <- 0 +View(issues) +for(d in 1:nrow(issues)) { +curdate <- issues$date[d] +cat(as.character(curdate),"\n") +# Put all tweets from specific day in a temporary DF +tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] +for(t in 1:nrow(tweets_curday)){ +# Select tweet's text, make it lowercase and remove hashtag indicators (#) +curtext <- tolower(as.character(tweets_curday$text[t])) +curtext <- str_replace_all(curtext, "#", "") +for(i in 1:length(issuelist)) { +curtags <- as.character(issuelist[[i]]) +curissue <- names(issuelist)[i] +curtags <- str_c("\\W", curtags, "\\W") +tags_found <- str_detect(curtext, sprintf("%s", curtags)) +tags_found <- any(tags_found) +if(tags_found) { +#cat("Positive in", curissue,"from",as.character(drange[d]),"\n") +issues[d,curissue] <- issues[d,curissue] + 1 +} +else { +#cat("Nothing found\n") +} +} # /for issuelist +} # /for tweets_curday +} # /for drange +View(issues) +plot(x = issues$date, y=issues$issue.ttip) +plot(x = issues$date, y=issues$issue.ttip, type="l") +test <- c("issues$issue.ttip", "issues$issue.nsa") +plot(x = issues$date, y=test, type="l") +test +melt +library(ggplot2) +library(reshape2) +df <- melt(issues,id="date") +View(df) +ggplot(df,aes(x=Year,y=value,colour=variable,group=variable)) + geom_line() +ggplot(df,aes(x=date,y=value,colour=variable,group=variable)) + geom_line() +ggplot(df,aes(x=date,y=value,colour=variable,group=variable)) + geom_smooth() +ggplot(df,aes(x=date,y=value,colour=variable,group=variable)) + geom_point() +ggplot(df,aes(x=date,y=value,colour=variable,group=variable)) + geom_smooth(size=1,colour="red",method="loess", se=FALSE) +ggplot(df,aes(x=date,y=value,colour=variable,group=variable)) + geom_smooth(size=1,method="loess", se=FALSE) +ggplot(df,aes(x=date,y=value,colour=variable,group=variable)) + geom_line() +ggplot(df,aes(x=date,y=value,colour=variable,group=variable)) + geom_smooth(size=1,method="lm", se=FALSE) +ggplot(df,aes(x=date,y=value,colour=variable,group=variable)) + geom_smooth(size=1,method="gam", se=FALSE) +ggplot(df,aes(x=date,y=value,colour=variable,group=variable)) + geom_smooth(size=1,method="gam",formula = y ~ s(x, k = 3), se=FALSE) +ggplot(df,aes(x=date,y=value,colour=variable,group=variable)) + geom_smooth(size=1,method="gam",formula = y ~ x, se=FALSE) +ggplot(df,aes(x=date,y=value,colour=variable,group=variable)) + geom_smooth(size=1,method="loess",formula = y ~ x, se=FALSE) +ggplot(df,aes(x=date,y=value,colour=variable,group=variable)) + geom_smooth(size=3,method="loess",formula = y ~ x, se=FALSE) +ggplot(df,aes(x=date,y=value,colour=variable,group=variable)) + geom_smooth(size=0.5,method="loess",formula = y ~ x, se=FALSE) +opts(legend.position = "none") +theme(legend.position = "none") +View(issues) +require(lubridate) +require(XML) +require(ggplot2) +require(reshape2) +date_start +save(issues, "issues.RData") +save(issues, file = "issues.RData") +weeks(1) +weeks(354) +drange +date_start + weeks(0:7) +date_start <- as.Date("2014-01-01") +date_end <- as.Date("2014-12-01") +drange <- as.integer(date_end - date_start) +drange / 7 +round(drange/7) +round(drange/7,0) +signif(drange/7) +signif(drange/7, 0) +issues_bak <- issues +View(df) +issues_melt <- melt(issues,id="date") +ggplot(issues_melt,aes(x=date,y=value,colour=variable,group=variable)) + geom_line() +ggplot(issues_melt,aes(x=date,y=value,colour=variable,group=variable)) + geom_smooth(size=0.5,method="loess",formula = y ~ x, se=FALSE) +drange <- as.integer(date_end - date_start) +d +wrange <- as.integer(date_end - date_start) +wrange +wrange <- (as.integer(date_end - date_start) / 7) +wrange +format(round(wrange, 1), nsmall = 1). +format(round(wrange, 1), nsmall = 1) +sprintf("%.1f",wrange) +sprintf("%f",wrange) +sprintf("%.1",wrange) +sprintf("%1f",wrange) +sprintf("%.0f",wrange) +floor(wrange) +wrange <- (as.integer(date_end - date_start) / 7) +wrange <- floor(wrange) +wrange <- date_start + weeks(0:wdrange) +wrange <- date_start + weeks(0:wrange) +wrange +wrange[3] +wrange[3] + 1800 +wrange[3] + 1 +wrange[3] + 7 +View(issues) +issues$dates[3] - wrange [2] +issues$dates[3] - wrange[2] +wrange +class(wrange[2]) +class(issues$dates[3]) +class(issues$date[3]) +issues$date[3] - wrange[2] +issues$date[1] +issues$date[2] +wrange[1] +wrange[2] +wrange[1:2] +days(wrange[1:2]) +ddays(wrange[1:2]) +days(1:2) +wrange[1] + days(0:6) +wrange +issues_week <- data.frame(week = wrange) +View(issues_week) +wrange <- floor(wrange) +wrange <- (as.integer(date_end - date_start) / 7) +wrange <- floor(wrange) +wrange +wrange <- date_start + weeks(0:wrange) +issues_week <- data.frame(week = wrange) +View(issues_week) +wrange <- (as.integer(date_end - date_start) / 7) +wrange <- floor(wrange) - 1 +wrange <- date_start + weeks(0:wrange) +issues_week <- data.frame(week = wrange) +View(issues_week) +wrange[1] + days(0:6) +issues_week$week[2] +issues_week$week[2] + 1 +currange <- issues_week$week[w] + days(0:6) +w <- 1 +currange <- issues_week$week[w] + days(0:6) +currange +currange[7] +str_detect(names(issues), "^issue") +str_extract(names(issues), "^issue\.+") +str_extract(names(issues), "^issue\\.+") +str_extract(names(issues), "^issue\\..+") +issueheads +issues[issueheads[2],] +issues[,issueheads[2]] +issues[,issueheads[1]] +View(issues_week) +View(issues) +issues[issues[, "date"] == wrange[1]] +issues[issues[, "date"] == wrange[1], ] +issues[issues[, "date"] == wrange[3], ] +issues[issues[, "date"] == currange, ] +currange +issues[issues[, "date"] = currange, ] +issues[issues[, "date"] == currange, ] +View(issues) +warning() +issues[issues[, "date"] == currange, ] +warning() +issues[issues[, "date"] == sprintf("%s", currange), ] +issues[issues[, "date"] == 2014-01-02, ] +issues[issues[, "date"] == "2014-01-02", ] +issues[issues[, "date"] == "2014-01-03", ] +issues[issues[, "date"] == "2014-01-07", ] +issues[issues[, "date"] == "2014-01-08", ] +issues[issues[, "date"] == "2014-01-01:2014-01-08", ] +issues[issues[, "date"] == "2014-01-01:2014-01-07", ] +test <-issues[issues[, "date"] == currange, ] +test +sum(testz) +sum(test) +curweek <- issues_week$week[w] +currange <- curweek + days(0:6) +curweek +currange +d=1 +curday <- issues$date[d] +curday +names(issues)[2] +curvalue <- issues[d,c] +d +c +c <- 2 +curvalue <- issues[d,c] +vurv +curvalue +c=56 +curvalue +curvalue <- issues[d,c] +curvalue +c +d +View(issues) +issues[2,7] +issues[7,2] +issues[7,3] +curissue +curissue <- names(issues)[c] +c +c = 7 +curissue <- names(issues)[c] +curvalue <- issues[d,c] +curvalue +issues[d,curissue] +issues[d,7] +issues[d,curissue] +curissue +d +issues[d:d+6,curissue] +issues[(d:d+6),curissue] +d2 <- d+6 +d2 +issues[(d:d2),curissue] +issues[d:d2,curissue] +sum(issues[d:d2,curissue]) +issues_week[issueheads] <- 0 +View(issues_week) +issues_week[w,curissue] +View(issues) +for(w in 1:nrow(issues_week)) { +curweek <- issues_week$week[w] +currange <- curweek + days(0:6) +day <- 1 +for(d in 1:nrow(issues)) { +curday <- issues$date[d] +if(curweek == curday) { +for(c in 2:ncol(issues)) { +curissue <- names(issues)[c] +d2 <- d + 6 +curvalue <- sum(issues[d:d2,curissue]) +issues_week[w, curissue] <- curvalue +} # /for issues columns +} # /if day matches first day of week break -} -## Last loop is reached. Now clear the data frame -# Is the last tweet in tweets_temp from 2013? -status <- str_detect(tweets_temp$created_at[nrow(tweets_temp)], "2014$") -if (!status) { # Starting when tweet not from 2014 -# Delete all tweets other than from 2014 -old <- 0 -for(r in 1:nrow(tweets_temp)) { -status <- str_detect(tweets_temp$created_at[r], "2014$") -if(is.na(status)) { -#status <- FALSE -cat("[INFO] NA-Status in Tweet", r) -} -if(!status) { # Starting when tweet not from 2014 -old <- old + 1 -} -} -if(old > 0) { -old <- old - 1 -# If even the first entry isn't from 2014, we have to set "old" manually because of a bug -status <- str_detect(tweets_temp$created_at[1], "2014$") -if(!status) { -old <- nrow(tweets_temp) -cat("[INFO] Timeline enhält keinen einzigen aus 2014\n") -} -# delete all lines which are older than 2014 -tweets_temp <- head(tweets_temp, -old) -} -rm(old) -tweets_full <- insertRow(tweets_full, tweets_temp) -#rm(tweets_temp) -break # End loop because 2013 is reached -} -# The last tweet is still from 2014, so we need another loop -else { -# Setting max_id to gather next 200 tweets -max_id <- tweets_temp$id_str[nrow(tweets_temp)] -loop <- loop + 1 # just for stats -tweets_full <- insertRow(tweets_full, tweets_temp) -#rm(tweets_temp) -} -} # /repeat -tweets_complete <- insertRow(tweets_complete, tweets_full) -tweets_full <- head(tweets_full, -nrow(tweets_full)) # Empty tweets_full -cat("User:",user,"finished after",loop,"loops. Total Tweets now:",nrow(tweets_complete),"\n") -write.csv(tweets_complete, "tweets_complete.csv") -# Every tweet from 2014 from user[r] is downloaded. Now next user in for-loop -} -status -current -tweets_temp -status -fromJSON(current) -tweets_temp <- fromJSON(correctJSON(current)) -tweets_temp -status <- errorErrorColumn(tweets_temp) -a -View(acc_df) -tweets_complete$user[32539] -tweets_full <- data.frame(user=character(), name=character(), created_at=character(), id_str=character(), text=character(), retweet_count=character()) -for(a in 94:nrow(acc_df)) { -user <- as.character(acc_df$screenname[a]) -name <- as.character(acc_df$name[a]) -max_id <- "999999999999999999" -loop <- 1 -error <- 0 -repeat { -# Define specific search query -query <- c(include_rts=1, exclude_replies="true", trim_user="true", include_entities="false", -screen_name=user, -count=max_count, -max_id=max_id); -# At first, work with an temporary tweet-DB -current <- twitter_api_call(api_url, query, api_params) -rm(tweets_temp) -tweets_temp <- fromJSON(correctJSON(current)) -## START ERROR HANDLING ## -# Empty API output -status <- errorEmptyAPI(tweets_temp) -if(status == 1) { Sys.sleep(3);error <- error + 1;next} -if(status == 2) {break} -# Contains "error" column -status <- errorErrorColumn(tweets_temp) -if(status == 1) { Sys.sleep(3);error <- error + 1;next} -if(status == 2) {break} -# Check if error code exists -code <- errorCheckCode(tweets_temp) # 0 if no error -if(code == 34) { # page does not exist -status <- errorCode34 -if(status == 1) { Sys.sleep(3);error <- error + 1;next} -if(status == 2) {break} -} -if(code == 88) { # rate limit exceeded -wait <- errorCode88() -Sys.sleep(wait) -next -} -## END ERROR HANDLING ## -# Delete unnecessary columns and add username and real name to dataframe -tweets_temp <- tweets_temp[keep] -tweets_temp <- cbind(user=user, name=name, tweets_temp) -# Now sleep 3 second to dodge 300queries/15min limit -cat("[",a,"/",nrow(acc_df),"] ", sep = "") -cat("User: ",user," in loop: ",loop,". \n", sep = "") -Sys.sleep(2) -if(tweets_full$id_str[nrow(tweets_full)] == tweets_temp$id_str[nrow(tweets_temp)] && nrow(tweets_full) > 0) { -cat("[INFO] Last tweet of temp is last tweet of full. Abort loop and begin with next user.\n") -break -} -## Last loop is reached. Now clear the data frame -# Is the last tweet in tweets_temp from 2013? -status <- str_detect(tweets_temp$created_at[nrow(tweets_temp)], "2014$") -if (!status) { # Starting when tweet not from 2014 -# Delete all tweets other than from 2014 -old <- 0 -for(r in 1:nrow(tweets_temp)) { -status <- str_detect(tweets_temp$created_at[r], "2014$") -if(is.na(status)) { -#status <- FALSE -cat("[INFO] NA-Status in Tweet", r) -} -if(!status) { # Starting when tweet not from 2014 -old <- old + 1 -} -} -if(old > 0) { -old <- old - 1 -# If even the first entry isn't from 2014, we have to set "old" manually because of a bug -status <- str_detect(tweets_temp$created_at[1], "2014$") -if(!status) { -old <- nrow(tweets_temp) -cat("[INFO] Timeline enhält keinen einzigen aus 2014\n") -} -# delete all lines which are older than 2014 -tweets_temp <- head(tweets_temp, -old) -} -rm(old) -tweets_full <- insertRow(tweets_full, tweets_temp) -#rm(tweets_temp) -break # End loop because 2013 is reached -} -# The last tweet is still from 2014, so we need another loop -else { -# Setting max_id to gather next 200 tweets -max_id <- tweets_temp$id_str[nrow(tweets_temp)] -loop <- loop + 1 # just for stats -tweets_full <- insertRow(tweets_full, tweets_temp) -#rm(tweets_temp) -} -} # /repeat -tweets_complete <- insertRow(tweets_complete, tweets_full) -tweets_full <- head(tweets_full, -nrow(tweets_full)) # Empty tweets_full -cat("User:",user,"finished after",loop,"loops. Total Tweets now:",nrow(tweets_complete),"\n") -write.csv(tweets_complete, "tweets_complete.csv") -# Every tweet from 2014 from user[r] is downloaded. Now next user in for-loop -} -status -tweets_full <- data.frame(user=character(), name=character(), created_at=character(), id_str=character(), text=character(), retweet_count=character()) -a -for(a in 346:nrow(acc_df)) { -user <- as.character(acc_df$screenname[a]) -name <- as.character(acc_df$name[a]) -max_id <- "999999999999999999" -loop <- 1 -error <- 0 -repeat { -# Define specific search query -query <- c(include_rts=1, exclude_replies="true", trim_user="true", include_entities="false", -screen_name=user, -count=max_count, -max_id=max_id); -# At first, work with an temporary tweet-DB -current <- twitter_api_call(api_url, query, api_params) -rm(tweets_temp) -tweets_temp <- fromJSON(correctJSON(current)) -## START ERROR HANDLING ## -# Empty API output -status <- errorEmptyAPI(tweets_temp) -if(status == 1) { Sys.sleep(3);error <- error + 1;next} -if(status == 2) {break} -# Contains "error" column -status <- errorErrorColumn(tweets_temp) -if(status == 1) { Sys.sleep(3);error <- error + 1;next} -if(status == 2) {break} -# Check if error code exists -code <- errorCheckCode(tweets_temp) # 0 if no error -if(code == 34) { # page does not exist -status <- errorCode34() -if(status == 1) { Sys.sleep(3);error <- error + 1;next} -if(status == 2) {break} -} -if(code == 88) { # rate limit exceeded -wait <- errorCode88() -Sys.sleep(wait) -next -} -## END ERROR HANDLING ## -# Delete unnecessary columns and add username and real name to dataframe -tweets_temp <- tweets_temp[keep] -tweets_temp <- cbind(user=user, name=name, tweets_temp) -# Now sleep 3 second to dodge 300queries/15min limit -cat("[",a,"/",nrow(acc_df),"] ", sep = "") -cat("User: ",user," in loop: ",loop,". \n", sep = "") -Sys.sleep(2) -if(tweets_full$id_str[nrow(tweets_full)] == tweets_temp$id_str[nrow(tweets_temp)] && nrow(tweets_full) > 0) { -cat("[INFO] Last tweet of temp is last tweet of full. Abort loop and begin with next user.\n") -break -} -## Last loop is reached. Now clear the data frame -# Is the last tweet in tweets_temp from 2013? -status <- str_detect(tweets_temp$created_at[nrow(tweets_temp)], "2014$") -if (!status) { # Starting when tweet not from 2014 -# Delete all tweets other than from 2014 -old <- 0 -for(r in 1:nrow(tweets_temp)) { -status <- str_detect(tweets_temp$created_at[r], "2014$") -if(is.na(status)) { -#status <- FALSE -cat("[INFO] NA-Status in Tweet", r) -} -if(!status) { # Starting when tweet not from 2014 -old <- old + 1 -} -} -if(old > 0) { -old <- old - 1 -# If even the first entry isn't from 2014, we have to set "old" manually because of a bug -status <- str_detect(tweets_temp$created_at[1], "2014$") -if(!status) { -old <- nrow(tweets_temp) -cat("[INFO] Timeline enhält keinen einzigen aus 2014\n") -} -# delete all lines which are older than 2014 -tweets_temp <- head(tweets_temp, -old) -} -rm(old) -tweets_full <- insertRow(tweets_full, tweets_temp) -#rm(tweets_temp) -break # End loop because 2013 is reached -} -# The last tweet is still from 2014, so we need another loop -else { -# Setting max_id to gather next 200 tweets -max_id <- tweets_temp$id_str[nrow(tweets_temp)] -loop <- loop + 1 # just for stats -tweets_full <- insertRow(tweets_full, tweets_temp) -#rm(tweets_temp) -} -} # /repeat -tweets_complete <- insertRow(tweets_complete, tweets_full) -tweets_full <- head(tweets_full, -nrow(tweets_full)) # Empty tweets_full -cat("User:",user,"finished after",loop,"loops. Total Tweets now:",nrow(tweets_complete),"\n") -write.csv(tweets_complete, "tweets_complete.csv") -# Every tweet from 2014 from user[a] is downloaded. Now next user in for-loop -} -save(tweets_complete, file="tweets_complete.RData") -tweets_complete$id_str[146982] -class(tweets_complete$id_str[146982]) -tweets_complete$id_str[1] +} # /for issues rows +} # /for issues_week +View(issues_week) +for(w in 1:nrow(issues_week)) { +curweek <- issues_week$week[w] +currange <- curweek + days(0:6) +day <- 1 +for(d in 1:nrow(issues)) { +curday <- issues$date[d] +if(curweek == curday) { +for(c in 2:ncol(issues)) { +curissue <- names(issues)[c] +d2 <- d + 6 +curvalue <- sum(issues[d:d2,curissue]) +issues_week[w, curissue] <- curvalue +} # /for issues columns +} # /if day matches first day of week +} # /for issues rows +} # /for issues_week +View(issues_week) +View(issues) +View(issues_week) +View(issues) +View(issues_week) +issues_week_melt <- melt(issues_week,id="week") +ggplot(issues_week_melt,aes(x=week,y=value,colour=variable,group=variable)) + geom_line() +ggplot(issues_week_melt,aes(x=week,y=value,colour=variable,group=variable)) + geom_smooth(size=0.5,method="loess",formula = y ~ x, se=FALSE) diff --git a/issuecomp.R b/issuecomp.R index 6e963b8..355b5ac 100644 --- a/issuecomp.R +++ b/issuecomp.R @@ -1,10 +1,13 @@ require(lubridate) +require(XML) +require(ggplot2) +require(reshape2) # Create date range date_start <- as.Date("2014-01-01") date_end <- as.Date("2014-12-01") drange <- as.integer(date_end - date_start) -drange <- date_start + days(0:d) +drange <- date_start + days(0:drange) issues <- data.frame(date = drange) issuelist <- xmlToList("issues.xml") @@ -42,13 +45,53 @@ for(d in 1:nrow(issues)) { } # /for drange +## Do not use days but week intervals + +wrange <- (as.integer(date_end - date_start) / 7) +wrange <- floor(wrange) - 1 +wrange <- date_start + weeks(0:wrange) +issues_week <- data.frame(week = wrange) +issues_week[issueheads] <- 0 + + +for(w in 1:nrow(issues_week)) { + curweek <- issues_week$week[w] + currange <- curweek + days(0:6) + + day <- 1 + + + for(d in 1:nrow(issues)) { + curday <- issues$date[d] + + if(curweek == curday) { + for(c in 2:ncol(issues)) { + curissue <- names(issues)[c] + d2 <- d + 6 + curvalue <- sum(issues[d:d2,curissue]) + issues_week[w, curissue] <- curvalue + + } # /for issues columns + } # /if day matches first day of week + + } # /for issues rows +} # /for issues_week + + + + # VISUALS ----------------------------------------------------------------- -library(ggplot2) -library(reshape2) -df <- melt(issues,id="date") -ggplot(df,aes(x=date,y=value,colour=variable,group=variable)) + geom_line() -ggplot(df,aes(x=date,y=value,colour=variable,group=variable)) + geom_smooth(size=0.5,method="loess",formula = y ~ x, se=FALSE) +# Level: days +issues_melt <- melt(issues,id="date") +ggplot(issues_melt,aes(x=date,y=value,colour=variable,group=variable)) + geom_line() +ggplot(issues_melt,aes(x=date,y=value,colour=variable,group=variable)) + geom_smooth(size=0.5,method="loess",formula = y ~ x, se=FALSE) + +# Level: weeks +issues_week_melt <- melt(issues_week,id="week") +ggplot(issues_week_melt,aes(x=week,y=value,colour=variable,group=variable)) + geom_line() +ggplot(issues_week_melt,aes(x=week,y=value,colour=variable,group=variable)) + geom_smooth(size=0.5,method="loess",formula = y ~ x, se=FALSE) + # POSSIBLY USEFUL CODE ----------------------------------------------------