better year handling, removed early ssh pfadiskn clean, added better twitter-acc-list

This commit is contained in:
2015-01-10 01:50:01 +01:00
parent 34807191b9
commit b85be742d4
11 changed files with 2778 additions and 577 deletions
+62 -35
View File
@@ -10,10 +10,22 @@ setwd("~/Dokumente/Uni/Aktuell/BA-Arbeit/uni-ba-issuecomp")
source("functions.R")
acc_url <- "http://www.bundestwitter.de/api/politiker"
acc_df <- fromJSON(acc_url)
#acc_url <- "http://www.bundestwitter.de/api/politiker"
#acc_df <- fromJSON(acc_url)
acc_df <- read.csv("politiker2.csv")
delrow <- NULL
for(r in 1:nrow(acc_df)) {
acc <- as.character(acc_df$twitter_acc[r])
if(!nzchar(acc)) {
delrow <- c(delrow, r)
}
}
acc_df <- acc_df[-delrow, ]
rm(delrow, r, acc)
acc_df$row.names <- NULL
row.names(acc_df) <- NULL
# COLLECT ALL TWEETS ------------------------------------------------------
@@ -42,11 +54,15 @@ api_params <- c(
api_url <- "https://api.twitter.com/1.1/statuses/user_timeline.json";
max_count <- "200"
keep <- c("created_at", "id_str", "text", "retweet_count")
# tweets_complete: All tweets
# tweets_full: All tweets of current user
# tweets_temp: The current max 200 tweets of current user
tweets_full <- data.frame(user=character(), name=character(), created_at=character(), id_str=character(), text=character(), retweet_count=character())
tweets_complete <- tweets_full
for(a in 346:nrow(acc_df)) {
user <- as.character(acc_df$screenname[a])
for(a in 1:nrow(acc_df)) {
user <- as.character(acc_df$twitter_acc[a])
name <- as.character(acc_df$name[a])
max_id <- "999999999999999999"
loop <- 1
@@ -104,43 +120,54 @@ for(a in 346:nrow(acc_df)) {
}
## Last loop is reached. Now clear the data frame
# Is the last tweet in tweets_temp from 2013?
status <- str_detect(tweets_temp$created_at[nrow(tweets_temp)], "2014$")
if (!status) { # Starting when tweet not from 2014
# # Is the last tweet in tweets_temp from 2013?
# status <- str_detect(tweets_temp$created_at[nrow(tweets_temp)], "2014$")
# Extract year of last tweet in tweets_temp
year_last <- as.numeric(str_extract(tweets_temp$created_at[nrow(tweets_temp)], "\\d{4}$"))
status <- year_last < 2014
if (status) { # Starting when tweet is earlier than 2014 (i.e. 2013, 2012...)
# Delete all tweets other than from 2014
old <- 0
for(r in 1:nrow(tweets_temp)) {
status <- str_detect(tweets_temp$created_at[r], "2014$")
if(is.na(status)) {
#status <- FALSE
cat("[INFO] NA-Status in Tweet", r)
}
if(!status) { # Starting when tweet not from 2014
old <- old + 1
}
# Is even the first tweet older than 2014?
year_first <- as.numeric(str_extract(tweets_temp$created_at[1], "\\d{4}$"))
status <- year_first < 2014
if(status) {
cat("[INFO] Timeline enhält keinen einzigen aus 2014\n")
}
if(old > 0) {
old <- old - 1
# If even the first entry isn't from 2014, we have to set "old" manually because of a bug
status <- str_detect(tweets_temp$created_at[1], "2014$")
if(!status) {
old <- nrow(tweets_temp)
cat("[INFO] Timeline enhält keinen einzigen aus 2014\n")
}
# delete all lines which are older than 2014
tweets_temp <- head(tweets_temp, -old)
}
rm(old)
# # Delete all tweets other than from 2014
# old <- 0
# for(r in 1:nrow(tweets_temp)) {
# status <- str_detect(tweets_temp$created_at[r], "2014$")
# if(is.na(status)) {
# #status <- FALSE
# cat("[INFO] NA-Status in Tweet", r)
# }
# if(!status) { # Starting when tweet not from 2014
# old <- old + 1
# }
# }
# if(old > 0) {
# old <- old - 1
#
# # If even the first entry isn't from 2014, we have to set "old" manually because of a bug
# status <- str_detect(tweets_temp$created_at[1], "2014$")
# if(!status) {
# old <- nrow(tweets_temp)
# cat("[INFO] Timeline enhält keinen einzigen aus 2014\n")
# }
#
# # delete all lines which are older than 2014
# tweets_temp <- head(tweets_temp, -old)
# }
# rm(old)
tweets_full <- insertRow(tweets_full, tweets_temp)
#rm(tweets_temp)
break # End loop because 2013 is reached
}
# The last tweet is still from 2014, so we need another loop
# The last tweet is newer or equal 2014, so we need another loop
else {
# Setting max_id to gather next 200 tweets
max_id <- tweets_temp$id_str[nrow(tweets_temp)]
@@ -157,7 +184,7 @@ for(a in 346:nrow(acc_df)) {
write.csv(tweets_complete, "tweets_complete.csv")
# Every tweet from 2014 from user[a] is downloaded. Now next user in for-loop
# Every tweet from 2014 or newer from user[a] is downloaded. Now next user in for-loop
}
rm(a, code, current, error, loop, max_id, name, query, r, status, user, wait, tweets_full, tweets_temp)
@@ -178,7 +205,7 @@ Sys.setlocale("LC_TIME", "C")
tweets$created_at <- as.POSIXct(tweets$created_at, format = "%a %b %d %H:%M:%S %z %Y")
tweets <- tweets[order(tweets$created_at), ]
# Finally delete every tweet not from 2014
# Finally delete every tweet not from 2014 (so also )
delrow <- NULL
for(r in 1:nrow(tweets)) {
if(format(tweets$created_at[r], "%Y") != "2014") {