better year handling, removed early ssh pfadiskn clean, added better twitter-acc-list
This commit is contained in:
+62
-35
@@ -10,10 +10,22 @@ setwd("~/Dokumente/Uni/Aktuell/BA-Arbeit/uni-ba-issuecomp")
|
||||
source("functions.R")
|
||||
|
||||
|
||||
acc_url <- "http://www.bundestwitter.de/api/politiker"
|
||||
acc_df <- fromJSON(acc_url)
|
||||
#acc_url <- "http://www.bundestwitter.de/api/politiker"
|
||||
#acc_df <- fromJSON(acc_url)
|
||||
|
||||
acc_df <- read.csv("politiker2.csv")
|
||||
|
||||
delrow <- NULL
|
||||
for(r in 1:nrow(acc_df)) {
|
||||
acc <- as.character(acc_df$twitter_acc[r])
|
||||
if(!nzchar(acc)) {
|
||||
delrow <- c(delrow, r)
|
||||
}
|
||||
}
|
||||
acc_df <- acc_df[-delrow, ]
|
||||
rm(delrow, r, acc)
|
||||
acc_df$row.names <- NULL
|
||||
row.names(acc_df) <- NULL
|
||||
|
||||
|
||||
# COLLECT ALL TWEETS ------------------------------------------------------
|
||||
@@ -42,11 +54,15 @@ api_params <- c(
|
||||
api_url <- "https://api.twitter.com/1.1/statuses/user_timeline.json";
|
||||
max_count <- "200"
|
||||
keep <- c("created_at", "id_str", "text", "retweet_count")
|
||||
|
||||
# tweets_complete: All tweets
|
||||
# tweets_full: All tweets of current user
|
||||
# tweets_temp: The current max 200 tweets of current user
|
||||
tweets_full <- data.frame(user=character(), name=character(), created_at=character(), id_str=character(), text=character(), retweet_count=character())
|
||||
tweets_complete <- tweets_full
|
||||
|
||||
for(a in 346:nrow(acc_df)) {
|
||||
user <- as.character(acc_df$screenname[a])
|
||||
for(a in 1:nrow(acc_df)) {
|
||||
user <- as.character(acc_df$twitter_acc[a])
|
||||
name <- as.character(acc_df$name[a])
|
||||
max_id <- "999999999999999999"
|
||||
loop <- 1
|
||||
@@ -104,43 +120,54 @@ for(a in 346:nrow(acc_df)) {
|
||||
}
|
||||
|
||||
## Last loop is reached. Now clear the data frame
|
||||
# Is the last tweet in tweets_temp from 2013?
|
||||
status <- str_detect(tweets_temp$created_at[nrow(tweets_temp)], "2014$")
|
||||
if (!status) { # Starting when tweet not from 2014
|
||||
# # Is the last tweet in tweets_temp from 2013?
|
||||
# status <- str_detect(tweets_temp$created_at[nrow(tweets_temp)], "2014$")
|
||||
|
||||
# Extract year of last tweet in tweets_temp
|
||||
year_last <- as.numeric(str_extract(tweets_temp$created_at[nrow(tweets_temp)], "\\d{4}$"))
|
||||
status <- year_last < 2014
|
||||
if (status) { # Starting when tweet is earlier than 2014 (i.e. 2013, 2012...)
|
||||
|
||||
# Delete all tweets other than from 2014
|
||||
old <- 0
|
||||
for(r in 1:nrow(tweets_temp)) {
|
||||
status <- str_detect(tweets_temp$created_at[r], "2014$")
|
||||
if(is.na(status)) {
|
||||
#status <- FALSE
|
||||
cat("[INFO] NA-Status in Tweet", r)
|
||||
}
|
||||
if(!status) { # Starting when tweet not from 2014
|
||||
old <- old + 1
|
||||
}
|
||||
# Is even the first tweet older than 2014?
|
||||
year_first <- as.numeric(str_extract(tweets_temp$created_at[1], "\\d{4}$"))
|
||||
status <- year_first < 2014
|
||||
if(status) {
|
||||
cat("[INFO] Timeline enhält keinen einzigen aus 2014\n")
|
||||
}
|
||||
if(old > 0) {
|
||||
old <- old - 1
|
||||
|
||||
# If even the first entry isn't from 2014, we have to set "old" manually because of a bug
|
||||
status <- str_detect(tweets_temp$created_at[1], "2014$")
|
||||
if(!status) {
|
||||
old <- nrow(tweets_temp)
|
||||
cat("[INFO] Timeline enhält keinen einzigen aus 2014\n")
|
||||
}
|
||||
|
||||
# delete all lines which are older than 2014
|
||||
tweets_temp <- head(tweets_temp, -old)
|
||||
}
|
||||
rm(old)
|
||||
|
||||
# # Delete all tweets other than from 2014
|
||||
# old <- 0
|
||||
# for(r in 1:nrow(tweets_temp)) {
|
||||
# status <- str_detect(tweets_temp$created_at[r], "2014$")
|
||||
# if(is.na(status)) {
|
||||
# #status <- FALSE
|
||||
# cat("[INFO] NA-Status in Tweet", r)
|
||||
# }
|
||||
# if(!status) { # Starting when tweet not from 2014
|
||||
# old <- old + 1
|
||||
# }
|
||||
# }
|
||||
# if(old > 0) {
|
||||
# old <- old - 1
|
||||
#
|
||||
# # If even the first entry isn't from 2014, we have to set "old" manually because of a bug
|
||||
# status <- str_detect(tweets_temp$created_at[1], "2014$")
|
||||
# if(!status) {
|
||||
# old <- nrow(tweets_temp)
|
||||
# cat("[INFO] Timeline enhält keinen einzigen aus 2014\n")
|
||||
# }
|
||||
#
|
||||
# # delete all lines which are older than 2014
|
||||
# tweets_temp <- head(tweets_temp, -old)
|
||||
# }
|
||||
# rm(old)
|
||||
|
||||
tweets_full <- insertRow(tweets_full, tweets_temp)
|
||||
#rm(tweets_temp)
|
||||
break # End loop because 2013 is reached
|
||||
}
|
||||
|
||||
# The last tweet is still from 2014, so we need another loop
|
||||
# The last tweet is newer or equal 2014, so we need another loop
|
||||
else {
|
||||
# Setting max_id to gather next 200 tweets
|
||||
max_id <- tweets_temp$id_str[nrow(tweets_temp)]
|
||||
@@ -157,7 +184,7 @@ for(a in 346:nrow(acc_df)) {
|
||||
write.csv(tweets_complete, "tweets_complete.csv")
|
||||
|
||||
|
||||
# Every tweet from 2014 from user[a] is downloaded. Now next user in for-loop
|
||||
# Every tweet from 2014 or newer from user[a] is downloaded. Now next user in for-loop
|
||||
}
|
||||
rm(a, code, current, error, loop, max_id, name, query, r, status, user, wait, tweets_full, tweets_temp)
|
||||
|
||||
@@ -178,7 +205,7 @@ Sys.setlocale("LC_TIME", "C")
|
||||
tweets$created_at <- as.POSIXct(tweets$created_at, format = "%a %b %d %H:%M:%S %z %Y")
|
||||
tweets <- tweets[order(tweets$created_at), ]
|
||||
|
||||
# Finally delete every tweet not from 2014
|
||||
# Finally delete every tweet not from 2014 (so also )
|
||||
delrow <- NULL
|
||||
for(r in 1:nrow(tweets)) {
|
||||
if(format(tweets$created_at[r], "%Y") != "2014") {
|
||||
|
||||
Reference in New Issue
Block a user