diff --git a/extract-twitter-accounts.R b/extract-twitter-accounts.R index 23e40d1..746eb31 100644 --- a/extract-twitter-accounts.R +++ b/extract-twitter-accounts.R @@ -11,11 +11,6 @@ setwd("~/Dokumente/Uni/Aktuell/BA-Arbeit/uni-ba-issuecomp") source("functions.R") -# # Set curl handle for friendly scraping -# handle <- getCurlHandle(httpheader = list(from = "max.mehl@uni.kn", -# 'user-agent' = str_c(R.version$version.string) -# ) -# ) acc_url <- "http://www.bundestwitter.de/api/politiker" acc_df <- fromJSON(acc_url) @@ -166,5 +161,37 @@ for(a in 346:nrow(acc_df)) { # Every tweet from 2014 from user[a] is downloaded. Now next user in for-loop } +rm(a, code, current, error, loop, max_id, name, query, r, status, user, wait, tweets_full, tweets_temp) + + +# CLEAR DATAFRAME --------------------------------------------------------- + +tweets <- tweets_bak + +# Remove duplicates +tweets <- tweets_complete[!duplicated(tweets_complete), ] +save(tweets_complete, file="tweets_complete.RData") +save(tweets, file="tweets.RData") +rm(tweets_complete) + +# Format dates in data frame +Sys.setlocale("LC_TIME", "C") + +tweets$created_at <- as.POSIXct(tweets$created_at, format = "%a %b %d %H:%M:%S %z %Y") +tweets <- tweets[order(tweets$created_at), ] + +# Finally delete every tweet not from 2014 +delrow <- NULL +for(r in 1:nrow(tweets)) { + if(format(tweets$created_at[r], "%Y") != "2014") { + delrow <- c(delrow, r) + } + if(format(tweets$created_at[r], "%Y") == "2014") { + break + } +} +tweets <- tweets[-delrow, ] +rm(delrow, r) + diff --git a/tweets.RData b/tweets.RData new file mode 100644 index 0000000..9abd65d Binary files /dev/null and b/tweets.RData differ