diff --git a/.RData b/.RData index 1b2ce36..e8f14ea 100644 Binary files a/.RData and b/.RData differ diff --git a/.Rhistory b/.Rhistory index ff5df04..9ac776b 100644 --- a/.Rhistory +++ b/.Rhistory @@ -1,165 +1,512 @@ -setwd("~/Dokumente/Uni/Aktuell/BA-Arbeit/uni-ba-issuecomp") -read.csv("politiker2.csv") -read.csv("politiker2.csv") -list1 <- read.csv("politiker2.csv") -View(list1) -list1$name -list1$name[1] -for(i in 1:length(list1)) { -lastname <- as.character(list1$name[i]) +rm(tweets_temp) +tweets_temp <- fromJSON(correctJSON(current)) +## START ERROR HANDLING ## +# Empty API output +status <- errorEmptyAPI(tweets_temp) +if(status == 1) { Sys.sleep(3);error <- error + 1;next} +if(status == 2) {break} +# Contains "error" column +status <- errorErrorColumn(tweets_temp) +if(status == 1) { Sys.sleep(3);error <- error + 1;next} +if(status == 2) {break} +# Check if error code exists +code <- errorCheckCode(tweets_temp) # 0 if no error +if(code == 34) { # page does not exist +status <- errorCode34() +if(status == 1) { Sys.sleep(3);error <- error + 1;next} +if(status == 2) {break} } -str_extract(lastname, "\\w+?") -require(XML) -require(stringr) -str_extract(lastname, "\\w+?") -str_extract(lastname, "\\w+") -str_extract(lastname, "\\w+") -list2 <- read.csv("politiker.csv") -View(list2) -View(list2) -View(list2) -View(list2) -list1 <- read.csv("politiker2.csv") -list2 <- read.csv("politiker.csv") -for(i in 1:length(list1)) { -name1 <- as.character(list1$name[i]) -lastname1 <- str_extract(name1, "\\w+") -for(a in 1:length(list2)) { -name2 <- as.character(list2$name[a]) -if(str_detect(name2, lastname1)) { -acc <- as.character(list2$screenname[a]) -cat(name1,":",acc,sep = "") +if(code == 88) { # rate limit exceeded +wait <- errorCode88() +Sys.sleep(wait) +next +} +## END ERROR HANDLING ## +# Delete unnecessary columns and add username and real name to dataframe +tweets_temp <- tweets_temp[keep] +tweets_temp <- cbind(user=user, name=name, tweets_temp) +# Now sleep 3 second to dodge 300queries/15min limit +cat("[",a,"/",nrow(acc_df),"] ", sep = "") +cat("User: ",user," in loop: ",loop,". \n", sep = "") +Sys.sleep(2) +if(tweets_full$id_str[nrow(tweets_full)] == tweets_temp$id_str[nrow(tweets_temp)] && nrow(tweets_full) > 0) { +cat("[INFO] Last tweet of temp is last tweet of full. Abort loop and begin with next user.\n") +break +} +## Last loop is reached. Now clear the data frame +# # Is the last tweet in tweets_temp from 2013? +# status <- str_detect(tweets_temp$created_at[nrow(tweets_temp)], "2014$") +# Extract year of last tweet in tweets_temp +year <- as.numeric(str_extract(tweets_temp$created_at[nrow(tweets_temp)], "\\d{4}$")) +status <- year < 2014 +if (status) { # Starting when tweet is earlier than 2014 (i.e. 2013, 2012...) +# # Delete all tweets other than from 2014 +# old <- 0 +# for(r in 1:nrow(tweets_temp)) { +# status <- str_detect(tweets_temp$created_at[r], "2014$") +# if(is.na(status)) { +# #status <- FALSE +# cat("[INFO] NA-Status in Tweet", r) +# } +# if(!status) { # Starting when tweet not from 2014 +# old <- old + 1 +# } +# } +# if(old > 0) { +# old <- old - 1 +# +# # If even the first entry isn't from 2014, we have to set "old" manually because of a bug +# status <- str_detect(tweets_temp$created_at[1], "2014$") +# if(!status) { +# old <- nrow(tweets_temp) +# cat("[INFO] Timeline enhält keinen einzigen aus 2014\n") +# } +# +# # delete all lines which are older than 2014 +# tweets_temp <- head(tweets_temp, -old) +# } +# rm(old) +tweets_full <- insertRow(tweets_full, tweets_temp) +#rm(tweets_temp) +break # End loop because 2013 is reached +} +# The last tweet is newer or equal 2014, so we need another loop +else { +# Setting max_id to gather next 200 tweets +max_id <- tweets_temp$id_str[nrow(tweets_temp)] +loop <- loop + 1 # just for stats +tweets_full <- insertRow(tweets_full, tweets_temp) +#rm(tweets_temp) +} +} # /repeat +tweets_complete <- insertRow(tweets_complete, tweets_full) +tweets_full <- head(tweets_full, -nrow(tweets_full)) # Empty tweets_full +cat("User:",user,"finished after",loop,"loops. Total Tweets now:",nrow(tweets_complete),"\n") +write.csv(tweets_complete, "tweets_complete.csv") +# Every tweet from 2014 or newer from user[a] is downloaded. Now next user in for-loop +} +api_url <- "https://api.twitter.com/1.1/statuses/user_timeline.json"; +max_count <- "200" +keep <- c("created_at", "id_str", "text", "retweet_count") +# tweets_complete: All tweets +# tweets_full: All tweets of current user +# tweets_temp: The current max 200 tweets of current user +tweets_full <- data.frame(user=character(), name=character(), created_at=character(), id_str=character(), text=character(), retweet_count=character()) +tweets_complete <- tweets_full +for(a in 1:nrow(acc_df)) { +user <- as.character(acc_df$twitter_acc[a]) +name <- as.character(acc_df$name[a]) +max_id <- "999999999999999999" +loop <- 1 +error <- 0 +repeat { +# Define specific search query +query <- c(include_rts=1, exclude_replies="true", trim_user="true", include_entities="false", +screen_name=user, +count=max_count, +max_id=max_id); +# At first, work with an temporary tweet-DB +current <- twitter_api_call(api_url, query, api_params) +rm(tweets_temp) +tweets_temp <- fromJSON(correctJSON(current)) +## START ERROR HANDLING ## +# Empty API output +status <- errorEmptyAPI(tweets_temp) +if(status == 1) { Sys.sleep(3);error <- error + 1;next} +if(status == 2) {break} +# Contains "error" column +status <- errorErrorColumn(tweets_temp) +if(status == 1) { Sys.sleep(3);error <- error + 1;next} +if(status == 2) {break} +# Check if error code exists +code <- errorCheckCode(tweets_temp) # 0 if no error +if(code == 34) { # page does not exist +status <- errorCode34() +if(status == 1) { Sys.sleep(3);error <- error + 1;next} +if(status == 2) {break} +} +if(code == 88) { # rate limit exceeded +wait <- errorCode88() +Sys.sleep(wait) +next +} +## END ERROR HANDLING ## +# Delete unnecessary columns and add username and real name to dataframe +tweets_temp <- tweets_temp[keep] +tweets_temp <- cbind(user=user, name=name, tweets_temp) +# Now sleep 3 second to dodge 300queries/15min limit +cat("[",a,"/",nrow(acc_df),"] ", sep = "") +cat("User: ",user," in loop: ",loop,". \n", sep = "") +Sys.sleep(2) +if(tweets_full$id_str[nrow(tweets_full)] == tweets_temp$id_str[nrow(tweets_temp)] && nrow(tweets_full) > 0) { +cat("[INFO] Last tweet of temp is last tweet of full. Abort loop and begin with next user.\n") +break +} +## Last loop is reached. Now clear the data frame +# # Is the last tweet in tweets_temp from 2013? +# status <- str_detect(tweets_temp$created_at[nrow(tweets_temp)], "2014$") +# Extract year of last tweet in tweets_temp +year <- as.numeric(str_extract(tweets_temp$created_at[nrow(tweets_temp)], "\\d{4}$")) +status <- year < 2014 +if (status) { # Starting when tweet is earlier than 2014 (i.e. 2013, 2012...) +# # Delete all tweets other than from 2014 +# old <- 0 +# for(r in 1:nrow(tweets_temp)) { +# status <- str_detect(tweets_temp$created_at[r], "2014$") +# if(is.na(status)) { +# #status <- FALSE +# cat("[INFO] NA-Status in Tweet", r) +# } +# if(!status) { # Starting when tweet not from 2014 +# old <- old + 1 +# } +# } +# if(old > 0) { +# old <- old - 1 +# +# # If even the first entry isn't from 2014, we have to set "old" manually because of a bug +# status <- str_detect(tweets_temp$created_at[1], "2014$") +# if(!status) { +# old <- nrow(tweets_temp) +# cat("[INFO] Timeline enhält keinen einzigen aus 2014\n") +# } +# +# # delete all lines which are older than 2014 +# tweets_temp <- head(tweets_temp, -old) +# } +# rm(old) +tweets_full <- insertRow(tweets_full, tweets_temp) +#rm(tweets_temp) +break # End loop because 2013 is reached +} +# The last tweet is newer or equal 2014, so we need another loop +else { +# Setting max_id to gather next 200 tweets +max_id <- tweets_temp$id_str[nrow(tweets_temp)] +loop <- loop + 1 # just for stats +tweets_full <- insertRow(tweets_full, tweets_temp) +#rm(tweets_temp) +} +} # /repeat +tweets_complete <- insertRow(tweets_complete, tweets_full) +tweets_full <- head(tweets_full, -nrow(tweets_full)) # Empty tweets_full +cat("User:",user,"finished after",loop,"loops. Total Tweets now:",nrow(tweets_complete),"\n") +write.csv(tweets_complete, "tweets_complete.csv") +# Every tweet from 2014 or newer from user[a] is downloaded. Now next user in for-loop +} +tweets_full <- data.frame(user=character(), name=character(), created_at=character(), id_str=character(), text=character(), retweet_count=character()) +tweets_complete <- tweets_full +for(a in 1:nrow(acc_df)) { +user <- as.character(acc_df$twitter_acc[a]) +name <- as.character(acc_df$name[a]) +max_id <- "999999999999999999" +loop <- 1 +error <- 0 +repeat { +# Define specific search query +query <- c(include_rts=1, exclude_replies="true", trim_user="true", include_entities="false", +screen_name=user, +count=max_count, +max_id=max_id); +# At first, work with an temporary tweet-DB +current <- twitter_api_call(api_url, query, api_params) +rm(tweets_temp) +tweets_temp <- fromJSON(correctJSON(current)) +## START ERROR HANDLING ## +# Empty API output +status <- errorEmptyAPI(tweets_temp) +if(status == 1) { Sys.sleep(3);error <- error + 1;next} +if(status == 2) {break} +# Contains "error" column +status <- errorErrorColumn(tweets_temp) +if(status == 1) { Sys.sleep(3);error <- error + 1;next} +if(status == 2) {break} +# Check if error code exists +code <- errorCheckCode(tweets_temp) # 0 if no error +if(code == 34) { # page does not exist +status <- errorCode34() +if(status == 1) { Sys.sleep(3);error <- error + 1;next} +if(status == 2) {break} +} +if(code == 88) { # rate limit exceeded +wait <- errorCode88() +Sys.sleep(wait) +next +} +## END ERROR HANDLING ## +# Delete unnecessary columns and add username and real name to dataframe +tweets_temp <- tweets_temp[keep] +tweets_temp <- cbind(user=user, name=name, tweets_temp) +# Now sleep 3 second to dodge 300queries/15min limit +cat("[",a,"/",nrow(acc_df),"] ", sep = "") +cat("User: ",user," in loop: ",loop,". \n", sep = "") +Sys.sleep(2) +if(tweets_full$id_str[nrow(tweets_full)] == tweets_temp$id_str[nrow(tweets_temp)] && nrow(tweets_full) > 0) { +cat("[INFO] Last tweet of temp is last tweet of full. Abort loop and begin with next user.\n") +break +} +## Last loop is reached. Now clear the data frame +# # Is the last tweet in tweets_temp from 2013? +# status <- str_detect(tweets_temp$created_at[nrow(tweets_temp)], "2014$") +# Extract year of last tweet in tweets_temp +year_last <- as.numeric(str_extract(tweets_temp$created_at[nrow(tweets_temp)], "\\d{4}$")) +status <- year_last < 2014 +if (status) { # Starting when tweet is earlier than 2014 (i.e. 2013, 2012...) +year_first <- as.numeric(str_extract(tweets_temp$created_at[1], "\\d{4}$")) +status <- year_first < 2014 +cat("[INFO] Timeline enhält keinen einzigen aus 2014\n") +} +# # Delete all tweets other than from 2014 +# old <- 0 +# for(r in 1:nrow(tweets_temp)) { +# status <- str_detect(tweets_temp$created_at[r], "2014$") +# if(is.na(status)) { +# #status <- FALSE +# cat("[INFO] NA-Status in Tweet", r) +# } +# if(!status) { # Starting when tweet not from 2014 +# old <- old + 1 +# } +# } +# if(old > 0) { +# old <- old - 1 +# +# # If even the first entry isn't from 2014, we have to set "old" manually because of a bug +# status <- str_detect(tweets_temp$created_at[1], "2014$") +# if(!status) { +# old <- nrow(tweets_temp) +# cat("[INFO] Timeline enhält keinen einzigen aus 2014\n") +# } +# +# # delete all lines which are older than 2014 +# tweets_temp <- head(tweets_temp, -old) +# } +# rm(old) +tweets_full <- insertRow(tweets_full, tweets_temp) +#rm(tweets_temp) +break # End loop because 2013 is reached +} +# The last tweet is newer or equal 2014, so we need another loop +else { +# Setting max_id to gather next 200 tweets +max_id <- tweets_temp$id_str[nrow(tweets_temp)] +loop <- loop + 1 # just for stats +tweets_full <- insertRow(tweets_full, tweets_temp) +#rm(tweets_temp) +} +} # /repeat +tweets_complete <- insertRow(tweets_complete, tweets_full) +tweets_full <- head(tweets_full, -nrow(tweets_full)) # Empty tweets_full +cat("User:",user,"finished after",loop,"loops. Total Tweets now:",nrow(tweets_complete),"\n") +write.csv(tweets_complete, "tweets_complete.csv") +# Every tweet from 2014 or newer from user[a] is downloaded. Now next user in for-loop +} +tweets_full <- data.frame(user=character(), name=character(), created_at=character(), id_str=character(), text=character(), retweet_count=character()) +tweets_complete <- tweets_full +for(a in 1:nrow(acc_df)) { +user <- as.character(acc_df$twitter_acc[a]) +name <- as.character(acc_df$name[a]) +max_id <- "999999999999999999" +loop <- 1 +error <- 0 +repeat { +# Define specific search query +query <- c(include_rts=1, exclude_replies="true", trim_user="true", include_entities="false", +screen_name=user, +count=max_count, +max_id=max_id); +# At first, work with an temporary tweet-DB +current <- twitter_api_call(api_url, query, api_params) +rm(tweets_temp) +tweets_temp <- fromJSON(correctJSON(current)) +## START ERROR HANDLING ## +# Empty API output +status <- errorEmptyAPI(tweets_temp) +if(status == 1) { Sys.sleep(3);error <- error + 1;next} +if(status == 2) {break} +# Contains "error" column +status <- errorErrorColumn(tweets_temp) +if(status == 1) { Sys.sleep(3);error <- error + 1;next} +if(status == 2) {break} +# Check if error code exists +code <- errorCheckCode(tweets_temp) # 0 if no error +if(code == 34) { # page does not exist +status <- errorCode34() +if(status == 1) { Sys.sleep(3);error <- error + 1;next} +if(status == 2) {break} +} +if(code == 88) { # rate limit exceeded +wait <- errorCode88() +Sys.sleep(wait) +next +} +## END ERROR HANDLING ## +# Delete unnecessary columns and add username and real name to dataframe +tweets_temp <- tweets_temp[keep] +tweets_temp <- cbind(user=user, name=name, tweets_temp) +# Now sleep 3 second to dodge 300queries/15min limit +cat("[",a,"/",nrow(acc_df),"] ", sep = "") +cat("User: ",user," in loop: ",loop,". \n", sep = "") +Sys.sleep(2) +if(tweets_full$id_str[nrow(tweets_full)] == tweets_temp$id_str[nrow(tweets_temp)] && nrow(tweets_full) > 0) { +cat("[INFO] Last tweet of temp is last tweet of full. Abort loop and begin with next user.\n") +break +} +## Last loop is reached. Now clear the data frame +# # Is the last tweet in tweets_temp from 2013? +# status <- str_detect(tweets_temp$created_at[nrow(tweets_temp)], "2014$") +# Extract year of last tweet in tweets_temp +year_last <- as.numeric(str_extract(tweets_temp$created_at[nrow(tweets_temp)], "\\d{4}$")) +status <- year_last < 2014 +if (status) { # Starting when tweet is earlier than 2014 (i.e. 2013, 2012...) +# Is even the first tweet older than 2014? +year_first <- as.numeric(str_extract(tweets_temp$created_at[1], "\\d{4}$")) +status <- year_first < 2014 +if(status) { +cat("[INFO] Timeline enhält keinen einzigen aus 2014\n") +} +# # Delete all tweets other than from 2014 +# old <- 0 +# for(r in 1:nrow(tweets_temp)) { +# status <- str_detect(tweets_temp$created_at[r], "2014$") +# if(is.na(status)) { +# #status <- FALSE +# cat("[INFO] NA-Status in Tweet", r) +# } +# if(!status) { # Starting when tweet not from 2014 +# old <- old + 1 +# } +# } +# if(old > 0) { +# old <- old - 1 +# +# # If even the first entry isn't from 2014, we have to set "old" manually because of a bug +# status <- str_detect(tweets_temp$created_at[1], "2014$") +# if(!status) { +# old <- nrow(tweets_temp) +# cat("[INFO] Timeline enhält keinen einzigen aus 2014\n") +# } +# +# # delete all lines which are older than 2014 +# tweets_temp <- head(tweets_temp, -old) +# } +# rm(old) +tweets_full <- insertRow(tweets_full, tweets_temp) +#rm(tweets_temp) +break # End loop because 2013 is reached +} +# The last tweet is newer or equal 2014, so we need another loop +else { +# Setting max_id to gather next 200 tweets +max_id <- tweets_temp$id_str[nrow(tweets_temp)] +loop <- loop + 1 # just for stats +tweets_full <- insertRow(tweets_full, tweets_temp) +#rm(tweets_temp) +} +} # /repeat +tweets_complete <- insertRow(tweets_complete, tweets_full) +tweets_full <- head(tweets_full, -nrow(tweets_full)) # Empty tweets_full +cat("User:",user,"finished after",loop,"loops. Total Tweets now:",nrow(tweets_complete),"\n") +write.csv(tweets_complete, "tweets_complete.csv") +# Every tweet from 2014 or newer from user[a] is downloaded. Now next user in for-loop +} +rm(a, code, current, error, loop, max_id, name, query, r, status, user, wait, tweets_full, tweets_temp) +rm(a, code, current, error, loop, max_id, max_count, year_first, year_last, name, query, status, user, wait, tweets_full, tweets_temp) +rm(year) +save(tweets_complete, file="tweets_complete.RData") +tweets_complete2 <- tweets_complete +View(tweets_complete2) +tweets <- tweets_complete[!duplicated(tweets_complete), ] +tweets <- na.omit(tweets) +save(tweets, file="tweets.RData") +rm(tweets_complete2) +Sys.setlocale("LC_TIME", "C") +tweets$created_at <- as.POSIXct(tweets$created_at, format = "%a %b %d %H:%M:%S %z %Y") +tweets <- tweets[order(tweets$created_at), ] +head(tweets) +delrow <- NULL +for(r in 1:nrow(tweets)) { +if(format(tweets$created_at[r], "%Y") != "2014") { +delrow <- c(delrow, r) +} +if(format(tweets$created_at[r], "%Y") == "2014") { +break } } +delrow <- NULL +pb <- txtProgressBar(min = 0, max = total, style = 3) +for(r in 1:nrow(tweets)) { +setTxtProgressBar(pb, r) +if(format(tweets$created_at[r], "%Y") != "2014") { +delrow <- c(delrow, r) } -cat(name1,":",acc,sep = "") -for(i in 1:length(list1)) { -name1 <- as.character(list1$name[i]) -lastname1 <- str_extract(name1, "\\w+") -for(a in 1:length(list2)) { -name2 <- as.character(list2$name[a]) -if(str_detect(name2, lastname1)) { -acc <- as.character(list2$screenname[a]) -#cat(name1,":",acc,"\n",sep = "") -cat("Found\n") +if(format(tweets$created_at[r], "%Y") == "2014") { +break } } +pb <- txtProgressBar(min = 0, max = total, style = 3) +pb <- txtProgressBar(min = 0, max = nrow(tweets), style = 3) +for(r in 1:nrow(tweets)) { +setTxtProgressBar(pb, r) +if(format(tweets$created_at[r], "%Y") != "2014") { +delrow <- c(delrow, r) } -a -for(i in 1:nrow(list1)) { -name1 <- as.character(list1$name[i]) -lastname1 <- str_extract(name1, "\\w+") -for(a in 1:nrow(list2)) { -name2 <- as.character(list2$name[a]) -if(str_detect(name2, lastname1)) { -acc <- as.character(list2$screenname[a]) -cat(name1,":",acc,"\n",sep = "") -#cat("Found\n") +if(format(tweets$created_at[r], "%Y") == "2014") { +break } } +tweets <- tweets[-delrow, ] +rm(delrow, r) +summary(tweets) +tweets$created_at[140000] +tweets$created_at[130000] +tweets$created_at[1] +tweets$created_at[2] +tweets$created_at[100] +tweets$created_at[141086] +delrow +delrow <- NULL +for(r in 1:nrow(tweets)) { +if(format(tweets$created_at[r], "%Y") != "2014") { +delrow <- c(delrow, r) } -for(i in 1:nrow(list1)) { -detect <- FALSE -name1 <- as.character(list1$name[i]) -lastname1 <- str_extract(name1, "\\w+") -for(a in 1:nrow(list2)) { -name2 <- as.character(list2$name[a]) -if(str_detect(name2, lastname1)) { -detect <- TRUE -acc <- as.character(list2$screenname[a]) -cat(name1," --> ",acc,"\n",sep = "") +if(format(tweets$created_at[r], "%Y") == "2014") { +break } } -if(detect) { -cat("\n") -} -} -for(i in 1:nrow(list1)) { -detect <- FALSE -name1 <- as.character(list1$name[i]) -lastname1 <- str_extract(name1, "\\w+") -for(a in 1:nrow(list2)) { -name2 <- as.character(list2$name[a]) -if(str_detect(name2, lastname1)) { -detect <- TRUE -acc <- as.character(list2$screenname[a]) -cat(name1," --> ",acc,"(",name2,")","\n",sep = "") -} -} -if(detect) { -cat("\n") -} -} -for(i in 1:nrow(list1)) { -detect <- FALSE -name1 <- as.character(list1$name[i]) -lastname1 <- str_extract(name1, "\\w+") -for(a in 1:nrow(list2)) { -name2 <- as.character(list2$name[a]) -if(str_detect(name2, lastname1)) { -detect <- TRUE -acc <- as.character(list2$screenname[a]) -cat(name1," --> ",acc," (",name2,")","\n",sep = "") -} -} -if(detect) { -cat("\n") -} -} -c(name1," --> ",acc," (",name2,")","\n",sep = "") -str_c(name1," --> ",acc," (",name2,")","\n",sep = "") -str_c(name1," --> ",acc," (",name2,")") -result <- str_c(name1," --> ",acc," (",name2,")") -for(i in 1:nrow(list1)) { -detect <- FALSE -name1 <- as.character(list1$name[i]) -lastname1 <- str_extract(name1, "\\w+") -for(a in 1:nrow(list2)) { -name2 <- as.character(list2$name[a]) -if(str_detect(name2, lastname1)) { -detect <- TRUE -acc <- as.character(list2$screenname[a]) -result <- str_c(name1," --> ",acc," (",name2,")") -write(result, "merge.txt", append = TRUE) -} -} -if(detect) { -write("", "merge.txt", append = TRUE) -} -} -for(i in 1:nrow(list1)) { -detect <- FALSE -name1 <- as.character(list1$name[i]) -lastname1 <- str_extract(name1, "\\w+") -for(a in 1:nrow(list2)) { -name2 <- as.character(list2$name[a]) -if(str_detect(name2, lastname1)) { -detect <- TRUE -acc <- as.character(list2$screenname[a]) -verif <- as.character(list2$verified[a]) -result <- str_c(name1," --> ",acc," (",name2,") ",verif) -write(result, "merge.txt", append = TRUE) -} -} -if(detect) { -write("", "merge.txt", append = TRUE) -} -} -for(i in 1:nrow(list1)) { -detect <- FALSE -name1 <- as.character(list1$name[i]) -lastname1 <- str_extract(name1, "\\w+") -for(a in 1:nrow(list2)) { -name2 <- as.character(list2$name[a]) -if(str_detect(name2, lastname1)) { -detect <- TRUE -acc <- as.character(list2$screenname[a]) -verif <- as.character(list2$verified[a]) -result <- str_c(name1," --> ",acc," (",name2,") ",verif) -write(result, "merge.txt", append = TRUE) -} -} -if(detect) { -write("", "merge.txt", append = TRUE) +pb <- txtProgressBar(min = 0, max = nrow(tweets), style = 3) +for(r in 1:nrow(tweets)) { +if(format(tweets$created_at[r], "%Y") != "2014") { +delrow <- c(delrow, r) } +setTxtProgressBar(pb, r) } +tweets <- tweets[-delrow, ] +tweets$created_at[137800] +tweets$created_at[137876] +rm(delrow, r) +rm(pb) +tweets$created_at <- format(tweets$created_at, "%Y-%m-%d") +tweets$created_at[137876] +save(tweets, file="tweets.RData") +readLines("twitter-api-consumerkey.txt") +api_params2 <- api_params +readLines("twitter-api-consumerkey.txt") +readLines("twitter-api-consumerkey.txt")[2] +readLines("twitter-api-credentials.txt")[2] +readLines("twitter-api-credentials.txt")[1] +api_params <- c( +"oauth_consumer_key" = readLines("twitter-api-credentials.txt")[2], +"oauth_nonce" = NA, +"oauth_signature_method" = "HMAC-SHA1", +"oauth_timestamp" = NA, +"oauth_token" = readLines("twitter-api-credentials.txt")[4], +"oauth_version" = "1.0", +"consumer_secret" = readLines("twitter-api-credentials.txt")[3], +"oauth_token_secret" = readLines("twitter-api-credentials.txt")[5] +) +api_params +api_params2 +rm(tweets_complete) +rm(api_params2) +source("issuecomp-functions.R") diff --git a/issuecomp.R b/issuecomp-analysis.R similarity index 100% rename from issuecomp.R rename to issuecomp-analysis.R diff --git a/functions.R b/issuecomp-functions.R similarity index 100% rename from functions.R rename to issuecomp-functions.R diff --git a/extract-twitter-accounts.R b/issuecomp-scraping.R similarity index 83% rename from extract-twitter-accounts.R rename to issuecomp-scraping.R index 97f24a6..d3bb2c3 100644 --- a/extract-twitter-accounts.R +++ b/issuecomp-scraping.R @@ -7,7 +7,7 @@ require(RTwitterAPI) setwd("~/Dokumente/Uni/Aktuell/BA-Arbeit/uni-ba-issuecomp") -source("functions.R") +source("issuecomp-functions.R") #acc_url <- "http://www.bundestwitter.de/api/politiker" @@ -118,15 +118,15 @@ for(a in 1:nrow(acc_df)) { cat("[INFO] Last tweet of temp is last tweet of full. Abort loop and begin with next user.\n") break } - - ## Last loop is reached. Now clear the data frame -# # Is the last tweet in tweets_temp from 2013? -# status <- str_detect(tweets_temp$created_at[nrow(tweets_temp)], "2014$") + + ## CHECK if we need another loop # Extract year of last tweet in tweets_temp year_last <- as.numeric(str_extract(tweets_temp$created_at[nrow(tweets_temp)], "\\d{4}$")) status <- year_last < 2014 - if (status) { # Starting when tweet is earlier than 2014 (i.e. 2013, 2012...) + + # Is last tweet earlier than 2014? So break the loop + if (status) { # Is even the first tweet older than 2014? year_first <- as.numeric(str_extract(tweets_temp$created_at[1], "\\d{4}$")) @@ -135,35 +135,7 @@ for(a in 1:nrow(acc_df)) { cat("[INFO] Timeline enhält keinen einzigen aus 2014\n") } -# # Delete all tweets other than from 2014 -# old <- 0 -# for(r in 1:nrow(tweets_temp)) { -# status <- str_detect(tweets_temp$created_at[r], "2014$") -# if(is.na(status)) { -# #status <- FALSE -# cat("[INFO] NA-Status in Tweet", r) -# } -# if(!status) { # Starting when tweet not from 2014 -# old <- old + 1 -# } -# } -# if(old > 0) { -# old <- old - 1 -# -# # If even the first entry isn't from 2014, we have to set "old" manually because of a bug -# status <- str_detect(tweets_temp$created_at[1], "2014$") -# if(!status) { -# old <- nrow(tweets_temp) -# cat("[INFO] Timeline enhält keinen einzigen aus 2014\n") -# } -# -# # delete all lines which are older than 2014 -# tweets_temp <- head(tweets_temp, -old) -# } -# rm(old) - tweets_full <- insertRow(tweets_full, tweets_temp) - #rm(tweets_temp) break # End loop because 2013 is reached } @@ -174,7 +146,6 @@ for(a in 1:nrow(acc_df)) { loop <- loop + 1 # just for stats tweets_full <- insertRow(tweets_full, tweets_temp) - #rm(tweets_temp) } } # /repeat @@ -182,7 +153,6 @@ for(a in 1:nrow(acc_df)) { tweets_full <- head(tweets_full, -nrow(tweets_full)) # Empty tweets_full cat("User:",user,"finished after",loop,"loops. Total Tweets now:",nrow(tweets_complete),"\n") write.csv(tweets_complete, "tweets_complete.csv") - # Every tweet from 2014 or newer from user[a] is downloaded. Now next user in for-loop }