From c296884028922419c8842f668dfa5a5112f9e3a1 Mon Sep 17 00:00:00 2001 From: mxmehl Date: Sat, 6 Dec 2014 16:40:29 +0100 Subject: [PATCH] better error handling and logic error fixing --- extract-twitter-accounts.R | 82 ++++++++++++-------------------------- functions.R | 73 +++++++++++++++++++++++++++++++++ 2 files changed, 99 insertions(+), 56 deletions(-) diff --git a/extract-twitter-accounts.R b/extract-twitter-accounts.R index 9331006..ae8047a 100644 --- a/extract-twitter-accounts.R +++ b/extract-twitter-accounts.R @@ -51,7 +51,8 @@ max_count <- "200" keep <- c("created_at", "id_str", "text", "retweet_count") tweets_full <- data.frame(user=character(), name=character(), created_at=character(), id_str=character(), text=character(), retweet_count=character()) tweets_complete <- tweets_full -for(a in 1:nrow(acc_df)) { + +for(a in 94:nrow(acc_df)) { user <- as.character(acc_df$screenname[a]) name <- as.character(acc_df$name[a]) max_id <- "999999999999999999" @@ -66,63 +67,32 @@ for(a in 1:nrow(acc_df)) { # At first, work with an temporary tweet-DB current <- twitter_api_call(api_url, query, api_params) + rm(tweets_temp) tweets_temp <- fromJSON(correctJSON(current)) - ## START ERROR HANDLING ## + ## START ERROR HANDLING ## + # Empty API output + status <- errorEmptyAPI(tweets_temp) + if(status == 1) { Sys.sleep(3);error <- error + 1;next} + if(status == 2) {break} - # Check for empty API returns - status <- length(tweets_temp) - if(status == 0) { - if(error > 2) { - cat("[WARNING] 3x empty API result. Aborting now.\n") - break - } - cat("[WARNING] Empty API result. Trying again.\n") - rm(tweets_temp) - error <- error + 1 - Sys.sleep(3) + # Contains "error" column + status <- errorErrorColumn(tweets_temp) + if(status == 1) { Sys.sleep(3);error <- error + 1;next} + if(status == 2) {break} + + # Check if error code exists + code <- errorCheckCode(tweets_temp) # 0 if no error + if(code == 34) { # page does not exist + status <- errorCode34 + if(status == 1) { Sys.sleep(3);error <- error + 1;next} + if(status == 2) {break} + } + if(code == 88) { # rate limit exceeded + wait <- errorCode88() + Sys.sleep(wait) next - } - - # Check if API output contains error fields - status <- "error" %in% names(tweets_temp) - if(status) { - cat("[WARNING] Error in API request:", tweets_temp$error[1],"\n") - rm(tweets_temp) - break - } - - # Check for other errors, mostly rate limits - status <- "errors" %in% names(tweets_temp) - if(status) { - cat("[WARNING] Error in API request:", tweets_temp$errors[1,1],"\n") - status <- tweets_temp$errors[1,2] - - # "Rate limit exceeded" - if(status == 88) { - rate_api_url <- "https://api.twitter.com/1.1/application/rate_limit_status.json" - rate_query <-c (resources="statuses") - resettime <- fromJSON(twitter_api_call(rate_api_url, rate_query, api_params)) - resettime <- resettime$resources$statuses$`/statuses/user_timeline`$reset - curtime <- as.numeric(as.POSIXct(Sys.time())) - wait <- round(resettime - curtime + 10) - cat("[INFO] Rate limit is exceeded. Now waiting",wait,"seconds.\n") - Sys.sleep(wait) - } - - # "Sorry, that page does not exist" - if(status == 34) { - if(error > 2) { - cat("[WARNING] 3x Not existing page. Aborting now.\n") - break - } - error <- error + 1 - } - - rm(tweets_temp) - Sys.sleep(3) - next - } + } ## END ERROR HANDLING ## @@ -173,7 +143,7 @@ for(a in 1:nrow(acc_df)) { rm(old) tweets_full <- insertRow(tweets_full, tweets_temp) - rm(tweets_temp) + #rm(tweets_temp) break # End loop because 2013 is reached } @@ -184,7 +154,7 @@ for(a in 1:nrow(acc_df)) { loop <- loop + 1 # just for stats tweets_full <- insertRow(tweets_full, tweets_temp) - rm(tweets_temp) + #rm(tweets_temp) } } # /repeat diff --git a/functions.R b/functions.R index 5210957..1100fef 100644 --- a/functions.R +++ b/functions.R @@ -2,6 +2,8 @@ require(stringr) # Replace characters messing up JSON validation (\,\n,^) correctJSON <- function(string) { + string <- gsub('\\\\\\\\\\"(\\w)', '\\1' , string) + string <- gsub('\\\\\\\\\\" ', ' ', string) string <- gsub("\\\\{2,}", "", string) string <- str_replace_all(string, pattern = "[^[:print:]]", replacement = " ") string <- str_replace_all(string, pattern = "&..;", replacement = " ") @@ -16,3 +18,74 @@ insertRow <- function(existingDF, newrow, r) { row.names(existingDF) <- 1:nrow(existingDF) return(existingDF) } + +## ERROR HANDLING + +# Check for empty API returns (0 or 1 or 2) +errorEmptyAPI <- function(df) { + status <- length(df) + if(status == 0) { + if(error < 3) { + cat("[WARNING] Empty API result. Trying again.\n") + returncode <- 1 + } + else { + cat("[WARNING] 3x empty API result. Aborting now.\n") + returncode <- 2 + } + } + else { + returncode <- 0 + } + return(returncode) +} + +# Check if API output contains error fields (0 or 2) +errorErrorColumn <- function(df) { + status <- "error" %in% names(df) + if(status) { + cat("[WARNING] Error in API request:", df$error[1],"\n") + returncode <- 2 + } + else { + returncode <- 0 + } + return(returncode) +} + +# Check if error codes exist (i.e. 34 or 88) +errorCheckCode <- function(df) { + status <- "errors" %in% names(df) + if(status) { + cat("[WARNING] Error in API request:", df$errors[1,1],"\n") + code <- df$errors[1,2] + } + else { + code <- 0 + } + return(code) +} + +# Handle code 88: rate limit exceeded (wait time) +errorCode88 <- function() { + rate_api_url <- "https://api.twitter.com/1.1/application/rate_limit_status.json" + rate_query <- c(resources="statuses") + resettime <- fromJSON(twitter_api_call(rate_api_url, rate_query, api_params)) + resettime <- resettime$resources$statuses$`/statuses/user_timeline`$reset + curtime <- as.numeric(as.POSIXct(Sys.time())) + wait <- round(resettime - curtime + 10) + cat("[INFO] Rate limit is exceeded. Now waiting",wait,"seconds.\n") + return(wait) +} + +# Handle code 34: Page does not exist (1 or 2) +errorCode34 <- function() { + if(error > 2) { + cat("[WARNING] 3x Not existing page. Aborting now.\n") + returncode <- 2 + } + else { + returncode <- 1 + } + return(returncode) +}