working userloop

2014-12-01 18:38:58 +01:00
parent 1fc222f55c
commit 92179c6fb0
2 changed files with 67 additions and 67 deletions
@@ -8,11 +8,11 @@ setwd("~/Dokumente/Uni/Aktuell/BA-Arbeit/uni-ba-issuecomp")

 source("functions.R")

-# Set curl handle for friendly scraping
-handle <- getCurlHandle(httpheader = list(from         = "max.mehl@uni.kn",
-                                          'user-agent' = str_c(R.version$version.string)
-)
-)
+# # Set curl handle for friendly scraping
+# handle <- getCurlHandle(httpheader = list(from         = "max.mehl@uni.kn",
+#                                           'user-agent' = str_c(R.version$version.string)
+# )
+# )

 acc_url <- "http://www.bundestwitter.de/api/politiker"
 #acc_json <- readLines("politiker.txt")
@@ -35,70 +35,79 @@ api_params <- c(
 )

 api_url   <- "https://api.twitter.com/1.1/statuses/user_timeline.json";
-user <- "GregorGysi"
 max_count <- "200"
-max_id <- "999999999999999999"
-loop <- 1
 keep <- c("created_at", "id_str", "text", "retweet_count")
 rm(tweets_full)
-last_id <- NULL
-repeat {
-  # Define specific search query
-  query <- c(include_rts=1, exclude_replies="true", trim_user="true", include_entities="false", 
-             screen_name=user, 
-             count=max_count, 
-             max_id=max_id);

-  # If a tweets_full DB already exists (after the first loop this should be the case)
-  if(exists("tweets_full")) {
-    current <- twitter_api_call(api_url, query, api_params)
-    tweets_temp <- fromJSON(correctJSON(current))
-    tweets_temp <- tweets_temp[keep]
-    tweets_full <- insertRow(tweets_full, tweets_temp)
-    rm(tweets_temp)
-  }
-  # First loop
-  else {
-    current <- twitter_api_call(api_url, query, api_params)
-    tweets_full <- fromJSON(correctJSON(current))
-    tweets_full <- tweets_full[keep]
-  }
+for(a in 1:nrow(acc_df)) {
+  user <- as.character(acc_df$screenname[a])
+  name <- as.character(acc_df$name[a])
+  max_id <- "999999999999999999"
+  loop <- 1
+  repeat {
+    # Define specific search query
+    query <- c(include_rts=1, exclude_replies="true", trim_user="true", include_entities="false", 
+               screen_name=user, 
+               count=max_count, 
+               max_id=max_id);
    
-  # Now sleep 3 second to dodge 300queries/15min limit
-  cat("User:",user,"in loop:",loop,"- now waiting 3 secs...\n")
-  Sys.sleep(3)
+    # If a tweets_full DB already exists (after the first loop this should be the case)
+    if(exists("tweets_full")) {
+      current <- twitter_api_call(api_url, query, api_params)
+      tweets_temp <- fromJSON(correctJSON(current))
+      tweets_temp <- tweets_temp[keep]
+      tweets_temp <- cbind(user=user, name=name, tweets_temp)
+      tweets_full <- insertRow(tweets_full, tweets_temp)
+      rm(tweets_temp)
+    }
+    # First loop
+    else {
+      current <- twitter_api_call(api_url, query, api_params)
+      tweets_full <- fromJSON(correctJSON(current))
+      tweets_full <- tweets_full[keep]
+      tweets_full <- cbind(user=user, name=name, tweets_full)
+    }
    
-  # Is the last tweet in tweets_full from 2013?
-  status <- str_detect(tweets_full$created_at[nrow(tweets_full)], "2013$")
-  # Last loop is reached. Now clear the data frame
-  if (status) {
+    # Now sleep 3 second to dodge 300queries/15min limit
+    cat("User:",user,"in loop:",loop,"- now waiting 2 secs...\n")
+    Sys.sleep(2)
    
-    # Delete all tweets from 2013
-    old <- 0
-    for(r in 1:nrow(tweets_full)) {
-      status <- str_detect(tweets_full$created_at[r], "2013$")
-      if(is.na(status)) { status <- FALSE }
-      if(status) {
-        old <- old + 1
+    # Is the last tweet in tweets_full from 2013?
+    status <- str_detect(tweets_full$created_at[nrow(tweets_full)], "2013$")
+    # Last loop is reached. Now clear the data frame
+    if (status) {
+      
+      # Delete all tweets from 2013
+      old <- 0
+      for(r in 1:nrow(tweets_full)) {
+        status <- str_detect(tweets_full$created_at[r], "2013$")
+        if(is.na(status)) { status <- FALSE }
+        if(status) {
+          old <- old + 1
+        }
      }
-    }
-    if(old > 0) {
-      old <- old - 1
-      tweets_full <- head(tweets_full, -old)
-    }
-    rm(old)
+      if(old > 0) {
+        old <- old - 1
+        tweets_full <- head(tweets_full, -old)
+      }
+      rm(old)
      
-    break  # End loop because 2013 is reached
+      break  # End loop because 2013 is reached
+    }
+    
+    # The last tweet is still from 2014, so we need another loop
+    else {
+      # Setting max_id to gather next 200 tweets
+      max_id <- tweets_full$id_str[nrow(tweets_full)]
+      loop <- loop + 1  # just for stats
+    }
  }
  
-  # The last tweet is still from 2014, so we need another loop
-  else {
-    # Setting max_id to gather next 200 tweets
-    max_id <- tweets_full$id_str[nrow(tweets_full)]
-    loop <- loop + 1  # just for stats
-  }
+  # Every tweet from 2014 from user[r] is downloaded. Now next user in for-loop
+  cat("User:",user,"finished after",loop,"loops\n")
 }

+
 # ---------------


@@ -10,15 +10,6 @@ correctJSON <- function(string) {
  return(string)
 }

-correctJSON2 <- function(string) {
-  #string <- sub(x=string, pattern = perl('\\\\(?![tn"])'), replacement = " ")
-  string <- gsub(x=string, pattern = "\n", replacement = " ")
-  string <- gsub(x=string, pattern = "\r", replacement = " ")
-  string <- gsub(x=string, pattern = "\\^", replacement = " ")
-  #\xed\xa0\xbd\xed\xb1\x8d\xed\xa0\xbd\xed\xb8\x8e\
-  return(string)
-}
-
 insertRow <- function(existingDF, newrow, r) {
  r <- as.numeric(nrow(existingDF)) + 1
  existingDF <- rbind(existingDF,newrow)