Bachelor thesis: "The influence of sensational issues on the political agenda setting in social media"
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

issuecomp-1-scraping.R 6.4KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197
  1. # PREPARATIONS ------------------------------------------------------------
  2. require(jsonlite)
  3. require(stringr)
  4. require(devtools)
  5. require(RTwitterAPI)
  6. setwd("~/Dokumente/Uni/Aktuell/BA-Arbeit/uni-ba-issuecomp")
  7. source("issuecomp-functions.R")
  8. #acc_url <- "http://www.bundestwitter.de/api/politiker"
  9. #acc_df <- fromJSON(acc_url)
  10. acc_df <- read.csv("MdB-twitter.csv")
  11. delrow <- NULL
  12. for(r in 1:nrow(acc_df)) {
  13. acc <- as.character(acc_df$twitter_acc[r])
  14. if(!nzchar(acc)) {
  15. delrow <- c(delrow, r)
  16. }
  17. }
  18. acc_df <- acc_df[-delrow, ]
  19. rm(delrow, r, acc)
  20. acc_df$row.names <- NULL
  21. row.names(acc_df) <- NULL
  22. # COLLECT ALL TWEETS ------------------------------------------------------
  23. # http://www.joyofdata.de/blog/twitters-rest-api-v1-1-with-r-for-linux-and-windows/
  24. # --> devtools::install_github("joyofdata/RTwitterAPI")
  25. # https://dev.twitter.com/rest/reference/get/statuses/user_timeline
  26. api_params <- c(
  27. "oauth_consumer_key" = readLines("twitter-api-credentials.txt")[2],
  28. "oauth_nonce" = NA,
  29. "oauth_signature_method" = "HMAC-SHA1",
  30. "oauth_timestamp" = NA,
  31. "oauth_token" = readLines("twitter-api-credentials.txt")[4],
  32. "oauth_version" = "1.0",
  33. "consumer_secret" = readLines("twitter-api-credentials.txt")[3],
  34. "oauth_token_secret" = readLines("twitter-api-credentials.txt")[5]
  35. )
  36. #api_url2 <- "https://api.twitter.com/1.1/statuses/show.json"
  37. #id2="498492933922754560" # 499533113676931073(\" ), 325320073906622464(\\>), 498492933922754560(\"W)
  38. #query2 <- c(id=id2, trim_user="true", include_entities="false")
  39. #current2 <- twitter_api_call(api_url2, query2, api_params)
  40. api_url <- "https://api.twitter.com/1.1/statuses/user_timeline.json";
  41. max_count <- "200"
  42. keep <- c("created_at", "id_str", "text", "retweet_count")
  43. # tweets_complete: All tweets
  44. # tweets_full: All tweets of current user
  45. # tweets_temp: The current max 200 tweets of current user
  46. tweets_full <- data.frame(user=character(), name=character(), created_at=character(), id_str=character(), text=character(), retweet_count=character())
  47. tweets_complete <- tweets_full
  48. for(a in 1:nrow(acc_df)) {
  49. user <- as.character(acc_df$twitter_acc[a])
  50. name <- as.character(acc_df$name[a])
  51. max_id <- "999999999999999999"
  52. loop <- 1
  53. error <- 0
  54. repeat {
  55. # Define specific search query
  56. query <- c(include_rts=1, exclude_replies="true", trim_user="true", include_entities="false",
  57. screen_name=user,
  58. count=max_count,
  59. max_id=max_id);
  60. # At first, work with an temporary tweet-DB
  61. current <- twitter_api_call(api_url, query, api_params)
  62. rm(tweets_temp)
  63. tweets_temp <- fromJSON(correctJSON(current))
  64. ## START ERROR HANDLING ##
  65. # Empty API output
  66. status <- errorEmptyAPI(tweets_temp)
  67. if(status == 1) { Sys.sleep(3);error <- error + 1;next}
  68. if(status == 2) {break}
  69. # Contains "error" column
  70. status <- errorErrorColumn(tweets_temp)
  71. if(status == 1) { Sys.sleep(3);error <- error + 1;next}
  72. if(status == 2) {break}
  73. # Check if error code exists
  74. code <- errorCheckCode(tweets_temp) # 0 if no error
  75. if(code == 34) { # page does not exist
  76. status <- errorCode34()
  77. if(status == 1) { Sys.sleep(3);error <- error + 1;next}
  78. if(status == 2) {break}
  79. }
  80. if(code == 88) { # rate limit exceeded
  81. wait <- errorCode88()
  82. Sys.sleep(wait)
  83. next
  84. }
  85. ## END ERROR HANDLING ##
  86. # Delete unnecessary columns and add username and real name to dataframe
  87. tweets_temp <- tweets_temp[keep]
  88. tweets_temp <- cbind(user=user, name=name, tweets_temp)
  89. # Now sleep 3 second to dodge 300queries/15min limit
  90. cat("[",a,"/",nrow(acc_df),"] ", sep = "")
  91. cat("User: ",user," in loop: ",loop,". \n", sep = "")
  92. Sys.sleep(2)
  93. if(tweets_full$id_str[nrow(tweets_full)] == tweets_temp$id_str[nrow(tweets_temp)] && nrow(tweets_full) > 0) {
  94. cat("[INFO] Last tweet of temp is last tweet of full. Abort loop and begin with next user.\n")
  95. break
  96. }
  97. ## CHECK if we need another loop
  98. # Extract year of last tweet in tweets_temp
  99. year_last <- as.numeric(str_extract(tweets_temp$created_at[nrow(tweets_temp)], "\\d{4}$"))
  100. status <- year_last < 2014
  101. # Is last tweet earlier than 2014? So break the loop
  102. if (status) {
  103. # Is even the first tweet older than 2014?
  104. year_first <- as.numeric(str_extract(tweets_temp$created_at[1], "\\d{4}$"))
  105. status <- year_first < 2014
  106. if(status) {
  107. cat("[INFO] Timeline enhält keinen einzigen aus 2014\n")
  108. }
  109. tweets_full <- insertRow(tweets_full, tweets_temp)
  110. break # End loop because 2013 is reached
  111. }
  112. # The last tweet is newer or equal 2014, so we need another loop
  113. else {
  114. # Setting max_id to gather next 200 tweets
  115. max_id <- tweets_temp$id_str[nrow(tweets_temp)]
  116. loop <- loop + 1 # just for stats
  117. tweets_full <- insertRow(tweets_full, tweets_temp)
  118. }
  119. } # /repeat
  120. tweets_complete <- insertRow(tweets_complete, tweets_full)
  121. tweets_full <- head(tweets_full, -nrow(tweets_full)) # Empty tweets_full
  122. cat("User:",user,"finished after",loop,"loops. Total Tweets now:",nrow(tweets_complete),"\n")
  123. write.csv(tweets_complete, "tweets_complete.csv")
  124. # Every tweet from 2014 or newer from user[a] is downloaded. Now next user in for-loop
  125. }
  126. rm(a, code, current, error, loop, max_id, max_count, year_first, year_last, name, query, status, user, wait, tweets_full, tweets_temp)
  127. # CLEAR DATAFRAME ---------------------------------------------------------
  128. save(tweets_complete, file="tweets_complete.RData")
  129. # Remove duplicates
  130. tweets <- tweets_complete[!duplicated(tweets_complete), ]
  131. tweets <- na.omit(tweets)
  132. rm(tweets_complete)
  133. # Format dates in data frame
  134. Sys.setlocale("LC_TIME", "C")
  135. tweets$created_at <- as.POSIXct(tweets$created_at, format = "%a %b %d %H:%M:%S %z %Y")
  136. tweets <- tweets[order(tweets$created_at), ]
  137. # Finally delete every tweet not from 2014 (2013 or 2015)
  138. delrow <- NULL
  139. for(r in 1:nrow(tweets)) {
  140. if(format(tweets$created_at[r], "%Y") != "2014") {
  141. delrow <- c(delrow, r)
  142. }
  143. curtext <- as.character(tweets$text[r])
  144. curtext <- str_replace_all(curtext, "$", " ")
  145. curtext <- str_replace_all(curtext, "http://.+?\\s", "URL ")
  146. tweets$text[r] <- curtext
  147. }
  148. tweets <- tweets[-delrow, ]
  149. rm(delrow, r)
  150. # Convert dates to omit (unnecessary) time
  151. tweets$created_at <- format(tweets$created_at, "%Y-%m-%d")
  152. save(tweets, file="tweets_untagged.RData")