Bachelor thesis: "The influence of sensational issues on the political agenda setting in social media"
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

issuecomp-2-analysis-EXT.R 5.3KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138
  1. require(lubridate)
  2. require(XML)
  3. require(stringr)
  4. require(foreach)
  5. require(doParallel)
  6. source("issuecomp-functions.R")
  7. setwd("E:/max.mehl")
  8. load(file = "tweets_untagged.RData")
  9. # Create date range
  10. date_start <- as.Date("2014-01-01")
  11. date_end <- as.Date("2014-12-31")
  12. drange <- as.integer(date_end - date_start)
  13. drange <- date_start + days(0:drange)
  14. # Import issues and prepare everything
  15. # Will only be filled after the large categorisation loop
  16. issues <- data.frame(date = drange)
  17. issuelist <- readLines("issues-v2.xml")
  18. issuelist <- str_replace_all(string = issuelist, pattern = ".*<!-- .+ -->", "")
  19. issuelist <- xmlToList(issuelist)
  20. issueheads <- names(issuelist)
  21. issues[issueheads] <- 0
  22. tweets$issue <- ""
  23. tweets$tags <- ""
  24. # MATCH TWEETS ------------------------------------------------------------
  25. # Create folder where all results will be saved (saver for backup and import)
  26. id_folder <- "matched-ids"
  27. unlink(id_folder, recursive = TRUE)
  28. dir.create(id_folder)
  29. # Tag expansion for plural, genetiv etc
  30. tagexpand <- c("", "s", "n", "en", "er", "e")
  31. # Parameters for parallelisation
  32. writeLines(c(""), "issuecomp-analysis.log")
  33. cl<-makeCluster(7)
  34. registerDoParallel(cl)
  35. # START CAT LOOP
  36. foreach(d = 1:nrow(issues), .packages = c("stringr"), .combine=rbind) %dopar% {
  37. # Go through every day
  38. curdate <- issues$date[d]
  39. cat(paste(as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE)
  40. # Put all tweets from specific day in a temporary DF
  41. tweets_curday <- tweets[tweets[, "created_at"] == curdate, ]
  42. for(t in 1:nrow(tweets_curday)){
  43. # cat(paste("Starting tweet", t, "of",as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE)
  44. # Select tweet's text, make it lowercase and remove hashtag indicators (#)
  45. curtext <- as.character(tweets_curday$text[t])
  46. curtext <- str_replace_all(curtext, "#", "")
  47. curid <- as.character(tweets_curday$id_str[t])
  48. # Now test each single issue (not tag!)
  49. for(i in 1:length(issueheads)) {
  50. curissue <- issueheads[i]
  51. curtags <- as.character(issuelist[[curissue]])
  52. curfile <- str_c(id_folder,"/",curissue,".csv")
  53. # Now test all tags of a single issue
  54. for(s in 1:length(curtags)) {
  55. curtag <- curtags[s]
  56. curchars <- nchar(curtag, type = "chars")
  57. # Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
  58. curacro <- checkAcronym(string = curtag)
  59. # Check if tag is some kind of specific hashtag. If so, do not handle as acronym, but don't expand it either
  60. if(str_detect(curtag, "^#")) {
  61. curacro <- FALSE # hashtags like #WM2014 are also written as #wm2014, so we need case-insensitivity
  62. curhash <- TRUE # But we need to mark it as hashtag, so it doesn't get extended or Levenshtein distance > 0
  63. curtag <- str_replace(curtag, "#", "")
  64. curchars <- curchars - 1
  65. } else {
  66. curhash <- FALSE
  67. }
  68. # Now expand the current tag by possible suffixes that may be plural forms
  69. # Only do if it isn't an acronym or specific hastag
  70. if(!curacro && !curhash) {
  71. for(e in 1:length(tagexpand)) {
  72. curtag[e] <- str_c(curtag[1], tagexpand[e])
  73. }
  74. }
  75. # Set Levenshtein distance depending on char length, acronym and hashtag status
  76. if(curchars <= 6 || curacro || curhash) { # Distance = 1 if 7 chars or longer
  77. curdistance <- 0
  78. } else {
  79. curdistance <- 1
  80. }
  81. # Match current tweet with tag.
  82. # Allow 1 Levenshtein distance if tag is >= 5 letters and no hashtag or acronym
  83. # Make is case-sensitiv if tag is an acronym
  84. tags_found <- NULL
  85. # Match the tweet with each variation of tagexpand
  86. for(e in 1:length(curtag)) {
  87. tags_found[e] <- smartPatternMatch(curtext, curtag[e], curdistance, curacro)
  88. }
  89. tags_found <- any(tags_found)
  90. curtag <- curtag[1]
  91. if(tags_found == TRUE) {
  92. # # Raise number of findings on this day for this issue by 1
  93. # issues[d,curissue] <- issues[d,curissue] + 1
  94. #
  95. # # Add issue and first matched tag of tweet to tweets-DF
  96. # oldissue <- tweets[tweets[, "id_str"] == curid, "issue"]
  97. # tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";")
  98. # oldtag <- tweets[tweets[, "id_str"] == curid, "tags"]
  99. # tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";")
  100. # Add information to file for function viewPatternMatching
  101. write(str_c(curdate,";\"",curid,"\";",curissue,";",curtag), curfile, append = TRUE)
  102. # cat(paste("Match!\n"), file="issuecomp-analysis.log", append=TRUE)
  103. # data.frame(date=curdate, issue=curissue)
  104. break # next issue, no more tags from same issue
  105. }
  106. else {
  107. #cat("Nothing found\n")
  108. }
  109. } # /for curtags
  110. } # /for issuelist
  111. } # /for tweets_curday
  112. } # /for drange
  113. #rm(tweets_curday,curacro, curchars, curdate,curfile,curid,curissue,curtag,curtags,curtext,d,date_end,date_start,i,id_folder,oldissue,oldtag,s,t,tags_found)
  114. stopCluster(cl)