second run; improving behaviour at different places

2015-03-06 13:40:22 +02:00
parent e3ff17df48
commit 42bfe4c773
5 changed files with 233 additions and 231 deletions
@@ -1,220 +1,3 @@
 curchars <- curchars - 1
 } else {
 curhash <- FALSE
 }
 # Now expand the current tag by possible suffixes that may be plural forms
 # Only do if it isn't an acronym or specific hastag
 if(!curacro && !curhash) {
 for(e in 1:length(tagexpand)) {
 curtag[e] <- str_c(curtag[1], tagexpand[e])
 }
 }
 ##############
 if(curchars <= 4 || curacro || curhash) {
 cat("distance 0\n")
 } else {
 cat("distance 1\n")
 }
 curtag <- "EURATOM"
 curchars <- nchar(curtag, type = "chars")
 # Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
 curacro <- checkAcronym(string = curtag)
 # Check if tag is some kind of specific hashtag. If so, do not handle as acronym, but don't expand it either
 if(str_detect(curtag, "^#")) {
 curacro <- FALSE
 curhash <- TRUE
 curtag <- str_replace(curtag, "#", "")
 curchars <- curchars - 1
 } else {
 curhash <- FALSE
 }
 # Now expand the current tag by possible suffixes that may be plural forms
 # Only do if it isn't an acronym or specific hastag
 if(!curacro && !curhash) {
 for(e in 1:length(tagexpand)) {
 curtag[e] <- str_c(curtag[1], tagexpand[e])
 }
 }
 ##############
 if(curchars <= 4 || curacro || curhash) {
 cat("distance 0\n")
 } else {
 cat("distance 1\n")
 }
 curtag <- "Energiewende"
 curchars <- nchar(curtag, type = "chars")
 # Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
 curacro <- checkAcronym(string = curtag)
 # Check if tag is some kind of specific hashtag. If so, do not handle as acronym, but don't expand it either
 if(str_detect(curtag, "^#")) {
 curacro <- FALSE
 curhash <- TRUE
 curtag <- str_replace(curtag, "#", "")
 curchars <- curchars - 1
 } else {
 curhash <- FALSE
 }
 # Now expand the current tag by possible suffixes that may be plural forms
 # Only do if it isn't an acronym or specific hastag
 if(!curacro && !curhash) {
 for(e in 1:length(tagexpand)) {
 curtag[e] <- str_c(curtag[1], tagexpand[e])
 }
 }
 ##############
 if(curchars <= 4 || curacro || curhash) {
 cat("distance 0\n")
 } else {
 cat("distance 1\n")
 }
 curtag <- "bnd"
 curchars <- nchar(curtag, type = "chars")
 # Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
 curacro <- checkAcronym(string = curtag)
 # Check if tag is some kind of specific hashtag. If so, do not handle as acronym, but don't expand it either
 if(str_detect(curtag, "^#")) {
 curacro <- FALSE
 curhash <- TRUE
 curtag <- str_replace(curtag, "#", "")
 curchars <- curchars - 1
 } else {
 curhash <- FALSE
 }
 # Now expand the current tag by possible suffixes that may be plural forms
 # Only do if it isn't an acronym or specific hastag
 if(!curacro && !curhash) {
 for(e in 1:length(tagexpand)) {
 curtag[e] <- str_c(curtag[1], tagexpand[e])
 }
 }
 ##############
 if(curchars <= 4 || curacro || curhash) {
 cat("distance 0\n")
 } else {
 cat("distance 1\n")
 }
 curtag <- "#WM"
 curchars <- nchar(curtag, type = "chars")
 # Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
 curacro <- checkAcronym(string = curtag)
 # Check if tag is some kind of specific hashtag. If so, do not handle as acronym, but don't expand it either
 if(str_detect(curtag, "^#")) {
 curacro <- FALSE
 curhash <- TRUE
 curtag <- str_replace(curtag, "#", "")
 curchars <- curchars - 1
 } else {
 curhash <- FALSE
 }
 # Now expand the current tag by possible suffixes that may be plural forms
 # Only do if it isn't an acronym or specific hastag
 if(!curacro && !curhash) {
 for(e in 1:length(tagexpand)) {
 curtag[e] <- str_c(curtag[1], tagexpand[e])
 }
 }
 ##############
 if(curchars <= 4 || curacro || curhash) {
 cat("distance 0\n")
 } else {
 cat("distance 1\n")
 }
 curtag
 curtag <- "Energiewende"
 curchars <- nchar(curtag, type = "chars")
 # Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
 curacro <- checkAcronym(string = curtag)
 # Check if tag is some kind of specific hashtag. If so, do not handle as acronym, but don't expand it either
 if(str_detect(curtag, "^#")) {
 curacro <- FALSE
 curhash <- TRUE
 curtag <- str_replace(curtag, "#", "")
 curchars <- curchars - 1
 } else {
 curhash <- FALSE
 }
 # Now expand the current tag by possible suffixes that may be plural forms
 # Only do if it isn't an acronym or specific hastag
 if(!curacro && !curhash) {
 for(e in 1:length(tagexpand)) {
 curtag[e] <- str_c(curtag[1], tagexpand[e])
 }
 }
 ##############
 if(curchars <= 4 || curacro || curhash) {
 cat("distance 0\n")
 } else {
 cat("distance 1\n")
 }
 curtag <- "Energiewende"
 curchars <- nchar(curtag, type = "chars")
 # Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
 curacro <- checkAcronym(string = curtag)
 # Check if tag is some kind of specific hashtag. If so, do not handle as acronym, but don't expand it either
 if(str_detect(curtag, "^#")) {
 curacro <- FALSE
 curhash <- TRUE
 curtag <- str_replace(curtag, "#", "")
 curchars <- curchars - 1
 } else {
 curhash <- FALSE
 }
 # Now expand the current tag by possible suffixes that may be plural forms
 # Only do if it isn't an acronym or specific hastag
 if(!curacro && !curhash) {
 for(e in 1:length(tagexpand)) {
 curtag[e] <- str_c(curtag[1], tagexpand[e])
 }
 }
 # Set Levenshtein distance depending on char length, acronym and hashtag status
 if(curchars <= 4 || curacro || curhash) {
 curdistance <- 0
 } else {
 curdistance <- 1
 }
 curtag
 smartPatternMatch("Die Energiewende ist toll!", curtag, curdistance, curacro)
 smartPatternMatch("Die Energiewende ist toll!", curtag[1], curdistance, curacro)
 smartPatternMatch("Die Energiewende ist toll!", curtag[2], curdistance, curacro)
 smartPatternMatch("Die Energiewende ist toll!", sprintf("%s", curtag), curdistance, curacro)
 tags_found <- NULL
 # Match the tweet with each variation of tagexpand
 for(e in 1:length(curtag)) {
 tags_found[e] <- smartPatternMatch(curtext, curtag[e], curdistance, curacro)
 }
 curtext <- "Die Energiewende ist toll!"
 tags_found <- NULL
 # Match the tweet with each variation of tagexpand
 for(e in 1:length(curtag)) {
 tags_found[e] <- smartPatternMatch(curtext, curtag[e], curdistance, curacro)
 }
 tags_found
 curtag
 curtag <- "#WM2014"
 curtext <- "Ich freu mich auf wm2014 sehr"
 curchars <- nchar(curtag, type = "chars")
 # Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
 curacro <- checkAcronym(string = curtag)
 # Check if tag is some kind of specific hashtag. If so, do not handle as acronym, but don't expand it either
 if(str_detect(curtag, "^#")) {
 curacro <- FALSE
 curhash <- TRUE
 curtag <- str_replace(curtag, "#", "")
 curchars <- curchars - 1
 } else {
 curhash <- FALSE
 }
 # Now expand the current tag by possible suffixes that may be plural forms
 # Only do if it isn't an acronym or specific hastag
 if(!curacro && !curhash) {
 for(e in 1:length(tagexpand)) {
 curtag[e] <- str_c(curtag[1], tagexpand[e])
 }
 }
 # Set Levenshtein distance depending on char length, acronym and hashtag status
 if(curchars <= 4 || curacro || curhash) {
 curdistance <- 0
 } else {
 curdistance <- 1
 }
 # Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow also 1 (Levenshtein distance)
@@ -510,3 +293,220 @@ for(i in 1:20) { cat(i,"\n")
 Sys.sleep(10)}
 list.dirs()
 list.files()
 rm(results)
 setwd("matched-ids/")
 results_files <- list.files()
 results_files
 results_files <- "all.csv"
 for(r in 1:length(results_files)) {
 if(r == 1) {
 results <- read.csv(results_files[r], sep=";", colClasses=c("character", "character", "character", "character"), header=F)
 } else {
 results_temp <- read.csv(results_files[r], sep=";", colClasses=c("character", "character", "character", "character"), header=F)
 results <- insertRow(results, results_temp)
 }
 }
 rm(r, results_temp, results_files)
 results <- results[!duplicated(results), ]
 names(results) <- c("date", "id_str", "issue", "tags")
 results <- results[order(results$id_str), ]
 row.names(results) <- NULL
 results[23381,]
 results[53381,]
 results[43253,]
 for(r in 53371:nrow(results)) {
 curdate <- as.character(results$date[r])
 curid <- as.character(results$id_str[r])
 curissue <- as.character(results$issue[r])
 curtag <- as.character(results$tags[r])
 cat("Sorting match", r, "of 53383 \n")
 # Update issue counter (date and issue)
 issues[issues[, "date"] == curdate, curissue] <- issues[issues[, "date"] == curdate, curissue] + 1
 # Update tweet dataframe (id, issue and tags)
 oldissue <- tweets[tweets[, "id_str"] == curid, "issue"]
 tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ",")
 oldtag <- tweets[tweets[, "id_str"] == curid, "tags"]
 tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ",")
 }
 issues[issueheads] <- 0
 View(issues)
 for(r in 1:nrow(results)) {
 curdate <- as.character(results$date[r])
 curid <- as.character(results$id_str[r])
 curissue <- as.character(results$issue[r])
 curtag <- as.character(results$tags[r])
 cat("Sorting match", r, "of 53383 \n")
 # Update issue counter (date and issue)
 issues[issues[, "date"] == curdate, curissue] <- issues[issues[, "date"] == curdate, curissue] + 1
 # Update tweet dataframe (id, issue and tags)
 oldissue <- tweets[tweets[, "id_str"] == curid, "issue"]
 tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ",")
 oldtag <- tweets[tweets[, "id_str"] == curid, "tags"]
 tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ",")
 }
 require(lubridate)
 require(XML)
 require(ggplot2)
 require(reshape2)
 require(stringr)
 require(foreach)
 require(doParallel)
 for(r in 1:nrow(results)) {
 curdate <- as.character(results$date[r])
 curid <- as.character(results$id_str[r])
 curissue <- as.character(results$issue[r])
 curtag <- as.character(results$tags[r])
 cat("Sorting match", r, "of 53383 \n")
 # Update issue counter (date and issue)
 issues[issues[, "date"] == curdate, curissue] <- issues[issues[, "date"] == curdate, curissue] + 1
 # Update tweet dataframe (id, issue and tags)
 oldissue <- tweets[tweets[, "id_str"] == curid, "issue"]
 tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ",")
 oldtag <- tweets[tweets[, "id_str"] == curid, "tags"]
 tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ",")
 }
 results[119,]
 results[120,]
 load(file = "tweets_untagged.RData")
 setwd("~/Dokumente/Uni/Aktuell/BA-Arbeit/uni-ba-issuecomp")
 results_files <- "matched-ids/all.csv"
 load(file = "tweets_untagged.RData")
 View(issues)
 issues <- data.frame(date = drange)
 issuelist <- readLines("issues.xml")
 issuelist <- str_replace_all(string = issuelist, pattern = ".*<!-- .+ -->", "")
 issuelist <- xmlToList(issuelist)
 issueheads <- names(issuelist)
 issues[issueheads] <- 0
 tweets$issue <- ""
 tweets$tags <- ""
 View(results)
 rm(r, results_temp, results_files)
 results <- results[!duplicated(results), ]
 names(results) <- c("date", "id_str", "issue", "tags")
 results <- results[order(results$id_str), ]
 row.names(results) <- NULL
 for(r in 1:nrow(results)) {
 curdate <- as.character(results$date[r])
 curid <- as.character(results$id_str[r])
 curissue <- as.character(results$issue[r])
 curtag <- as.character(results$tags[r])
 cat("Sorting match", r, "of 53383 \n")
 # Update issue counter (date and issue)
 issues[issues[, "date"] == curdate, curissue] <- issues[issues[, "date"] == curdate, curissue] + 1
 # Update tweet dataframe (id, issue and tags)
 oldissue <- tweets[tweets[, "id_str"] == curid, "issue"]
 tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ",")
 oldtag <- tweets[tweets[, "id_str"] == curid, "tags"]
 tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ",")
 }
 curdate
 curissue
 issues[issues[, "date"] == curdate, curissue]
 issueheads
 issuelist <- readLines("issues-v2.xml")
 issues <- data.frame(date = drange)
 issuelist <- readLines("issues-v2.xml")
 issuelist <- str_replace_all(string = issuelist, pattern = ".*<!-- .+ -->", "")
 issuelist <- xmlToList(issuelist)
 issueheads <- names(issuelist)
 issues[issueheads] <- 0
 tweets$issue <- ""
 tweets$tags <- ""
 for(r in 1:nrow(results)) {
 curdate <- as.character(results$date[r])
 curid <- as.character(results$id_str[r])
 curissue <- as.character(results$issue[r])
 curtag <- as.character(results$tags[r])
 cat("Sorting match", r, "of 53383 \n")
 # Update issue counter (date and issue)
 issues[issues[, "date"] == curdate, curissue] <- issues[issues[, "date"] == curdate, curissue] + 1
 # Update tweet dataframe (id, issue and tags)
 oldissue <- tweets[tweets[, "id_str"] == curid, "issue"]
 tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ",")
 oldtag <- tweets[tweets[, "id_str"] == curid, "tags"]
 tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ",")
 }
 results[33170,]
 results[33171,]
 results$date[33170]
 results$date[33170] <- "2014-08-21"
 for(r in 33170:nrow(results)) {
 curdate <- as.character(results$date[r])
 curid <- as.character(results$id_str[r])
 curissue <- as.character(results$issue[r])
 curtag <- as.character(results$tags[r])
 cat("Sorting match", r, "of 53383 \n")
 # Update issue counter (date and issue)
 issues[issues[, "date"] == curdate, curissue] <- issues[issues[, "date"] == curdate, curissue] + 1
 # Update tweet dataframe (id, issue and tags)
 oldissue <- tweets[tweets[, "id_str"] == curid, "issue"]
 tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ",")
 oldtag <- tweets[tweets[, "id_str"] == curid, "tags"]
 tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ",")
 }
 save(tweets, file="tweets_tagged.RData")
 write.csv(tweets, file="tweets.csv")
 save(issues, file="issues.RData")
 require(stringr)
 require(reshape2)
 require(ggplot2)
 require(vars)
 drop_s <- which(str_detect(names(issues), "^s"))
 drop_i <- which(str_detect(names(issues), "^i"))
 issues_i <- issues[,-drop_s]
 issues_s <- issues[,-drop_i]
 issues_i$total <- rowSums(issues_i[2:ncol(issues_i)])
 issues_i$entropy <- 0
 for(r in 1:nrow(issues_i)) {
 curtotal <- as.numeric(issues_i$total[r])
 curp <- 0
 for(c in 2:ncol(issues_i)) {
 curcount <- as.numeric(issues_i[r,c])
 curp[c] <- curcount / curtotal
 }
 curp <- curp [2:length(curp)-2]
 curdrop <- which(curp==0)
 curp <- curp[-curdrop]
 issues_i$entropy[r] <- sum(-1 * curp * log(curp))
 }
 issues_s$total <- rowSums(issues_s[2:ncol(issues_s)])
 issues_s$entropy <- 0
 for(r in 1:nrow(issues_s)) {
 curtotal <- as.numeric(issues_s$total[r])
 curp <- 0
 for(c in 2:ncol(issues_s)) {
 curcount <- as.numeric(issues_s[r,c])
 curp[c] <- curcount / curtotal
 }
 curp <- curp [2:length(curp)-2]
 curdrop <- which(curp==0)
 curp <- curp[-curdrop]
 issues_s$entropy[r] <- sum(-1 * curp * log(curp))
 }
 stats_total <- data.frame(date=drange)
 stats_total$tpd <- 0
 stats_total$ipd <- issues_i$total
 stats_total$spd <- issues_s$total
 # Total number of tweets per day over time
 for(r in 1:length(drange)) {
 stats_total$tpd[r] <- length(tweets[tweets[, "created_at"] == drange[r], "id_str"])
 }
 g1 <- ggplot(data = stats_melt, aes(x=date,y=value,colour=variable, group=variable)) +
 geom_line()+
 geom_smooth(size=1,formula = y ~ x, method="loess", se=FALSE, color=1)
 g1
 stats_entropy <- data.frame(date=drange)
 stats_entropy$entropy <- issues_i$entropy
 stats_entropy <- melt(stats_entropy, id="date")
 g1 <- ggplot(data = stats_entropy, aes(x=date,y=value,colour=variable, group=variable)) +
 geom_line() +
 geom_smooth(size=1,formula = y ~ x, method="loess", se=FALSE, color=1)
 g1
 test <- VAR(issues[,2:32], p=1, type="none")
 View(issues_i)
 View(issues_s)
 View(issues)
 test <- VAR(issues[,2:44], p=1, type="none")
 VAR(issues_s[,2:23], p=1, type=c("const", "trend", "both", "none"), season=NULL, exogen = issues_i[2:22])
 plot(irf(test, impulse = names(issues_s[2:23]), response = names(issues_i[2:22])))
@@ -21,7 +21,7 @@ drange <- date_start + days(0:drange)
 # Import issues and prepare everything
 # Will only be filled after the large categorisation loop
 issues <- data.frame(date = drange)
-issuelist <- readLines("issues.xml")
+issuelist <- readLines("issues-v2.xml")
 issuelist <- str_replace_all(string = issuelist, pattern = ".*<!-- .+ -->", "")
 issuelist <- xmlToList(issuelist)
 issueheads <- names(issuelist)
@@ -66,7 +66,8 @@ foreach(d = 1:nrow(issues), .packages = c("stringr"), .combine=rbind) %dopar% {
    for(i in 1:length(issueheads)) {
      curissue <- issueheads[i]
      curtags <- as.character(issuelist[[curissue]])  
-      curfile <- str_c(id_folder,"/",curissue,".csv")
+#       curfile <- str_c(id_folder,"/",curissue,".csv")
      curfile <- str_c(id_folder,"/",curdate,".csv")  # Possible solution to avoid buggy files when using many processes
      # Now test all tags of a single issue
      for(s in 1:length(curtags)) {
@@ -144,8 +145,9 @@ stopCluster(cl)
 # IMPORT RESULTS ----------------------------------------------------------
 # Import all files which have been generated at the categorisation run above.
-setwd("matched-ids/")
+#setwd("matched-ids/")
-results_files <- list.files()
+#results_files <- list.files()
 results_files <- "matched-ids/all.csv"
 for(r in 1:length(results_files)) {
  if(r == 1) {
    results <- read.csv(results_files[r], sep=";", colClasses=c("character", "character", "character", "character"), header=F)
@@ -166,15 +168,15 @@ row.names(results) <- NULL
 # (which wasn't possible in the categorisation process because of parallelisation)
 # Reset issues counter
-# issues[issueheads] <- 0
+#issues[issueheads] <- 0
-for(r in 1:nrow(results)) {
+for(r in 33170:nrow(results)) {
  curdate <- as.character(results$date[r])
  curid <- as.character(results$id_str[r])
  curissue <- as.character(results$issue[r])
  curtag <- as.character(results$tags[r])
-  cat("Sorting match", r, "of 62827 \n")
+  cat("Sorting match", r, "of 53383 \n")
  # Update issue counter (date and issue)
  issues[issues[, "date"] == curdate, curissue] <- issues[issues[, "date"] == curdate, curissue] + 1
@@ -72,19 +72,19 @@ stats_entropy <- melt(stats_entropy, id="date")
 g1 <- ggplot(data = stats_entropy, aes(x=date,y=value,colour=variable, group=variable)) + 
  geom_line() + 
  geom_smooth(size=1,formula = y ~ x, method="loess", se=FALSE, color=1)
-# g1
+g1
 # VAR ---------------------------------------------------------------------
-test <- VAR(issues[,2:32], p=1, type=c("const", "trend", "both", "none"), season=NULL, exogen = NULL, lag.max = NULL, ic = c("AIC", "HQ", "SC", "FPE"))
+# test <- VAR(issues[,2:32], p=1, type=c("const", "trend", "both", "none"), season=NULL, exogen = NULL, lag.max = NULL, ic = c("AIC", "HQ", "SC", "FPE"))
-test <- VAR(issues_i[,2:22], p=1, type="none", exogen = issues_s[,2:3])
+# test <- VAR(issues_i[,2:22], p=1, type="none", exogen = issues_s[,2:3])
-test <- VAR(issues_s[,2:11], p=1, type="none")
+# test <- VAR(issues_s[,2:11], p=1, type="none")
-test <- VAR(issues[,2:32], p=1, type="none")
+test <- VAR(issues[,2:44], p=1, type="none")
-VAR(issues_s[,2:11], p=1, type=c("const", "trend", "both", "none"), season=NULL, exogen = issues_i[2:22])
+# VAR(issues_s[,2:23], p=1, type=c("const", "trend", "both", "none"), season=NULL, exogen = issues_i[2:22])
-plot(irf(test, impulse = names(issues_s[2:11]), response = names(issues_i[2:22])))
+plot(irf(test, impulse = names(issues_s[2:23]), response = names(issues_i[2:22])))
 capture.output(print(summary(test), prmsd=TRUE, digits=1), file="out.txt")