Corrent status
This commit is contained in:
@@ -1,242 +1,512 @@
|
|||||||
require(lubridate)
|
# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
|
||||||
require(XML)
|
curacro <- checkAcronym(string = curtag)
|
||||||
require(ggplot2)
|
# Check if tag is some kind of specific hashtag. If so, do not handle as acronym, but don't expand it either
|
||||||
require(reshape2)
|
if(str_detect(curtag, "^#")) {
|
||||||
|
curacro <- FALSE
|
||||||
|
curhash <- TRUE
|
||||||
|
curtag <- str_replace(curtag, "#", "")
|
||||||
|
curchars <- curchars - 1
|
||||||
|
} else {
|
||||||
|
curhash <- FALSE
|
||||||
|
}
|
||||||
|
# Now expand the current tag by possible suffixes that may be plural forms
|
||||||
|
# Only do if it isn't an acronym or specific hastag
|
||||||
|
if(!curacro && !curhash) {
|
||||||
|
for(e in 1:length(tagexpand)) {
|
||||||
|
curtag[e] <- str_c(curtag[1], tagexpand[e])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
##############
|
||||||
|
if(curchars <= 4 || curacro || curhash) {
|
||||||
|
cat("distance 0\n")
|
||||||
|
} else {
|
||||||
|
cat("distance 1\n")
|
||||||
|
}
|
||||||
|
curtag <- "EURATOM"
|
||||||
|
curchars <- nchar(curtag, type = "chars")
|
||||||
|
# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
|
||||||
|
curacro <- checkAcronym(string = curtag)
|
||||||
|
# Check if tag is some kind of specific hashtag. If so, do not handle as acronym, but don't expand it either
|
||||||
|
if(str_detect(curtag, "^#")) {
|
||||||
|
curacro <- FALSE
|
||||||
|
curhash <- TRUE
|
||||||
|
curtag <- str_replace(curtag, "#", "")
|
||||||
|
curchars <- curchars - 1
|
||||||
|
} else {
|
||||||
|
curhash <- FALSE
|
||||||
|
}
|
||||||
|
# Now expand the current tag by possible suffixes that may be plural forms
|
||||||
|
# Only do if it isn't an acronym or specific hastag
|
||||||
|
if(!curacro && !curhash) {
|
||||||
|
for(e in 1:length(tagexpand)) {
|
||||||
|
curtag[e] <- str_c(curtag[1], tagexpand[e])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
##############
|
||||||
|
if(curchars <= 4 || curacro || curhash) {
|
||||||
|
cat("distance 0\n")
|
||||||
|
} else {
|
||||||
|
cat("distance 1\n")
|
||||||
|
}
|
||||||
|
curtag <- "Energiewende"
|
||||||
|
curchars <- nchar(curtag, type = "chars")
|
||||||
|
# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
|
||||||
|
curacro <- checkAcronym(string = curtag)
|
||||||
|
# Check if tag is some kind of specific hashtag. If so, do not handle as acronym, but don't expand it either
|
||||||
|
if(str_detect(curtag, "^#")) {
|
||||||
|
curacro <- FALSE
|
||||||
|
curhash <- TRUE
|
||||||
|
curtag <- str_replace(curtag, "#", "")
|
||||||
|
curchars <- curchars - 1
|
||||||
|
} else {
|
||||||
|
curhash <- FALSE
|
||||||
|
}
|
||||||
|
# Now expand the current tag by possible suffixes that may be plural forms
|
||||||
|
# Only do if it isn't an acronym or specific hastag
|
||||||
|
if(!curacro && !curhash) {
|
||||||
|
for(e in 1:length(tagexpand)) {
|
||||||
|
curtag[e] <- str_c(curtag[1], tagexpand[e])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
##############
|
||||||
|
if(curchars <= 4 || curacro || curhash) {
|
||||||
|
cat("distance 0\n")
|
||||||
|
} else {
|
||||||
|
cat("distance 1\n")
|
||||||
|
}
|
||||||
|
curtag <- "bnd"
|
||||||
|
curchars <- nchar(curtag, type = "chars")
|
||||||
|
# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
|
||||||
|
curacro <- checkAcronym(string = curtag)
|
||||||
|
# Check if tag is some kind of specific hashtag. If so, do not handle as acronym, but don't expand it either
|
||||||
|
if(str_detect(curtag, "^#")) {
|
||||||
|
curacro <- FALSE
|
||||||
|
curhash <- TRUE
|
||||||
|
curtag <- str_replace(curtag, "#", "")
|
||||||
|
curchars <- curchars - 1
|
||||||
|
} else {
|
||||||
|
curhash <- FALSE
|
||||||
|
}
|
||||||
|
# Now expand the current tag by possible suffixes that may be plural forms
|
||||||
|
# Only do if it isn't an acronym or specific hastag
|
||||||
|
if(!curacro && !curhash) {
|
||||||
|
for(e in 1:length(tagexpand)) {
|
||||||
|
curtag[e] <- str_c(curtag[1], tagexpand[e])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
##############
|
||||||
|
if(curchars <= 4 || curacro || curhash) {
|
||||||
|
cat("distance 0\n")
|
||||||
|
} else {
|
||||||
|
cat("distance 1\n")
|
||||||
|
}
|
||||||
|
curtag <- "#WM"
|
||||||
|
curchars <- nchar(curtag, type = "chars")
|
||||||
|
# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
|
||||||
|
curacro <- checkAcronym(string = curtag)
|
||||||
|
# Check if tag is some kind of specific hashtag. If so, do not handle as acronym, but don't expand it either
|
||||||
|
if(str_detect(curtag, "^#")) {
|
||||||
|
curacro <- FALSE
|
||||||
|
curhash <- TRUE
|
||||||
|
curtag <- str_replace(curtag, "#", "")
|
||||||
|
curchars <- curchars - 1
|
||||||
|
} else {
|
||||||
|
curhash <- FALSE
|
||||||
|
}
|
||||||
|
# Now expand the current tag by possible suffixes that may be plural forms
|
||||||
|
# Only do if it isn't an acronym or specific hastag
|
||||||
|
if(!curacro && !curhash) {
|
||||||
|
for(e in 1:length(tagexpand)) {
|
||||||
|
curtag[e] <- str_c(curtag[1], tagexpand[e])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
##############
|
||||||
|
if(curchars <= 4 || curacro || curhash) {
|
||||||
|
cat("distance 0\n")
|
||||||
|
} else {
|
||||||
|
cat("distance 1\n")
|
||||||
|
}
|
||||||
|
curtag
|
||||||
|
curtag <- "Energiewende"
|
||||||
|
curchars <- nchar(curtag, type = "chars")
|
||||||
|
# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
|
||||||
|
curacro <- checkAcronym(string = curtag)
|
||||||
|
# Check if tag is some kind of specific hashtag. If so, do not handle as acronym, but don't expand it either
|
||||||
|
if(str_detect(curtag, "^#")) {
|
||||||
|
curacro <- FALSE
|
||||||
|
curhash <- TRUE
|
||||||
|
curtag <- str_replace(curtag, "#", "")
|
||||||
|
curchars <- curchars - 1
|
||||||
|
} else {
|
||||||
|
curhash <- FALSE
|
||||||
|
}
|
||||||
|
# Now expand the current tag by possible suffixes that may be plural forms
|
||||||
|
# Only do if it isn't an acronym or specific hastag
|
||||||
|
if(!curacro && !curhash) {
|
||||||
|
for(e in 1:length(tagexpand)) {
|
||||||
|
curtag[e] <- str_c(curtag[1], tagexpand[e])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
##############
|
||||||
|
if(curchars <= 4 || curacro || curhash) {
|
||||||
|
cat("distance 0\n")
|
||||||
|
} else {
|
||||||
|
cat("distance 1\n")
|
||||||
|
}
|
||||||
|
curtag <- "Energiewende"
|
||||||
|
curchars <- nchar(curtag, type = "chars")
|
||||||
|
# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
|
||||||
|
curacro <- checkAcronym(string = curtag)
|
||||||
|
# Check if tag is some kind of specific hashtag. If so, do not handle as acronym, but don't expand it either
|
||||||
|
if(str_detect(curtag, "^#")) {
|
||||||
|
curacro <- FALSE
|
||||||
|
curhash <- TRUE
|
||||||
|
curtag <- str_replace(curtag, "#", "")
|
||||||
|
curchars <- curchars - 1
|
||||||
|
} else {
|
||||||
|
curhash <- FALSE
|
||||||
|
}
|
||||||
|
# Now expand the current tag by possible suffixes that may be plural forms
|
||||||
|
# Only do if it isn't an acronym or specific hastag
|
||||||
|
if(!curacro && !curhash) {
|
||||||
|
for(e in 1:length(tagexpand)) {
|
||||||
|
curtag[e] <- str_c(curtag[1], tagexpand[e])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
# Set Levenshtein distance depending on char length, acronym and hashtag status
|
||||||
|
if(curchars <= 4 || curacro || curhash) {
|
||||||
|
curdistance <- 0
|
||||||
|
} else {
|
||||||
|
curdistance <- 1
|
||||||
|
}
|
||||||
|
curtag
|
||||||
|
smartPatternMatch("Die Energiewende ist toll!", curtag, curdistance, curacro)
|
||||||
|
smartPatternMatch("Die Energiewende ist toll!", curtag[1], curdistance, curacro)
|
||||||
|
smartPatternMatch("Die Energiewende ist toll!", curtag[2], curdistance, curacro)
|
||||||
|
smartPatternMatch("Die Energiewende ist toll!", sprintf("%s", curtag), curdistance, curacro)
|
||||||
|
tags_found <- NULL
|
||||||
|
# Match the tweet with each variation of tagexpand
|
||||||
|
for(e in 1:length(curtag)) {
|
||||||
|
tags_found[e] <- smartPatternMatch(curtext, curtag[e], curdistance, curacro)
|
||||||
|
}
|
||||||
|
curtext <- "Die Energiewende ist toll!"
|
||||||
|
tags_found <- NULL
|
||||||
|
# Match the tweet with each variation of tagexpand
|
||||||
|
for(e in 1:length(curtag)) {
|
||||||
|
tags_found[e] <- smartPatternMatch(curtext, curtag[e], curdistance, curacro)
|
||||||
|
}
|
||||||
|
tags_found
|
||||||
|
curtag
|
||||||
|
curtag <- "#WM2014"
|
||||||
|
curtext <- "Ich freu mich auf wm2014 sehr"
|
||||||
|
curchars <- nchar(curtag, type = "chars")
|
||||||
|
# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
|
||||||
|
curacro <- checkAcronym(string = curtag)
|
||||||
|
# Check if tag is some kind of specific hashtag. If so, do not handle as acronym, but don't expand it either
|
||||||
|
if(str_detect(curtag, "^#")) {
|
||||||
|
curacro <- FALSE
|
||||||
|
curhash <- TRUE
|
||||||
|
curtag <- str_replace(curtag, "#", "")
|
||||||
|
curchars <- curchars - 1
|
||||||
|
} else {
|
||||||
|
curhash <- FALSE
|
||||||
|
}
|
||||||
|
# Now expand the current tag by possible suffixes that may be plural forms
|
||||||
|
# Only do if it isn't an acronym or specific hastag
|
||||||
|
if(!curacro && !curhash) {
|
||||||
|
for(e in 1:length(tagexpand)) {
|
||||||
|
curtag[e] <- str_c(curtag[1], tagexpand[e])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
# Set Levenshtein distance depending on char length, acronym and hashtag status
|
||||||
|
if(curchars <= 4 || curacro || curhash) {
|
||||||
|
curdistance <- 0
|
||||||
|
} else {
|
||||||
|
curdistance <- 1
|
||||||
|
}
|
||||||
|
# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow also 1 (Levenshtein distance)
|
||||||
|
tags_found <- NULL
|
||||||
|
# Match the tweet with each variation of tagexpand
|
||||||
|
for(e in 1:length(curtag)) {
|
||||||
|
tags_found[e] <- smartPatternMatch(curtext, curtag[e], curdistance, curacro)
|
||||||
|
}
|
||||||
|
tags_found <- any(tags_found)
|
||||||
|
tags_found
|
||||||
|
curtag
|
||||||
|
curtext
|
||||||
|
curdistance
|
||||||
|
test <- VAR(issues[,2:32], p=3, type=c("const", "trend", "both", "none"), season=NULL, exogen = NULL, lag.max = NULL, ic = c("AIC", "HQ", "SC", "FPE"))
|
||||||
|
test
|
||||||
|
test <- VAR(issues[,2:32], p=1, type="none")
|
||||||
|
capture.output(print(summary(test), prmsd=TRUE, digits=1), file="out.txt")
|
||||||
|
View(issues)
|
||||||
|
test <- VAR(issues_i[,2:22], p=1, type="none", exogen = issues_s[,2])
|
||||||
|
test <- VAR(issues_i[,2:22], p=1, type="none", exogen = issues_s[,2:3])
|
||||||
|
capture.output(print(summary(test), prmsd=TRUE, digits=1), file="out.txt")
|
||||||
|
irf(test)
|
||||||
|
test <- VAR(issues_s[,2:11], p=1, type="none")
|
||||||
|
irf(test)
|
||||||
|
plot(irf(test))
|
||||||
|
test <- VAR(issues[,2:32], p=1, type="none")
|
||||||
|
plot(irf(test, impulse = names(issues_s[2:11]), response = names(issues_i[2:22])))
|
||||||
|
plot(irf(test, impulse = names(issues_s[2:11]), response = names(issues_i[2:22]), n.ahead = 5))
|
||||||
require(stringr)
|
require(stringr)
|
||||||
library(foreach)
|
|
||||||
library(doParallel)
|
|
||||||
source("issuecomp-functions.R")
|
|
||||||
getwd()
|
|
||||||
setwd("~/Dokumente/Uni/Aktuell/BA-Arbeit/uni-ba-issuecomp")
|
|
||||||
getwd()
|
|
||||||
list.files()
|
|
||||||
list.files("matched-ids/")
|
|
||||||
load(file = "tweets_untagged.RData")
|
|
||||||
issues <- data.frame(date = drange)
|
|
||||||
# Create date range
|
|
||||||
date_start <- as.Date("2014-01-01")
|
|
||||||
date_end <- as.Date("2014-12-31")
|
|
||||||
drange <- as.integer(date_end - date_start)
|
|
||||||
drange <- date_start + days(0:drange)
|
|
||||||
issues <- data.frame(date = drange)
|
|
||||||
issuelist <- readLines("issues.xml")
|
|
||||||
issuelist <- str_replace_all(string = issuelist, pattern = ".*<!-- .+ -->", "")
|
|
||||||
issuelist <- xmlToList(issuelist)
|
|
||||||
issueheads <- names(issuelist)
|
|
||||||
issues[issueheads] <- 0
|
|
||||||
tweets$issue <- ""
|
|
||||||
tweets$tags <- ""
|
|
||||||
View(issues)
|
|
||||||
list.files("matched-ids/")
|
|
||||||
results <- list.files("matched-ids/")
|
|
||||||
results
|
|
||||||
read.csv("matched-ids/i10.trans.csv")
|
|
||||||
read.csv("matched-ids/i10.trans.csv", sep=";")
|
|
||||||
read.csv("matched-ids/i10.trans.csv", sep=";", stringsAsFactors=F)
|
|
||||||
read.csv("matched-ids/i10.trans.csv", sep=";", stringsAsFactors=T)
|
|
||||||
reesult_files <- read.csv("matched-ids/i10.trans.csv", sep=";", stringsAsFactors=F)
|
|
||||||
View(reesult_files)
|
|
||||||
result_files <- read.csv("matched-ids/i10.trans.csv", sep=";", colClasses=c("date", "character", "character", "character"))
|
|
||||||
result_files <- read.csv("matched-ids/i10.trans.csv", sep=";", colClasses=c("character", "character", "character", "character"))
|
|
||||||
rm(reesult_files)
|
|
||||||
View(result_files)
|
|
||||||
nrow(result_files)
|
|
||||||
result_files <- result_files(!duplicated(result_files))
|
|
||||||
result_files <- result_files[!duplicated(result_files)]
|
|
||||||
result_files <- result_files[!duplicated(result_files), ]
|
|
||||||
nrow(result_files)
|
|
||||||
result_files <- read.csv("matched-ids/i10.trans.csv", sep=";", colClasses=c("character", "character", "character", "character"), header=F)
|
|
||||||
View(result_files)
|
|
||||||
read.results
|
|
||||||
results
|
|
||||||
setwd("matched-ids/")
|
|
||||||
list.files("")
|
|
||||||
getwd()
|
|
||||||
list.files()
|
|
||||||
results <- list.files()
|
|
||||||
results
|
|
||||||
results_cat <- read.csv(results, sep=";", colClasses=c("character", "character", "character", "character"), header=F)
|
|
||||||
results_cat <- read.csv(results[1], sep=";", colClasses=c("character", "character", "character", "character"), header=F)
|
|
||||||
results_cat
|
|
||||||
View(results_cat)
|
|
||||||
source("issuecomp-functions.R")
|
|
||||||
setwd("~/Dokumente/Uni/Aktuell/BA-Arbeit/uni-ba-issuecomp")
|
|
||||||
source("issuecomp-functions.R")
|
|
||||||
insertRow
|
|
||||||
results_temp <- read.csv(results[2], sep=";", colClasses=c("character", "character", "character", "character"), header=F)
|
|
||||||
setwd("matched-ids/")
|
|
||||||
results_temp <- read.csv(results[2], sep=";", colClasses=c("character", "character", "character", "character"), header=F)
|
|
||||||
rm(result_files)
|
|
||||||
insertRow(existingDF = results_cat, results_temp)
|
|
||||||
rm(results_cat)
|
|
||||||
for(r in 1:length(results)) {
|
|
||||||
if(r == 1) {
|
|
||||||
results_cat <- read.csv(results[r], sep=";", colClasses=c("character", "character", "character", "character"), header=F)
|
|
||||||
} else {
|
|
||||||
results_temp <- read.csv(results[r], sep=";", colClasses=c("character", "character", "character", "character"), header=F)
|
|
||||||
insertRow(results_cat, results_temp)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for(r in 1:length(results)) {
|
|
||||||
if(r == 1) {
|
|
||||||
results_cat <- read.csv(results[r], sep=";", colClasses=c("character", "character", "character", "character"), header=F)
|
|
||||||
} else {
|
|
||||||
results_temp <- read.csv(results[r], sep=";", colClasses=c("character", "character", "character", "character"), header=F)
|
|
||||||
results_cat insertRow(results_cat, results_temp)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for(r in 1:length(results)) {
|
|
||||||
if(r == 1) {
|
|
||||||
results_cat <- read.csv(results[r], sep=";", colClasses=c("character", "character", "character", "character"), header=F)
|
|
||||||
} else {
|
|
||||||
results_temp <- read.csv(results[r], sep=";", colClasses=c("character", "character", "character", "character"), header=F)
|
|
||||||
results_cat <- insertRow(results_cat, results_temp)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
View(results_cat)
|
|
||||||
results_cat[20000]
|
|
||||||
results_cat[20000, ]
|
|
||||||
rm(r, results_temp)
|
|
||||||
results_cat <- results_cat[!duplicated(results_cat), ]
|
|
||||||
View(results_cat)
|
|
||||||
rm(results, results_cat)
|
|
||||||
results_files <- list.files()
|
|
||||||
for(r in 1:length(results)) {
|
|
||||||
if(r == 1) {
|
|
||||||
results <- read.csv(results_files[r], sep=";", colClasses=c("character", "character", "character", "character"), header=F)
|
|
||||||
} else {
|
|
||||||
results_temp <- read.csv(results_files[r], sep=";", colClasses=c("character", "character", "character", "character"), header=F)
|
|
||||||
results <- insertRow(results_cat, results_temp)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
rm(r, results_temp)
|
|
||||||
results <- results[!duplicated(results), ]
|
|
||||||
results_files <- list.files()
|
|
||||||
for(r in 1:length(results_files)) {
|
|
||||||
if(r == 1) {
|
|
||||||
results <- read.csv(results_files[r], sep=";", colClasses=c("character", "character", "character", "character"), header=F)
|
|
||||||
} else {
|
|
||||||
results_temp <- read.csv(results_files[r], sep=";", colClasses=c("character", "character", "character", "character"), header=F)
|
|
||||||
results <- insertRow(results, results_temp)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
rm(r, results_temp)
|
|
||||||
results <- results[!duplicated(results), ]
|
|
||||||
View(results)
|
|
||||||
View(issues)
|
|
||||||
row.names(results) <- NULL
|
|
||||||
View(results)
|
|
||||||
rownames(results)
|
|
||||||
row.names(results)
|
|
||||||
names(results)
|
|
||||||
View(tweets)
|
|
||||||
View(tweets)
|
|
||||||
names(results) <- c("date", "id_str", "issue", "tags")
|
|
||||||
View(results)
|
|
||||||
results_test <- results[order(results$id_str)]
|
|
||||||
results_test <- results[order(results$id_str), ]
|
|
||||||
View(results_test)
|
|
||||||
results_files <- list.files()
|
|
||||||
for(r in 1:length(results_files)) {
|
|
||||||
if(r == 1) {
|
|
||||||
results <- read.csv(results_files[r], sep=";", colClasses=c("character", "character", "character", "character"), header=F)
|
|
||||||
} else {
|
|
||||||
results_temp <- read.csv(results_files[r], sep=";", colClasses=c("character", "character", "character", "character"), header=F)
|
|
||||||
results <- insertRow(results, results_temp)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
rm(r, results_temp)
|
|
||||||
rm(r, results_temp, results_files)
|
|
||||||
results <- results[!duplicated(results), ]
|
|
||||||
names(results)
|
|
||||||
names(results) <- c("date", "id_str", "issue", "tags")
|
|
||||||
View(results)
|
|
||||||
results_test <- results[order(results$id_str), ]
|
|
||||||
row.names(results) <- NULL
|
|
||||||
results <- results[order(results$id_str), ]
|
|
||||||
row.names(results) <- NULL
|
|
||||||
View(results)
|
|
||||||
rm(results_test)
|
|
||||||
View(issues)
|
|
||||||
as.character(results$date[2])
|
|
||||||
class(results$date)
|
|
||||||
class(issues$date)
|
|
||||||
View(issues)
|
|
||||||
as.character(issues$date[2])
|
|
||||||
issues$date[2]
|
|
||||||
issuelist <- readLines("issues.xml")
|
|
||||||
issuelist <- str_replace_all(string = issuelist, pattern = ".*<!-- .+ -->", "")
|
|
||||||
issuelist <- xmlToList(issuelist)
|
|
||||||
issueheads <- names(issuelist)
|
|
||||||
require(lubridate)
|
|
||||||
require(XML)
|
require(XML)
|
||||||
require(ggplot2)
|
readYN <- function(question) {
|
||||||
require(reshape2)
|
n <- readline(prompt=question)
|
||||||
require(stringr)
|
n <- as.character(n)
|
||||||
library(foreach)
|
return(n)
|
||||||
library(doParallel)
|
|
||||||
issuelist <- readLines("issues.xml")
|
|
||||||
issuelist <- str_replace_all(string = issuelist, pattern = ".*<!-- .+ -->", "")
|
|
||||||
issuelist <- xmlToList(issuelist)
|
|
||||||
issueheads <- names(issuelist)
|
|
||||||
setwd("~/Dokumente/Uni/Aktuell/BA-Arbeit/uni-ba-issuecomp")
|
|
||||||
issuelist <- readLines("issues.xml")
|
|
||||||
issuelist <- str_replace_all(string = issuelist, pattern = ".*<!-- .+ -->", "")
|
|
||||||
issuelist <- xmlToList(issuelist)
|
|
||||||
issueheads <- names(issuelist)
|
|
||||||
issues[issueheads] <- 0
|
|
||||||
curdate <- as.character(results$date[3])
|
|
||||||
curissue <- as.character(results$issue[3])
|
|
||||||
curdate
|
|
||||||
curissue
|
|
||||||
issues[curdate, curissue] <- issues[curdate, curissue] + 1
|
|
||||||
View(issues)
|
|
||||||
issues <- data.frame(date = drange)
|
|
||||||
issues[issueheads] <- 0
|
|
||||||
View(issues)
|
|
||||||
issues[issues[, "date"] == curdate, curissue] <- issues[issues[, "date"] == curdate, curissue] + 1
|
|
||||||
View(issues)
|
|
||||||
for(r in 1:nrow(results)) {
|
|
||||||
curdate <- as.character(results$date[r])
|
|
||||||
curissue <- as.character(results$issue[r])
|
|
||||||
issues[issues[, "date"] == curdate, curissue] <- issues[issues[, "date"] == curdate, curissue] + 1
|
|
||||||
}
|
}
|
||||||
View(issues)
|
checkIssue <- function(string, issuelist) {
|
||||||
issues[issueheads] <- 0
|
status <- any(str_detect(string, issuelist))
|
||||||
View(issues)
|
return(status)
|
||||||
for(r in 1:nrow(results)) {
|
}
|
||||||
curdate <- as.character(results$date[r])
|
checkAllIssues <- function(string, issuelist) {
|
||||||
curid <- as.character(results$id_str[r])
|
status <- NULL
|
||||||
curissue <- as.character(results$issue[r])
|
for(i in 1:length(string)) {
|
||||||
curtag <- as.character(results$tags[r])
|
if(checkIssue(string[i], issuelist)) {
|
||||||
# Update issue counter (date and issue)
|
status[i] <- TRUE
|
||||||
issues[issues[, "date"] == curdate, curissue] <- issues[issues[, "date"] == curdate, curissue] + 1
|
}
|
||||||
# Update tweet dataframe (id, issue and tags)
|
else {
|
||||||
oldissue <- tweets[tweets[, "id_str"] == curid, "issue"]
|
cat("Issue",string[i],"does not exist. Please try again.\n")
|
||||||
tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ",")
|
status[i] <- FALSE
|
||||||
oldtag <- tweets[tweets[, "id_str"] == curid, "tags"]
|
}
|
||||||
tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ",")
|
}
|
||||||
|
return(status)
|
||||||
}
|
}
|
||||||
View(tweets)
|
View(tweets)
|
||||||
tweets$issue <- ""
|
write.csv(tweets, file="tweets.csv")
|
||||||
tweets$tags <- ""
|
c_tweets <- read.csv("tweets.csv", colClasses="character")
|
||||||
View(tweets)
|
View(c_tweets)
|
||||||
issues[issueheads] <- 0
|
c_tweets$X <- NULL
|
||||||
for(r in 1:nrow(results)) {
|
c_issues <- data.frame(date = drange)
|
||||||
curdate <- as.character(results$date[r])
|
c_issuelist <- xmlToList("issues.xml")
|
||||||
curid <- as.character(results$id_str[r])
|
c_issueheads <- names(issuelist)
|
||||||
curissue <- as.character(results$issue[r])
|
c_issues[issueheads] <- 0
|
||||||
curtag <- as.character(results$tags[r])
|
source("issuecomp-codingsample-function.R")
|
||||||
cat("Sorting match", r, "from", nrow(results), "\n")
|
rm(c_err, c_result, c_samid, c_samno,c_samtags,c_samissue,c_samtext,c_yn)
|
||||||
# Update issue counter (date and issue)
|
c_errors <- read.csv("issuecomp-codingsample-error.csv", header = F, sep=",", colClasses="character")
|
||||||
issues[issues[, "date"] == curdate, curissue] <- issues[issues[, "date"] == curdate, curissue] + 1
|
View(c_errors)
|
||||||
# Update tweet dataframe (id, issue and tags)
|
names(c_errors) <- c("str_id", "code", "issue", "tags", "text")
|
||||||
oldissue <- tweets[tweets[, "id_str"] == curid, "issue"]
|
View(c_errors)
|
||||||
tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ",")
|
for(r in 1:nrow(c_errors)) {
|
||||||
oldtag <- tweets[tweets[, "id_str"] == curid, "tags"]
|
c_errcode <- as.character(c_errors$code[r])
|
||||||
tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ",")
|
c_errissue <- as.character(c_errors$issue[r])
|
||||||
|
c_errtags <- as.character(c_errors$tags[r])
|
||||||
|
c_errtext <- as.character(c_errors$text[r])
|
||||||
|
c_errid <- as.character(c_errors$str_id[r])
|
||||||
|
cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errtags, "\n", sep="")
|
||||||
|
source("issuecomp-codingsample-function2.R")
|
||||||
}
|
}
|
||||||
View(issues)
|
c_errors <- read.csv("issuecomp-codingsample-error.csv", header = F, sep=",", colClasses="character")
|
||||||
View(tweets)
|
names(c_errors) <- c("str_id", "code", "issue", "tags", "text")
|
||||||
View(tweets)
|
for(r in 1:nrow(c_errors)) {
|
||||||
save(tweets, file="tweets_tagged.RData")
|
c_errcode <- as.character(c_errors$code[r])
|
||||||
|
c_errissue <- as.character(c_errors$issue[r])
|
||||||
|
c_errtags <- as.character(c_errors$tags[r])
|
||||||
|
c_errtext <- as.character(c_errors$text[r])
|
||||||
|
c_errid <- as.character(c_errors$str_id[r])
|
||||||
|
cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errtags, "\n", sep="")
|
||||||
|
source("issuecomp-codingsample-function2.R")
|
||||||
|
}
|
||||||
|
for(r in 1:nrow(c_errors)) {
|
||||||
|
c_errcode <- as.character(c_errors$code[r])
|
||||||
|
c_errissue <- as.character(c_errors$issue[r])
|
||||||
|
c_errtags <- as.character(c_errors$tags[r])
|
||||||
|
c_errtext <- as.character(c_errors$text[r])
|
||||||
|
c_errid <- as.character(c_errors$str_id[r])
|
||||||
|
cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errissue, " (", c_errtags, ")\n", sep="")
|
||||||
|
source("issuecomp-codingsample-function2.R")
|
||||||
|
}
|
||||||
|
for(r in 1:nrow(c_errors)) {
|
||||||
|
c_errcode <- as.character(c_errors$code[r])
|
||||||
|
c_errissue <- as.character(c_errors$issue[r])
|
||||||
|
c_errtags <- as.character(c_errors$tags[r])
|
||||||
|
c_errtext <- as.character(c_errors$text[r])
|
||||||
|
c_errid <- as.character(c_errors$str_id[r])
|
||||||
|
cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errissue, " (", c_errtags, ")\n", sep="")
|
||||||
|
source("issuecomp-codingsample-function2.R")
|
||||||
|
}
|
||||||
|
tagexpand
|
||||||
|
source("issuecomp-codingsample-function.R")
|
||||||
|
source("issuecomp-codingsample-function.R")
|
||||||
|
source("issuecomp-codingsample-function.R")
|
||||||
|
c_errors <- read.csv("issuecomp-codingsample-error.csv", header = F, sep=",", colClasses="character")
|
||||||
|
names(c_errors) <- c("str_id", "code", "issue", "tags", "text")
|
||||||
|
for(r in 1:nrow(c_errors)) {
|
||||||
|
c_errcode <- as.character(c_errors$code[r])
|
||||||
|
c_errissue <- as.character(c_errors$issue[r])
|
||||||
|
c_errtags <- as.character(c_errors$tags[r])
|
||||||
|
c_errtext <- as.character(c_errors$text[r])
|
||||||
|
c_errid <- as.character(c_errors$str_id[r])
|
||||||
|
cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errissue, " (", c_errtags, ")\n", sep="")
|
||||||
|
source("issuecomp-codingsample-function2.R")
|
||||||
|
}
|
||||||
|
for(r in 1:nrow(c_errors)) {
|
||||||
|
c_errcode <- as.character(c_errors$code[r])
|
||||||
|
c_errissue <- as.character(c_errors$issue[r])
|
||||||
|
c_errtags <- as.character(c_errors$tags[r])
|
||||||
|
c_errtext <- as.character(c_errors$text[r])
|
||||||
|
c_errid <- as.character(c_errors$str_id[r])
|
||||||
|
cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errissue, " (", c_errtags, ")\n", sep="")
|
||||||
|
source("issuecomp-codingsample-function2.R")
|
||||||
|
}
|
||||||
|
for(r in 1:nrow(c_errors)) {
|
||||||
|
c_errcode <- as.character(c_errors$code[r])
|
||||||
|
c_errissue <- as.character(c_errors$issue[r])
|
||||||
|
c_errtags <- as.character(c_errors$tags[r])
|
||||||
|
c_errtext <- as.character(c_errors$text[r])
|
||||||
|
c_errid <- as.character(c_errors$str_id[r])
|
||||||
|
cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errissue, " (", c_errtags, ")\n", sep="")
|
||||||
|
source("issuecomp-codingsample-function2.R")
|
||||||
|
}
|
||||||
|
for(r in 1:nrow(c_errors)) {
|
||||||
|
c_errcode <- as.character(c_errors$code[r])
|
||||||
|
c_errissue <- as.character(c_errors$issue[r])
|
||||||
|
c_errtags <- as.character(c_errors$tags[r])
|
||||||
|
c_errtext <- as.character(c_errors$text[r])
|
||||||
|
c_errid <- as.character(c_errors$str_id[r])
|
||||||
|
cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errissue, " (", c_errtags, ")\n", sep="")
|
||||||
|
source("issuecomp-codingsample-function2.R")
|
||||||
|
}
|
||||||
|
for(r in 1:nrow(c_errors)) {
|
||||||
|
c_errcode <- as.character(c_errors$code[r])
|
||||||
|
c_errissue <- as.character(c_errors$issue[r])
|
||||||
|
c_errtags <- as.character(c_errors$tags[r])
|
||||||
|
c_errtext <- as.character(c_errors$text[r])
|
||||||
|
c_errid <- as.character(c_errors$str_id[r])
|
||||||
|
cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errissue, " (", c_errtags, ")\n", sep="")
|
||||||
|
source("issuecomp-codingsample-function2.R")
|
||||||
|
}
|
||||||
|
c_tmp <- read.csv("issuecomp-codingsample-error1.csv", header = F, colClasses="character")
|
||||||
|
View(c_tmp)
|
||||||
|
View(c_errors)
|
||||||
|
View(c_tmp)
|
||||||
|
names(c_tmp) <- c("str_id", "all", "wrong", "tags", "text")
|
||||||
|
View(c_tmp)
|
||||||
|
c_tmp[, c("wrong", "tagged", "all", "text")]
|
||||||
|
View(c_tmp)
|
||||||
|
names(c_tmp) <- c("str_id", "all", "wrong", "tagged", "text")
|
||||||
|
c_tmp[, c("wrong", "tagged", "all", "text")]
|
||||||
|
c_error1 <- c_tmp[, c("wrong", "tagged", "all", "text")]
|
||||||
|
View(c_error1)
|
||||||
|
c_tmp <- read.csv("issuecomp-codingsample-error2.csv", header = F, colClasses="character")
|
||||||
|
View(c_tmp)
|
||||||
|
c_tmp <- read.csv("issuecomp-codingsample-error2.csv", header = F, colClasses="character")
|
||||||
|
names(c_tmp) <- c("str_id", "all", "missing", "tagged", "text")
|
||||||
|
c_error1 <- c_tmp[, c("missing", "tagged", "all", "text")]
|
||||||
|
c_error2 <- c_tmp[, c("missing", "tagged", "all", "text")]
|
||||||
|
View(c_error2)
|
||||||
|
c_error2 <- c_tmp[, c("missing", "text", "tagged", "all")]
|
||||||
|
View(c_error2)
|
||||||
|
View(c_error1)
|
||||||
|
View(c_error2)
|
||||||
|
c_tmp <- read.csv("issuecomp-codingsample-correct.csv", header = F, colClasses="character")
|
||||||
|
View(c_tmp)
|
||||||
|
names(c_tmp) <- c("str_id", "status", "issue", "tags", "text")
|
||||||
|
View(c_tmp)
|
||||||
|
c_currect <- c_tmp
|
||||||
|
c_correct <- c_tmp
|
||||||
|
rm(c_currect)
|
||||||
|
View(c_correct)
|
||||||
|
source("issuecomp-codingsample-function.R")
|
||||||
|
rm(c_err, c_result, c_samid, c_samno,c_samtags,c_samissue,c_samtext,c_yn)
|
||||||
|
c_errors <- read.csv("issuecomp-codingsample-error.csv", header = F, sep=",", colClasses="character")
|
||||||
|
c_errors <- read.csv("issuecomp-codingsample-error.csv", header = F, sep=",", colClasses="character")
|
||||||
|
c_errors <- read.csv("issuecomp-codingsample-error.csv", header = F, sep=",", colClasses="character")
|
||||||
|
c_errors <- read.csv("issuecomp-codingsample-error.csv", header = F, sep=",", colClasses="character")
|
||||||
|
c_errors <- read.csv("issuecomp-codingsample-error.csv", header = F, sep=",", colClasses="character", quote = "")
|
||||||
|
View(c_errors)
|
||||||
|
c_errors <- read.csv("issuecomp-codingsample-error.csv", header = F, sep=",", colClasses="character")
|
||||||
|
test <- "Zitat "total dämlich!""
|
||||||
|
tweets$id_str == "523512815425175552"
|
||||||
|
tweets[tweets$id_str == "523512815425175552"]
|
||||||
|
tweets[tweets$id_str == "523512815425175552", ]
|
||||||
|
tweets[tweets$id_str == "523512815425175552", "text"]
|
||||||
|
test <- tweets[tweets$id_str == "523512815425175552", "text"]
|
||||||
|
test
|
||||||
|
test <- c_tweets[ctweets$id_str == "523512815425175552", "text"]
|
||||||
|
test <- c_tweets[c_tweets$id_str == "523512815425175552", "text"]
|
||||||
|
test
|
||||||
|
str_replace(test, "\\"", ")
|
||||||
|
str_replace(test, "\\"", "")
|
||||||
|
str_replace(test, "\"", "")
|
||||||
|
str_detect(test, "\"")
|
||||||
|
test <- as.character(c_tweets[c_tweets$id_str == "523512815425175552", "text"])
|
||||||
|
test
|
||||||
|
c_tweets <- read.csv("tweets.csv", colClasses="character")
|
||||||
|
for(r in 1:nrow(c_tweets)) {
|
||||||
|
curtext <- as.character(c_tweets$text[r])
|
||||||
|
if(str_detect(curtext, "\"") {
|
||||||
|
c_tweets$text[r] <- str_replace(curtext, "\"", "")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for(r in 1:nrow(c_tweets)) {
|
||||||
|
curtext <- as.character(c_tweets$text[r])
|
||||||
|
if(str_detect(curtext, "\"") {
|
||||||
|
c_tweets$text[r] <- str_replace(curtext, "\"", "")
|
||||||
|
} else {}
|
||||||
|
}
|
||||||
|
for(r in 1:nrow(c_tweets)) {
|
||||||
|
curtext <- as.character(c_tweets$text[r])
|
||||||
|
if(str_detect(curtext, "\"") {
|
||||||
|
c_tweets$text[r] <- str_replace(curtext, "\"", "")
|
||||||
|
} else {
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for(r in 1:nrow(c_tweets)) {
|
||||||
|
curtext <- as.character(c_tweets$text[r])
|
||||||
|
if(str_detect(curtext, "\"")) {
|
||||||
|
c_tweets$text[r] <- str_replace(curtext, "\"", "")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
test <- as.character(c_tweets[c_tweets$id_str == "523512815425175552", "text"])
|
||||||
|
test
|
||||||
|
View(c_tweets)
|
||||||
|
c_errors <- read.csv("issuecomp-codingsample-error.csv", header = F, sep=",", colClasses="character")
|
||||||
|
View(c_errors)
|
||||||
|
names(c_errors) <- c("str_id", "code", "issue", "tags", "text")
|
||||||
|
View(c_errors)
|
||||||
|
for(r in 1:nrow(c_errors)) {
|
||||||
|
c_errcode <- as.character(c_errors$code[r])
|
||||||
|
c_errissue <- as.character(c_errors$issue[r])
|
||||||
|
c_errtags <- as.character(c_errors$tags[r])
|
||||||
|
c_errtext <- as.character(c_errors$text[r])
|
||||||
|
c_errid <- as.character(c_errors$str_id[r])
|
||||||
|
cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errissue, " (", c_errtags, ")\n", sep="")
|
||||||
|
source("issuecomp-codingsample-function2.R")
|
||||||
|
}
|
||||||
|
issueheads
|
||||||
|
for(r in 1:nrow(c_errors)) {
|
||||||
|
c_errcode <- as.character(c_errors$code[r])
|
||||||
|
c_errissue <- as.character(c_errors$issue[r])
|
||||||
|
c_errtags <- as.character(c_errors$tags[r])
|
||||||
|
c_errtext <- as.character(c_errors$text[r])
|
||||||
|
c_errid <- as.character(c_errors$str_id[r])
|
||||||
|
cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errissue, " (", c_errtags, ")\n", sep="")
|
||||||
|
source("issuecomp-codingsample-function2.R")
|
||||||
|
}
|
||||||
|
# All tweets with WRONG ISSUES
|
||||||
|
c_tmp <- read.csv("issuecomp-codingsample-error1.csv", header = F, colClasses="character")
|
||||||
|
names(c_tmp) <- c("str_id", "all", "wrong", "tagged", "text")
|
||||||
|
c_error1 <- c_tmp[, c("wrong", "tagged", "all", "text")]
|
||||||
|
# All tweets with MISSING ISSUES
|
||||||
|
c_tmp <- read.csv("issuecomp-codingsample-error2.csv", header = F, colClasses="character")
|
||||||
|
names(c_tmp) <- c("str_id", "all", "missing", "tagged", "text")
|
||||||
|
c_error2 <- c_tmp[, c("missing", "text", "tagged", "all")]
|
||||||
|
# All CORRECT tweets
|
||||||
|
c_tmp <- read.csv("issuecomp-codingsample-correct.csv", header = F, colClasses="character")
|
||||||
|
names(c_tmp) <- c("str_id", "status", "issue", "tags", "text")
|
||||||
|
c_correct <- c_tmp
|
||||||
|
View(c_error1)
|
||||||
|
View(c_error2)
|
||||||
|
View(c_error1)
|
||||||
|
View(c_correct)
|
||||||
|
test <- VAR(issues_i[,2:22], p=1, type="none", exogen = issues_s[,2:3])
|
||||||
|
plot(irf(test, impulse = names(issues_s[2:11]), response = names(issues_i[2:22])))
|
||||||
|
test <- VAR(issues[,2:32], p=1, type="none")
|
||||||
|
plot(irf(test, impulse = names(issues_s[2:11]), response = names(issues_i[2:22])))
|
||||||
|
VARselect(issues[,2:32], lag.max=8, type="none")
|
||||||
|
VARselect(issues[,2:32], lag.max=8, type="both")
|
||||||
|
VARselect(issues[,2:32], lag.max=30, type="both")
|
||||||
|
VARselect(issues[,2:32], lag.max=15, type="both")
|
||||||
|
|||||||
+3
-2
@@ -1,7 +1,8 @@
|
|||||||
tweets_complete.csv
|
tweets_complete.csv
|
||||||
tweets.csv
|
tweets.csv
|
||||||
tweets_untagged.csv
|
|
||||||
tweets_untagged.RData
|
|
||||||
.RData
|
.RData
|
||||||
matched-ids
|
matched-ids
|
||||||
issuecomp-analysis.log
|
issuecomp-analysis.log
|
||||||
|
issuecomp-codingsample-correct.csv
|
||||||
|
issuecomp-codingsample-error.csv
|
||||||
|
issuecomp-codingsample-error2.csv
|
||||||
|
|||||||
@@ -0,0 +1,138 @@
|
|||||||
|
require(lubridate)
|
||||||
|
require(XML)
|
||||||
|
require(stringr)
|
||||||
|
require(foreach)
|
||||||
|
require(doParallel)
|
||||||
|
|
||||||
|
source("issuecomp-functions.R")
|
||||||
|
setwd("~/Dokumente/Uni/Aktuell/BA-Arbeit/uni-ba-issuecomp")
|
||||||
|
|
||||||
|
|
||||||
|
load(file = "tweets_untagged.RData")
|
||||||
|
|
||||||
|
# Create date range
|
||||||
|
date_start <- as.Date("2014-01-01")
|
||||||
|
date_end <- as.Date("2014-12-31")
|
||||||
|
drange <- as.integer(date_end - date_start)
|
||||||
|
drange <- date_start + days(0:drange)
|
||||||
|
|
||||||
|
# Import issues and prepare everything
|
||||||
|
# Will only be filled after the large categorisation loop
|
||||||
|
issues <- data.frame(date = drange)
|
||||||
|
issuelist <- readLines("issues-v2.xml")
|
||||||
|
issuelist <- str_replace_all(string = issuelist, pattern = ".*<!-- .+ -->", "")
|
||||||
|
issuelist <- xmlToList(issuelist)
|
||||||
|
issueheads <- names(issuelist)
|
||||||
|
issues[issueheads] <- 0
|
||||||
|
tweets$issue <- ""
|
||||||
|
tweets$tags <- ""
|
||||||
|
|
||||||
|
|
||||||
|
# MATCH TWEETS ------------------------------------------------------------
|
||||||
|
|
||||||
|
# Create folder where all results will be saved (saver for backup and import)
|
||||||
|
id_folder <- "matched-ids"
|
||||||
|
unlink(id_folder, recursive = TRUE)
|
||||||
|
dir.create(id_folder)
|
||||||
|
|
||||||
|
# Tag expansion for plural, genetiv etc
|
||||||
|
tagexpand <- c("", "s", "n", "en", "er", "e")
|
||||||
|
|
||||||
|
# Parameters for parallelisation
|
||||||
|
writeLines(c(""), "issuecomp-analysis.log")
|
||||||
|
cl<-makeCluster(7)
|
||||||
|
registerDoParallel(cl)
|
||||||
|
|
||||||
|
# START CAT LOOP
|
||||||
|
foreach(d = 1:nrow(issues), .packages = c("stringr"), .combine=rbind) %dopar% {
|
||||||
|
# Go through every day
|
||||||
|
curdate <- issues$date[d]
|
||||||
|
cat(paste(as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE)
|
||||||
|
|
||||||
|
# Put all tweets from specific day in a temporary DF
|
||||||
|
tweets_curday <- tweets[tweets[, "created_at"] == curdate, ]
|
||||||
|
|
||||||
|
for(t in 1:nrow(tweets_curday)){
|
||||||
|
# cat(paste("Starting tweet", t, "of",as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE)
|
||||||
|
# Select tweet's text, make it lowercase and remove hashtag indicators (#)
|
||||||
|
curtext <- as.character(tweets_curday$text[t])
|
||||||
|
curtext <- str_replace_all(curtext, "#", "")
|
||||||
|
|
||||||
|
curid <- as.character(tweets_curday$id_str[t])
|
||||||
|
|
||||||
|
# Now test each single issue (not tag!)
|
||||||
|
for(i in 1:length(issueheads)) {
|
||||||
|
curissue <- issueheads[i]
|
||||||
|
curtags <- as.character(issuelist[[curissue]])
|
||||||
|
curfile <- str_c(id_folder,"/",curissue,".csv")
|
||||||
|
|
||||||
|
# Now test all tags of a single issue
|
||||||
|
for(s in 1:length(curtags)) {
|
||||||
|
curtag <- curtags[s]
|
||||||
|
curchars <- nchar(curtag, type = "chars")
|
||||||
|
|
||||||
|
# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
|
||||||
|
curacro <- checkAcronym(string = curtag)
|
||||||
|
# Check if tag is some kind of specific hashtag. If so, do not handle as acronym, but don't expand it either
|
||||||
|
if(str_detect(curtag, "^#")) {
|
||||||
|
curacro <- FALSE # hashtags like #WM2014 are also written as #wm2014, so we need case-insensitivity
|
||||||
|
curhash <- TRUE # But we need to mark it as hashtag, so it doesn't get extended or Levenshtein distance > 0
|
||||||
|
curtag <- str_replace(curtag, "#", "")
|
||||||
|
curchars <- curchars - 1
|
||||||
|
} else {
|
||||||
|
curhash <- FALSE
|
||||||
|
}
|
||||||
|
|
||||||
|
# Now expand the current tag by possible suffixes that may be plural forms
|
||||||
|
# Only do if it isn't an acronym or specific hastag
|
||||||
|
if(!curacro && !curhash) {
|
||||||
|
for(e in 1:length(tagexpand)) {
|
||||||
|
curtag[e] <- str_c(curtag[1], tagexpand[e])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Set Levenshtein distance depending on char length, acronym and hashtag status
|
||||||
|
if(curchars <= 6 || curacro || curhash) { # Distance = 1 if 7 chars or longer
|
||||||
|
curdistance <- 0
|
||||||
|
} else {
|
||||||
|
curdistance <- 1
|
||||||
|
}
|
||||||
|
|
||||||
|
# Match current tweet with tag.
|
||||||
|
# Allow 1 Levenshtein distance if tag is >= 5 letters and no hashtag or acronym
|
||||||
|
# Make is case-sensitiv if tag is an acronym
|
||||||
|
|
||||||
|
tags_found <- NULL
|
||||||
|
# Match the tweet with each variation of tagexpand
|
||||||
|
for(e in 1:length(curtag)) {
|
||||||
|
tags_found[e] <- smartPatternMatch(curtext, curtag[e], curdistance, curacro)
|
||||||
|
}
|
||||||
|
tags_found <- any(tags_found)
|
||||||
|
curtag <- curtag[1]
|
||||||
|
|
||||||
|
if(tags_found == TRUE) {
|
||||||
|
# # Raise number of findings on this day for this issue by 1
|
||||||
|
# issues[d,curissue] <- issues[d,curissue] + 1
|
||||||
|
#
|
||||||
|
# # Add issue and first matched tag of tweet to tweets-DF
|
||||||
|
# oldissue <- tweets[tweets[, "id_str"] == curid, "issue"]
|
||||||
|
# tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";")
|
||||||
|
# oldtag <- tweets[tweets[, "id_str"] == curid, "tags"]
|
||||||
|
# tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";")
|
||||||
|
|
||||||
|
# Add information to file for function viewPatternMatching
|
||||||
|
write(str_c(curdate,";\"",curid,"\";",curissue,";",curtag), curfile, append = TRUE)
|
||||||
|
# cat(paste("Match!\n"), file="issuecomp-analysis.log", append=TRUE)
|
||||||
|
# data.frame(date=curdate, issue=curissue)
|
||||||
|
break # next issue, no more tags from same issue
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
#cat("Nothing found\n")
|
||||||
|
}
|
||||||
|
} # /for curtags
|
||||||
|
} # /for issuelist
|
||||||
|
} # /for tweets_curday
|
||||||
|
} # /for drange
|
||||||
|
|
||||||
|
#rm(tweets_curday,curacro, curchars, curdate,curfile,curid,curissue,curtag,curtags,curtext,d,date_end,date_start,i,id_folder,oldissue,oldtag,s,t,tags_found)
|
||||||
|
stopCluster(cl)
|
||||||
@@ -3,8 +3,8 @@ require(XML)
|
|||||||
require(ggplot2)
|
require(ggplot2)
|
||||||
require(reshape2)
|
require(reshape2)
|
||||||
require(stringr)
|
require(stringr)
|
||||||
library(foreach)
|
require(foreach)
|
||||||
library(doParallel)
|
require(doParallel)
|
||||||
|
|
||||||
source("issuecomp-functions.R")
|
source("issuecomp-functions.R")
|
||||||
setwd("~/Dokumente/Uni/Aktuell/BA-Arbeit/uni-ba-issuecomp")
|
setwd("~/Dokumente/Uni/Aktuell/BA-Arbeit/uni-ba-issuecomp")
|
||||||
@@ -94,7 +94,7 @@ foreach(d = 1:nrow(issues), .packages = c("stringr"), .combine=rbind) %dopar% {
|
|||||||
}
|
}
|
||||||
|
|
||||||
# Set Levenshtein distance depending on char length, acronym and hashtag status
|
# Set Levenshtein distance depending on char length, acronym and hashtag status
|
||||||
if(curchars <= 4 || curacro || curhash) {
|
if(curchars <= 6 || curacro || curhash) { # Distance = 1 if 7 chars or longer
|
||||||
curdistance <- 0
|
curdistance <- 0
|
||||||
} else {
|
} else {
|
||||||
curdistance <- 1
|
curdistance <- 1
|
||||||
@@ -191,6 +191,7 @@ for(r in 1:nrow(results)) {
|
|||||||
# SAVING ------------------------------------------------------------------
|
# SAVING ------------------------------------------------------------------
|
||||||
|
|
||||||
save(tweets, file="tweets_tagged.RData")
|
save(tweets, file="tweets_tagged.RData")
|
||||||
|
write.csv(tweets, file="tweets.csv")
|
||||||
save(issues, file="issues.RData")
|
save(issues, file="issues.RData")
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
+7
-2
@@ -72,15 +72,20 @@ stats_entropy <- melt(stats_entropy, id="date")
|
|||||||
g1 <- ggplot(data = stats_entropy, aes(x=date,y=value,colour=variable, group=variable)) +
|
g1 <- ggplot(data = stats_entropy, aes(x=date,y=value,colour=variable, group=variable)) +
|
||||||
geom_line() +
|
geom_line() +
|
||||||
geom_smooth(size=1,formula = y ~ x, method="loess", se=FALSE, color=1)
|
geom_smooth(size=1,formula = y ~ x, method="loess", se=FALSE, color=1)
|
||||||
g1
|
# g1
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# VAR ---------------------------------------------------------------------
|
# VAR ---------------------------------------------------------------------
|
||||||
|
|
||||||
test <- VAR(issues[,2:32], p=3, type=c("const", "trend", "both", "none"), season=NULL, exogen = NULL, lag.max = NULL, ic = c("AIC", "HQ", "SC", "FPE"))
|
test <- VAR(issues[,2:32], p=1, type=c("const", "trend", "both", "none"), season=NULL, exogen = NULL, lag.max = NULL, ic = c("AIC", "HQ", "SC", "FPE"))
|
||||||
|
test <- VAR(issues_i[,2:22], p=1, type="none", exogen = issues_s[,2:3])
|
||||||
|
test <- VAR(issues_s[,2:11], p=1, type="none")
|
||||||
|
test <- VAR(issues[,2:32], p=1, type="none")
|
||||||
VAR(issues_s[,2:11], p=1, type=c("const", "trend", "both", "none"), season=NULL, exogen = issues_i[2:22])
|
VAR(issues_s[,2:11], p=1, type=c("const", "trend", "both", "none"), season=NULL, exogen = issues_i[2:22])
|
||||||
|
|
||||||
|
plot(irf(test, impulse = names(issues_s[2:11]), response = names(issues_i[2:22])))
|
||||||
|
|
||||||
capture.output(print(summary(test), prmsd=TRUE, digits=1), file="out.txt")
|
capture.output(print(summary(test), prmsd=TRUE, digits=1), file="out.txt")
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,31 +0,0 @@
|
|||||||
"443032724505624576",0,"","","RT @Linksfraktion: Sevim Dagdelen: Optionspflicht abschaffen – ohne „Wenn“ und „Aber“ http://t.co/PXVz60RyDa"
|
|
||||||
"464039981766684672",0,"","","Hange spricht von einer Kutsche ohne Dach in der wir fahren und hoffen, dass es nicht regne #btADA"
|
|
||||||
"528513118440525824",0,"","","Dora zu Erfolgen der #Hamburger Linken: kostenfreies Mittag + Initiative, dass Maklergebühr nicht von Vermieter*innen.bezahlt werden müssen "
|
|
||||||
"511144502590181376",0,"","","RT @2kdei: TY! RİP @SajadJiyad: post about David Haines? Use this photo. He would want us to remember him. URL #ISISmedi… "
|
|
||||||
"515791668969504768",0,"","","Anschauen: Beitrag Heute Show zu #TTIP URL "
|
|
||||||
"533717720689549312",0,"","","RT @JUKoMo: "Wie groß muss die Angst der SPD vor Julia Klöckner sein?" Starker @LSaktuellRP-Kommentar zum SPD-Parteitag: URL "
|
|
||||||
"472393996879527936",0,"","","RT @johannisbear: Bitte RT! Bitte helft @ulf_der_freak #Aurela darf nicht sterben! URL "
|
|
||||||
"499494814778662912",0,"","","Das wievielte Mal verspricht #Merkel die Angleichung der #Ostrenten an Westniveau? Es gibt gute Gründe zu misstrauen URL "
|
|
||||||
"530750832443400192",0,"","","ist - gezwungenermaßen - mit dem #Fernbus unterwegs #gdlstreik "
|
|
||||||
"532310093904089088",0,"","","SPD will Waldschluchtpfad als Dauerstandort - Nachrichten Gatow | SPANDAUER VOLKSBLATT URL "
|
|
||||||
"465407100408320000",0,"","","Die neue Modernität in den Kleingarten Kneipen URL "
|
|
||||||
"428481075732811776",0,"civil.208;","Datenschutz;",""Datenschutz soll nicht unverhältnismäßig geschwächt werden." Was heißt da unverhältnismäßig? #Regierungserklärung #Merkel #Bundestag "
|
|
||||||
"421711548189786114",0,"","","...und inzwischen gute Freunde. URL "
|
|
||||||
"532608281009979392",0,"","","RT @initiatived21: “@anipenny: ”Graswurzelbewegg medienpägogisch interessierter Lehrer vernetzt sich, braucht aber auch Unterstützg“ @Esken… "
|
|
||||||
"537931613464965121",0,"","","RT @StefanKaufmann: Es läuft Plenardebatte zum Haushalt des Bundesmin. für Bild. und Forschung. Trotz Schwarzer Null steigt Etat deutlich -… "
|
|
||||||
"499843193069129728",0,"","","#CETA - der komplette Text wurde heute geleakt: https://t.co/YeWUsSAHoB "
|
|
||||||
"477758769703944194",0,"","","Beim Tag der Offenen Tür des THW beeindruckt von der Vielfalt der Einsätze der Organisation. 10 Mill. im Haushalt sind hier gut eingesetzt "
|
|
||||||
"539467838520832000",0,"","","RT @spdbt: KoA-Vertrag muss gelten! @ThomasOppermann: „Bei #Maut darf es keine Mehrbelastung für deutsche Autofahrer geben." URL "
|
|
||||||
"449534756259381248",0,"","","Höflich, aber klares Statement zu Menschenrechten. Der Bundespräsident macht das gut! #China #XiJinping URL "
|
|
||||||
"514314498510168065",0,"","","RT @zeitonline_wir: Ban Ki Moon lädt zum #Klimagipfel. 120 Staats- und Regierungschefs kommen. Nur nicht Angela Merkel. (ae) #climate2014 h… "
|
|
||||||
"434413898054524928",0,"","","RT @KonstantinNotz: Zum Appell der SchriftstellerInnen URL und Videobericht zum Empfang der #grünen Bundestagsfraktion h… "
|
|
||||||
"509707990929534976",0,"","","RT @weltnetzTV: URL in Kooperation mit theREALnews! URL "
|
|
||||||
"542066419274637312",0,"","","RT @tagesthemen: "Ein bescheuerter Satz" - Sigmund Gottlieb kommentiert #YallaCSU. Jetzt in den Tagesthemen. URL "
|
|
||||||
"508921157131984896",0,"","","RT @drthomasfeist: #gain2014 @MartinRabanus @KaiGehring @PLengsfeld @KarambaDiaby @the_dfg @DAADnewyork @AvHStiftung @UniLeipzig URL "
|
|
||||||
"540745511024992257",0,"","","RT @maybritillner: .@MikeMohring Der Freistaat #Thueringen hat es nicht verdient von einer Regierung geführt zu werden, die sich als Experi… "
|
|
||||||
"507492334347763712",0,"","","Bitte RT: Wer kann helfen? Brauchen ganz dringend mobile Duschcontainer für die Flüchtlingszelte in #Nürnberg #followerpower "
|
|
||||||
"485509000268902400",0,"","","#WM2014 #Deutschland #GER - #Thalheim #Erzgebirge URL "
|
|
||||||
"542710860528234497",0,"","","Illegaler #Kunsthandel blüht! BReg muss nachbessern. Kein Umschlagplatz BRD f. Raubkunst. Dazu @GreenClaudia + ich: https://t.co/tr16CjQl42 "
|
|
||||||
"535525766919106560",0,"","","RT @EU_Salon: .@DJanecek: #TTIP Zivilgesellsxhaft hat Funktion zu treiben... ohne kritische Masse hätte es Diskussion so nicht gegeben. #ES3 "
|
|
||||||
"472481188641124352",0,"","","Vor dem Grand Serail in Beirut,- im Schatten @nouripour :-) URL "
|
|
||||||
"532818411374788608",0,"","","Laut einer Studie treibt unsere Debatte um Mietpreisbremse Mieten in die Höhe - tolle Wurst! "
|
|
||||||
|
Can't render this file because it contains an unexpected character in line 6 and column 43.
|
@@ -1,6 +0,0 @@
|
|||||||
"532463690013233152",1,"","","RT @cordhos: ! “@cducsubt: Paul Breitner vom @FCBayern - #Bundestag jetzt mit eigenem Fanclub @hahnflo @DoroBaer @dieAlbsteigerin http://t.…"
|
|
||||||
"516584367448403968",1,"","","Debate und critics in the parlamentarian assembly of the European Council about the elections in #Turkey @PACE_News @GeziParkii"
|
|
||||||
"516624274522918912",1,"","","Nach Bürgergespräch bin nun noch im Ratshof zur Ausstellungseröffnung - Wanderausstellung zum Bundestag."
|
|
||||||
"530357749188923392",2,"","","Streiks müssen Auswirkungen haben - und die #bahn verletzt täglich Verbraucherinteressen: URL #gdlstreik #gdl #db "
|
|
||||||
"465846218858708992",2,"","","RT @bioland_de: Wieso sollen Biobauern dafür bestraft werden, dass sie KEINE Pestizide einsetzen? Genau das hat die EU vor: URL "
|
|
||||||
"543111899794407426",2,"","","RT @DanielLuecking: #NSAUA @Peter_Schaar Verhältnismäßigkeit muss hinterfragt werden - Grundrechtsverletzungen durch Überwachung gehören au… "
|
|
||||||
|
@@ -1 +0,0 @@
|
|||||||
"530357749188923392","","labor.504","","Streiks müssen Auswirkungen haben - und die #bahn verletzt täglich Verbraucherinteressen: URL #gdlstreik #gdl #db "
|
|
||||||
|
@@ -7,7 +7,7 @@ repeat {
|
|||||||
|
|
||||||
|
|
||||||
repeat {
|
repeat {
|
||||||
cat("===================\n\n[TWEET]: ",c_samtext,"\n", "[ISSUES]: ", c_samissue, sep="")
|
cat("===================\n\n[TWEET]: ",c_samtext,"\n", "[ISSUES]: ", c_samissue, " (", c_samtags, ")", sep="")
|
||||||
c_yn <- readYN("Is the categorization correct AND complete?\nEnter y or n: ")
|
c_yn <- readYN("Is the categorization correct AND complete?\nEnter y or n: ")
|
||||||
|
|
||||||
# Check if input is correct
|
# Check if input is correct
|
||||||
|
|||||||
@@ -3,14 +3,14 @@ if(c_errcode == "1") {
|
|||||||
repeat {
|
repeat {
|
||||||
c_curissue <- readYN("Which issue is incorrect?: ")
|
c_curissue <- readYN("Which issue is incorrect?: ")
|
||||||
if(c_curissue == "QUIT") {cat("Quitting this item without changes or entries.\n"); break}
|
if(c_curissue == "QUIT") {cat("Quitting this item without changes or entries.\n"); break}
|
||||||
c_curissue <- unlist(str_split(c_curissue, ";"))
|
c_curissue <- unlist(str_split(c_curissue, ","))
|
||||||
|
|
||||||
status <- checkAllIssues(c_curissue, c_issueheads)
|
status <- checkAllIssues(c_curissue, c_issueheads)
|
||||||
|
|
||||||
# Only continue if every given issue really exists (all "status" have to be TRUE)
|
# Only continue if every given issue really exists (all "status" have to be TRUE)
|
||||||
if(all(status)) {
|
if(all(status)) {
|
||||||
# Revert str_split
|
# Revert str_split
|
||||||
c_curissue <- str_join(c_curissue,collapse = ";")
|
c_curissue <- str_join(c_curissue,collapse = ",")
|
||||||
|
|
||||||
# <ID>,<all issues>,<faulty issue(s),<all tags>,<tweet text>
|
# <ID>,<all issues>,<faulty issue(s),<all tags>,<tweet text>
|
||||||
c_result <- str_c("\"",c_errid,"\",\"",c_errissue,"\",\"",c_curissue,"\",\"",c_errtags,"\",\"",c_errtext,"\"")
|
c_result <- str_c("\"",c_errid,"\",\"",c_errissue,"\",\"",c_curissue,"\",\"",c_errtags,"\",\"",c_errtext,"\"")
|
||||||
@@ -24,14 +24,14 @@ if(c_errcode == "1") {
|
|||||||
repeat {
|
repeat {
|
||||||
c_curissue <- readYN("Which issue is missing?: ")
|
c_curissue <- readYN("Which issue is missing?: ")
|
||||||
if(c_curissue == "QUIT") {cat("Quitting this item without changes or entries.\n"); break}
|
if(c_curissue == "QUIT") {cat("Quitting this item without changes or entries.\n"); break}
|
||||||
c_curissue <- unlist(str_split(c_curissue, ";"))
|
c_curissue <- unlist(str_split(c_curissue, ","))
|
||||||
|
|
||||||
status <- checkAllIssues(c_curissue, c_issueheads)
|
status <- checkAllIssues(c_curissue, c_issueheads)
|
||||||
|
|
||||||
# Only continue if every given issue really exists (all "status" have to be TRUE)
|
# Only continue if every given issue really exists (all "status" have to be TRUE)
|
||||||
if(all(status)) {
|
if(all(status)) {
|
||||||
# Revert str_split
|
# Revert str_split
|
||||||
c_curissue <- str_join(c_curissue,collapse = ";")
|
c_curissue <- str_join(c_curissue,collapse = ",")
|
||||||
|
|
||||||
# <ID>,<all issues>,<faulty issue(s),<all tags>,<tweet text>
|
# <ID>,<all issues>,<faulty issue(s),<all tags>,<tweet text>
|
||||||
c_result <- str_c("\"",c_errid,"\",\"",c_errissue,"\",\"",c_curissue,"\",\"",c_errtags,"\",\"",c_errtext,"\"")
|
c_result <- str_c("\"",c_errid,"\",\"",c_errissue,"\",\"",c_curissue,"\",\"",c_errtags,"\",\"",c_errtext,"\"")
|
||||||
@@ -42,14 +42,51 @@ if(c_errcode == "1") {
|
|||||||
|
|
||||||
# There is an issue missing AND a issue was wrong
|
# There is an issue missing AND a issue was wrong
|
||||||
} else if(c_errcode == "3") {
|
} else if(c_errcode == "3") {
|
||||||
#cat("Which issue is incorrect and which one is missing?\n")
|
# #cat("Which issue is incorrect and which one is missing?\n")
|
||||||
|
# repeat {
|
||||||
|
# c_tag <- readYN("Which issue is incorrect?: ")
|
||||||
|
# c_tag <- unlist(str_split(c_tag, ","))
|
||||||
|
# for(i in 1:length(c_tag)) {
|
||||||
|
# if(checkIssue(c_tag[i], c_issueheads)) {} else {cat("Issue",c_tag[i],"does not exist. Please try again.\n")}
|
||||||
|
# }
|
||||||
|
|
||||||
repeat {
|
repeat {
|
||||||
c_tag <- readYN("Which issue is incorrect?: ")
|
c_curissue <- readYN("Which issue is incorrect?: ")
|
||||||
c_tag <- unlist(str_split(c_tag, ";"))
|
if(c_curissue == "QUIT") {cat("Quitting this item without changes or entries.\n"); break}
|
||||||
for(i in 1:length(c_tag)) {
|
c_curissue <- unlist(str_split(c_curissue, ","))
|
||||||
if(checkIssue(c_tag[i], c_issueheads)) {} else {cat("Issue",c_tag[i],"does not exist. Please try again.\n")}
|
|
||||||
|
status <- checkAllIssues(c_curissue, c_issueheads)
|
||||||
|
|
||||||
|
# Only continue if every given issue really exists (all "status" have to be TRUE)
|
||||||
|
if(all(status)) {
|
||||||
|
# Revert str_split
|
||||||
|
c_curissue <- str_join(c_curissue,collapse = ",")
|
||||||
|
|
||||||
|
# <ID>,<all issues>,<faulty issue(s),<all tags>,<tweet text>
|
||||||
|
c_result <- str_c("\"",c_errid,"\",\"",c_errissue,"\",\"",c_curissue,"\",\"",c_errtags,"\",\"",c_errtext,"\"")
|
||||||
|
write(c_result, file = "issuecomp-codingsample-error1.csv", append = T)
|
||||||
|
break
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
repeat {
|
||||||
|
c_curissue <- readYN("Which issue is missing?: ")
|
||||||
|
if(c_curissue == "QUIT") {cat("Quitting this item without changes or entries.\n"); break}
|
||||||
|
c_curissue <- unlist(str_split(c_curissue, ","))
|
||||||
|
|
||||||
|
status <- checkAllIssues(c_curissue, c_issueheads)
|
||||||
|
|
||||||
|
# Only continue if every given issue really exists (all "status" have to be TRUE)
|
||||||
|
if(all(status)) {
|
||||||
|
# Revert str_split
|
||||||
|
c_curissue <- str_join(c_curissue,collapse = ",")
|
||||||
|
|
||||||
|
# <ID>,<all issues>,<faulty issue(s),<all tags>,<tweet text>
|
||||||
|
c_result <- str_c("\"",c_errid,"\",\"",c_errissue,"\",\"",c_curissue,"\",\"",c_errtags,"\",\"",c_errtext,"\"")
|
||||||
|
write(c_result, file = "issuecomp-codingsample-error2.csv", append = T)
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
# If this triggers the hell freezes...
|
# If this triggers the hell freezes...
|
||||||
|
|||||||
@@ -34,6 +34,13 @@ checkAllIssues <- function(string, issuelist) {
|
|||||||
|
|
||||||
# Read CSV of all tweets (with tags, if available)
|
# Read CSV of all tweets (with tags, if available)
|
||||||
c_tweets <- read.csv("tweets.csv", colClasses="character")
|
c_tweets <- read.csv("tweets.csv", colClasses="character")
|
||||||
|
# Replace quotes because it may cause problems when saving and reading as CSV files
|
||||||
|
for(r in 1:nrow(c_tweets)) {
|
||||||
|
curtext <- as.character(c_tweets$text[r])
|
||||||
|
if(str_detect(curtext, "\"")) {
|
||||||
|
c_tweets$text[r] <- str_replace(curtext, "\"", "")
|
||||||
|
}
|
||||||
|
}
|
||||||
c_tweets$X <- NULL
|
c_tweets$X <- NULL
|
||||||
|
|
||||||
# Read all issues from XML file
|
# Read all issues from XML file
|
||||||
@@ -66,10 +73,24 @@ for(r in 1:nrow(c_errors)) {
|
|||||||
c_errtext <- as.character(c_errors$text[r])
|
c_errtext <- as.character(c_errors$text[r])
|
||||||
c_errid <- as.character(c_errors$str_id[r])
|
c_errid <- as.character(c_errors$str_id[r])
|
||||||
|
|
||||||
cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errtags, "\n", sep="")
|
cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errissue, " (", c_errtags, ")\n", sep="")
|
||||||
source("issuecomp-codingsample-function2.R")
|
source("issuecomp-codingsample-function2.R")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# Now import the error files in a human readable data frame to improve the issue database
|
||||||
|
|
||||||
|
# All tweets with WRONG ISSUES
|
||||||
|
c_tmp <- read.csv("issuecomp-codingsample-error1.csv", header = F, colClasses="character")
|
||||||
|
names(c_tmp) <- c("str_id", "all", "wrong", "tagged", "text")
|
||||||
|
c_error1 <- c_tmp[, c("wrong", "tagged", "all", "text")]
|
||||||
|
|
||||||
|
# All tweets with MISSING ISSUES
|
||||||
|
c_tmp <- read.csv("issuecomp-codingsample-error2.csv", header = F, colClasses="character")
|
||||||
|
names(c_tmp) <- c("str_id", "all", "missing", "tagged", "text")
|
||||||
|
c_error2 <- c_tmp[, c("missing", "text", "tagged", "all")]
|
||||||
|
|
||||||
|
# All CORRECT tweets
|
||||||
|
c_tmp <- read.csv("issuecomp-codingsample-correct.csv", header = F, colClasses="character")
|
||||||
|
names(c_tmp) <- c("str_id", "status", "issue", "tags", "text")
|
||||||
|
c_correct <- c_tmp
|
||||||
|
|||||||
@@ -1,203 +1 @@
|
|||||||
<i7.env>
|
|
||||||
<tag>Energiewende</tag>
|
|
||||||
<tag>Klimaschutz</tag>
|
|
||||||
<tag>COP20</tag>
|
|
||||||
<tag>cop20</tag>
|
|
||||||
</i7.env>
|
|
||||||
|
|
||||||
<i3.health>
|
|
||||||
<tag>Gesundheitsbranche</tag>
|
|
||||||
</i3.health>
|
|
||||||
|
|
||||||
<i10.trans>
|
|
||||||
<tag>LKWs</tag>
|
|
||||||
<tag>PKWs</tag>
|
|
||||||
</i10.trans>
|
|
||||||
|
|
||||||
<i1.macro>
|
|
||||||
<tag>Arbeitslose</tag>
|
|
||||||
</i1.macro>
|
|
||||||
|
|
||||||
<i12.law>
|
|
||||||
<tag>Vorratsdatenspeicherung</tag>
|
|
||||||
<tag>VDS</tag>
|
|
||||||
<tag>Cybercrime</tag>
|
|
||||||
<tag>Vorratsdatenspeicherung</tag>
|
|
||||||
<tag>VDS</tag>
|
|
||||||
</i12.law>
|
|
||||||
|
|
||||||
|
|
||||||
<!-- Sensational issues -->
|
|
||||||
|
|
||||||
<!-- Political/conflicts -->
|
|
||||||
<s.nsa>
|
|
||||||
<tag>NSA</tag>
|
|
||||||
<tag>Snowden</tag>
|
|
||||||
<tag>GCHQ</tag>
|
|
||||||
</s.nsa>
|
|
||||||
|
|
||||||
<s.is>
|
|
||||||
<tag>ISIS</tag>
|
|
||||||
<tag>IS</tag>
|
|
||||||
<tag>al-Baghdadi</tag>
|
|
||||||
<tag>Kurde</tag>
|
|
||||||
<tag>Jeside</tag>
|
|
||||||
<tag>#Mosul</tag>
|
|
||||||
<tag>#Mossul</tag>
|
|
||||||
<tag>#Fallujah</tag>
|
|
||||||
<tag>#Falludscha</tag>
|
|
||||||
<tag>#Kobanê</tag>
|
|
||||||
<tag>#Kobane</tag>
|
|
||||||
<tag>Syrien</tag>
|
|
||||||
<tag>Irak</tag>
|
|
||||||
<tag>#Aleppo</tag>
|
|
||||||
</s.is>
|
|
||||||
|
|
||||||
<s.ebola>
|
|
||||||
<tag>Ebola</tag>
|
|
||||||
</s.ebola>
|
|
||||||
|
|
||||||
<s.edathy>
|
|
||||||
<tag>Edathy</tag>
|
|
||||||
<tag>Edathy-Affäre</tag>
|
|
||||||
</s.edathy>
|
|
||||||
|
|
||||||
<s.ukraine>
|
|
||||||
<tag>Ukraine</tag>
|
|
||||||
<tag>Krim</tag>
|
|
||||||
<tag>Prorussisch</tag>
|
|
||||||
<tag>Donetsk</tag>
|
|
||||||
<tag>Donezk</tag>
|
|
||||||
<tag>Euromaidan</tag>
|
|
||||||
</s.ukraine>
|
|
||||||
|
|
||||||
<s.hk>
|
|
||||||
<tag>Hong Kong</tag>
|
|
||||||
<tag>Hong-Kong</tag>
|
|
||||||
<tag>Studentenprotest</tag>
|
|
||||||
<tag>Protest der Studenten</tag>
|
|
||||||
<tag>Hongkong</tag>
|
|
||||||
</s.hk>
|
|
||||||
|
|
||||||
<s.mh17>
|
|
||||||
<tag>#MH17</tag>
|
|
||||||
<tag>#KL4103</tag>
|
|
||||||
</s.mh17>
|
|
||||||
|
|
||||||
<s.mh370>
|
|
||||||
<tag>#MH370</tag>
|
|
||||||
<tag>#CZ748</tag>
|
|
||||||
</s.mh370>
|
|
||||||
|
|
||||||
<s.gaza>
|
|
||||||
<tag>Gaza</tag>
|
|
||||||
<tag>Hamas</tag>
|
|
||||||
</s.gaza>
|
|
||||||
|
|
||||||
<s.ferguson>
|
|
||||||
<tag>Ferguson</tag>
|
|
||||||
<tag>Michael Brown</tag>
|
|
||||||
</s.ferguson>
|
|
||||||
|
|
||||||
<s.boko>
|
|
||||||
<tag>Boko Haram</tag>
|
|
||||||
</s.boko>
|
|
||||||
|
|
||||||
<s.pegida>
|
|
||||||
<tag>Pegida</tag>
|
|
||||||
<tag>#nopegida</tag>
|
|
||||||
</s.pegida>
|
|
||||||
|
|
||||||
<!-- Yellow pages -->
|
|
||||||
<s.schumi>
|
|
||||||
<tag>Schumacher</tag>
|
|
||||||
<tag>Schumi</tag>
|
|
||||||
</s.schumi>
|
|
||||||
|
|
||||||
<s.esc>
|
|
||||||
<tag>ESC</tag>
|
|
||||||
<tag>Conchita Wurst</tag>
|
|
||||||
<tag>#ConchitaWurst</tag>
|
|
||||||
<tag>Eurovision Song Contest</tag>
|
|
||||||
</s.esc>
|
|
||||||
|
|
||||||
<s.wulff>
|
|
||||||
<tag>Wulff</tag>
|
|
||||||
</s.wulff>
|
|
||||||
|
|
||||||
<s.tebartz>
|
|
||||||
<tag>Tebartz-van Elst</tag>
|
|
||||||
<tag>Tebartz</tag>
|
|
||||||
<tag>Limburg</tag>
|
|
||||||
</s.tebartz>
|
|
||||||
|
|
||||||
<s.gurlitt>
|
|
||||||
<tag>Gurlitt</tag>
|
|
||||||
</s.gurlitt>
|
|
||||||
|
|
||||||
<s.hoen>
|
|
||||||
<tag>Hoeneß</tag>
|
|
||||||
<tag>Hoeness</tag>
|
|
||||||
</s.hoen>
|
|
||||||
|
|
||||||
<s.pistorius>
|
|
||||||
<tag>Pistorius</tag>
|
|
||||||
<tag>#OscarPistorius</tag>
|
|
||||||
</s.pistorius>
|
|
||||||
|
|
||||||
<!-- Science -->
|
|
||||||
<s.philae>
|
|
||||||
<tag>Philae</tag>
|
|
||||||
<tag>#Tschuri</tag>
|
|
||||||
<tag>#Rosetta</tag>
|
|
||||||
<tag>#CometLanding</tag>
|
|
||||||
</s.philae>
|
|
||||||
|
|
||||||
<!-- Sports -->
|
|
||||||
<s.wm>
|
|
||||||
<tag>Fußball</tag>
|
|
||||||
<tag>Fussball</tag>
|
|
||||||
<tag>Stadion</tag>
|
|
||||||
<tag>Weltmeisterschaft</tag>
|
|
||||||
<tag>#WM</tag>
|
|
||||||
<tag>#BRAGER</tag>
|
|
||||||
<tag>#GERBRA</tag>
|
|
||||||
<tag>Fußballmeisterschaft</tag>
|
|
||||||
<tag>Fussballmeisterschaft</tag>
|
|
||||||
<tag>Fußballweltmeisterschaft</tag>
|
|
||||||
<tag>Fußssallweltmeisterschaft</tag>
|
|
||||||
<tag>Nationalmannschaft</tag>
|
|
||||||
<tag>Weltmeister</tag>
|
|
||||||
<tag>Brasilien</tag>
|
|
||||||
<tag>#WorldCup</tag>
|
|
||||||
<tag>#WM2014</tag>
|
|
||||||
</s.wm>
|
|
||||||
|
|
||||||
<s.sotschi>
|
|
||||||
<tag>Sotschi</tag>
|
|
||||||
<tag>#sochi2014</tag>
|
|
||||||
<tag>#sotschi2014</tag>
|
|
||||||
<tag>#Sochi</tag>
|
|
||||||
<tag>#WirfuerD</tag>
|
|
||||||
</s.sotschi>
|
|
||||||
|
|
||||||
<!-- Tests -->
|
|
||||||
<s.de>
|
|
||||||
<tag>Deutschland</tag>
|
|
||||||
<tag>Deutsche</tag>
|
|
||||||
</s.de>
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
+1986
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Reference in New Issue
Block a user