diff --git a/issuecomp-analysis.R b/issuecomp-analysis.R
index c99dcc5..2e41b01 100644
--- a/issuecomp-analysis.R
+++ b/issuecomp-analysis.R
@@ -9,8 +9,11 @@ date_start <- as.Date("2014-01-01")
date_end <- as.Date("2014-12-31")
drange <- as.integer(date_end - date_start)
drange <- date_start + days(0:drange)
-issues <- data.frame(date = drange)
+
+# MATCH TWEETS ------------------------------------------------------------
+
+issues <- data.frame(date = drange)
issuelist <- xmlToList("issues.xml")
issueheads <- names(issuelist)
issues[issueheads] <- 0
@@ -28,44 +31,38 @@ for(d in 1:nrow(issues)) {
curtext <- as.character(tweets_curday$text[t])
curtext <- str_replace_all(curtext, "#", "")
+ # Now test each single issue (not tag!)
for(i in 1:length(issuelist)) {
curtags <- as.character(issuelist[[i]])
curissue <- names(issuelist)[i]
- curtags <- str_c("\\W", curtags, "\\W")
- tags_found <- str_detect(curtext, sprintf("%s", curtags))
- tags_found <- any(tags_found)
-
- ######
-
- # Test all tags in ONE issue
+
+ # Now test all tags of a single issue
for(t in 1:length(curtags)) {
- curtag <- curtags[t]
- curchars <- nchar(curtag, type = "chars")
+ curtag <- str_c("\\W", curtags[t], "\\W")
+ curchars <- nchar(curtag, type = "chars") - 4
+ # Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
tags_found <- smartPatternMatch(curtext, curtag, curchars)
-
if(tags_found == 1) {
- cat("Text contains at least the tag:", curtag, "\n")
+ #cat("Matched", curtag, "with", curtext,"\n")
+ issues[d,curissue] <- issues[d,curissue] + 1
break
}
- }
-
-
- ######
-
- if(tags_found) {
- #cat("Positive in", curissue,"from",as.character(drange[d]),"\n")
- issues[d,curissue] <- issues[d,curissue] + 1
- }
- else {
- #cat("Nothing found\n")
- }
+ else {
+ #cat("Nothing found\n")
+ }
+ } # /for curtags
} # /for issuelist
} # /for tweets_curday
} # /for drange
+
+# WEEKLY INTERVALS --------------------------------------------------------
+
+
+
## Do not use days but week intervals
wrange <- (as.integer(date_end - date_start) / 7)
diff --git a/issuecomp-functions.R b/issuecomp-functions.R
index d97496f..3c915bd 100644
--- a/issuecomp-functions.R
+++ b/issuecomp-functions.R
@@ -28,13 +28,13 @@ convertLogical0 <- function(var) {
smartPatternMatch <- function(string, pattern, chars) {
if(chars < 5) {
- found <- agrep(pattern, string, max.distance = list(all = 0), ignore.case = TRUE)
+ found <- agrep(pattern, string, max.distance = list(all = 0), ignore.case = TRUE, fixed = FALSE)
}
- if(chars > 7) {
- found <- agrep(pattern, string, max.distance = list(all = 2), ignore.case = TRUE)
+ else if(chars > 7) {
+ found <- agrep(pattern, string, max.distance = list(all = 2), ignore.case = TRUE, fixed = FALSE)
}
else {
- found <- agrep(pattern, string, max.distance = list(all = 1), ignore.case = TRUE)
+ found <- agrep(pattern, string, max.distance = list(all = 1), ignore.case = TRUE, fixed = FALSE)
}
found <- convertLogical0(found)
return(found)
diff --git a/issues.xml b/issues.xml
index 3102a12..c791322 100644
--- a/issues.xml
+++ b/issues.xml
@@ -11,15 +11,80 @@
kraftwerk
strom
+
+
+ ukraine
+ euromaidan
+ krim
+ putin
+ kiew
+
+
+
+ arbeitsmarkt
+ mindestlohn
+ arbeitslosigkeit
+ hartz4
+ arbeitslos
+
+
+
+ nsa
+ snowden
+ bnd
+ gchq
+ überwachung
+
+
+
+ wm2014
+ weltmeister
+ meister
+ finale
+ halbfinale
+ viertelfinale
+ achtelfinale
+ brager
+ gerbra
+ argger
+ gerarg
+ wm
+ stadion
+
+
+
+ israel
+ gaza
+ naher osten
+ nahen osten
+ nahost
+
+
+
+ irak
+ isis
+ is
+ kalifat
+
+
+
+ ebola
+
+
+
+ edathy
+ kinderpornographie
+ kipo
+ pädophil
+ pädophilie
+
-
- ein langer ausdruck
- binde-strich
- fünfe
- achtacht
- fehlar
- korrektur
-
+
+ christ
+ christlich
+ christen
+ inflation
+
pillepalle
diff --git a/issues.xml.test b/issues.xml.test
new file mode 100644
index 0000000..3102a12
--- /dev/null
+++ b/issues.xml.test
@@ -0,0 +1,29 @@
+
+
+
+ umwelt
+ energie
+ energiewende
+ atomkraft
+ windkraft
+ wasserkraft
+ solarstrom
+ kraftwerk
+ strom
+
+
+
+ ein langer ausdruck
+ binde-strich
+ fünfe
+ achtacht
+ fehlar
+ korrektur
+
+
+
+ pillepalle
+ schundluder
+ whatthefuck
+
+