From efcdfae80e98f3c643ef5593a83b161ac03f700f Mon Sep 17 00:00:00 2001 From: mxmehl Date: Mon, 12 Jan 2015 15:36:14 +0100 Subject: [PATCH] fixed some pattern detection bugs --- issuecomp-analysis.R | 45 +++++++++++------------- issuecomp-functions.R | 8 ++--- issues.xml | 81 ++++++++++++++++++++++++++++++++++++++----- issues.xml.test | 29 ++++++++++++++++ 4 files changed, 127 insertions(+), 36 deletions(-) create mode 100644 issues.xml.test diff --git a/issuecomp-analysis.R b/issuecomp-analysis.R index c99dcc5..2e41b01 100644 --- a/issuecomp-analysis.R +++ b/issuecomp-analysis.R @@ -9,8 +9,11 @@ date_start <- as.Date("2014-01-01") date_end <- as.Date("2014-12-31") drange <- as.integer(date_end - date_start) drange <- date_start + days(0:drange) -issues <- data.frame(date = drange) + +# MATCH TWEETS ------------------------------------------------------------ + +issues <- data.frame(date = drange) issuelist <- xmlToList("issues.xml") issueheads <- names(issuelist) issues[issueheads] <- 0 @@ -28,44 +31,38 @@ for(d in 1:nrow(issues)) { curtext <- as.character(tweets_curday$text[t]) curtext <- str_replace_all(curtext, "#", "") + # Now test each single issue (not tag!) for(i in 1:length(issuelist)) { curtags <- as.character(issuelist[[i]]) curissue <- names(issuelist)[i] - curtags <- str_c("\\W", curtags, "\\W") - tags_found <- str_detect(curtext, sprintf("%s", curtags)) - tags_found <- any(tags_found) - - ###### - - # Test all tags in ONE issue + + # Now test all tags of a single issue for(t in 1:length(curtags)) { - curtag <- curtags[t] - curchars <- nchar(curtag, type = "chars") + curtag <- str_c("\\W", curtags[t], "\\W") + curchars <- nchar(curtag, type = "chars") - 4 + # Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance) tags_found <- smartPatternMatch(curtext, curtag, curchars) - if(tags_found == 1) { - cat("Text contains at least the tag:", curtag, "\n") + #cat("Matched", curtag, "with", curtext,"\n") + issues[d,curissue] <- issues[d,curissue] + 1 break } - } - - - ###### - - if(tags_found) { - #cat("Positive in", curissue,"from",as.character(drange[d]),"\n") - issues[d,curissue] <- issues[d,curissue] + 1 - } - else { - #cat("Nothing found\n") - } + else { + #cat("Nothing found\n") + } + } # /for curtags } # /for issuelist } # /for tweets_curday } # /for drange + +# WEEKLY INTERVALS -------------------------------------------------------- + + + ## Do not use days but week intervals wrange <- (as.integer(date_end - date_start) / 7) diff --git a/issuecomp-functions.R b/issuecomp-functions.R index d97496f..3c915bd 100644 --- a/issuecomp-functions.R +++ b/issuecomp-functions.R @@ -28,13 +28,13 @@ convertLogical0 <- function(var) { smartPatternMatch <- function(string, pattern, chars) { if(chars < 5) { - found <- agrep(pattern, string, max.distance = list(all = 0), ignore.case = TRUE) + found <- agrep(pattern, string, max.distance = list(all = 0), ignore.case = TRUE, fixed = FALSE) } - if(chars > 7) { - found <- agrep(pattern, string, max.distance = list(all = 2), ignore.case = TRUE) + else if(chars > 7) { + found <- agrep(pattern, string, max.distance = list(all = 2), ignore.case = TRUE, fixed = FALSE) } else { - found <- agrep(pattern, string, max.distance = list(all = 1), ignore.case = TRUE) + found <- agrep(pattern, string, max.distance = list(all = 1), ignore.case = TRUE, fixed = FALSE) } found <- convertLogical0(found) return(found) diff --git a/issues.xml b/issues.xml index 3102a12..c791322 100644 --- a/issues.xml +++ b/issues.xml @@ -11,15 +11,80 @@ kraftwerk strom + + + ukraine + euromaidan + krim + putin + kiew + + + + arbeitsmarkt + mindestlohn + arbeitslosigkeit + hartz4 + arbeitslos + + + + nsa + snowden + bnd + gchq + überwachung + + + + wm2014 + weltmeister + meister + finale + halbfinale + viertelfinale + achtelfinale + brager + gerbra + argger + gerarg + wm + stadion + + + + israel + gaza + naher osten + nahen osten + nahost + + + + irak + isis + is + kalifat + + + + ebola + + + + edathy + kinderpornographie + kipo + pädophil + pädophilie + - - ein langer ausdruck - binde-strich - fünfe - achtacht - fehlar - korrektur - + + christ + christlich + christen + inflation + pillepalle diff --git a/issues.xml.test b/issues.xml.test new file mode 100644 index 0000000..3102a12 --- /dev/null +++ b/issues.xml.test @@ -0,0 +1,29 @@ + + + + umwelt + energie + energiewende + atomkraft + windkraft + wasserkraft + solarstrom + kraftwerk + strom + + + + ein langer ausdruck + binde-strich + fünfe + achtacht + fehlar + korrektur + + + + pillepalle + schundluder + whatthefuck + +