fixed some pattern detection bugs

This commit is contained in:
2015-01-12 15:36:14 +01:00
parent fd4ea4a47e
commit efcdfae80e
4 changed files with 127 additions and 36 deletions
+21 -24
View File
@@ -9,8 +9,11 @@ date_start <- as.Date("2014-01-01")
date_end <- as.Date("2014-12-31")
drange <- as.integer(date_end - date_start)
drange <- date_start + days(0:drange)
issues <- data.frame(date = drange)
# MATCH TWEETS ------------------------------------------------------------
issues <- data.frame(date = drange)
issuelist <- xmlToList("issues.xml")
issueheads <- names(issuelist)
issues[issueheads] <- 0
@@ -28,44 +31,38 @@ for(d in 1:nrow(issues)) {
curtext <- as.character(tweets_curday$text[t])
curtext <- str_replace_all(curtext, "#", "")
# Now test each single issue (not tag!)
for(i in 1:length(issuelist)) {
curtags <- as.character(issuelist[[i]])
curissue <- names(issuelist)[i]
curtags <- str_c("\\W", curtags, "\\W")
tags_found <- str_detect(curtext, sprintf("%s", curtags))
tags_found <- any(tags_found)
######
# Test all tags in ONE issue
# Now test all tags of a single issue
for(t in 1:length(curtags)) {
curtag <- curtags[t]
curchars <- nchar(curtag, type = "chars")
curtag <- str_c("\\W", curtags[t], "\\W")
curchars <- nchar(curtag, type = "chars") - 4
# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
tags_found <- smartPatternMatch(curtext, curtag, curchars)
if(tags_found == 1) {
cat("Text contains at least the tag:", curtag, "\n")
#cat("Matched", curtag, "with", curtext,"\n")
issues[d,curissue] <- issues[d,curissue] + 1
break
}
}
######
if(tags_found) {
#cat("Positive in", curissue,"from",as.character(drange[d]),"\n")
issues[d,curissue] <- issues[d,curissue] + 1
}
else {
#cat("Nothing found\n")
}
else {
#cat("Nothing found\n")
}
} # /for curtags
} # /for issuelist
} # /for tweets_curday
} # /for drange
# WEEKLY INTERVALS --------------------------------------------------------
## Do not use days but week intervals
wrange <- (as.integer(date_end - date_start) / 7)
+4 -4
View File
@@ -28,13 +28,13 @@ convertLogical0 <- function(var) {
smartPatternMatch <- function(string, pattern, chars) {
if(chars < 5) {
found <- agrep(pattern, string, max.distance = list(all = 0), ignore.case = TRUE)
found <- agrep(pattern, string, max.distance = list(all = 0), ignore.case = TRUE, fixed = FALSE)
}
if(chars > 7) {
found <- agrep(pattern, string, max.distance = list(all = 2), ignore.case = TRUE)
else if(chars > 7) {
found <- agrep(pattern, string, max.distance = list(all = 2), ignore.case = TRUE, fixed = FALSE)
}
else {
found <- agrep(pattern, string, max.distance = list(all = 1), ignore.case = TRUE)
found <- agrep(pattern, string, max.distance = list(all = 1), ignore.case = TRUE, fixed = FALSE)
}
found <- convertLogical0(found)
return(found)
+73 -8
View File
@@ -11,15 +11,80 @@
<tag>kraftwerk</tag>
<tag>strom</tag>
</issue.green>
<issue.ukraine>
<tag>ukraine</tag>
<tag>euromaidan</tag>
<tag>krim</tag>
<tag>putin</tag>
<tag>kiew</tag>
</issue.ukraine>
<issue.labour>
<tag>arbeitsmarkt</tag>
<tag>mindestlohn</tag>
<tag>arbeitslosigkeit</tag>
<tag>hartz4</tag>
<tag>arbeitslos</tag>
</issue.labour>
<issue.nsa>
<tag>nsa</tag>
<tag>snowden</tag>
<tag>bnd</tag>
<tag>gchq</tag>
<tag>überwachung</tag>
</issue.nsa>
<issue.wm2014>
<tag>wm2014</tag>
<tag>weltmeister</tag>
<tag>meister</tag>
<tag>finale</tag>
<tag>halbfinale</tag>
<tag>viertelfinale</tag>
<tag>achtelfinale</tag>
<tag>brager</tag>
<tag>gerbra</tag>
<tag>argger</tag>
<tag>gerarg</tag>
<tag>wm</tag>
<tag>stadion</tag>
</issue.wm2014>
<issue.middleeast>
<tag>israel</tag>
<tag>gaza</tag>
<tag>naher osten</tag>
<tag>nahen osten</tag>
<tag>nahost</tag>
</issue.middleeast>
<issue.iraq>
<tag>irak</tag>
<tag>isis</tag>
<tag>is</tag>
<tag>kalifat</tag>
</issue.iraq>
<issue.ebola>
<tag>ebola</tag>
</issue.ebola>
<issue.edathy>
<tag>edathy</tag>
<tag>kinderpornographie</tag>
<tag>kipo</tag>
<tag>pädophil</tag>
<tag>pädophilie</tag>
</issue.edathy>
<issue.test>
<tag>ein langer ausdruck</tag>
<tag>binde-strich</tag>
<tag>fünfe</tag>
<tag>achtacht</tag>
<tag>fehlar</tag>
<tag>korrektur</tag>
</issue.test>
<issue.conservative>
<tag>christ</tag>
<tag>christlich</tag>
<tag>christen</tag>
<tag>inflation</tag>
</issue.conservative>
<issue.control>
<tag>pillepalle</tag>
+29
View File
@@ -0,0 +1,29 @@
<issuelist>
<issue.green>
<tag>umwelt</tag>
<tag>energie</tag>
<tag>energiewende</tag>
<tag>atomkraft</tag>
<tag>windkraft</tag>
<tag>wasserkraft</tag>
<tag>solarstrom</tag>
<tag>kraftwerk</tag>
<tag>strom</tag>
</issue.green>
<issue.test>
<tag>ein langer ausdruck</tag>
<tag>binde-strich</tag>
<tag>fünfe</tag>
<tag>achtacht</tag>
<tag>fehlar</tag>
<tag>korrektur</tag>
</issue.test>
<issue.control>
<tag>pillepalle</tag>
<tag>schundluder</tag>
<tag>whatthefuck</tag>
</issue.control>
</issuelist>