fixed some pattern detection bugs
This commit is contained in:
+21
-24
@@ -9,8 +9,11 @@ date_start <- as.Date("2014-01-01")
|
||||
date_end <- as.Date("2014-12-31")
|
||||
drange <- as.integer(date_end - date_start)
|
||||
drange <- date_start + days(0:drange)
|
||||
issues <- data.frame(date = drange)
|
||||
|
||||
|
||||
# MATCH TWEETS ------------------------------------------------------------
|
||||
|
||||
issues <- data.frame(date = drange)
|
||||
issuelist <- xmlToList("issues.xml")
|
||||
issueheads <- names(issuelist)
|
||||
issues[issueheads] <- 0
|
||||
@@ -28,44 +31,38 @@ for(d in 1:nrow(issues)) {
|
||||
curtext <- as.character(tweets_curday$text[t])
|
||||
curtext <- str_replace_all(curtext, "#", "")
|
||||
|
||||
# Now test each single issue (not tag!)
|
||||
for(i in 1:length(issuelist)) {
|
||||
curtags <- as.character(issuelist[[i]])
|
||||
curissue <- names(issuelist)[i]
|
||||
curtags <- str_c("\\W", curtags, "\\W")
|
||||
tags_found <- str_detect(curtext, sprintf("%s", curtags))
|
||||
tags_found <- any(tags_found)
|
||||
|
||||
######
|
||||
|
||||
# Test all tags in ONE issue
|
||||
|
||||
# Now test all tags of a single issue
|
||||
for(t in 1:length(curtags)) {
|
||||
curtag <- curtags[t]
|
||||
curchars <- nchar(curtag, type = "chars")
|
||||
curtag <- str_c("\\W", curtags[t], "\\W")
|
||||
curchars <- nchar(curtag, type = "chars") - 4
|
||||
|
||||
# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
|
||||
tags_found <- smartPatternMatch(curtext, curtag, curchars)
|
||||
|
||||
if(tags_found == 1) {
|
||||
cat("Text contains at least the tag:", curtag, "\n")
|
||||
#cat("Matched", curtag, "with", curtext,"\n")
|
||||
issues[d,curissue] <- issues[d,curissue] + 1
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
######
|
||||
|
||||
if(tags_found) {
|
||||
#cat("Positive in", curissue,"from",as.character(drange[d]),"\n")
|
||||
issues[d,curissue] <- issues[d,curissue] + 1
|
||||
}
|
||||
else {
|
||||
#cat("Nothing found\n")
|
||||
}
|
||||
else {
|
||||
#cat("Nothing found\n")
|
||||
}
|
||||
} # /for curtags
|
||||
|
||||
} # /for issuelist
|
||||
} # /for tweets_curday
|
||||
} # /for drange
|
||||
|
||||
|
||||
|
||||
# WEEKLY INTERVALS --------------------------------------------------------
|
||||
|
||||
|
||||
|
||||
## Do not use days but week intervals
|
||||
|
||||
wrange <- (as.integer(date_end - date_start) / 7)
|
||||
|
||||
@@ -28,13 +28,13 @@ convertLogical0 <- function(var) {
|
||||
|
||||
smartPatternMatch <- function(string, pattern, chars) {
|
||||
if(chars < 5) {
|
||||
found <- agrep(pattern, string, max.distance = list(all = 0), ignore.case = TRUE)
|
||||
found <- agrep(pattern, string, max.distance = list(all = 0), ignore.case = TRUE, fixed = FALSE)
|
||||
}
|
||||
if(chars > 7) {
|
||||
found <- agrep(pattern, string, max.distance = list(all = 2), ignore.case = TRUE)
|
||||
else if(chars > 7) {
|
||||
found <- agrep(pattern, string, max.distance = list(all = 2), ignore.case = TRUE, fixed = FALSE)
|
||||
}
|
||||
else {
|
||||
found <- agrep(pattern, string, max.distance = list(all = 1), ignore.case = TRUE)
|
||||
found <- agrep(pattern, string, max.distance = list(all = 1), ignore.case = TRUE, fixed = FALSE)
|
||||
}
|
||||
found <- convertLogical0(found)
|
||||
return(found)
|
||||
|
||||
+73
-8
@@ -11,15 +11,80 @@
|
||||
<tag>kraftwerk</tag>
|
||||
<tag>strom</tag>
|
||||
</issue.green>
|
||||
|
||||
<issue.ukraine>
|
||||
<tag>ukraine</tag>
|
||||
<tag>euromaidan</tag>
|
||||
<tag>krim</tag>
|
||||
<tag>putin</tag>
|
||||
<tag>kiew</tag>
|
||||
</issue.ukraine>
|
||||
|
||||
<issue.labour>
|
||||
<tag>arbeitsmarkt</tag>
|
||||
<tag>mindestlohn</tag>
|
||||
<tag>arbeitslosigkeit</tag>
|
||||
<tag>hartz4</tag>
|
||||
<tag>arbeitslos</tag>
|
||||
</issue.labour>
|
||||
|
||||
<issue.nsa>
|
||||
<tag>nsa</tag>
|
||||
<tag>snowden</tag>
|
||||
<tag>bnd</tag>
|
||||
<tag>gchq</tag>
|
||||
<tag>überwachung</tag>
|
||||
</issue.nsa>
|
||||
|
||||
<issue.wm2014>
|
||||
<tag>wm2014</tag>
|
||||
<tag>weltmeister</tag>
|
||||
<tag>meister</tag>
|
||||
<tag>finale</tag>
|
||||
<tag>halbfinale</tag>
|
||||
<tag>viertelfinale</tag>
|
||||
<tag>achtelfinale</tag>
|
||||
<tag>brager</tag>
|
||||
<tag>gerbra</tag>
|
||||
<tag>argger</tag>
|
||||
<tag>gerarg</tag>
|
||||
<tag>wm</tag>
|
||||
<tag>stadion</tag>
|
||||
</issue.wm2014>
|
||||
|
||||
<issue.middleeast>
|
||||
<tag>israel</tag>
|
||||
<tag>gaza</tag>
|
||||
<tag>naher osten</tag>
|
||||
<tag>nahen osten</tag>
|
||||
<tag>nahost</tag>
|
||||
</issue.middleeast>
|
||||
|
||||
<issue.iraq>
|
||||
<tag>irak</tag>
|
||||
<tag>isis</tag>
|
||||
<tag>is</tag>
|
||||
<tag>kalifat</tag>
|
||||
</issue.iraq>
|
||||
|
||||
<issue.ebola>
|
||||
<tag>ebola</tag>
|
||||
</issue.ebola>
|
||||
|
||||
<issue.edathy>
|
||||
<tag>edathy</tag>
|
||||
<tag>kinderpornographie</tag>
|
||||
<tag>kipo</tag>
|
||||
<tag>pädophil</tag>
|
||||
<tag>pädophilie</tag>
|
||||
</issue.edathy>
|
||||
|
||||
<issue.test>
|
||||
<tag>ein langer ausdruck</tag>
|
||||
<tag>binde-strich</tag>
|
||||
<tag>fünfe</tag>
|
||||
<tag>achtacht</tag>
|
||||
<tag>fehlar</tag>
|
||||
<tag>korrektur</tag>
|
||||
</issue.test>
|
||||
<issue.conservative>
|
||||
<tag>christ</tag>
|
||||
<tag>christlich</tag>
|
||||
<tag>christen</tag>
|
||||
<tag>inflation</tag>
|
||||
</issue.conservative>
|
||||
|
||||
<issue.control>
|
||||
<tag>pillepalle</tag>
|
||||
|
||||
@@ -0,0 +1,29 @@
|
||||
<issuelist>
|
||||
|
||||
<issue.green>
|
||||
<tag>umwelt</tag>
|
||||
<tag>energie</tag>
|
||||
<tag>energiewende</tag>
|
||||
<tag>atomkraft</tag>
|
||||
<tag>windkraft</tag>
|
||||
<tag>wasserkraft</tag>
|
||||
<tag>solarstrom</tag>
|
||||
<tag>kraftwerk</tag>
|
||||
<tag>strom</tag>
|
||||
</issue.green>
|
||||
|
||||
<issue.test>
|
||||
<tag>ein langer ausdruck</tag>
|
||||
<tag>binde-strich</tag>
|
||||
<tag>fünfe</tag>
|
||||
<tag>achtacht</tag>
|
||||
<tag>fehlar</tag>
|
||||
<tag>korrektur</tag>
|
||||
</issue.test>
|
||||
|
||||
<issue.control>
|
||||
<tag>pillepalle</tag>
|
||||
<tag>schundluder</tag>
|
||||
<tag>whatthefuck</tag>
|
||||
</issue.control>
|
||||
</issuelist>
|
||||
Reference in New Issue
Block a user