fixed some pattern detection bugs
This commit is contained in:
+20
-23
@@ -9,8 +9,11 @@ date_start <- as.Date("2014-01-01")
|
|||||||
date_end <- as.Date("2014-12-31")
|
date_end <- as.Date("2014-12-31")
|
||||||
drange <- as.integer(date_end - date_start)
|
drange <- as.integer(date_end - date_start)
|
||||||
drange <- date_start + days(0:drange)
|
drange <- date_start + days(0:drange)
|
||||||
issues <- data.frame(date = drange)
|
|
||||||
|
|
||||||
|
|
||||||
|
# MATCH TWEETS ------------------------------------------------------------
|
||||||
|
|
||||||
|
issues <- data.frame(date = drange)
|
||||||
issuelist <- xmlToList("issues.xml")
|
issuelist <- xmlToList("issues.xml")
|
||||||
issueheads <- names(issuelist)
|
issueheads <- names(issuelist)
|
||||||
issues[issueheads] <- 0
|
issues[issueheads] <- 0
|
||||||
@@ -28,44 +31,38 @@ for(d in 1:nrow(issues)) {
|
|||||||
curtext <- as.character(tweets_curday$text[t])
|
curtext <- as.character(tweets_curday$text[t])
|
||||||
curtext <- str_replace_all(curtext, "#", "")
|
curtext <- str_replace_all(curtext, "#", "")
|
||||||
|
|
||||||
|
# Now test each single issue (not tag!)
|
||||||
for(i in 1:length(issuelist)) {
|
for(i in 1:length(issuelist)) {
|
||||||
curtags <- as.character(issuelist[[i]])
|
curtags <- as.character(issuelist[[i]])
|
||||||
curissue <- names(issuelist)[i]
|
curissue <- names(issuelist)[i]
|
||||||
curtags <- str_c("\\W", curtags, "\\W")
|
|
||||||
tags_found <- str_detect(curtext, sprintf("%s", curtags))
|
|
||||||
tags_found <- any(tags_found)
|
|
||||||
|
|
||||||
######
|
# Now test all tags of a single issue
|
||||||
|
|
||||||
# Test all tags in ONE issue
|
|
||||||
for(t in 1:length(curtags)) {
|
for(t in 1:length(curtags)) {
|
||||||
curtag <- curtags[t]
|
curtag <- str_c("\\W", curtags[t], "\\W")
|
||||||
curchars <- nchar(curtag, type = "chars")
|
curchars <- nchar(curtag, type = "chars") - 4
|
||||||
|
|
||||||
|
# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
|
||||||
tags_found <- smartPatternMatch(curtext, curtag, curchars)
|
tags_found <- smartPatternMatch(curtext, curtag, curchars)
|
||||||
|
|
||||||
if(tags_found == 1) {
|
if(tags_found == 1) {
|
||||||
cat("Text contains at least the tag:", curtag, "\n")
|
#cat("Matched", curtag, "with", curtext,"\n")
|
||||||
|
issues[d,curissue] <- issues[d,curissue] + 1
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
}
|
else {
|
||||||
|
#cat("Nothing found\n")
|
||||||
|
}
|
||||||
######
|
} # /for curtags
|
||||||
|
|
||||||
if(tags_found) {
|
|
||||||
#cat("Positive in", curissue,"from",as.character(drange[d]),"\n")
|
|
||||||
issues[d,curissue] <- issues[d,curissue] + 1
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
#cat("Nothing found\n")
|
|
||||||
}
|
|
||||||
|
|
||||||
} # /for issuelist
|
} # /for issuelist
|
||||||
} # /for tweets_curday
|
} # /for tweets_curday
|
||||||
} # /for drange
|
} # /for drange
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# WEEKLY INTERVALS --------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## Do not use days but week intervals
|
## Do not use days but week intervals
|
||||||
|
|
||||||
wrange <- (as.integer(date_end - date_start) / 7)
|
wrange <- (as.integer(date_end - date_start) / 7)
|
||||||
|
|||||||
@@ -28,13 +28,13 @@ convertLogical0 <- function(var) {
|
|||||||
|
|
||||||
smartPatternMatch <- function(string, pattern, chars) {
|
smartPatternMatch <- function(string, pattern, chars) {
|
||||||
if(chars < 5) {
|
if(chars < 5) {
|
||||||
found <- agrep(pattern, string, max.distance = list(all = 0), ignore.case = TRUE)
|
found <- agrep(pattern, string, max.distance = list(all = 0), ignore.case = TRUE, fixed = FALSE)
|
||||||
}
|
}
|
||||||
if(chars > 7) {
|
else if(chars > 7) {
|
||||||
found <- agrep(pattern, string, max.distance = list(all = 2), ignore.case = TRUE)
|
found <- agrep(pattern, string, max.distance = list(all = 2), ignore.case = TRUE, fixed = FALSE)
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
found <- agrep(pattern, string, max.distance = list(all = 1), ignore.case = TRUE)
|
found <- agrep(pattern, string, max.distance = list(all = 1), ignore.case = TRUE, fixed = FALSE)
|
||||||
}
|
}
|
||||||
found <- convertLogical0(found)
|
found <- convertLogical0(found)
|
||||||
return(found)
|
return(found)
|
||||||
|
|||||||
+73
-8
@@ -12,14 +12,79 @@
|
|||||||
<tag>strom</tag>
|
<tag>strom</tag>
|
||||||
</issue.green>
|
</issue.green>
|
||||||
|
|
||||||
<issue.test>
|
<issue.ukraine>
|
||||||
<tag>ein langer ausdruck</tag>
|
<tag>ukraine</tag>
|
||||||
<tag>binde-strich</tag>
|
<tag>euromaidan</tag>
|
||||||
<tag>fünfe</tag>
|
<tag>krim</tag>
|
||||||
<tag>achtacht</tag>
|
<tag>putin</tag>
|
||||||
<tag>fehlar</tag>
|
<tag>kiew</tag>
|
||||||
<tag>korrektur</tag>
|
</issue.ukraine>
|
||||||
</issue.test>
|
|
||||||
|
<issue.labour>
|
||||||
|
<tag>arbeitsmarkt</tag>
|
||||||
|
<tag>mindestlohn</tag>
|
||||||
|
<tag>arbeitslosigkeit</tag>
|
||||||
|
<tag>hartz4</tag>
|
||||||
|
<tag>arbeitslos</tag>
|
||||||
|
</issue.labour>
|
||||||
|
|
||||||
|
<issue.nsa>
|
||||||
|
<tag>nsa</tag>
|
||||||
|
<tag>snowden</tag>
|
||||||
|
<tag>bnd</tag>
|
||||||
|
<tag>gchq</tag>
|
||||||
|
<tag>überwachung</tag>
|
||||||
|
</issue.nsa>
|
||||||
|
|
||||||
|
<issue.wm2014>
|
||||||
|
<tag>wm2014</tag>
|
||||||
|
<tag>weltmeister</tag>
|
||||||
|
<tag>meister</tag>
|
||||||
|
<tag>finale</tag>
|
||||||
|
<tag>halbfinale</tag>
|
||||||
|
<tag>viertelfinale</tag>
|
||||||
|
<tag>achtelfinale</tag>
|
||||||
|
<tag>brager</tag>
|
||||||
|
<tag>gerbra</tag>
|
||||||
|
<tag>argger</tag>
|
||||||
|
<tag>gerarg</tag>
|
||||||
|
<tag>wm</tag>
|
||||||
|
<tag>stadion</tag>
|
||||||
|
</issue.wm2014>
|
||||||
|
|
||||||
|
<issue.middleeast>
|
||||||
|
<tag>israel</tag>
|
||||||
|
<tag>gaza</tag>
|
||||||
|
<tag>naher osten</tag>
|
||||||
|
<tag>nahen osten</tag>
|
||||||
|
<tag>nahost</tag>
|
||||||
|
</issue.middleeast>
|
||||||
|
|
||||||
|
<issue.iraq>
|
||||||
|
<tag>irak</tag>
|
||||||
|
<tag>isis</tag>
|
||||||
|
<tag>is</tag>
|
||||||
|
<tag>kalifat</tag>
|
||||||
|
</issue.iraq>
|
||||||
|
|
||||||
|
<issue.ebola>
|
||||||
|
<tag>ebola</tag>
|
||||||
|
</issue.ebola>
|
||||||
|
|
||||||
|
<issue.edathy>
|
||||||
|
<tag>edathy</tag>
|
||||||
|
<tag>kinderpornographie</tag>
|
||||||
|
<tag>kipo</tag>
|
||||||
|
<tag>pädophil</tag>
|
||||||
|
<tag>pädophilie</tag>
|
||||||
|
</issue.edathy>
|
||||||
|
|
||||||
|
<issue.conservative>
|
||||||
|
<tag>christ</tag>
|
||||||
|
<tag>christlich</tag>
|
||||||
|
<tag>christen</tag>
|
||||||
|
<tag>inflation</tag>
|
||||||
|
</issue.conservative>
|
||||||
|
|
||||||
<issue.control>
|
<issue.control>
|
||||||
<tag>pillepalle</tag>
|
<tag>pillepalle</tag>
|
||||||
|
|||||||
@@ -0,0 +1,29 @@
|
|||||||
|
<issuelist>
|
||||||
|
|
||||||
|
<issue.green>
|
||||||
|
<tag>umwelt</tag>
|
||||||
|
<tag>energie</tag>
|
||||||
|
<tag>energiewende</tag>
|
||||||
|
<tag>atomkraft</tag>
|
||||||
|
<tag>windkraft</tag>
|
||||||
|
<tag>wasserkraft</tag>
|
||||||
|
<tag>solarstrom</tag>
|
||||||
|
<tag>kraftwerk</tag>
|
||||||
|
<tag>strom</tag>
|
||||||
|
</issue.green>
|
||||||
|
|
||||||
|
<issue.test>
|
||||||
|
<tag>ein langer ausdruck</tag>
|
||||||
|
<tag>binde-strich</tag>
|
||||||
|
<tag>fünfe</tag>
|
||||||
|
<tag>achtacht</tag>
|
||||||
|
<tag>fehlar</tag>
|
||||||
|
<tag>korrektur</tag>
|
||||||
|
</issue.test>
|
||||||
|
|
||||||
|
<issue.control>
|
||||||
|
<tag>pillepalle</tag>
|
||||||
|
<tag>schundluder</tag>
|
||||||
|
<tag>whatthefuck</tag>
|
||||||
|
</issue.control>
|
||||||
|
</issuelist>
|
||||||
Reference in New Issue
Block a user