fixed some pattern detection bugs

This commit is contained in:
2015-01-12 15:36:14 +01:00
parent fd4ea4a47e
commit efcdfae80e
4 changed files with 127 additions and 36 deletions
+21 -24
View File
@@ -9,8 +9,11 @@ date_start <- as.Date("2014-01-01")
date_end <- as.Date("2014-12-31")
drange <- as.integer(date_end - date_start)
drange <- date_start + days(0:drange)
issues <- data.frame(date = drange)
# MATCH TWEETS ------------------------------------------------------------
issues <- data.frame(date = drange)
issuelist <- xmlToList("issues.xml")
issueheads <- names(issuelist)
issues[issueheads] <- 0
@@ -28,44 +31,38 @@ for(d in 1:nrow(issues)) {
curtext <- as.character(tweets_curday$text[t])
curtext <- str_replace_all(curtext, "#", "")
# Now test each single issue (not tag!)
for(i in 1:length(issuelist)) {
curtags <- as.character(issuelist[[i]])
curissue <- names(issuelist)[i]
curtags <- str_c("\\W", curtags, "\\W")
tags_found <- str_detect(curtext, sprintf("%s", curtags))
tags_found <- any(tags_found)
######
# Test all tags in ONE issue
# Now test all tags of a single issue
for(t in 1:length(curtags)) {
curtag <- curtags[t]
curchars <- nchar(curtag, type = "chars")
curtag <- str_c("\\W", curtags[t], "\\W")
curchars <- nchar(curtag, type = "chars") - 4
# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
tags_found <- smartPatternMatch(curtext, curtag, curchars)
if(tags_found == 1) {
cat("Text contains at least the tag:", curtag, "\n")
#cat("Matched", curtag, "with", curtext,"\n")
issues[d,curissue] <- issues[d,curissue] + 1
break
}
}
######
if(tags_found) {
#cat("Positive in", curissue,"from",as.character(drange[d]),"\n")
issues[d,curissue] <- issues[d,curissue] + 1
}
else {
#cat("Nothing found\n")
}
else {
#cat("Nothing found\n")
}
} # /for curtags
} # /for issuelist
} # /for tweets_curday
} # /for drange
# WEEKLY INTERVALS --------------------------------------------------------
## Do not use days but week intervals
wrange <- (as.integer(date_end - date_start) / 7)