better matching, now with plural forms and less distance

This commit is contained in:
2015-01-21 12:27:09 +01:00
parent e9c5fc7d8d
commit a8987936c4
4 changed files with 488 additions and 452 deletions
+34 -3
View File
@@ -29,6 +29,8 @@ issues[issueheads] <- 0
tweets$issue <- ""
tweets$tags <- ""
tagexpand <- c("", "s", "n", "en")
for(d in 1:nrow(issues)) {
# Go through every day
curdate <- issues$date[d]
@@ -61,10 +63,23 @@ for(d in 1:nrow(issues)) {
} else {
curacro <- FALSE
}
# Now expand the current tag by possible suffixes that may be plural forms
if(!curacro) {
for(e in 1:length(tagexpand)) {
curtag[e] <- str_c(curtag[1], tagexpand[e])
}
}
# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro)
if(tags_found == 1) {
# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow also 1 (Levenshtein distance)
tags_found <- NULL
for(e in 1:length(curtag)) {
tags_found[e] <- smartPatternMatch(curtext, curtag[e], curchars, curacro)
}
tags_found <- any(tags_found)
curtag <- curtag[1]
if(tags_found == TRUE) {
# Raise number of findings on this day for this issue by 1
issues[d,curissue] <- issues[d,curissue] + 1
@@ -117,6 +132,22 @@ g1
rm(g1, r)
# Show party percentage of twitter users
acc_parties <- data.frame(party = c("cducsu", "spd", "linke", "gruene"))
acc_parties$btw13 <- c(49.3, 30.6, 10.1, 10.0) # seats of party / 631 seats
acc_parties$twitter <- 0
for(p in 1:nrow(acc_parties)) {
acc_parties$twitter[p] <- round(nrow(acc_df[acc_df$party == as.character(acc_parties$party[p]), ]) / 280 * 100)
}
pie(acc_parties$btw13, col=c("black", "red", "purple", "green"), labels = c("CDU/CSU", "SPD", "Die LINKE", "Bündnis 90/Grüne"), clockwise = T,
main = "Seats of parties in the parliament")
pie(acc_parties$twitter, col=c("black", "red", "purple", "green"), labels = c("CDU/CSU", "SPD", "Die LINKE", "Bündnis 90/Grüne"), clockwise = T,
main = "Percentage of parties' MdBs of all Twitter accounts")
rm(acc_parties, p)
# VISUALS -----------------------------------------------------------------