fix wrong acronym check, add hashtag check

master
mxmehl 8 years ago
parent a32447a27b
commit 6e83b8b6a1

@ -74,27 +74,36 @@ foreach(d = 1:nrow(issues), .packages = c("stringr"), .combine=rbind) %dopar% {
curchars <- nchar(curtag, type = "chars")
# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
if(curchars <= 4) {
curacro <- checkAcronym(string = curtag, chars = curchars)
curacro <- checkAcronym(string = curtag)
# Check if tag is some kind of specific hashtag. If so, do not handle as acronym, but don't expand it either
if(str_detect(curtag, "^#")) {
curacro <- FALSE # hashtags like #WM2014 are also written as #wm2014, so we need case-insensitivity
curhash <- TRUE # But we need to mark it as hashtag, so it doesn't get extended or Levenshtein distance > 0
curtag <- str_replace(curtag, "#", "")
curchars <- curchars - 1
} else {
curacro <- FALSE
curhash <- FALSE
}
# Now expand the current tag by possible suffixes that may be plural forms
if(!curacro) {
# Only do if it isn't an acronym or specific hastag
if(!curacro && !curhash) {
for(e in 1:length(tagexpand)) {
curtag[e] <- str_c(curtag[1], tagexpand[e])
}
}
# Set Levenshtein distance depending on char length
if(curchars <= 4) {
# Set Levenshtein distance depending on char length, acronym and hashtag status
if(curchars <= 4 || curacro || curhash) {
curdistance <- 0
} else {
curdistance <- 1
}
# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow also 1 (Levenshtein distance)
# Match current tweet with tag.
# Allow 1 Levenshtein distance if tag is >= 5 letters and no hashtag or acronym
# Make is case-sensitiv if tag is an acronym
tags_found <- NULL
# Match the tweet with each variation of tagexpand
for(e in 1:length(curtag)) {

@ -71,10 +71,10 @@ viewMatchingTweets <- function(date, issue, folder) {
}
checkAcronym <- function(string, chars) {
curtag_up <- str_replace_all(string = curtag, pattern = "[[:lower:]]", replacement = "")
curchars_up <- nchar(curtag_up, type = "chars")
if(curchars_up == curchars) {
checkAcronym <- function(string) {
curtag_up <- str_replace_all(string = string, pattern = "[[:lower:]]", replacement = "")
#curchars_up <- nchar(curtag_up, type = "chars")
if(curtag_up == string) {
return(TRUE)
}
else {

Loading…
Cancel
Save