|
|
|
@ -74,27 +74,36 @@ foreach(d = 1:nrow(issues), .packages = c("stringr"), .combine=rbind) %dopar% {
|
|
|
|
|
curchars <- nchar(curtag, type = "chars")
|
|
|
|
|
|
|
|
|
|
# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
|
|
|
|
|
if(curchars <= 4) {
|
|
|
|
|
curacro <- checkAcronym(string = curtag, chars = curchars)
|
|
|
|
|
curacro <- checkAcronym(string = curtag)
|
|
|
|
|
# Check if tag is some kind of specific hashtag. If so, do not handle as acronym, but don't expand it either
|
|
|
|
|
if(str_detect(curtag, "^#")) {
|
|
|
|
|
curacro <- FALSE # hashtags like #WM2014 are also written as #wm2014, so we need case-insensitivity
|
|
|
|
|
curhash <- TRUE # But we need to mark it as hashtag, so it doesn't get extended or Levenshtein distance > 0
|
|
|
|
|
curtag <- str_replace(curtag, "#", "")
|
|
|
|
|
curchars <- curchars - 1
|
|
|
|
|
} else {
|
|
|
|
|
curacro <- FALSE
|
|
|
|
|
curhash <- FALSE
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# Now expand the current tag by possible suffixes that may be plural forms
|
|
|
|
|
if(!curacro) {
|
|
|
|
|
# Only do if it isn't an acronym or specific hastag
|
|
|
|
|
if(!curacro && !curhash) {
|
|
|
|
|
for(e in 1:length(tagexpand)) {
|
|
|
|
|
curtag[e] <- str_c(curtag[1], tagexpand[e])
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# Set Levenshtein distance depending on char length
|
|
|
|
|
if(curchars <= 4) {
|
|
|
|
|
# Set Levenshtein distance depending on char length, acronym and hashtag status
|
|
|
|
|
if(curchars <= 4 || curacro || curhash) {
|
|
|
|
|
curdistance <- 0
|
|
|
|
|
} else {
|
|
|
|
|
curdistance <- 1
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow also 1 (Levenshtein distance)
|
|
|
|
|
# Match current tweet with tag.
|
|
|
|
|
# Allow 1 Levenshtein distance if tag is >= 5 letters and no hashtag or acronym
|
|
|
|
|
# Make is case-sensitiv if tag is an acronym
|
|
|
|
|
|
|
|
|
|
tags_found <- NULL
|
|
|
|
|
# Match the tweet with each variation of tagexpand
|
|
|
|
|
for(e in 1:length(curtag)) {
|
|
|
|
|