From 6e83b8b6a12e9987f830409f2c42d7b68359e580 Mon Sep 17 00:00:00 2001 From: mxmehl Date: Thu, 26 Feb 2015 01:27:02 +0100 Subject: [PATCH] fix wrong acronym check, add hashtag check --- issuecomp-2-analysis.R | 23 ++++++++++++++++------- issuecomp-functions.R | 8 ++++---- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/issuecomp-2-analysis.R b/issuecomp-2-analysis.R index 4033d27..835ee60 100644 --- a/issuecomp-2-analysis.R +++ b/issuecomp-2-analysis.R @@ -74,27 +74,36 @@ foreach(d = 1:nrow(issues), .packages = c("stringr"), .combine=rbind) %dopar% { curchars <- nchar(curtag, type = "chars") # Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch - if(curchars <= 4) { - curacro <- checkAcronym(string = curtag, chars = curchars) + curacro <- checkAcronym(string = curtag) + # Check if tag is some kind of specific hashtag. If so, do not handle as acronym, but don't expand it either + if(str_detect(curtag, "^#")) { + curacro <- FALSE # hashtags like #WM2014 are also written as #wm2014, so we need case-insensitivity + curhash <- TRUE # But we need to mark it as hashtag, so it doesn't get extended or Levenshtein distance > 0 + curtag <- str_replace(curtag, "#", "") + curchars <- curchars - 1 } else { - curacro <- FALSE + curhash <- FALSE } # Now expand the current tag by possible suffixes that may be plural forms - if(!curacro) { + # Only do if it isn't an acronym or specific hastag + if(!curacro && !curhash) { for(e in 1:length(tagexpand)) { curtag[e] <- str_c(curtag[1], tagexpand[e]) } } - # Set Levenshtein distance depending on char length - if(curchars <= 4) { + # Set Levenshtein distance depending on char length, acronym and hashtag status + if(curchars <= 4 || curacro || curhash) { curdistance <- 0 } else { curdistance <- 1 } - # Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow also 1 (Levenshtein distance) + # Match current tweet with tag. + # Allow 1 Levenshtein distance if tag is >= 5 letters and no hashtag or acronym + # Make is case-sensitiv if tag is an acronym + tags_found <- NULL # Match the tweet with each variation of tagexpand for(e in 1:length(curtag)) { diff --git a/issuecomp-functions.R b/issuecomp-functions.R index f9c639a..33303a9 100644 --- a/issuecomp-functions.R +++ b/issuecomp-functions.R @@ -71,10 +71,10 @@ viewMatchingTweets <- function(date, issue, folder) { } -checkAcronym <- function(string, chars) { - curtag_up <- str_replace_all(string = curtag, pattern = "[[:lower:]]", replacement = "") - curchars_up <- nchar(curtag_up, type = "chars") - if(curchars_up == curchars) { +checkAcronym <- function(string) { + curtag_up <- str_replace_all(string = string, pattern = "[[:lower:]]", replacement = "") + #curchars_up <- nchar(curtag_up, type = "chars") + if(curtag_up == string) { return(TRUE) } else {