From 6e83b8b6a12e9987f830409f2c42d7b68359e580 Mon Sep 17 00:00:00 2001
From: mxmehl <mail@mehl.mx>
Date: Thu, 26 Feb 2015 01:27:02 +0100
Subject: [PATCH] fix wrong acronym check, add hashtag check

---
 issuecomp-2-analysis.R | 23 ++++++++++++++++-------
 issuecomp-functions.R  |  8 ++++----
 2 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/issuecomp-2-analysis.R b/issuecomp-2-analysis.R
index 4033d27..835ee60 100644
--- a/issuecomp-2-analysis.R
+++ b/issuecomp-2-analysis.R
@@ -74,27 +74,36 @@ foreach(d = 1:nrow(issues), .packages = c("stringr"), .combine=rbind) %dopar% {
         curchars <- nchar(curtag, type = "chars")
         
         # Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
-        if(curchars <= 4) {
-          curacro <- checkAcronym(string = curtag, chars = curchars)
+        curacro <- checkAcronym(string = curtag)
+        # Check if tag is some kind of specific hashtag. If so, do not handle as acronym, but don't expand it either
+        if(str_detect(curtag, "^#")) {
+          curacro <- FALSE   # hashtags like #WM2014 are also written as #wm2014, so we need case-insensitivity
+          curhash <- TRUE    # But we need to mark it as hashtag, so it doesn't get extended or Levenshtein distance > 0
+          curtag <- str_replace(curtag, "#", "")
+          curchars <- curchars - 1
         } else {
-          curacro <- FALSE
+          curhash <- FALSE
         }
         
         # Now expand the current tag by possible suffixes that may be plural forms
-        if(!curacro) {
+        # Only do if it isn't an acronym or specific hastag
+        if(!curacro && !curhash) {
           for(e in 1:length(tagexpand)) {
             curtag[e] <- str_c(curtag[1], tagexpand[e])
           }
         }
         
-        # Set Levenshtein distance depending on char length
-        if(curchars <= 4) {
+        # Set Levenshtein distance depending on char length, acronym and hashtag status
+        if(curchars <= 4 || curacro || curhash) {
           curdistance <- 0
         } else {
           curdistance <- 1
         }
         
-        # Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow also 1 (Levenshtein distance)
+        # Match current tweet with tag. 
+          # Allow 1 Levenshtein distance if tag is >= 5 letters and no hashtag or acronym
+          # Make is case-sensitiv if tag is an acronym
+        
         tags_found <- NULL
         # Match the tweet with each variation of tagexpand
         for(e in 1:length(curtag)) {
diff --git a/issuecomp-functions.R b/issuecomp-functions.R
index f9c639a..33303a9 100644
--- a/issuecomp-functions.R
+++ b/issuecomp-functions.R
@@ -71,10 +71,10 @@ viewMatchingTweets <- function(date, issue, folder) {
 }
 
 
-checkAcronym <- function(string, chars) {
-  curtag_up <- str_replace_all(string = curtag, pattern = "[[:lower:]]", replacement = "")
-  curchars_up <- nchar(curtag_up, type = "chars")
-  if(curchars_up == curchars) {
+checkAcronym <- function(string) {
+  curtag_up <- str_replace_all(string = string, pattern = "[[:lower:]]", replacement = "")
+  #curchars_up <- nchar(curtag_up, type = "chars")
+  if(curtag_up == string) {
     return(TRUE)
   }
   else {