Browse Source

fix wrong acronym check, add hashtag check

mxmehl 5 years ago
parent
commit
6e83b8b6a1
2 changed files with 20 additions and 11 deletions
  1. 16
    7
      issuecomp-2-analysis.R
  2. 4
    4
      issuecomp-functions.R

+ 16
- 7
issuecomp-2-analysis.R View File

@@ -74,27 +74,36 @@ foreach(d = 1:nrow(issues), .packages = c("stringr"), .combine=rbind) %dopar% {
74 74
         curchars <- nchar(curtag, type = "chars")
75 75
         
76 76
         # Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
77
-        if(curchars <= 4) {
78
-          curacro <- checkAcronym(string = curtag, chars = curchars)
77
+        curacro <- checkAcronym(string = curtag)
78
+        # Check if tag is some kind of specific hashtag. If so, do not handle as acronym, but don't expand it either
79
+        if(str_detect(curtag, "^#")) {
80
+          curacro <- FALSE   # hashtags like #WM2014 are also written as #wm2014, so we need case-insensitivity
81
+          curhash <- TRUE    # But we need to mark it as hashtag, so it doesn't get extended or Levenshtein distance > 0
82
+          curtag <- str_replace(curtag, "#", "")
83
+          curchars <- curchars - 1
79 84
         } else {
80
-          curacro <- FALSE
85
+          curhash <- FALSE
81 86
         }
82 87
         
83 88
         # Now expand the current tag by possible suffixes that may be plural forms
84
-        if(!curacro) {
89
+        # Only do if it isn't an acronym or specific hastag
90
+        if(!curacro && !curhash) {
85 91
           for(e in 1:length(tagexpand)) {
86 92
             curtag[e] <- str_c(curtag[1], tagexpand[e])
87 93
           }
88 94
         }
89 95
         
90
-        # Set Levenshtein distance depending on char length
91
-        if(curchars <= 4) {
96
+        # Set Levenshtein distance depending on char length, acronym and hashtag status
97
+        if(curchars <= 4 || curacro || curhash) {
92 98
           curdistance <- 0
93 99
         } else {
94 100
           curdistance <- 1
95 101
         }
96 102
         
97
-        # Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow also 1 (Levenshtein distance)
103
+        # Match current tweet with tag. 
104
+          # Allow 1 Levenshtein distance if tag is >= 5 letters and no hashtag or acronym
105
+          # Make is case-sensitiv if tag is an acronym
106
+        
98 107
         tags_found <- NULL
99 108
         # Match the tweet with each variation of tagexpand
100 109
         for(e in 1:length(curtag)) {

+ 4
- 4
issuecomp-functions.R View File

@@ -71,10 +71,10 @@ viewMatchingTweets <- function(date, issue, folder) {
71 71
 }
72 72
 
73 73
 
74
-checkAcronym <- function(string, chars) {
75
-  curtag_up <- str_replace_all(string = curtag, pattern = "[[:lower:]]", replacement = "")
76
-  curchars_up <- nchar(curtag_up, type = "chars")
77
-  if(curchars_up == curchars) {
74
+checkAcronym <- function(string) {
75
+  curtag_up <- str_replace_all(string = string, pattern = "[[:lower:]]", replacement = "")
76
+  #curchars_up <- nchar(curtag_up, type = "chars")
77
+  if(curtag_up == string) {
78 78
     return(TRUE)
79 79
   }
80 80
   else {

Loading…
Cancel
Save