From 54d1cd79aaf0a3b60a670d7917a8bea6ee6f36c8 Mon Sep 17 00:00:00 2001 From: mxmehl Date: Wed, 21 Jan 2015 13:17:24 +0100 Subject: [PATCH] better pattern matching logic --- .Rhistory | 400 +++++++++++++++++++++--------------------- issuecomp-analysis.R | 10 +- issuecomp-functions.R | 33 ++-- 3 files changed, 228 insertions(+), 215 deletions(-) diff --git a/.Rhistory b/.Rhistory index 565f556..8e00e3e 100644 --- a/.Rhistory +++ b/.Rhistory @@ -1,203 +1,3 @@ -dir.create(id_folder) -issues <- data.frame(date = drange) -issuelist <- xmlToList("issues.xml") -issueheads <- names(issuelist) -issues[issueheads] <- 0 -tweets$issue <- "" -tweets$tags <- "" -for(d in 1:nrow(issues)) { -# Go through every day -curdate <- issues$date[d] -cat(as.character(curdate),"\n") -# Put all tweets from specific day in a temporary DF -tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] -for(t in 1:nrow(tweets_curday)){ -# Select tweet's text, make it lowercase and remove hashtag indicators (#) -curtext <- as.character(tweets_curday$text[t]) -curtext <- str_replace_all(curtext, "#", "") -curid <- as.character(tweets_curday$id_str[t]) -# Now test each single issue (not tag!) -for(i in 1:length(issueheads)) { -curissue <- issueheads[i] -curtags <- as.character(issuelist[[curissue]]) -curfile <- str_c(id_folder,"/",curissue,".csv") -# Now test all tags of a single issue -for(s in 1:length(curtags)) { -curtag <- curtags[s] -curchars <- nchar(curtag, type = "chars") -# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch -if(curchars <= 4) { -curacro <- checkAcronym(string = curtag, chars = curchars) -} else { -curacro <- FALSE -} -# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance) -tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro) -if(tags_found == 1) { -# Raise number of findings on this day for this issue by 1 -issues[d,curissue] <- issues[d,curissue] + 1 -# Add issue and first matched tag of tweet to tweets-DF -oldissue <- tweets[tweets[, "id_str"] == curid, "issue"] -tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";") -oldtag <- tweets[tweets[, "id_str"] == curid, "tags"] -tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";") -# Add information to file for function viewPatternMatching -write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE) -break -} -else { -#cat("Nothing found\n") -} -} # /for curtags -} # /for issuelist -} # /for tweets_curday -} # /for drange -View(tweets) -require(lubridate) -require(XML) -require(ggplot2) -require(reshape2) -require(stringr) -smartPatternMatch("bla bla Matching bla bla", "matching", 8, FALSE) -smartPatternMatch("bla bla Matching bla bla", "mating", 8, FALSE) -source("issuecomp-functions.R") -smartPatternMatch("bla bla Matching bla bla", "mating", 8, FALSE) -test <- c("matching", "matccing", "matxxing") -smartPatternMatch("bla bla Matching bla bla", "matching", 8, FALSE) -smartPatternMatch("bla bla Matching bla bla", "matccing", 8, FALSE) -smartPatternMatch <- function(string, pattern, chars, acronym) { -patternrex <- str_c("\\b", pattern, "\\b") -if(chars <= 4) { # 4 or less -found <- agrep(patternrex, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE) -} -else if(chars >= 8) { # 8 or more -found <- agrep(patternrex, string, max.distance = list(all = 1), ignore.case = !acronym, fixed = FALSE) -# # Give longer words a chance by ignoring word boundaries \\b -# if(convertLogical0(found) == 0) { -# found <- grep(pattern, string, ignore.case = !acronym, fixed = FALSE) -# } -} -else { # 5,6,7 -found <- agrep(patternrex, string, max.distance = list(all = 1), ignore.case = !acronym, fixed = FALSE) -} -found <- convertLogical0(found) -return(found) -} -smartPatternMatch("bla bla Matching bla bla", "matccing", 8, FALSE) -smartPatternMatch("bla bla Matching bla bla", "matxxing", 8, FALSE) -smartPatternMatch("bla bla Matching bla bla", sprintf(), 8, FALSE) -sprintf("%s", test) -smartPatternMatch("bla bla Matching bla bla", sprintf("%s", test), 8, FALSE) -for(i in 1:length(test)) { smartPatternMatch("bla bla Matching bla bla", test[i], 8, FALSE)} -for(i in 1:length(test)) { cat(smartPatternMatch("bla bla Matching bla bla", test[i], 8, FALSE))} -for(i in 1:length(test)) { tags_found[i] (smartPatternMatch("bla bla Matching bla bla", test[i], 8, FALSE))} -for(i in 1:length(test)) { tags_found[i] <- (smartPatternMatch("bla bla Matching bla bla", test[i], 8, FALSE))} -tags_found -length(tags_found) -any(tags_found) -smartPatternMatch <- function(string, pattern, chars, acronym) { -patternrex <- str_c("\\b", pattern, "\\b") -if(chars <= 4) { # 4 or less -found <- agrep(patternrex, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE) -} -else if(chars >= 8) { # 8 or more -found <- agrep(patternrex, string, max.distance = list(all = 1), ignore.case = !acronym, fixed = FALSE) -# # Give longer words a chance by ignoring word boundaries \\b -# if(convertLogical0(found) == 0) { -# found <- grep(pattern, string, ignore.case = !acronym, fixed = FALSE) -# } -} -else { # 5,6,7 -found <- agrep(patternrex, string, max.distance = list(all = 1), ignore.case = !acronym, fixed = FALSE) -} -found <- convertLogical0(found) -if(found == 1) { -found <- TRUE -} else { -found <- FALSE -} -return(found) -} -for(i in 1:length(test)) { tags_found[i] <- (smartPatternMatch("bla bla Matching bla bla", test[i], 8, FALSE))} -any(tags_found) -tags_found <- NULL -rm(tags_found) -for(i in 1:length(test)) { tags_found[i] <- (smartPatternMatch("bla bla Matching bla bla", test[i], 8, FALSE))} -tags_found <- NULL -for(i in 1:length(test)) { tags_found[i] <- (smartPatternMatch("bla bla Matching bla bla", test[i], 8, FALSE))} -tags_found <- NULL -for(i in 1:length(test)) { tags_found[i] <- (smartPatternMatch("bla bla Matching bla bla", test[i], 8, FALSE))} -any(tags_found) -curtag -tagexpand <- c("s", "n", "en") -curtag -curtag[2] <- "bla" -curtag -curtag[2] <- NULL -curtag[2] <- "" -curtag -rm(curtag[2]) -curtag <- "Tomate" -for(e in 1:length(tagexpand)) { -curtag[e] <- str_c(curtag[e], tagexpand[e]) -} -curtag -for(e in 1:length(tagexpand)) { -curtag[e] <- str_c(curtag, tagexpand[e]) -} -curtag <- "Tomate" -for(e in 1:length(tagexpand)) { -curtag[e] <- str_c(curtag, tagexpand[e]) -} -curtag -curtag <- "Tomate" -for(e in 1:length(tagexpand)) { -curtag[e] <- str_c(curtag[1], tagexpand[e]) -} -curtag -tagexpand <- c("", "s", "n", "en") -for(e in 1:length(tagexpand)) { -curtag[e] <- str_c(curtag[1], tagexpand[e]) -} -curtag <- "Tomate" -for(e in 1:length(tagexpand)) { -curtag[e] <- str_c(curtag[1], tagexpand[e]) -} -curtag -smartPatternMatch <- function(string, pattern, chars, acronym) { -patternrex <- str_c("\\b", pattern, "\\b") -if(chars <= 4) { # 4 or less -found <- agrep(patternrex, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE) -} -else if(chars >= 8) { # 8 or more -found <- agrep(patternrex, string, max.distance = list(all = 1), ignore.case = !acronym, fixed = FALSE) -# # Give longer words a chance by ignoring word boundaries \\b -# if(convertLogical0(found) == 0) { -# found <- grep(pattern, string, ignore.case = !acronym, fixed = FALSE) -# } -} -else { # 5,6,7 -found <- agrep(patternrex, string, max.distance = list(all = 1), ignore.case = !acronym, fixed = FALSE) -} -found <- convertLogical0(found) -if(found == 1) { -found <- TRUE -} else { -found <- FALSE -} -return(found) -} -# MATCH TWEETS ------------------------------------------------------------ -id_folder <- "matched-ids" -unlink(id_folder, recursive = TRUE) -dir.create(id_folder) -issues <- data.frame(date = drange) -issuelist <- xmlToList("issues.xml") -issueheads <- names(issuelist) -issues[issueheads] <- 0 -tweets$issue <- "" -tweets$tags <- "" -tagexpand <- c("", "s", "n", "en") for(d in 1:nrow(issues)) { # Go through every day curdate <- issues$date[d] @@ -510,3 +310,203 @@ main = "Seats of parties in the parliament") pie(acc_parties$twitter, col=c("black", "red", "purple", "green"), labels = c("CDU/CSU", "SPD", "Die LINKE", "Bündnis 90/Grüne"), clockwise = T, main = "Percentage of parties' MdBs of all Twitter accounts") rm(acc_parties) +require(lubridate) +require(XML) +require(ggplot2) +require(reshape2) +require(stringr) +source("issuecomp-functions.R") +curchars +curchars <- 7 +curchars >= 5 && curchars <= 7 +curchars <- 10 +curchars >= 5 && curchars <= 7 +curchars <- 4 +curchars >= 5 && curchars <= 7 +if(curchars <= 4) { +curdistance <- 0 +} +else if {curchars >= 5} { +curdistance <- 1 +} +if(curchars <= 4) { +curdistance <- 0 +} else if {curchars >= 5} { +curdistance <- 1 +} +if(curchars <= 4) { +curdistance <- 0 +} else { +curdistance <- 1 +} +curdistance +source("issuecomp-functions.R") +smartPatternMatch("bla bla Tomate bla", "tomaten", 0, F) +smartPatternMatch("bla bla Tomate bla", "tomaten", 1, F) +smartPatternMatch("bla bla Tomate bla", "tomatens", 1, F) +smartPatternMatch("bla bla Tomate bla", "tomatens", 2, F) +rm(list=ls()) +require(lubridate) +require(XML) +require(ggplot2) +require(reshape2) +require(stringr) +source("issuecomp-functions.R") +load(file = "tweets_untagged.RData") +date_start <- as.Date("2014-01-01") +date_end <- as.Date("2014-12-31") +drange <- as.integer(date_end - date_start) +drange <- date_start + days(0:drange) +# MATCH TWEETS ------------------------------------------------------------ +id_folder <- "matched-ids" +unlink(id_folder, recursive = TRUE) +dir.create(id_folder) +issues <- data.frame(date = drange) +issuelist <- xmlToList("issues.xml") +issueheads <- names(issuelist) +issues[issueheads] <- 0 +tweets$issue <- "" +tweets$tags <- "" +tagexpand <- c("", "s", "n", "en") +for(d in 1:nrow(issues)) { +# Go through every day +curdate <- issues$date[d] +cat(as.character(curdate),"\n") +# Put all tweets from specific day in a temporary DF +tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] +for(t in 1:nrow(tweets_curday)){ +# Select tweet's text, make it lowercase and remove hashtag indicators (#) +curtext <- as.character(tweets_curday$text[t]) +curtext <- str_replace_all(curtext, "#", "") +curid <- as.character(tweets_curday$id_str[t]) +# Now test each single issue (not tag!) +for(i in 1:length(issueheads)) { +curissue <- issueheads[i] +curtags <- as.character(issuelist[[curissue]]) +curfile <- str_c(id_folder,"/",curissue,".csv") +# Now test all tags of a single issue +for(s in 1:length(curtags)) { +curtag <- curtags[s] +curchars <- nchar(curtag, type = "chars") +# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch +if(curchars <= 4) { +curacro <- checkAcronym(string = curtag, chars = curchars) +} else { +curacro <- FALSE +} +# Now expand the current tag by possible suffixes that may be plural forms +if(!curacro) { +for(e in 1:length(tagexpand)) { +curtag[e] <- str_c(curtag[1], tagexpand[e]) +} +} +# Set Levenshtein distance depending on char length +if(curchars <= 4) { +curdistance <- 0 +} else { +curdistance <- 1 +} +# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow also 1 (Levenshtein distance) +tags_found <- NULL +# Match the tweet with each variation of tagexpand +for(e in 1:length(curtag)) { +tags_found[e] <- smartPatternMatch(curtext, curtag[e], curdistance, curacro) +} +tags_found <- any(tags_found) +curtag <- curtag[1] +if(tags_found == TRUE) { +# Raise number of findings on this day for this issue by 1 +issues[d,curissue] <- issues[d,curissue] + 1 +# Add issue and first matched tag of tweet to tweets-DF +oldissue <- tweets[tweets[, "id_str"] == curid, "issue"] +tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";") +oldtag <- tweets[tweets[, "id_str"] == curid, "tags"] +tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";") +# Add information to file for function viewPatternMatching +write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE) +break +} +else { +#cat("Nothing found\n") +} +} # /for curtags +} # /for issuelist +} # /for tweets_curday +} # /for drange +# MATCH TWEETS ------------------------------------------------------------ +id_folder <- "matched-ids" +unlink(id_folder, recursive = TRUE) +dir.create(id_folder) +issues <- data.frame(date = drange) +issuelist <- xmlToList("issues.xml") +issueheads <- names(issuelist) +issues[issueheads] <- 0 +tweets$issue <- "" +tweets$tags <- "" +tagexpand <- c("", "s", "n", "en") +for(d in 1:nrow(issues)) { +# Go through every day +curdate <- issues$date[d] +cat(as.character(curdate),"\n") +# Put all tweets from specific day in a temporary DF +tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] +for(t in 1:nrow(tweets_curday)){ +# Select tweet's text, make it lowercase and remove hashtag indicators (#) +curtext <- as.character(tweets_curday$text[t]) +curtext <- str_replace_all(curtext, "#", "") +curid <- as.character(tweets_curday$id_str[t]) +# Now test each single issue (not tag!) +for(i in 1:length(issueheads)) { +curissue <- issueheads[i] +curtags <- as.character(issuelist[[curissue]]) +curfile <- str_c(id_folder,"/",curissue,".csv") +# Now test all tags of a single issue +for(s in 1:length(curtags)) { +curtag <- curtags[s] +curchars <- nchar(curtag, type = "chars") +# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch +if(curchars <= 4) { +curacro <- checkAcronym(string = curtag, chars = curchars) +} else { +curacro <- FALSE +} +# Now expand the current tag by possible suffixes that may be plural forms +if(!curacro) { +for(e in 1:length(tagexpand)) { +curtag[e] <- str_c(curtag[1], tagexpand[e]) +} +} +# Set Levenshtein distance depending on char length +if(curchars <= 4) { +curdistance <- 0 +} else { +curdistance <- 1 +} +# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow also 1 (Levenshtein distance) +tags_found <- NULL +# Match the tweet with each variation of tagexpand +for(e in 1:length(curtag)) { +tags_found[e] <- smartPatternMatch(curtext, curtag[e], curdistance, curacro) +} +tags_found <- any(tags_found) +curtag <- curtag[1] +if(tags_found == TRUE) { +# Raise number of findings on this day for this issue by 1 +issues[d,curissue] <- issues[d,curissue] + 1 +# Add issue and first matched tag of tweet to tweets-DF +oldissue <- tweets[tweets[, "id_str"] == curid, "issue"] +tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";") +oldtag <- tweets[tweets[, "id_str"] == curid, "tags"] +tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";") +# Add information to file for function viewPatternMatching +write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE) +break +} +else { +#cat("Nothing found\n") +} +} # /for curtags +} # /for issuelist +} # /for tweets_curday +} # /for drange +View(issues) diff --git a/issuecomp-analysis.R b/issuecomp-analysis.R index 44d2c41..527ab98 100644 --- a/issuecomp-analysis.R +++ b/issuecomp-analysis.R @@ -70,11 +70,19 @@ for(d in 1:nrow(issues)) { curtag[e] <- str_c(curtag[1], tagexpand[e]) } } + + # Set Levenshtein distance depending on char length + if(curchars <= 4) { + curdistance <- 0 + } else { + curdistance <- 1 + } # Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow also 1 (Levenshtein distance) tags_found <- NULL + # Match the tweet with each variation of tagexpand for(e in 1:length(curtag)) { - tags_found[e] <- smartPatternMatch(curtext, curtag[e], curchars, curacro) + tags_found[e] <- smartPatternMatch(curtext, curtag[e], curdistance, curacro) } tags_found <- any(tags_found) curtag <- curtag[1] diff --git a/issuecomp-functions.R b/issuecomp-functions.R index 4ab73da..f9c639a 100644 --- a/issuecomp-functions.R +++ b/issuecomp-functions.R @@ -26,22 +26,27 @@ convertLogical0 <- function(var) { return(var) } -smartPatternMatch <- function(string, pattern, chars, acronym) { +smartPatternMatch <- function(string, pattern, dist, acronym) { patternrex <- str_c("\\b", pattern, "\\b") - if(chars <= 4) { # 4 or less - found <- agrep(patternrex, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE) - } - else if(chars >= 8) { # 8 or more - found <- agrep(patternrex, string, max.distance = list(all = 1), ignore.case = !acronym, fixed = FALSE) -# # Give longer words a chance by ignoring word boundaries \\b -# if(convertLogical0(found) == 0) { -# found <- grep(pattern, string, ignore.case = !acronym, fixed = FALSE) -# } - } - else { # 5,6,7 - found <- agrep(patternrex, string, max.distance = list(all = 1), ignore.case = !acronym, fixed = FALSE) - } + found <- agrep(patternrex, string, max.distance = list(all = dist), ignore.case = !acronym, fixed = FALSE) + +# if(chars <= 4) { # 4 or less +# found <- agrep(patternrex, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE) +# } +# else if(chars >= 8) { # 8 or more +# found <- agrep(patternrex, string, max.distance = list(all = 1), ignore.case = !acronym, fixed = FALSE) +# # # Give longer words a chance by ignoring word boundaries \\b +# # if(convertLogical0(found) == 0) { +# # found <- grep(pattern, string, ignore.case = !acronym, fixed = FALSE) +# # } +# } +# else { # 5,6,7 +# found <- agrep(patternrex, string, max.distance = list(all = 1), ignore.case = !acronym, fixed = FALSE) +# } +# + + # Convert 0/1 to F/T found <- convertLogical0(found) if(found == 1) { found <- TRUE