better matching, now with plural forms and less distance

This commit is contained in:
2015-01-21 12:27:09 +01:00
parent e9c5fc7d8d
commit a8987936c4
4 changed files with 488 additions and 452 deletions
+442 -442
View File
@@ -1,417 +1,3 @@
tweets$tags <- ""
for(d in 1:nrow(issues)) {
# Go through every day
curdate <- issues$date[d]
cat(as.character(curdate),"\n")
# Put all tweets from specific day in a temporary DF
tweets_curday <- tweets[tweets[, "created_at"] == curdate, ]
for(t in 1:nrow(tweets_curday)){
# Select tweet's text, make it lowercase and remove hashtag indicators (#)
curtext <- as.character(tweets_curday$text[t])
curtext <- str_replace_all(curtext, "#", "")
curid <- as.character(tweets_curday$id_str[t])
# Now test each single issue (not tag!)
for(i in 1:length(issuelist)) {
curtags <- as.character(issuelist[[i]])
curissue <- names(issuelist)[i]
curfile <- str_c(id_folder,"/",curissue,".csv")
# Now test all tags of a single issue
for(s in 1:length(curtags)) {
curtag <- curtags[s]
curchars <- nchar(curtag, type = "chars")
# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
if(curchars <= 4) {
curacro <- checkAcronym(string = curtag, chars = curchars)
} else {
curacro <- FALSE
}
# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro)
if(tags_found == 1) {
# Raise number of findings on this day for this issue by 1
issues[d,curissue] <- issues[d,curissue] + 1
# Add issue and first matched tag of tweet to tweets-DF
oldissue <- tweets[tweets[, "id_str"] == curid, "issue"]
tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";")
oldtag <- tweets[tweets[, "id_str"] == curid, "tags"]
tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";")
# Add information to file for function viewPatternMatching
write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE)
break
}
else {
#cat("Nothing found\n")
}
} # /for curtags
} # /for issuelist
} # /for tweets_curday
} # /for drange
View(issues)
# MATCH TWEETS ------------------------------------------------------------
id_folder <- "matched-ids"
unlink(id_folder, recursive = TRUE)
dir.create(id_folder)
issues <- data.frame(date = drange)
issuelist <- xmlToList("issues.xml")
issueheads <- names(issuelist)
issues[issueheads] <- 0
tweets$issue <- ""
tweets$tags <- ""
for(d in 1:nrow(issues)) {
# Go through every day
curdate <- issues$date[d]
cat(as.character(curdate),"\n")
# Put all tweets from specific day in a temporary DF
tweets_curday <- tweets[tweets[, "created_at"] == curdate, ]
for(t in 1:nrow(tweets_curday)){
# Select tweet's text, make it lowercase and remove hashtag indicators (#)
curtext <- as.character(tweets_curday$text[t])
curtext <- str_replace_all(curtext, "#", "")
curid <- as.character(tweets_curday$id_str[t])
# Now test each single issue (not tag!)
for(i in 1:length(issuelist)) {
curtags <- as.character(issuelist[[i]])
curissue <- names(issuelist)[i]
curfile <- str_c(id_folder,"/",curissue,".csv")
# Now test all tags of a single issue
for(s in 1:length(curtags)) {
curtag <- curtags[s]
curchars <- nchar(curtag, type = "chars")
# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
if(curchars <= 4) {
curacro <- checkAcronym(string = curtag, chars = curchars)
} else {
curacro <- FALSE
}
# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro)
if(tags_found == 1) {
# Raise number of findings on this day for this issue by 1
issues[d,curissue] <- issues[d,curissue] + 1
# Add issue and first matched tag of tweet to tweets-DF
oldissue <- tweets[tweets[, "id_str"] == curid, "issue"]
tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";")
oldtag <- tweets[tweets[, "id_str"] == curid, "tags"]
tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";")
# Add information to file for function viewPatternMatching
write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE)
break
}
else {
#cat("Nothing found\n")
}
} # /for curtags
} # /for issuelist
} # /for tweets_curday
} # /for drange
#rm(tweets_curday,curacro, curchars, curdate,curfile,curid,curissue,curtag,curtags,curtext,d,date_end,date_start,i,id_folder,oldissue,oldtag,s,t,tags_found)
View(issues)
save(issues, "issues.RData")
save(issues, file="issues.RData")
readYN <- function(question) {
n <- readline(prompt=question)
n <- as.character(n)
return(n)
}
checkIssue <- function(string, issuelist) {
status <- any(str_detect(string, issuelist))
return(status)
}
checkAllIssues <- function(string, issuelist) {
status <- NULL
for(i in 1:length(string)) {
if(checkIssue(string[i], issuelist)) {
status[i] <- TRUE
}
else {
cat("Issue",string[i],"does not exist. Please try again.\n")
status[i] <- FALSE
}
}
return(status)
}
require(stringr)
require(XML)
require(stringr)
require(XML)
# FUNCTIONS ---------------------------------------------------------------
readYN <- function(question) {
n <- readline(prompt=question)
n <- as.character(n)
return(n)
}
checkIssue <- function(string, issuelist) {
status <- any(str_detect(string, issuelist))
return(status)
}
checkAllIssues <- function(string, issuelist) {
status <- NULL
for(i in 1:length(string)) {
if(checkIssue(string[i], issuelist)) {
status[i] <- TRUE
}
else {
cat("Issue",string[i],"does not exist. Please try again.\n")
status[i] <- FALSE
}
}
return(status)
}
c_issues <- data.frame(date = drange)
c_issuelist <- xmlToList("issues.xml")
c_issueheads <- names(issuelist)
c_issues[issueheads] <- 0
source("issuecomp-codingsample-function.R")
c_tweets <- tweets
View(c_tweets)
source("issuecomp-codingsample-function.R")
smartPatternMatch("Höflich, aber klares Statement zu Menschenrechten. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschenrecht\\b", 13, FALSE)
smartPatternMatch("Höflich, aber klares Statement zu Menschenrechten. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschenrechte\\b", 13, FALSE)
smartPatternMatch <- function(string, pattern, chars, acronym) {
pattern <- str_c("\\b", pattern, "\\b")
if(chars <= 4) {
found <- agrep(pattern, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE)
}
else if(chars >= 8) {
cat("bla")
found <- agrep(pattern, string, max.distance = list(all = 2), ignore.case = !acronym, fixed = FALSE)
}
else {
found <- agrep(pattern, string, max.distance = list(all = 1), ignore.case = !acronym, fixed = FALSE)
}
found <- convertLogical0(found)
return(found)
}
smartPatternMatch("Höflich, aber klares Statement zu Menschenrechten. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschenrechte\\b", 13, FALSE)
smartPatternMatch <- function(string, pattern, chars, acronym) {
pattern <- str_c("\\b", pattern, "\\b")
if(chars <= 4) {
found <- agrep(pattern, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE)
}
else if(chars >= 8) {
found <- agrep(pattern, string, max.distance = list(all = 2), ignore.case = !acronym, fixed = FALSE)
}
else {
found <- agrep(pattern, string, max.distance = list(all = 1), ignore.case = !acronym, fixed = FALSE)
}
found <- convertLogical0(found)
return(found)
}
smartPatternMatch <- function(string, pattern, chars, acronym) {
pattern <- str_c("\\b", pattern, "\\b")
if(chars <= 4) {
found <- agrep(pattern, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE)
}
else if(chars >= 8) {
found <- agrep(pattern, string, max.distance = list(all = 3), ignore.case = !acronym, fixed = FALSE)
}
else {
found <- agrep(pattern, string, max.distance = list(all = 1), ignore.case = !acronym, fixed = FALSE)
}
found <- convertLogical0(found)
return(found)
}
smartPatternMatch("Höflich, aber klares Statement zu Menschenrechten. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschenrechte\\b", 13, FALSE)
smartPatternMatch("Höflich, aber klares Statement zu Menschenrechten. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschenrecht\\b", 13, FALSE)
smartPatternMatch("Höflich, aber klares Statement zu Menschenrechten. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschenracht\\b", 13, FALSE)
smartPatternMatch("Höflich, aber klares Statement zu Menschenrechten. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschen-recht\\b", 13, FALSE)
smartPatternMatch("Höflich, aber klares Statement zu Menschen-Rechten. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschenrecht\\b", 13, FALSE)
smartPatternMatch("Höflich, aber klares Statement zu Menschen-Rechte. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschenrecht\\b", 13, FALSE)
smartPatternMatch("Bla bla Tomate ", "\\Tomate\\b", 6, FALSE)
smartPatternMatch("Bla bla Tomaten bla bla", "\\Tomate\\b", 6, FALSE)
smartPatternMatch <- function(string, pattern, chars, acronym) {
pattern <- str_c("\\b", pattern, "\\b")
if(chars <= 4) {
found <- agrep(pattern, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE)
}
else if(chars >= 8) {
found <- agrep(pattern, string, max.distance = list(all = 3), ignore.case = !acronym, fixed = FALSE)
}
else {
found <- agrep(pattern, string, max.distance = list(all = 2), ignore.case = !acronym, fixed = FALSE)
}
found <- convertLogical0(found)
return(found)
}
smartPatternMatch("Bla bla Tomaten bla bla", "\\Tomate\\b", 6, FALSE)
smartPatternMatch("Bla bla Menschen bla bla", "\\Menschen\\b", 8, FALSE)
smartPatternMatch("Bla bla Menschen bla bla", "\\Menschen\\b", 7, FALSE)
smartPatternMatch("Bla bla Menschen bla bla", "\\Mensch\\b", 7, FALSE)
smartPatternMatch("Bla bla Menschen bla bla", "\\Mensch\\b", 8, FALSE)
smartPatternMatch("Bla bla Nazis bla bla", "\\Nazis\\b", 8, FALSE)
smartPatternMatch("Bla bla Nazis bla bla", "\\Nazis\\b", 5, FALSE)
smartPatternMatch("Bla bla Nazis bla bla", "\\Nazi\\b", 4, FALSE)
smartPatternMatch("Bla bla Nazi bla bla", "\\Nazis\\b", 5, FALSE)
source("issuecomp-codingsample-function.R")
smartPatternMatch("Der kleine Flüchtlingsjunge war", "\\bFlüchtling\\b", 9, FALSE)
str_detect("Der kleine Flüchtlingsjunge war", pattern = "\\bFlüchtling\\b")
str_detect("Der kleine Flüchtlingsjunge war", pattern = "Flüchtling")
str_detect("Der kleine Flücht lingsjunge war", pattern = "Flüchtling")
smartPatternMatch <- function(string, pattern, chars, acronym) {
pattern <- str_c("\\b", pattern, "\\b")
if(chars <= 4) { # 4 or less
found <- agrep(pattern, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE)
}
else if(chars >= 8) { # 8 or more
found <- agrep(pattern, string, max.distance = list(all = 3), ignore.case = !acronym, fixed = FALSE)
cat(found)
}
else { # 5,6,7
found <- agrep(pattern, string, max.distance = list(all = 2), ignore.case = !acronym, fixed = FALSE)
}
found <- convertLogical0(found)
return(found)
}
str_detect("Der kleine Flücht lingsjunge war", pattern = "Flüchtling")
smartPatternMatch("Der kleine Flüchtlingsjunge war", "\\bFlüchtling\\b", 9, FALSE)
smartPatternMatch("Der kleine Flüchtlingsjunge war", "\\bFlüchtling\\b", 9, FALSE)
smartPatternMatch <- function(string, pattern, chars, acronym) {
pattern <- str_c("\\b", pattern, "\\b")
if(chars <= 4) { # 4 or less
found <- agrep(pattern, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE)
}
else if(chars >= 8) { # 8 or more
found <- agrep(pattern, string, max.distance = list(all = 3), ignore.case = !acronym, fixed = FALSE)
cat("it's",found)
}
else { # 5,6,7
found <- agrep(pattern, string, max.distance = list(all = 2), ignore.case = !acronym, fixed = FALSE)
}
found <- convertLogical0(found)
return(found)
}
smartPatternMatch("Der kleine Flüchtlingsjunge war", "\\bFlüchtling\\b", 9, FALSE)
str_detect("Der kleine Flücht lingsjunge war", pattern = "Flüchtling")
str_detect("Der kleine Flüchtlingsjunge war", pattern = "Flüchtling")
smartPatternMatch("Der kleine Flüchtlingsjunge war", "\\bFlüchtling\\b", 9, FALSE)
smartPatternMatch("Der kleine Flüchtlinge war", "\\bFlüchtling\\b", 9, FALSE)
grep("Flüchtling","Der kleine Flüchtlingsjunge war", ignore.case = TRUE, fixed=FALSE)
grep("\\bFlüchtling\\b","Der kleine Flüchtlingsjunge war", ignore.case = TRUE, fixed=FALSE)
grep("\\bFlüchtling\\b","Der kleine Flüchtlingsjunge war", ignore.case = TRUE, fixed=TRUE)
grep("\\bFlüchtling\\b","Der kleine Flüchtlingsjunge war", ignore.case = TRUE, fixed=FALSE)
grep("Flüchtling","Der kleine Flüchtlingsjunge war", ignore.case = TRUE, fixed=FALSE)
grep("Flüchtling","Der kleine Flücht-lingsjunge war", ignore.case = TRUE, fixed=FALSE)
grep("Flüchtling","Der kleine Flüchtlingsjunge war", ignore.case = TRUE, fixed=FALSE)
smartPatternMatch <- function(string, pattern, chars, acronym) {
patternrex <- str_c("\\b", pattern, "\\b")
if(chars <= 4) { # 4 or less
found <- agrep(patternrex, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE)
}
else if(chars >= 8) { # 8 or more
found <- agrep(patternrex, string, max.distance = list(all = 3), ignore.case = !acronym, fixed = FALSE)
if(convertLogical0(found) == 0) {
found <- grep(pattern, string, ignore.case = !acronym, fixed = FALSE)
}
}
else { # 5,6,7
found <- agrep(patternrex, string, max.distance = list(all = 2), ignore.case = !acronym, fixed = FALSE)
}
found <- convertLogical0(found)
return(found)
}
smartPatternMatch("Der kleine Flüchtlingsjunge war", "Flüchtling", 9, FALSE)
smartPatternMatch("Der kleine Flüchtlingsjunge war", "Flüchtling", 9, FALSE)
smartPatternMatch("Der kleine Flüchtlingsjunge war", "Flüchtling", 7, FALSE)
c_errors <- read.csv("issuecomp-codingsample-error.csv", header = F, sep=",", colClasses="character")
names(c_errors) <- c("str_id", "code", "issue", "tags", "text")
for(r in 1:nrow(c_errors)) {
c_errcode <- as.character(c_errors$code[r])
c_errissue <- as.character(c_errors$issue[r])
c_errtags <- as.character(c_errors$tags[r])
c_errtext <- as.character(c_errors$text[r])
c_errid <- as.character(c_errors$str_id[r])
cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errtags, "\n", sep="")
source("issuecomp-codingsample-function2.R")
}
View(c_errors)
viewMatchingTweets(date = "2014-05-10", issue = "agrar.204", id_folder)
viewMatchingTweets(date = "2014-05-10", issue = "agrar.402", id_folder)
viewMatchingTweets(date = "2014-01-10", issue = "agrar.402", id_folder)
viewMatchingTweets(date = "2014-01-20", issue = "agrar.402", id_folder)
viewMatchingTweets(date = "2014-01-10", issue = "agrar.403", id_folder)
viewMatchingTweets(date = "2014-04-10", issue = "agrar.403", id_folder)
viewMatchingTweets(date = "2014-05-10", issue = "agrar.403", id_folder)
viewMatchingTweets(date = "2014-02-11", issue = "agrar.403", id_folder)
viewMatchingTweets(date = "2014-08-01", issue = "agrar.403", id_folder)
issuelist <- xmlToList("issues.xml")
issuelist
issuelist[[1]]
xmlTreeParse(file = "issues.xml")
View(issues)
issuelist
issueheads
issuelist[[1]]
issuelist2 <- xmlTreeParse(file = "issues.xml")
issuelist2[[1]]
issuelist2[[2]]
issuelist2[[1,2]]
issuelist2[1
issuelist2[1]
issuelist2$doc$file
issuelist2$doc$version
xmlParse("issues.xml")
issuelist2 <- xmlParse("issues.xml")
issuelist2[1]
issuelist2[2]
issuelist2
issuelist
issuelist$edu.606
issuelist$edu.606[1]
issuelist$edu.606[2]
issuelist$edu.606[3]
issueheads
issuelist$macro.100
length(issuelist$macro.100)
length(issuelist$macro.101)
length(issuelist$macro.103)
length(issuelist$macro.105)
issuelist$macro.105
issuelist$macro.105[2]
issueheads
as.character(issuelist[[1]])
as.character(issuelist[[2]])
test <- issueheads[1]
test
as.character(issuelist$test)
as.character(issuelist$macro.100)
as.character(issuelist[test])
as.character(issuelist[test,1])
as.character(issuelist[1,test])
as.character(issuelist[test])
issuelist[test]
issuelist[test]
length(issuelist[test])
length(issuelist$macro.100)
issuelist$macro.100
test
issuelist[test]
issuelist[,test]
issuelist[,as.character(test)]
issuelist[[test]]
issuelist[,test]
issuelist[test]
issuelist[[test]]
length(issuelist[[test]])
issuelist[[test]]
issuelist[[test]][1]
as.character(issuelist[[test]][1])
as.character(issuelist[[test]])
issueheads
issueheads[2]
as.character(issuelist[[i]])
as.character(issuelist[[1]])
as.character(issuelist[[test]])
i <- 1
curissue <- issueheads[i]
curtags <- as.character(issuelist[[curissue]])
curfile <- str_c(id_folder,"/",curissue,".csv")
curissue
curtags
curfile
curtags[2]
# MATCH TWEETS ------------------------------------------------------------
id_folder <- "matched-ids"
unlink(id_folder, recursive = TRUE)
dir.create(id_folder)
issues <- data.frame(date = drange)
issuelist <- xmlToList("issues.xml")
@@ -466,10 +52,142 @@ else {
} # /for issuelist
} # /for tweets_curday
} # /for drange
smartPatternMatch(string = "er ist pädophil ", pattern = "pädophilie", chars = 10, acronym = FALSE)
smartPatternMatch(string = "er ist pädophiler ", pattern = "pädophilie", chars = 10, acronym = FALSE)
smartPatternMatch(string = "er ist pädophiler ", pattern = "Pädophilie", chars = 10, acronym = FALSE)
smartPatternMatch(string = "er ist pädophiles ", pattern = "Pädophilie", chars = 10, acronym = FALSE)
View(tweets)
require(lubridate)
require(XML)
require(ggplot2)
require(reshape2)
require(stringr)
smartPatternMatch("bla bla Matching bla bla", "matching", 8, FALSE)
smartPatternMatch("bla bla Matching bla bla", "mating", 8, FALSE)
source("issuecomp-functions.R")
smartPatternMatch("bla bla Matching bla bla", "mating", 8, FALSE)
test <- c("matching", "matccing", "matxxing")
smartPatternMatch("bla bla Matching bla bla", "matching", 8, FALSE)
smartPatternMatch("bla bla Matching bla bla", "matccing", 8, FALSE)
smartPatternMatch <- function(string, pattern, chars, acronym) {
patternrex <- str_c("\\b", pattern, "\\b")
if(chars <= 4) { # 4 or less
found <- agrep(patternrex, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE)
}
else if(chars >= 8) { # 8 or more
found <- agrep(patternrex, string, max.distance = list(all = 1), ignore.case = !acronym, fixed = FALSE)
# # Give longer words a chance by ignoring word boundaries \\b
# if(convertLogical0(found) == 0) {
# found <- grep(pattern, string, ignore.case = !acronym, fixed = FALSE)
# }
}
else { # 5,6,7
found <- agrep(patternrex, string, max.distance = list(all = 1), ignore.case = !acronym, fixed = FALSE)
}
found <- convertLogical0(found)
return(found)
}
smartPatternMatch("bla bla Matching bla bla", "matccing", 8, FALSE)
smartPatternMatch("bla bla Matching bla bla", "matxxing", 8, FALSE)
smartPatternMatch("bla bla Matching bla bla", sprintf(), 8, FALSE)
sprintf("%s", test)
smartPatternMatch("bla bla Matching bla bla", sprintf("%s", test), 8, FALSE)
for(i in 1:length(test)) { smartPatternMatch("bla bla Matching bla bla", test[i], 8, FALSE)}
for(i in 1:length(test)) { cat(smartPatternMatch("bla bla Matching bla bla", test[i], 8, FALSE))}
for(i in 1:length(test)) { tags_found[i] (smartPatternMatch("bla bla Matching bla bla", test[i], 8, FALSE))}
for(i in 1:length(test)) { tags_found[i] <- (smartPatternMatch("bla bla Matching bla bla", test[i], 8, FALSE))}
tags_found
length(tags_found)
any(tags_found)
smartPatternMatch <- function(string, pattern, chars, acronym) {
patternrex <- str_c("\\b", pattern, "\\b")
if(chars <= 4) { # 4 or less
found <- agrep(patternrex, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE)
}
else if(chars >= 8) { # 8 or more
found <- agrep(patternrex, string, max.distance = list(all = 1), ignore.case = !acronym, fixed = FALSE)
# # Give longer words a chance by ignoring word boundaries \\b
# if(convertLogical0(found) == 0) {
# found <- grep(pattern, string, ignore.case = !acronym, fixed = FALSE)
# }
}
else { # 5,6,7
found <- agrep(patternrex, string, max.distance = list(all = 1), ignore.case = !acronym, fixed = FALSE)
}
found <- convertLogical0(found)
if(found == 1) {
found <- TRUE
} else {
found <- FALSE
}
return(found)
}
for(i in 1:length(test)) { tags_found[i] <- (smartPatternMatch("bla bla Matching bla bla", test[i], 8, FALSE))}
any(tags_found)
tags_found <- NULL
rm(tags_found)
for(i in 1:length(test)) { tags_found[i] <- (smartPatternMatch("bla bla Matching bla bla", test[i], 8, FALSE))}
tags_found <- NULL
for(i in 1:length(test)) { tags_found[i] <- (smartPatternMatch("bla bla Matching bla bla", test[i], 8, FALSE))}
tags_found <- NULL
for(i in 1:length(test)) { tags_found[i] <- (smartPatternMatch("bla bla Matching bla bla", test[i], 8, FALSE))}
any(tags_found)
curtag
tagexpand <- c("s", "n", "en")
curtag
curtag[2] <- "bla"
curtag
curtag[2] <- NULL
curtag[2] <- ""
curtag
rm(curtag[2])
curtag <- "Tomate"
for(e in 1:length(tagexpand)) {
curtag[e] <- str_c(curtag[e], tagexpand[e])
}
curtag
for(e in 1:length(tagexpand)) {
curtag[e] <- str_c(curtag, tagexpand[e])
}
curtag <- "Tomate"
for(e in 1:length(tagexpand)) {
curtag[e] <- str_c(curtag, tagexpand[e])
}
curtag
curtag <- "Tomate"
for(e in 1:length(tagexpand)) {
curtag[e] <- str_c(curtag[1], tagexpand[e])
}
curtag
tagexpand <- c("", "s", "n", "en")
for(e in 1:length(tagexpand)) {
curtag[e] <- str_c(curtag[1], tagexpand[e])
}
curtag <- "Tomate"
for(e in 1:length(tagexpand)) {
curtag[e] <- str_c(curtag[1], tagexpand[e])
}
curtag
smartPatternMatch <- function(string, pattern, chars, acronym) {
patternrex <- str_c("\\b", pattern, "\\b")
if(chars <= 4) { # 4 or less
found <- agrep(patternrex, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE)
}
else if(chars >= 8) { # 8 or more
found <- agrep(patternrex, string, max.distance = list(all = 1), ignore.case = !acronym, fixed = FALSE)
# # Give longer words a chance by ignoring word boundaries \\b
# if(convertLogical0(found) == 0) {
# found <- grep(pattern, string, ignore.case = !acronym, fixed = FALSE)
# }
}
else { # 5,6,7
found <- agrep(patternrex, string, max.distance = list(all = 1), ignore.case = !acronym, fixed = FALSE)
}
found <- convertLogical0(found)
if(found == 1) {
found <- TRUE
} else {
found <- FALSE
}
return(found)
}
# MATCH TWEETS ------------------------------------------------------------
id_folder <- "matched-ids"
unlink(id_folder, recursive = TRUE)
dir.create(id_folder)
@@ -479,34 +197,316 @@ issueheads <- names(issuelist)
issues[issueheads] <- 0
tweets$issue <- ""
tweets$tags <- ""
issueheads
issuelist <- xmlToList("issues.xml")
issuelist
issueheads
View(issues)
issuelist$text
issuelist$macro.100
issuelist$macro.101
issuelist$text
issuelist$text <- NULL
issueheads <- names(issuelist)
issueheads
issuelist
issuelist$text <- ""
issuelist
issuelist$text <- NA
issuelist
issuelist$text
issuelist$text[1]
issuelist$text[2]
issuelist$text[6]
issuelist$text[10]
issues <- data.frame(date = drange)
issuelist <- xmlToList("issues.xml")
tagexpand <- c("", "s", "n", "en")
for(d in 1:nrow(issues)) {
# Go through every day
curdate <- issues$date[d]
cat(as.character(curdate),"\n")
# Put all tweets from specific day in a temporary DF
tweets_curday <- tweets[tweets[, "created_at"] == curdate, ]
for(t in 1:nrow(tweets_curday)){
# Select tweet's text, make it lowercase and remove hashtag indicators (#)
curtext <- as.character(tweets_curday$text[t])
curtext <- str_replace_all(curtext, "#", "")
curid <- as.character(tweets_curday$id_str[t])
# Now test each single issue (not tag!)
for(i in 1:length(issueheads)) {
curissue <- issueheads[i]
curtags <- as.character(issuelist[[curissue]])
curfile <- str_c(id_folder,"/",curissue,".csv")
# Now test all tags of a single issue
for(s in 1:length(curtags)) {
curtag <- curtags[s]
curchars <- nchar(curtag, type = "chars")
# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
if(curchars <= 4) {
curacro <- checkAcronym(string = curtag, chars = curchars)
} else {
curacro <- FALSE
}
# Now expand the current tag by possible suffixes that may be plural forms
if(!curacro) {
for(e in 1:length(tagexpand)) {
curtag[e] <- str_c(curtag[1], tagexpand[e])
}
}
# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow also 1 (Levenshtein distance)
tags_found <- NULL
tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro)
if(tags_found == 1) {
# Raise number of findings on this day for this issue by 1
issues[d,curissue] <- issues[d,curissue] + 1
# Add issue and first matched tag of tweet to tweets-DF
oldissue <- tweets[tweets[, "id_str"] == curid, "issue"]
tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";")
oldtag <- tweets[tweets[, "id_str"] == curid, "tags"]
tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";")
# Add information to file for function viewPatternMatching
write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE)
break
}
else {
#cat("Nothing found\n")
}
} # /for curtags
} # /for issuelist
} # /for tweets_curday
} # /for drange
#rm(tweets_curday,curacro, curchars, curdate,curfile,curid,curissue,curtag,curtags,curtext,d,date_end,date_start,i,id_folder,oldissue,oldtag,s,t,tags_found)
warnings()
tags_found <- NULL
for(e in 1:length(curtag)) {
tags_found[e] <- smartPatternMatch(curtext, curtag[e], curchars, curacro)
}
tags_found
curtext
curtag
any(tags_found)
# MATCH TWEETS ------------------------------------------------------------
id_folder <- "matched-ids"
unlink(id_folder, recursive = TRUE)
dir.create(id_folder)
issues <- data.frame(date = drange)
issuelist <- xmlToList("issues.xml")
issueheads <- names(issuelist)
issues[issueheads] <- 0
tweets$issue <- ""
tweets$tags <- ""
tagexpand <- c("", "s", "n", "en")
for(d in 1:nrow(issues)) {
# Go through every day
curdate <- issues$date[d]
cat(as.character(curdate),"\n")
# Put all tweets from specific day in a temporary DF
tweets_curday <- tweets[tweets[, "created_at"] == curdate, ]
for(t in 1:nrow(tweets_curday)){
# Select tweet's text, make it lowercase and remove hashtag indicators (#)
curtext <- as.character(tweets_curday$text[t])
curtext <- str_replace_all(curtext, "#", "")
curid <- as.character(tweets_curday$id_str[t])
# Now test each single issue (not tag!)
for(i in 1:length(issueheads)) {
curissue <- issueheads[i]
curtags <- as.character(issuelist[[curissue]])
curfile <- str_c(id_folder,"/",curissue,".csv")
# Now test all tags of a single issue
for(s in 1:length(curtags)) {
curtag <- curtags[s]
curchars <- nchar(curtag, type = "chars")
# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
if(curchars <= 4) {
curacro <- checkAcronym(string = curtag, chars = curchars)
} else {
curacro <- FALSE
}
# Now expand the current tag by possible suffixes that may be plural forms
if(!curacro) {
for(e in 1:length(tagexpand)) {
curtag[e] <- str_c(curtag[1], tagexpand[e])
}
}
# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow also 1 (Levenshtein distance)
tags_found <- NULL
for(e in 1:length(curtag)) {
tags_found[e] <- smartPatternMatch(curtext, curtag[e], curchars, curacro)
}
tags_found <- any(tags_found)
if(tags_found == TRUE) {
# Raise number of findings on this day for this issue by 1
issues[d,curissue] <- issues[d,curissue] + 1
# Add issue and first matched tag of tweet to tweets-DF
oldissue <- tweets[tweets[, "id_str"] == curid, "issue"]
tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";")
oldtag <- tweets[tweets[, "id_str"] == curid, "tags"]
tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";")
# Add information to file for function viewPatternMatching
write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE)
break
}
else {
#cat("Nothing found\n")
}
} # /for curtags
} # /for issuelist
} # /for tweets_curday
} # /for drange
#rm(tweets_curday,curacro, curchars, curdate,curfile,curid,curissue,curtag,curtags,curtext,d,date_end,date_start,i,id_folder,oldissue,oldtag,s,t,tags_found)
curtag
curtag <- curtag[1]
curtag
# MATCH TWEETS ------------------------------------------------------------
id_folder <- "matched-ids"
unlink(id_folder, recursive = TRUE)
dir.create(id_folder)
issues <- data.frame(date = drange)
issuelist <- xmlToList("issues.xml")
issueheads <- names(issuelist)
issues[issueheads] <- 0
tweets$issue <- ""
tweets$tags <- ""
tagexpand <- c("", "s", "n", "en")
for(d in 1:nrow(issues)) {
# Go through every day
curdate <- issues$date[d]
cat(as.character(curdate),"\n")
# Put all tweets from specific day in a temporary DF
tweets_curday <- tweets[tweets[, "created_at"] == curdate, ]
for(t in 1:nrow(tweets_curday)){
# Select tweet's text, make it lowercase and remove hashtag indicators (#)
curtext <- as.character(tweets_curday$text[t])
curtext <- str_replace_all(curtext, "#", "")
curid <- as.character(tweets_curday$id_str[t])
# Now test each single issue (not tag!)
for(i in 1:length(issueheads)) {
curissue <- issueheads[i]
curtags <- as.character(issuelist[[curissue]])
curfile <- str_c(id_folder,"/",curissue,".csv")
# Now test all tags of a single issue
for(s in 1:length(curtags)) {
curtag <- curtags[s]
curchars <- nchar(curtag, type = "chars")
# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
if(curchars <= 4) {
curacro <- checkAcronym(string = curtag, chars = curchars)
} else {
curacro <- FALSE
}
# Now expand the current tag by possible suffixes that may be plural forms
if(!curacro) {
for(e in 1:length(tagexpand)) {
curtag[e] <- str_c(curtag[1], tagexpand[e])
}
}
# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow also 1 (Levenshtein distance)
tags_found <- NULL
for(e in 1:length(curtag)) {
tags_found[e] <- smartPatternMatch(curtext, curtag[e], curchars, curacro)
}
tags_found <- any(tags_found)
curtag <- curtag[1]
if(tags_found == TRUE) {
# Raise number of findings on this day for this issue by 1
issues[d,curissue] <- issues[d,curissue] + 1
# Add issue and first matched tag of tweet to tweets-DF
oldissue <- tweets[tweets[, "id_str"] == curid, "issue"]
tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";")
oldtag <- tweets[tweets[, "id_str"] == curid, "tags"]
tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";")
# Add information to file for function viewPatternMatching
write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE)
break
}
else {
#cat("Nothing found\n")
}
} # /for curtags
} # /for issuelist
} # /for tweets_curday
} # /for drange
View(tweets)
require(jsonlite)
require(stringr)
require(devtools)
require(RTwitterAPI)
acc_df <- read.csv("MdB-twitter.csv")
delrow <- NULL
for(r in 1:nrow(acc_df)) {
acc <- as.character(acc_df$twitter_acc[r])
if(!nzchar(acc)) {
delrow <- c(delrow, r)
}
}
acc_df <- acc_df[-delrow, ]
rm(delrow, r, acc)
acc_df$row.names <- NULL
row.names(acc_df) <- NULL
View(acc_df)
acc_df(acc_df$party == "linke")
acc_df[acc_df$party == "linke"]
acc_df[, acc_df$party == "linke"]
acc_df[acc_df$party == "linke", ]
length(acc_df[acc_df$party == "linke", ])
nrow(acc_df[acc_df$party == "linke", ])
nrow(acc_df[acc_df$party == "linke", ]) / 280
nrow(acc_df[acc_df$party == "gruene", ]) / 280
nrow(acc_df[acc_df$party == "cducsu", ]) / 280
nrow(acc_df[acc_df$party == "spd", ]) / 280
test <- c("linke", "gruene")
nrow(acc_df[acc_df$party == sprintf("%s", test), ]) / 280
test
nrow(acc_df[acc_df$party == sprintf("%s", test), ]) / 280
acc_parties <- c("cducsu", "spd", "linke", "gruene")
acc_parties <- data.frame(party = c("cducsu", "spd", "linke", "gruene"))
View(acc_parties)
acc_parties$btw13 <- c(41.5, 25.7, 8.6, 8.4)
View(acc_parties)
acc_parties$twitter <- 0
View(acc_parties)
for(p in 1:length(acc_parties)) {
acc_parties$twitter[p] <- as.numeric(nrow(acc_df[acc_df$party == as.character(acc_parties$party[p]), ]) / 280)
}
View(acc_parties)
as.numeric(nrow(acc_df[acc_df$party == as.character(acc_parties$party[p]), ]) / 280 * 100)
round(14.64282, digits = 1)
round(nrow(acc_df[acc_df$party == as.character(acc_parties$party[p]), ]) / 280), digits=1)
nrow(acc_df[acc_df$party == as.character(acc_parties$party[p]), ]) / 280)
nrow(acc_df[acc_df$party == as.character(acc_parties$party[p]), ]) / 280
nrow(acc_df[acc_df$party == as.character(acc_parties$party[p]), ]) / 280 * 100
round(nrow(acc_df[acc_df$party == as.character(acc_parties$party[p]), ]) / 280 * 100)
round(nrow(acc_df[acc_df$party == as.character(acc_parties$party[p]), ]) / 280 * 100, digits=1)
for(p in 1:length(acc_parties)) {
acc_parties$twitter[p] <- round(nrow(acc_df[acc_df$party == as.character(acc_parties$party[p]), ]) / 280 * 100, digits=1)
}
View(acc_parties)
View(acc_parties)
acc_parties$twitter <- 0
for(p in 1:length(acc_parties)) {
acc_parties$twitter[p] <- round(nrow(acc_df[acc_df$party == as.character(acc_parties$party[p]), ]) / 280 * 100, digits=1)
}
View(acc_parties)
nrow(acc_df[acc_df$party == "gruene", ]) / 280
as.character(acc_parties$party[4])
acc_parties <- data.frame(party = c("cducsu", "spd", "linke", "gruene"))
acc_parties$btw13 <- c(41.5, 25.7, 8.6, 8.4)
acc_parties$twitter <- 0
for(p in 1:length(acc_parties)) {
acc_parties$twitter[p] <- round(nrow(acc_df[acc_df$party == as.character(acc_parties$party[p]), ]) / 280 * 100)
}
View(acc_parties)
round(nrow(acc_df[acc_df$party == as.character(acc_parties$party[p]), ]) / 280 * 100)
p
acc_parties
acc_parties <- data.frame(party = c("cducsu", "spd", "linke", "gruene"))
acc_parties$btw13 <- c(41.5, 25.7, 8.6, 8.4)
acc_parties$twitter <- 0
for(p in 1:nrow(acc_parties)) {
acc_parties$twitter[p] <- round(nrow(acc_df[acc_df$party == as.character(acc_parties$party[p]), ]) / 280 * 100)
}
View(acc_parties)
acc_parties <- data.frame(party = c("cducsu", "spd", "linke", "gruene"))
acc_parties$btw13 <- c(49.3, 30.6, 10.1, 10.0)
acc_parties$twitter <- 0
for(p in 1:nrow(acc_parties)) {
acc_parties$twitter[p] <- round(nrow(acc_df[acc_df$party == as.character(acc_parties$party[p]), ]) / 280 * 100)
}
View(acc_parties)
pie(acc_parties$btw13)
pie(acc_parties$btw13, col=c("black", "red", "purple", "green"))
pie(acc_parties$btw13, col=c("black", "red", "purple", "green"), labels = c("CDU/CSU", "SPD", "Die LINKE", "Bündnis 90/Grüne"))
pie(acc_parties$twitter, col=c("black", "red", "purple", "green"), labels = c("CDU/CSU", "SPD", "Die LINKE", "Bündnis 90/Grüne"))
pie(acc_parties$twitter, col=c("black", "red", "purple", "green"), labels = c("CDU/CSU", "SPD", "Die LINKE", "Bündnis 90/Grüne"), clockwise = T)
pie(acc_btw13$twitter, col=c("black", "red", "purple", "green"), labels = c("CDU/CSU", "SPD", "Die LINKE", "Bündnis 90/Grüne"), clockwise = T)
pie(acc_parties$btw13, col=c("black", "red", "purple", "green"), labels = c("CDU/CSU", "SPD", "Die LINKE", "Bündnis 90/Grüne"), clockwise = T)
acc_parties <- data.frame(party = c("cducsu", "spd", "linke", "gruene"))
acc_parties$btw13 <- c(49.3, 30.6, 10.1, 10.0) # seats of party / 631 seats
acc_parties$twitter <- 0
for(p in 1:nrow(acc_parties)) {
acc_parties$twitter[p] <- round(nrow(acc_df[acc_df$party == as.character(acc_parties$party[p]), ]) / 280 * 100)
}
pie(acc_parties$btw13, col=c("black", "red", "purple", "green"), labels = c("CDU/CSU", "SPD", "Die LINKE", "Bündnis 90/Grüne"), clockwise = T)
pie(acc_parties$twitter, col=c("black", "red", "purple", "green"), labels = c("CDU/CSU", "SPD", "Die LINKE", "Bündnis 90/Grüne"), clockwise = T)
pie(acc_parties$btw13, col=c("black", "red", "purple", "green"), labels = c("CDU/CSU", "SPD", "Die LINKE", "Bündnis 90/Grüne"), clockwise = T,
main = "Seats of parties in the parliament")
pie(acc_parties$twitter, col=c("black", "red", "purple", "green"), labels = c("CDU/CSU", "SPD", "Die LINKE", "Bündnis 90/Grüne"), clockwise = T,
main = "Percentage of parties' MdBs of all Twitter accounts")
rm(acc_parties)
+34 -3
View File
@@ -29,6 +29,8 @@ issues[issueheads] <- 0
tweets$issue <- ""
tweets$tags <- ""
tagexpand <- c("", "s", "n", "en")
for(d in 1:nrow(issues)) {
# Go through every day
curdate <- issues$date[d]
@@ -61,10 +63,23 @@ for(d in 1:nrow(issues)) {
} else {
curacro <- FALSE
}
# Now expand the current tag by possible suffixes that may be plural forms
if(!curacro) {
for(e in 1:length(tagexpand)) {
curtag[e] <- str_c(curtag[1], tagexpand[e])
}
}
# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro)
if(tags_found == 1) {
# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow also 1 (Levenshtein distance)
tags_found <- NULL
for(e in 1:length(curtag)) {
tags_found[e] <- smartPatternMatch(curtext, curtag[e], curchars, curacro)
}
tags_found <- any(tags_found)
curtag <- curtag[1]
if(tags_found == TRUE) {
# Raise number of findings on this day for this issue by 1
issues[d,curissue] <- issues[d,curissue] + 1
@@ -117,6 +132,22 @@ g1
rm(g1, r)
# Show party percentage of twitter users
acc_parties <- data.frame(party = c("cducsu", "spd", "linke", "gruene"))
acc_parties$btw13 <- c(49.3, 30.6, 10.1, 10.0) # seats of party / 631 seats
acc_parties$twitter <- 0
for(p in 1:nrow(acc_parties)) {
acc_parties$twitter[p] <- round(nrow(acc_df[acc_df$party == as.character(acc_parties$party[p]), ]) / 280 * 100)
}
pie(acc_parties$btw13, col=c("black", "red", "purple", "green"), labels = c("CDU/CSU", "SPD", "Die LINKE", "Bündnis 90/Grüne"), clockwise = T,
main = "Seats of parties in the parliament")
pie(acc_parties$twitter, col=c("black", "red", "purple", "green"), labels = c("CDU/CSU", "SPD", "Die LINKE", "Bündnis 90/Grüne"), clockwise = T,
main = "Percentage of parties' MdBs of all Twitter accounts")
rm(acc_parties, p)
# VISUALS -----------------------------------------------------------------
+11 -5
View File
@@ -33,16 +33,22 @@ smartPatternMatch <- function(string, pattern, chars, acronym) {
found <- agrep(patternrex, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE)
}
else if(chars >= 8) { # 8 or more
found <- agrep(patternrex, string, max.distance = list(all = 2), ignore.case = !acronym, fixed = FALSE)
# Give longer words a chance by ignoring word boundaries \\b
if(convertLogical0(found) == 0) {
found <- grep(pattern, string, ignore.case = !acronym, fixed = FALSE)
}
found <- agrep(patternrex, string, max.distance = list(all = 1), ignore.case = !acronym, fixed = FALSE)
# # Give longer words a chance by ignoring word boundaries \\b
# if(convertLogical0(found) == 0) {
# found <- grep(pattern, string, ignore.case = !acronym, fixed = FALSE)
# }
}
else { # 5,6,7
found <- agrep(patternrex, string, max.distance = list(all = 1), ignore.case = !acronym, fixed = FALSE)
}
found <- convertLogical0(found)
if(found == 1) {
found <- TRUE
} else {
found <- FALSE
}
return(found)
}
+1 -2
View File
@@ -24,7 +24,7 @@
<macro.105>
<tag>Staatsverschuldung</tag>
<tag>Schuldenquote</tag>
<tag>Haushaltskürzungen</tag>
<tag>Haushaltskürzung</tag>
<tag>Staatsdefizit</tag>
<tag>Finanzpolitik</tag>
<tag>Haushaltspolitik</tag>
@@ -95,7 +95,6 @@
</civil.208>
<civil.209>
<tag>Extremismus</tag>
<tag>Spione</tag>
<tag>Spion</tag>
<tag>linksradikal</tag>
<tag>rechtsradikal</tag>