before changing fuzzy matching because it sucks

2015-01-18 23:35:47 +01:00
parent 4693f8571e
commit e9c5fc7d8d
3 changed files with 404 additions and 383 deletions
@@ -1,381 +1,3 @@
 View(c_errors)
 for(r in 1:nrow(c_errors)) {
 c_errcode <- as.character(c_errors$code[r])
 c_errissue <- as.character(c_errors$issue[r])
 c_errtags <- as.character(c_errors$tags[r])
 c_errtext <- as.character(c_errors$text[r])
 c_errid <- as.character(c_errors$str_id[r])
 cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errtags, "\n", sep="")
 source("issuecomp-codingsample-function2.R")
 }
 for(r in 1:nrow(c_errors)) {
 c_errcode <- as.character(c_errors$code[r])
 c_errissue <- as.character(c_errors$issue[r])
 c_errtags <- as.character(c_errors$tags[r])
 c_errtext <- as.character(c_errors$text[r])
 c_errid <- as.character(c_errors$str_id[r])
 cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errtags, "\n", sep="")
 source("issuecomp-codingsample-function2.R")
 }
 c_curissue
 c_curtags
 c_errissue
 c_errtags
 c_errid
 delrow <- NULL
 for(r in 1:nrow(tweets)) {
 if(format(tweets$created_at[r], "%Y") != "2014") {
 delrow <- c(delrow, r)
 }
 curtext <- as.character(tweets$text[r])
 curtext <- str_replace_all(curtext, "$", " ")
 curtext <- str_replace_all(curtext, "http://.+?\\s", "URL ")
 }
 r
 require(stringr)
 View(tweets)
 df <- data.frame(x= c("zeile1","zeile2"))
 View(df)
 df$x[1] <- "blabla"
 View(df)
 df <- data.frame(x= c("zeile1","zeile2"))
 test <- "bla bla"
 df$x[1] <- test
 View(df)
 df$x[1] <- as.character(test)
 class(df$x)
 df$x[1] <- as.factor(test)
 head(tweet)
 head(tweets)
 df <- head(tweets)
 View(df)
 df$text[1] <- "test"
 View(tweets)
 View(df)
 for(r in 1:nrow(tweets)) {
 #   if(format(tweets$created_at[r], "%Y") != "2014") {
 #     delrow <- c(delrow, r)
 #   }
 curtext <- as.character(tweets$text[r])
 curtext <- str_replace_all(curtext, "$", " ")
 curtext <- str_replace_all(curtext, "http://.+?\\s", "URL ")
 tweets$text[r] <- curtext
 }
 View(tweets)
 View(c_tweets)
 rm(delrow, r)
 save(tweets, file="tweets_untagged.RData")
 row.names(tweets) <- NULL
 write.csv(tweets, "tweets.csv")
 save(tweets, file="tweets.RData")
 c_tweets <- read.csv("tweets.csv")
 c_tweets$X <- NULL
 View(c_tweets)
 viewMatchingTweets
 c_tweets <- read.csv("tweets.csv", colClasses="character")
 c_tweets$X <- NULL
 View(c_tweets)
 View(c_issues)
 c_errtags
 for(r in 1:nrow(c_errors)) {
 c_errcode <- as.character(c_errors$code[r])
 c_errissue <- as.character(c_errors$issue[r])
 c_errtags <- as.character(c_errors$tags[r])
 c_errtext <- as.character(c_errors$text[r])
 c_errid <- as.character(c_errors$str_id[r])
 cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errtags, "\n", sep="")
 source("issuecomp-codingsample-function2.R")
 }
 curissue
 c_curissue
 str_join(c_curissue)
 str_join(c_curissue,collapse = NULL)
 str_join(c_curissue,sep=";",collapse = NULL)
 paste(c_curissue,sep = "")
 paste(c_curissue,sep = '')
 length(paste(c_curissue,sep = ''))
 str_join(c_curissue,sep=";",collapse = "")
 str_join(c_curissue,sep=";",collapse = "w")
 str_join(c_curissue,collapse = ";")
 for(r in 1:nrow(c_errors)) {
 c_errcode <- as.character(c_errors$code[r])
 c_errissue <- as.character(c_errors$issue[r])
 c_errtags <- as.character(c_errors$tags[r])
 c_errtext <- as.character(c_errors$text[r])
 c_errid <- as.character(c_errors$str_id[r])
 cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errtags, "\n", sep="")
 source("issuecomp-codingsample-function2.R")
 }
 require(jsonlite)
 require(stringr)
 require(devtools)
 require(RTwitterAPI)
 setwd("~/Dokumente/Uni/Aktuell/BA-Arbeit/uni-ba-issuecomp")
 source("issuecomp-functions.R")
 acc_df <- read.csv("MdB-twitter.csv")
 delrow <- NULL
 for(r in 1:nrow(acc_df)) {
 acc <- as.character(acc_df$twitter_acc[r])
 if(!nzchar(acc)) {
 delrow <- c(delrow, r)
 }
 }
 acc_df <- acc_df[-delrow, ]
 rm(delrow, r, acc)
 acc_df$row.names <- NULL
 row.names(acc_df) <- NULL
 require(lubridate)
 require(XML)
 require(ggplot2)
 require(reshape2)
 require(stringr)
 source("issuecomp-functions.R")
 load("tweets.RData")
 View(tweets)
 View(tweets)
 View(tweets)
 date_start <- as.Date("2014-01-01")
 date_start + days
 date_start + days(1)
 date_start + days(0)
 date_start + days(0:2)
 date_start <- as.Date("2014-01-01")
 date_end <- as.Date("2014-12-31")
 drange <- as.integer(date_end - date_start)
 drange <- date_start + days(0:drange)
 curdate <- date_start + days(2)
 curdate
 tweets[tweets[, "created_at"] == curdate, "msg_id"]
 View(tweets)
 curdate
 tweets[tweets[, "created_at"] == "2014-01-01", "msg_id"]
 tweets[tweets[, "created_at"] == curdate, "id_str"]
 drange
 length(tweets[tweets[, "created_at"] == curdate, "id_str"])
 length(tweets[tweets[, "created_at"] == curdate+1, "id_str"])
 length(tweets[tweets[, "created_at"] == curdate+15, "id_str"])
 stats <- data.frame(x=NULL)
 View(stats)
 stats <- data.frame(date=drange)
 View(stats)
 stats$tpd <- NULL
 stats$tpd <- ""
 stats$tpd <- NULL
 stats$tpd[1] <- 2
 View(stats)
 stats$tpd[2] <- 3
 View(stats)
 stats$tpd <- ""
 stats$tpd <- NULL
 stats$tpd <- ""
 stats$tpd[1] <- 2
 View(tweets)
 View(stats)
 stats <- data.frame(date=drange)
 stats$tpd <- ""
 # Total number of tweets per day over time
 for(r in 1:length(drange)) {
 stats$tpd[r] <- length(tweets[tweets[, "created_at"] == curdate, "id_str"])
 }
 View(stats)
 drange[2]
 stats <- data.frame(date=drange)
 stats$tpd <- ""
 # Total number of tweets per day over time
 for(r in 1:length(drange)) {
 stats$tpd[r] <- length(tweets[tweets[, "created_at"] == drange[r], "id_str"])
 }
 View(stats)
 plot.ts(x = stats$tpd, y=stats$date)
 plot.ts(x = stats$date, y=stats$tpd)
 g1 <- ggplot(stats, aes(date,tpd))
 g1 <- g1 + geom_histogram(fill="steelblue", stat="identity")
 g1 <- g1 + stat_smooth(size=1,colour="red",method="loess", se=FALSE)
 g1 <- gg1 + ggtitle("Zeitliche Entwicklung von plötzlichen Medienfokussen") + xlab("Einzelne Monate") + ylab("Plötzliche Medienfokusse")
 g1
 g1 <- ggplot(stats, aes(date,tpd))
 g1 <- g1 + geom_histogram(fill="steelblue", stat="identity")
 g1
 g1 <- g1 + stat_smooth(size=1,colour="red",method="loess", se=FALSE)
 g1
 g1 <- ggplot(stats, aes(date,tpd))
 g1 <- g1 + geom_line()
 g1
 g1 <- ggplot() + geom_line(data = stats, aes(x=date,y=tpd, color=black))
 g1
 g1 <- ggplot() + geom_line(data = stats, aes(x=date,y=tpd, color="black"))
 g1
 g1 <- ggplot() + geom_line(data = stats, aes(x=date,y=tpd))
 g1
 ggplot(stats, aes(date, tpd)) + geom_line() +
 scale_x_date(format = "%b-%Y") + xlab("") + ylab("Daily Views")
 ggplot(stats, aes(date, tpd)) + geom_line() +
 scale_x_date() + xlab("") + ylab("Daily Views")
 lapply(stats, class)
 stats$tpd <- 0
 lapply(stats, class)
 stats <- data.frame(date=drange)
 stats$tpd <- 0
 # Total number of tweets per day over time
 for(r in 1:length(drange)) {
 stats$tpd[r] <- length(tweets[tweets[, "created_at"] == drange[r], "id_str"])
 }
 View(stats)
 plot.ts(x = stats$date, y=stats$tpd)
 g1 <- ggplot() + geom_line(data = stats, aes(x=date,y=tpd))
 g1
 ggplot(stats, aes(date, tpd)) + geom_line() +
 scale_x_date() + xlab("") + ylab("Daily Views")
 g1 <- ggplot() +
 geom_line(data = stats, aes(x=date,y=tpd)) +
 stats_smooth(size=1,colour="red",method="loess", se=FALSE)
 g1
 g1 <- ggplot() +
 geom_line(data = stats, aes(x=date,y=tpd)) +
 stat_smooth(size=1,colour="red",method="loess", se=FALSE)
 g1
 g1 <- ggplot() +
 geom_line(data = stats, aes(x=date,y=tpd)) +
 stat_smooth(colour="red",method="loess", se=FALSE)
 g1
 g1 <- ggplot() +
 geom_line(data = stats, aes(x=date,y=tpd)) +
 stat_smooth(colour="red",method="lm", se=FALSE)
 g1
 g1 <- ggplot() +
 geom_line(data = stats, aes(x=date,y=tpd)) +
 geom_smooth(colour="red",method="lm", se=FALSE)
 g1
 g1 <- ggplot() +
 geom_line(data = stats, aes(x=date,y=tpd)) +
 geom_smooth(colour="red",method="loess", se=FALSE)
 g1
 g1 <- ggplot() +
 geom_line(data = stats, aes(x=date,y=tpd)) +
 geom_smooth(size=1,colour="red",method="loess", se=FALSE)
 g1
 g1 <- g1 + geom_smooth(size=1,colour="red", method="loess", se=FALSE)
 g1
 g1 <- ggplot() + geom_line(data = stats, aes(x=date,y=tpd))
 g1 <- g1 + geom_smooth(size=1,colour="red", method="loess", se=FALSE)
 g1
 g1 <- ggplot() + geom_line(data = stats, aes(x=date,y=tpd))
 g1
 g1 + geom_smooth()
 g1 + geom_smqwd
 g1 <- ggplot() + geom_point(data = stats, aes(x=date,y=tpd))
 g1
 g1 <- g1 + geom_smooth(size=1,colour="red", method="loess", se=FALSE)
 g1
 g1 <- ggplot() + geom_point(data = stats, aes(x=date,y=tpd))
 g1 <- g1 + geom_smooth(size=1,method="loess", se=FALSE)
 g1
 g1 <- ggplot() + geom_point(data = stats, aes(x=date,y=tpd))
 g1 <- g1 + geom_smooth(size=1,method="loess", se=FALSE, aes(group=1))
 g1
 g1 <- ggplot() + geom_point(data = stats, aes(x=date,y=tpd)) +
 geom_smooth(size=1,method="loess", se=FALSE, aes(group=1))
 g1
 g1 <- ggplot() + geom_point(data = stats, aes(x=date,y=tpd)) +
 geom_smooth(size=1,method="loess", se=FALSE, aes(x = date, y=tpd))
 g1
 geom_smooth(size=1,method="loess", se=FALSE)
 g1 <- ggplot() + geom_point(data = stats, aes(x=date,y=tpd)) +
 geom_smooth(size=1,method="loess", se=FALSE)
 g1
 plot.ts(x = stats$date, y=stats$tpd)
 g1 <- ggplot() + geom_point(data = stats, aes(x=date,y=tpd, color=group)) +
 geom_smooth(size=1,method="loess", se=FALSE)
 g1
 install.packages(c("BH", "bibtex", "devtools", "dplyr", "httr", "jsonlite", "lazyeval", "manipulate", "RCurl", "ROAuth", "rstudioapi", "sp", "stringi"))
 g1 <- ggplot(data = stats, aes(x=date,y=tpd, color=variable, group=variable)) +
 geom_line() +
 geom_smooth(size=1,formula = y ~ x, method="loess", se=FALSE)
 g1
 stats_melt <- melt(stats, id="date")
 View(stats_melt)
 View(stats_melt)
 stats_melt <- melt(stats, id="date")
 g1 <- ggplot(data = stats_melt, aes(x=date,y=value,color=variable, group=variable)) +
 geom_line() +
 geom_smooth(size=1,formula = y ~ x, method="loess", se=FALSE)
 g1
 g1 <- ggplot(data = stats_melt, aes(x=date,y=value,color=1, group=variable)) +
 geom_line() +
 geom_smooth(size=1,formula = y ~ x, method="loess", se=FALSE)
 g1
 g1 <- ggplot(data = stats_melt, aes(x=date,y=value,color=1, group=variable)) +
 geom_line() +
 geom_smooth(size=1,formula = y ~ x, method="loess", se=FALSE, colors="red")
 g1
 g1 <- ggplot(data = stats_melt, aes(x=date,y=value,color=1, group=variable)) +
 geom_line() +
 geom_smooth(size=1,formula = y ~ x, method="loess", se=FALSE, color="red")
 g1
 g1 <- ggplot(data = stats_melt, aes(x=date,y=value,color="black", group=variable)) +
 geom_line() +
 geom_smooth(size=1,formula = y ~ x, method="loess", se=FALSE, color="red")
 g1
 g1 <- ggplot(data = stats_melt, aes(x=date,y=value,color="yellow", group=variable)) +
 geom_line() +
 geom_smooth(size=1,formula = y ~ x, method="loess", se=FALSE, color="red")
 g1
 g1 <- ggplot(data = stats_melt, aes(x=date,y=value,colour="black", group=variable)) +
 geom_line() +
 geom_smooth(size=1,formula = y ~ x, method="loess", se=FALSE, color="red")
 g1
 g1 <- ggplot(data = stats_melt, aes(x=date,y=value,colour="#FFFFFF", group=variable)) +
 geom_line() +
 geom_smooth(size=1,formula = y ~ x, method="loess", se=FALSE, color="red")
 g1
 g1 <- ggplot(data = stats_melt, aes(x=date,y=value,colour="vqwdqw", group=variable)) +
 geom_line() +
 geom_smooth(size=1,formula = y ~ x, method="loess", se=FALSE, color="red")
 g1
 g1 <- ggplot(data = stats_melt, aes(x=date,y=value,colour=variable, group=variable)) +
 geom_line() +
 geom_smooth(size=1,formula = y ~ x, method="loess", se=FALSE, color="red")
 g1
 g1 <- ggplot(data = stats_melt, aes(x=date,y=value,colour=1, group=variable)) +
 geom_line() +
 geom_smooth(size=1,formula = y ~ x, method="loess", se=FALSE, color="red")
 g1
 g1 <- ggplot(data = stats_melt, aes(x=date,y=value,colour=variable, group=variable)) +
 geom_line() +
 geom_smooth(size=1,formula = y ~ x, method="loess", se=FALSE, color=1)
 g1
 require(lubridate)
 require(XML)
 require(ggplot2)
 require(reshape2)
 require(stringr)
 source("issuecomp-functions.R")
 rm(curdate)
 rm(date_end, date_start)
 rm(g1, r, )
 rm(g1, r)
 id_folder <- "matched-ids"
 unlink(id_folder, recursive = TRUE)
 dir.create(id_folder)
 issues <- data.frame(date = drange)
 View(issues)
 issuelist <- xmlToList("issues.xml")
 issuelist
 issueheads <- names(issuelist)
 issueheads
 issues[issueheads] <- 0
 tweets$issue <- ""
 tweets$tags <- ""
 View(issues)
 # MATCH TWEETS ------------------------------------------------------------
 id_folder <- "matched-ids"
 unlink(id_folder, recursive = TRUE)
 dir.create(id_folder)
 issues <- data.frame(date = drange)
 issuelist <- xmlToList("issues.xml")
 issueheads <- names(issuelist)
 issues[issueheads] <- 0
 tweets$issue <- ""
 tweets$tags <- ""
 for(d in 1:nrow(issues)) {
 # Go through every day
@@ -510,3 +132,381 @@ return(status)
 }
 require(stringr)
 require(XML)
 require(stringr)
 require(XML)
 # FUNCTIONS ---------------------------------------------------------------
 readYN <- function(question) {
 n <- readline(prompt=question)
 n <- as.character(n)
 return(n)
 }
 checkIssue <- function(string, issuelist) {
 status <- any(str_detect(string, issuelist))
 return(status)
 }
 checkAllIssues <- function(string, issuelist) {
 status <- NULL
 for(i in 1:length(string)) {
 if(checkIssue(string[i], issuelist)) {
 status[i] <- TRUE
 }
 else {
 cat("Issue",string[i],"does not exist. Please try again.\n")
 status[i] <- FALSE
 }
 }
 return(status)
 }
 c_issues <- data.frame(date = drange)
 c_issuelist <- xmlToList("issues.xml")
 c_issueheads <- names(issuelist)
 c_issues[issueheads] <- 0
 source("issuecomp-codingsample-function.R")
 c_tweets <- tweets
 View(c_tweets)
 source("issuecomp-codingsample-function.R")
 smartPatternMatch("Höflich, aber klares Statement zu Menschenrechten. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschenrecht\\b", 13, FALSE)
 smartPatternMatch("Höflich, aber klares Statement zu Menschenrechten. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschenrechte\\b", 13, FALSE)
 smartPatternMatch <- function(string, pattern, chars, acronym) {
 pattern <- str_c("\\b", pattern, "\\b")
 if(chars <= 4) {
 found <- agrep(pattern, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE)
 }
 else if(chars >= 8) {
 cat("bla")
 found <- agrep(pattern, string, max.distance = list(all = 2), ignore.case = !acronym, fixed = FALSE)
 }
 else {
 found <- agrep(pattern, string, max.distance = list(all = 1), ignore.case = !acronym, fixed = FALSE)
 }
 found <- convertLogical0(found)
 return(found)
 }
 smartPatternMatch("Höflich, aber klares Statement zu Menschenrechten. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschenrechte\\b", 13, FALSE)
 smartPatternMatch <- function(string, pattern, chars, acronym) {
 pattern <- str_c("\\b", pattern, "\\b")
 if(chars <= 4) {
 found <- agrep(pattern, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE)
 }
 else if(chars >= 8) {
 found <- agrep(pattern, string, max.distance = list(all = 2), ignore.case = !acronym, fixed = FALSE)
 }
 else {
 found <- agrep(pattern, string, max.distance = list(all = 1), ignore.case = !acronym, fixed = FALSE)
 }
 found <- convertLogical0(found)
 return(found)
 }
 smartPatternMatch <- function(string, pattern, chars, acronym) {
 pattern <- str_c("\\b", pattern, "\\b")
 if(chars <= 4) {
 found <- agrep(pattern, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE)
 }
 else if(chars >= 8) {
 found <- agrep(pattern, string, max.distance = list(all = 3), ignore.case = !acronym, fixed = FALSE)
 }
 else {
 found <- agrep(pattern, string, max.distance = list(all = 1), ignore.case = !acronym, fixed = FALSE)
 }
 found <- convertLogical0(found)
 return(found)
 }
 smartPatternMatch("Höflich, aber klares Statement zu Menschenrechten. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschenrechte\\b", 13, FALSE)
 smartPatternMatch("Höflich, aber klares Statement zu Menschenrechten. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschenrecht\\b", 13, FALSE)
 smartPatternMatch("Höflich, aber klares Statement zu Menschenrechten. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschenracht\\b", 13, FALSE)
 smartPatternMatch("Höflich, aber klares Statement zu Menschenrechten. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschen-recht\\b", 13, FALSE)
 smartPatternMatch("Höflich, aber klares Statement zu Menschen-Rechten. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschenrecht\\b", 13, FALSE)
 smartPatternMatch("Höflich, aber klares Statement zu Menschen-Rechte. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschenrecht\\b", 13, FALSE)
 smartPatternMatch("Bla bla Tomate ", "\\Tomate\\b", 6, FALSE)
 smartPatternMatch("Bla bla Tomaten bla bla", "\\Tomate\\b", 6, FALSE)
 smartPatternMatch <- function(string, pattern, chars, acronym) {
 pattern <- str_c("\\b", pattern, "\\b")
 if(chars <= 4) {
 found <- agrep(pattern, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE)
 }
 else if(chars >= 8) {
 found <- agrep(pattern, string, max.distance = list(all = 3), ignore.case = !acronym, fixed = FALSE)
 }
 else {
 found <- agrep(pattern, string, max.distance = list(all = 2), ignore.case = !acronym, fixed = FALSE)
 }
 found <- convertLogical0(found)
 return(found)
 }
 smartPatternMatch("Bla bla Tomaten bla bla", "\\Tomate\\b", 6, FALSE)
 smartPatternMatch("Bla bla Menschen bla bla", "\\Menschen\\b", 8, FALSE)
 smartPatternMatch("Bla bla Menschen bla bla", "\\Menschen\\b", 7, FALSE)
 smartPatternMatch("Bla bla Menschen bla bla", "\\Mensch\\b", 7, FALSE)
 smartPatternMatch("Bla bla Menschen bla bla", "\\Mensch\\b", 8, FALSE)
 smartPatternMatch("Bla bla Nazis bla bla", "\\Nazis\\b", 8, FALSE)
 smartPatternMatch("Bla bla Nazis bla bla", "\\Nazis\\b", 5, FALSE)
 smartPatternMatch("Bla bla Nazis bla bla", "\\Nazi\\b", 4, FALSE)
 smartPatternMatch("Bla bla Nazi bla bla", "\\Nazis\\b", 5, FALSE)
 source("issuecomp-codingsample-function.R")
 smartPatternMatch("Der kleine Flüchtlingsjunge war", "\\bFlüchtling\\b", 9, FALSE)
 str_detect("Der kleine Flüchtlingsjunge war", pattern = "\\bFlüchtling\\b")
 str_detect("Der kleine Flüchtlingsjunge war", pattern = "Flüchtling")
 str_detect("Der kleine Flücht lingsjunge war", pattern = "Flüchtling")
 smartPatternMatch <- function(string, pattern, chars, acronym) {
 pattern <- str_c("\\b", pattern, "\\b")
 if(chars <= 4) { # 4 or less
 found <- agrep(pattern, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE)
 }
 else if(chars >= 8) { # 8 or more
 found <- agrep(pattern, string, max.distance = list(all = 3), ignore.case = !acronym, fixed = FALSE)
 cat(found)
 }
 else { # 5,6,7
 found <- agrep(pattern, string, max.distance = list(all = 2), ignore.case = !acronym, fixed = FALSE)
 }
 found <- convertLogical0(found)
 return(found)
 }
 str_detect("Der kleine Flücht lingsjunge war", pattern = "Flüchtling")
 smartPatternMatch("Der kleine Flüchtlingsjunge war", "\\bFlüchtling\\b", 9, FALSE)
 smartPatternMatch("Der kleine Flüchtlingsjunge war", "\\bFlüchtling\\b", 9, FALSE)
 smartPatternMatch <- function(string, pattern, chars, acronym) {
 pattern <- str_c("\\b", pattern, "\\b")
 if(chars <= 4) { # 4 or less
 found <- agrep(pattern, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE)
 }
 else if(chars >= 8) { # 8 or more
 found <- agrep(pattern, string, max.distance = list(all = 3), ignore.case = !acronym, fixed = FALSE)
 cat("it's",found)
 }
 else { # 5,6,7
 found <- agrep(pattern, string, max.distance = list(all = 2), ignore.case = !acronym, fixed = FALSE)
 }
 found <- convertLogical0(found)
 return(found)
 }
 smartPatternMatch("Der kleine Flüchtlingsjunge war", "\\bFlüchtling\\b", 9, FALSE)
 str_detect("Der kleine Flücht lingsjunge war", pattern = "Flüchtling")
 str_detect("Der kleine Flüchtlingsjunge war", pattern = "Flüchtling")
 smartPatternMatch("Der kleine Flüchtlingsjunge war", "\\bFlüchtling\\b", 9, FALSE)
 smartPatternMatch("Der kleine Flüchtlinge war", "\\bFlüchtling\\b", 9, FALSE)
 grep("Flüchtling","Der kleine Flüchtlingsjunge war", ignore.case = TRUE, fixed=FALSE)
 grep("\\bFlüchtling\\b","Der kleine Flüchtlingsjunge war", ignore.case = TRUE, fixed=FALSE)
 grep("\\bFlüchtling\\b","Der kleine Flüchtlingsjunge war", ignore.case = TRUE, fixed=TRUE)
 grep("\\bFlüchtling\\b","Der kleine Flüchtlingsjunge war", ignore.case = TRUE, fixed=FALSE)
 grep("Flüchtling","Der kleine Flüchtlingsjunge war", ignore.case = TRUE, fixed=FALSE)
 grep("Flüchtling","Der kleine Flücht-lingsjunge war", ignore.case = TRUE, fixed=FALSE)
 grep("Flüchtling","Der kleine Flüchtlingsjunge war", ignore.case = TRUE, fixed=FALSE)
 smartPatternMatch <- function(string, pattern, chars, acronym) {
 patternrex <- str_c("\\b", pattern, "\\b")
 if(chars <= 4) { # 4 or less
 found <- agrep(patternrex, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE)
 }
 else if(chars >= 8) { # 8 or more
 found <- agrep(patternrex, string, max.distance = list(all = 3), ignore.case = !acronym, fixed = FALSE)
 if(convertLogical0(found) == 0) {
 found <- grep(pattern, string, ignore.case = !acronym, fixed = FALSE)
 }
 }
 else { # 5,6,7
 found <- agrep(patternrex, string, max.distance = list(all = 2), ignore.case = !acronym, fixed = FALSE)
 }
 found <- convertLogical0(found)
 return(found)
 }
 smartPatternMatch("Der kleine Flüchtlingsjunge war", "Flüchtling", 9, FALSE)
 smartPatternMatch("Der kleine Flüchtlingsjunge war", "Flüchtling", 9, FALSE)
 smartPatternMatch("Der kleine Flüchtlingsjunge war", "Flüchtling", 7, FALSE)
 c_errors <- read.csv("issuecomp-codingsample-error.csv", header = F, sep=",", colClasses="character")
 names(c_errors) <- c("str_id", "code", "issue", "tags", "text")
 for(r in 1:nrow(c_errors)) {
 c_errcode <- as.character(c_errors$code[r])
 c_errissue <- as.character(c_errors$issue[r])
 c_errtags <- as.character(c_errors$tags[r])
 c_errtext <- as.character(c_errors$text[r])
 c_errid <- as.character(c_errors$str_id[r])
 cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errtags, "\n", sep="")
 source("issuecomp-codingsample-function2.R")
 }
 View(c_errors)
 viewMatchingTweets(date = "2014-05-10", issue = "agrar.204", id_folder)
 viewMatchingTweets(date = "2014-05-10", issue = "agrar.402", id_folder)
 viewMatchingTweets(date = "2014-01-10", issue = "agrar.402", id_folder)
 viewMatchingTweets(date = "2014-01-20", issue = "agrar.402", id_folder)
 viewMatchingTweets(date = "2014-01-10", issue = "agrar.403", id_folder)
 viewMatchingTweets(date = "2014-04-10", issue = "agrar.403", id_folder)
 viewMatchingTweets(date = "2014-05-10", issue = "agrar.403", id_folder)
 viewMatchingTweets(date = "2014-02-11", issue = "agrar.403", id_folder)
 viewMatchingTweets(date = "2014-08-01", issue = "agrar.403", id_folder)
 issuelist <- xmlToList("issues.xml")
 issuelist
 issuelist[[1]]
 xmlTreeParse(file = "issues.xml")
 View(issues)
 issuelist
 issueheads
 issuelist[[1]]
 issuelist2 <- xmlTreeParse(file = "issues.xml")
 issuelist2[[1]]
 issuelist2[[2]]
 issuelist2[[1,2]]
 issuelist2[1
 issuelist2[1]
 issuelist2$doc$file
 issuelist2$doc$version
 xmlParse("issues.xml")
 issuelist2 <- xmlParse("issues.xml")
 issuelist2[1]
 issuelist2[2]
 issuelist2
 issuelist
 issuelist$edu.606
 issuelist$edu.606[1]
 issuelist$edu.606[2]
 issuelist$edu.606[3]
 issueheads
 issuelist$macro.100
 length(issuelist$macro.100)
 length(issuelist$macro.101)
 length(issuelist$macro.103)
 length(issuelist$macro.105)
 issuelist$macro.105
 issuelist$macro.105[2]
 issueheads
 as.character(issuelist[[1]])
 as.character(issuelist[[2]])
 test <- issueheads[1]
 test
 as.character(issuelist$test)
 as.character(issuelist$macro.100)
 as.character(issuelist[test])
 as.character(issuelist[test,1])
 as.character(issuelist[1,test])
 as.character(issuelist[test])
 issuelist[test]
 issuelist[test]
 length(issuelist[test])
 length(issuelist$macro.100)
 issuelist$macro.100
 test
 issuelist[test]
 issuelist[,test]
 issuelist[,as.character(test)]
 issuelist[[test]]
 issuelist[,test]
 issuelist[test]
 issuelist[[test]]
 length(issuelist[[test]])
 issuelist[[test]]
 issuelist[[test]][1]
 as.character(issuelist[[test]][1])
 as.character(issuelist[[test]])
 issueheads
 issueheads[2]
 as.character(issuelist[[i]])
 as.character(issuelist[[1]])
 as.character(issuelist[[test]])
 i <- 1
 curissue <- issueheads[i]
 curtags <- as.character(issuelist[[curissue]])
 curfile <- str_c(id_folder,"/",curissue,".csv")
 curissue
 curtags
 curfile
 curtags[2]
 # MATCH TWEETS ------------------------------------------------------------
 id_folder <- "matched-ids"
 unlink(id_folder, recursive = TRUE)
 dir.create(id_folder)
 issues <- data.frame(date = drange)
 issuelist <- xmlToList("issues.xml")
 issueheads <- names(issuelist)
 issues[issueheads] <- 0
 tweets$issue <- ""
 tweets$tags <- ""
 for(d in 1:nrow(issues)) {
 # Go through every day
 curdate <- issues$date[d]
 cat(as.character(curdate),"\n")
 # Put all tweets from specific day in a temporary DF
 tweets_curday <- tweets[tweets[, "created_at"] == curdate, ]
 for(t in 1:nrow(tweets_curday)){
 # Select tweet's text, make it lowercase and remove hashtag indicators (#)
 curtext <- as.character(tweets_curday$text[t])
 curtext <- str_replace_all(curtext, "#", "")
 curid <- as.character(tweets_curday$id_str[t])
 # Now test each single issue (not tag!)
 for(i in 1:length(issueheads)) {
 curissue <- issueheads[i]
 curtags <- as.character(issuelist[[curissue]])
 curfile <- str_c(id_folder,"/",curissue,".csv")
 # Now test all tags of a single issue
 for(s in 1:length(curtags)) {
 curtag <- curtags[s]
 curchars <- nchar(curtag, type = "chars")
 # Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
 if(curchars <= 4) {
 curacro <- checkAcronym(string = curtag, chars = curchars)
 } else {
 curacro <- FALSE
 }
 # Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
 tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro)
 if(tags_found == 1) {
 # Raise number of findings on this day for this issue by 1
 issues[d,curissue] <- issues[d,curissue] + 1
 # Add issue and first matched tag of tweet to tweets-DF
 oldissue <- tweets[tweets[, "id_str"] == curid, "issue"]
 tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";")
 oldtag <- tweets[tweets[, "id_str"] == curid, "tags"]
 tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";")
 # Add information to file for function viewPatternMatching
 write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE)
 break
 }
 else {
 #cat("Nothing found\n")
 }
 } # /for curtags
 } # /for issuelist
 } # /for tweets_curday
 } # /for drange
 smartPatternMatch(string = "er ist pädophil ", pattern = "pädophilie", chars = 10, acronym = FALSE)
 smartPatternMatch(string = "er ist pädophiler ", pattern = "pädophilie", chars = 10, acronym = FALSE)
 smartPatternMatch(string = "er ist pädophiler ", pattern = "Pädophilie", chars = 10, acronym = FALSE)
 smartPatternMatch(string = "er ist pädophiles ", pattern = "Pädophilie", chars = 10, acronym = FALSE)
 id_folder <- "matched-ids"
 unlink(id_folder, recursive = TRUE)
 dir.create(id_folder)
 issues <- data.frame(date = drange)
 issuelist <- xmlToList("issues.xml")
 issueheads <- names(issuelist)
 issues[issueheads] <- 0
 tweets$issue <- ""
 tweets$tags <- ""
 issueheads
 issuelist <- xmlToList("issues.xml")
 issuelist
 issueheads
 View(issues)
 issuelist$text
 issuelist$macro.100
 issuelist$macro.101
 issuelist$text
 issuelist$text <- NULL
 issueheads <- names(issuelist)
 issueheads
 issuelist
 issuelist$text <- ""
 issuelist
 issuelist$text <- NA
 issuelist
 issuelist$text
 issuelist$text[1]
 issuelist$text[2]
 issuelist$text[6]
 issuelist$text[10]
 issues <- data.frame(date = drange)
 issuelist <- xmlToList("issues.xml")
 issues <- data.frame(date = drange)
 issuelist <- xmlToList("issues.xml")
 issueheads <- names(issuelist)
 issues[issueheads] <- 0
 tweets$issue <- ""
 tweets$tags <- ""
 View(tweets)
@@ -33,14 +33,14 @@ smartPatternMatch <- function(string, pattern, chars, acronym) {
    found <- agrep(patternrex, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE)
  }
  else if(chars >= 8) { # 8 or more
-    found <- agrep(patternrex, string, max.distance = list(all = 3), ignore.case = !acronym, fixed = FALSE)
+    found <- agrep(patternrex, string, max.distance = list(all = 2), ignore.case = !acronym, fixed = FALSE)
    # Give longer words a chance by ignoring word boundaries \\b
    if(convertLogical0(found) == 0) {
      found <- grep(pattern, string, ignore.case = !acronym, fixed = FALSE)
    }
  }
  else { # 5,6,7
-    found <- agrep(patternrex, string, max.distance = list(all = 2), ignore.case = !acronym, fixed = FALSE)
+    found <- agrep(patternrex, string, max.distance = list(all = 1), ignore.case = !acronym, fixed = FALSE)
  }
  found <- convertLogical0(found)
  return(found)
@@ -1,5 +1,4 @@
 <issuelist>
 # Makroökonomie
 <macro.100>
 	<tag>Wirtschaft</tag>
 	<tag>Wirtschaftswachstum</tag>
@@ -38,7 +37,7 @@
 	<tag>Solidaritätszuschlag</tag>
 	<tag>Erbschaftssteuer</tag>
 	<tag>Erbsteuer</tag>
-KFZ-Steuer
+    <tag>KFZ-Steuer</tag>
 	<tag>Alkoholsteuer</tag>
 	<tag>Steueroase</tag>
 </macro.107>
@@ -73,7 +72,7 @@ KFZ-Steuer
 </civil.201>
 <civil.206>
 	<tag>Wahlrecht</tag>
-NPD-Verbot
+    <tag>NPD-Verbot</tag>
 </civil.206>
 <civil.207>
 	<tag>Versammlungsfreiheit</tag>
@@ -439,4 +438,26 @@ NPD-Verbot
 	<tag>Rechtschreibreform</tag>
 	<tag>neue deutsche Rechtschreibung</tag>
 </edu.699>
 <sens.nsa>
    <tag>NSA</tag>
    <tag>Überwachung</tag>
    <tag>Snowden</tag>
    <tag>GCHQ</tag>
    <tag>BND</tag>
 </sens.nsa>
 <sens.is>
    <tag>Irak</tag>
    <tag>ISIS</tag>
    <tag>IS</tag>
    <tag>Kalifat</tag>
 </sens.is>
 <sens.ebola>
    <tag>ebola</tag>
 </sens.ebola>
 <sens.edathy>
    <tag>Edathy</tag>
    <tag>Kinderpornografie</tag>
    <tag>Kinderporno</tag>
    <tag>Pädophilie</tag>
 </sens.edathy>
 </issuelist>