From e9c5fc7d8dfe5c0d6566a50c10ef74a0dffa308e Mon Sep 17 00:00:00 2001 From: mxmehl Date: Sun, 18 Jan 2015 23:35:47 +0100 Subject: [PATCH] before changing fuzzy matching because it sucks --- .Rhistory | 756 +++++++++++++++++++++--------------------- issuecomp-functions.R | 4 +- issues.xml | 27 +- 3 files changed, 404 insertions(+), 383 deletions(-) diff --git a/.Rhistory b/.Rhistory index 7d95a31..f6bbcbd 100644 --- a/.Rhistory +++ b/.Rhistory @@ -1,381 +1,3 @@ -View(c_errors) -for(r in 1:nrow(c_errors)) { -c_errcode <- as.character(c_errors$code[r]) -c_errissue <- as.character(c_errors$issue[r]) -c_errtags <- as.character(c_errors$tags[r]) -c_errtext <- as.character(c_errors$text[r]) -c_errid <- as.character(c_errors$str_id[r]) -cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errtags, "\n", sep="") -source("issuecomp-codingsample-function2.R") -} -for(r in 1:nrow(c_errors)) { -c_errcode <- as.character(c_errors$code[r]) -c_errissue <- as.character(c_errors$issue[r]) -c_errtags <- as.character(c_errors$tags[r]) -c_errtext <- as.character(c_errors$text[r]) -c_errid <- as.character(c_errors$str_id[r]) -cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errtags, "\n", sep="") -source("issuecomp-codingsample-function2.R") -} -c_curissue -c_curtags -c_errissue -c_errtags -c_errid -delrow <- NULL -for(r in 1:nrow(tweets)) { -if(format(tweets$created_at[r], "%Y") != "2014") { -delrow <- c(delrow, r) -} -curtext <- as.character(tweets$text[r]) -curtext <- str_replace_all(curtext, "$", " ") -curtext <- str_replace_all(curtext, "http://.+?\\s", "URL ") -} -r -require(stringr) -View(tweets) -df <- data.frame(x= c("zeile1","zeile2")) -View(df) -df$x[1] <- "blabla" -View(df) -df <- data.frame(x= c("zeile1","zeile2")) -test <- "bla bla" -df$x[1] <- test -View(df) -df$x[1] <- as.character(test) -class(df$x) -df$x[1] <- as.factor(test) -head(tweet) -head(tweets) -df <- head(tweets) -View(df) -df$text[1] <- "test" -View(tweets) -View(df) -for(r in 1:nrow(tweets)) { -# if(format(tweets$created_at[r], "%Y") != "2014") { -# delrow <- c(delrow, r) -# } -curtext <- as.character(tweets$text[r]) -curtext <- str_replace_all(curtext, "$", " ") -curtext <- str_replace_all(curtext, "http://.+?\\s", "URL ") -tweets$text[r] <- curtext -} -View(tweets) -View(c_tweets) -rm(delrow, r) -save(tweets, file="tweets_untagged.RData") -row.names(tweets) <- NULL -write.csv(tweets, "tweets.csv") -save(tweets, file="tweets.RData") -c_tweets <- read.csv("tweets.csv") -c_tweets$X <- NULL -View(c_tweets) -viewMatchingTweets -c_tweets <- read.csv("tweets.csv", colClasses="character") -c_tweets$X <- NULL -View(c_tweets) -View(c_issues) -c_errtags -for(r in 1:nrow(c_errors)) { -c_errcode <- as.character(c_errors$code[r]) -c_errissue <- as.character(c_errors$issue[r]) -c_errtags <- as.character(c_errors$tags[r]) -c_errtext <- as.character(c_errors$text[r]) -c_errid <- as.character(c_errors$str_id[r]) -cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errtags, "\n", sep="") -source("issuecomp-codingsample-function2.R") -} -curissue -c_curissue -str_join(c_curissue) -str_join(c_curissue,collapse = NULL) -str_join(c_curissue,sep=";",collapse = NULL) -paste(c_curissue,sep = "") -paste(c_curissue,sep = '') -length(paste(c_curissue,sep = '')) -str_join(c_curissue,sep=";",collapse = "") -str_join(c_curissue,sep=";",collapse = "w") -str_join(c_curissue,collapse = ";") -for(r in 1:nrow(c_errors)) { -c_errcode <- as.character(c_errors$code[r]) -c_errissue <- as.character(c_errors$issue[r]) -c_errtags <- as.character(c_errors$tags[r]) -c_errtext <- as.character(c_errors$text[r]) -c_errid <- as.character(c_errors$str_id[r]) -cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errtags, "\n", sep="") -source("issuecomp-codingsample-function2.R") -} -require(jsonlite) -require(stringr) -require(devtools) -require(RTwitterAPI) -setwd("~/Dokumente/Uni/Aktuell/BA-Arbeit/uni-ba-issuecomp") -source("issuecomp-functions.R") -acc_df <- read.csv("MdB-twitter.csv") -delrow <- NULL -for(r in 1:nrow(acc_df)) { -acc <- as.character(acc_df$twitter_acc[r]) -if(!nzchar(acc)) { -delrow <- c(delrow, r) -} -} -acc_df <- acc_df[-delrow, ] -rm(delrow, r, acc) -acc_df$row.names <- NULL -row.names(acc_df) <- NULL -require(lubridate) -require(XML) -require(ggplot2) -require(reshape2) -require(stringr) -source("issuecomp-functions.R") -load("tweets.RData") -View(tweets) -View(tweets) -View(tweets) -date_start <- as.Date("2014-01-01") -date_start + days -date_start + days(1) -date_start + days(0) -date_start + days(0:2) -date_start <- as.Date("2014-01-01") -date_end <- as.Date("2014-12-31") -drange <- as.integer(date_end - date_start) -drange <- date_start + days(0:drange) -curdate <- date_start + days(2) -curdate -tweets[tweets[, "created_at"] == curdate, "msg_id"] -View(tweets) -curdate -tweets[tweets[, "created_at"] == "2014-01-01", "msg_id"] -tweets[tweets[, "created_at"] == curdate, "id_str"] -drange -length(tweets[tweets[, "created_at"] == curdate, "id_str"]) -length(tweets[tweets[, "created_at"] == curdate+1, "id_str"]) -length(tweets[tweets[, "created_at"] == curdate+15, "id_str"]) -stats <- data.frame(x=NULL) -View(stats) -stats <- data.frame(date=drange) -View(stats) -stats$tpd <- NULL -stats$tpd <- "" -stats$tpd <- NULL -stats$tpd[1] <- 2 -View(stats) -stats$tpd[2] <- 3 -View(stats) -stats$tpd <- "" -stats$tpd <- NULL -stats$tpd <- "" -stats$tpd[1] <- 2 -View(tweets) -View(stats) -stats <- data.frame(date=drange) -stats$tpd <- "" -# Total number of tweets per day over time -for(r in 1:length(drange)) { -stats$tpd[r] <- length(tweets[tweets[, "created_at"] == curdate, "id_str"]) -} -View(stats) -drange[2] -stats <- data.frame(date=drange) -stats$tpd <- "" -# Total number of tweets per day over time -for(r in 1:length(drange)) { -stats$tpd[r] <- length(tweets[tweets[, "created_at"] == drange[r], "id_str"]) -} -View(stats) -plot.ts(x = stats$tpd, y=stats$date) -plot.ts(x = stats$date, y=stats$tpd) -g1 <- ggplot(stats, aes(date,tpd)) -g1 <- g1 + geom_histogram(fill="steelblue", stat="identity") -g1 <- g1 + stat_smooth(size=1,colour="red",method="loess", se=FALSE) -g1 <- gg1 + ggtitle("Zeitliche Entwicklung von plötzlichen Medienfokussen") + xlab("Einzelne Monate") + ylab("Plötzliche Medienfokusse") -g1 -g1 <- ggplot(stats, aes(date,tpd)) -g1 <- g1 + geom_histogram(fill="steelblue", stat="identity") -g1 -g1 <- g1 + stat_smooth(size=1,colour="red",method="loess", se=FALSE) -g1 -g1 <- ggplot(stats, aes(date,tpd)) -g1 <- g1 + geom_line() -g1 -g1 <- ggplot() + geom_line(data = stats, aes(x=date,y=tpd, color=black)) -g1 -g1 <- ggplot() + geom_line(data = stats, aes(x=date,y=tpd, color="black")) -g1 -g1 <- ggplot() + geom_line(data = stats, aes(x=date,y=tpd)) -g1 -ggplot(stats, aes(date, tpd)) + geom_line() + -scale_x_date(format = "%b-%Y") + xlab("") + ylab("Daily Views") -ggplot(stats, aes(date, tpd)) + geom_line() + -scale_x_date() + xlab("") + ylab("Daily Views") -lapply(stats, class) -stats$tpd <- 0 -lapply(stats, class) -stats <- data.frame(date=drange) -stats$tpd <- 0 -# Total number of tweets per day over time -for(r in 1:length(drange)) { -stats$tpd[r] <- length(tweets[tweets[, "created_at"] == drange[r], "id_str"]) -} -View(stats) -plot.ts(x = stats$date, y=stats$tpd) -g1 <- ggplot() + geom_line(data = stats, aes(x=date,y=tpd)) -g1 -ggplot(stats, aes(date, tpd)) + geom_line() + -scale_x_date() + xlab("") + ylab("Daily Views") -g1 <- ggplot() + -geom_line(data = stats, aes(x=date,y=tpd)) + -stats_smooth(size=1,colour="red",method="loess", se=FALSE) -g1 -g1 <- ggplot() + -geom_line(data = stats, aes(x=date,y=tpd)) + -stat_smooth(size=1,colour="red",method="loess", se=FALSE) -g1 -g1 <- ggplot() + -geom_line(data = stats, aes(x=date,y=tpd)) + -stat_smooth(colour="red",method="loess", se=FALSE) -g1 -g1 <- ggplot() + -geom_line(data = stats, aes(x=date,y=tpd)) + -stat_smooth(colour="red",method="lm", se=FALSE) -g1 -g1 <- ggplot() + -geom_line(data = stats, aes(x=date,y=tpd)) + -geom_smooth(colour="red",method="lm", se=FALSE) -g1 -g1 <- ggplot() + -geom_line(data = stats, aes(x=date,y=tpd)) + -geom_smooth(colour="red",method="loess", se=FALSE) -g1 -g1 <- ggplot() + -geom_line(data = stats, aes(x=date,y=tpd)) + -geom_smooth(size=1,colour="red",method="loess", se=FALSE) -g1 -g1 <- g1 + geom_smooth(size=1,colour="red", method="loess", se=FALSE) -g1 -g1 <- ggplot() + geom_line(data = stats, aes(x=date,y=tpd)) -g1 <- g1 + geom_smooth(size=1,colour="red", method="loess", se=FALSE) -g1 -g1 <- ggplot() + geom_line(data = stats, aes(x=date,y=tpd)) -g1 -g1 + geom_smooth() -g1 + geom_smqwd -g1 <- ggplot() + geom_point(data = stats, aes(x=date,y=tpd)) -g1 -g1 <- g1 + geom_smooth(size=1,colour="red", method="loess", se=FALSE) -g1 -g1 <- ggplot() + geom_point(data = stats, aes(x=date,y=tpd)) -g1 <- g1 + geom_smooth(size=1,method="loess", se=FALSE) -g1 -g1 <- ggplot() + geom_point(data = stats, aes(x=date,y=tpd)) -g1 <- g1 + geom_smooth(size=1,method="loess", se=FALSE, aes(group=1)) -g1 -g1 <- ggplot() + geom_point(data = stats, aes(x=date,y=tpd)) + -geom_smooth(size=1,method="loess", se=FALSE, aes(group=1)) -g1 -g1 <- ggplot() + geom_point(data = stats, aes(x=date,y=tpd)) + -geom_smooth(size=1,method="loess", se=FALSE, aes(x = date, y=tpd)) -g1 -geom_smooth(size=1,method="loess", se=FALSE) -g1 <- ggplot() + geom_point(data = stats, aes(x=date,y=tpd)) + -geom_smooth(size=1,method="loess", se=FALSE) -g1 -plot.ts(x = stats$date, y=stats$tpd) -g1 <- ggplot() + geom_point(data = stats, aes(x=date,y=tpd, color=group)) + -geom_smooth(size=1,method="loess", se=FALSE) -g1 -install.packages(c("BH", "bibtex", "devtools", "dplyr", "httr", "jsonlite", "lazyeval", "manipulate", "RCurl", "ROAuth", "rstudioapi", "sp", "stringi")) -g1 <- ggplot(data = stats, aes(x=date,y=tpd, color=variable, group=variable)) + -geom_line() + -geom_smooth(size=1,formula = y ~ x, method="loess", se=FALSE) -g1 -stats_melt <- melt(stats, id="date") -View(stats_melt) -View(stats_melt) -stats_melt <- melt(stats, id="date") -g1 <- ggplot(data = stats_melt, aes(x=date,y=value,color=variable, group=variable)) + -geom_line() + -geom_smooth(size=1,formula = y ~ x, method="loess", se=FALSE) -g1 -g1 <- ggplot(data = stats_melt, aes(x=date,y=value,color=1, group=variable)) + -geom_line() + -geom_smooth(size=1,formula = y ~ x, method="loess", se=FALSE) -g1 -g1 <- ggplot(data = stats_melt, aes(x=date,y=value,color=1, group=variable)) + -geom_line() + -geom_smooth(size=1,formula = y ~ x, method="loess", se=FALSE, colors="red") -g1 -g1 <- ggplot(data = stats_melt, aes(x=date,y=value,color=1, group=variable)) + -geom_line() + -geom_smooth(size=1,formula = y ~ x, method="loess", se=FALSE, color="red") -g1 -g1 <- ggplot(data = stats_melt, aes(x=date,y=value,color="black", group=variable)) + -geom_line() + -geom_smooth(size=1,formula = y ~ x, method="loess", se=FALSE, color="red") -g1 -g1 <- ggplot(data = stats_melt, aes(x=date,y=value,color="yellow", group=variable)) + -geom_line() + -geom_smooth(size=1,formula = y ~ x, method="loess", se=FALSE, color="red") -g1 -g1 <- ggplot(data = stats_melt, aes(x=date,y=value,colour="black", group=variable)) + -geom_line() + -geom_smooth(size=1,formula = y ~ x, method="loess", se=FALSE, color="red") -g1 -g1 <- ggplot(data = stats_melt, aes(x=date,y=value,colour="#FFFFFF", group=variable)) + -geom_line() + -geom_smooth(size=1,formula = y ~ x, method="loess", se=FALSE, color="red") -g1 -g1 <- ggplot(data = stats_melt, aes(x=date,y=value,colour="vqwdqw", group=variable)) + -geom_line() + -geom_smooth(size=1,formula = y ~ x, method="loess", se=FALSE, color="red") -g1 -g1 <- ggplot(data = stats_melt, aes(x=date,y=value,colour=variable, group=variable)) + -geom_line() + -geom_smooth(size=1,formula = y ~ x, method="loess", se=FALSE, color="red") -g1 -g1 <- ggplot(data = stats_melt, aes(x=date,y=value,colour=1, group=variable)) + -geom_line() + -geom_smooth(size=1,formula = y ~ x, method="loess", se=FALSE, color="red") -g1 -g1 <- ggplot(data = stats_melt, aes(x=date,y=value,colour=variable, group=variable)) + -geom_line() + -geom_smooth(size=1,formula = y ~ x, method="loess", se=FALSE, color=1) -g1 -require(lubridate) -require(XML) -require(ggplot2) -require(reshape2) -require(stringr) -source("issuecomp-functions.R") -rm(curdate) -rm(date_end, date_start) -rm(g1, r, ) -rm(g1, r) -id_folder <- "matched-ids" -unlink(id_folder, recursive = TRUE) -dir.create(id_folder) -issues <- data.frame(date = drange) -View(issues) -issuelist <- xmlToList("issues.xml") -issuelist -issueheads <- names(issuelist) -issueheads -issues[issueheads] <- 0 -tweets$issue <- "" -tweets$tags <- "" -View(issues) -# MATCH TWEETS ------------------------------------------------------------ -id_folder <- "matched-ids" -unlink(id_folder, recursive = TRUE) -dir.create(id_folder) -issues <- data.frame(date = drange) -issuelist <- xmlToList("issues.xml") -issueheads <- names(issuelist) -issues[issueheads] <- 0 -tweets$issue <- "" tweets$tags <- "" for(d in 1:nrow(issues)) { # Go through every day @@ -510,3 +132,381 @@ return(status) } require(stringr) require(XML) +require(stringr) +require(XML) +# FUNCTIONS --------------------------------------------------------------- +readYN <- function(question) { +n <- readline(prompt=question) +n <- as.character(n) +return(n) +} +checkIssue <- function(string, issuelist) { +status <- any(str_detect(string, issuelist)) +return(status) +} +checkAllIssues <- function(string, issuelist) { +status <- NULL +for(i in 1:length(string)) { +if(checkIssue(string[i], issuelist)) { +status[i] <- TRUE +} +else { +cat("Issue",string[i],"does not exist. Please try again.\n") +status[i] <- FALSE +} +} +return(status) +} +c_issues <- data.frame(date = drange) +c_issuelist <- xmlToList("issues.xml") +c_issueheads <- names(issuelist) +c_issues[issueheads] <- 0 +source("issuecomp-codingsample-function.R") +c_tweets <- tweets +View(c_tweets) +source("issuecomp-codingsample-function.R") +smartPatternMatch("Höflich, aber klares Statement zu Menschenrechten. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschenrecht\\b", 13, FALSE) +smartPatternMatch("Höflich, aber klares Statement zu Menschenrechten. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschenrechte\\b", 13, FALSE) +smartPatternMatch <- function(string, pattern, chars, acronym) { +pattern <- str_c("\\b", pattern, "\\b") +if(chars <= 4) { +found <- agrep(pattern, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE) +} +else if(chars >= 8) { +cat("bla") +found <- agrep(pattern, string, max.distance = list(all = 2), ignore.case = !acronym, fixed = FALSE) +} +else { +found <- agrep(pattern, string, max.distance = list(all = 1), ignore.case = !acronym, fixed = FALSE) +} +found <- convertLogical0(found) +return(found) +} +smartPatternMatch("Höflich, aber klares Statement zu Menschenrechten. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschenrechte\\b", 13, FALSE) +smartPatternMatch <- function(string, pattern, chars, acronym) { +pattern <- str_c("\\b", pattern, "\\b") +if(chars <= 4) { +found <- agrep(pattern, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE) +} +else if(chars >= 8) { +found <- agrep(pattern, string, max.distance = list(all = 2), ignore.case = !acronym, fixed = FALSE) +} +else { +found <- agrep(pattern, string, max.distance = list(all = 1), ignore.case = !acronym, fixed = FALSE) +} +found <- convertLogical0(found) +return(found) +} +smartPatternMatch <- function(string, pattern, chars, acronym) { +pattern <- str_c("\\b", pattern, "\\b") +if(chars <= 4) { +found <- agrep(pattern, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE) +} +else if(chars >= 8) { +found <- agrep(pattern, string, max.distance = list(all = 3), ignore.case = !acronym, fixed = FALSE) +} +else { +found <- agrep(pattern, string, max.distance = list(all = 1), ignore.case = !acronym, fixed = FALSE) +} +found <- convertLogical0(found) +return(found) +} +smartPatternMatch("Höflich, aber klares Statement zu Menschenrechten. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschenrechte\\b", 13, FALSE) +smartPatternMatch("Höflich, aber klares Statement zu Menschenrechten. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschenrecht\\b", 13, FALSE) +smartPatternMatch("Höflich, aber klares Statement zu Menschenrechten. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschenracht\\b", 13, FALSE) +smartPatternMatch("Höflich, aber klares Statement zu Menschenrechten. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschen-recht\\b", 13, FALSE) +smartPatternMatch("Höflich, aber klares Statement zu Menschen-Rechten. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschenrecht\\b", 13, FALSE) +smartPatternMatch("Höflich, aber klares Statement zu Menschen-Rechte. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschenrecht\\b", 13, FALSE) +smartPatternMatch("Bla bla Tomate ", "\\Tomate\\b", 6, FALSE) +smartPatternMatch("Bla bla Tomaten bla bla", "\\Tomate\\b", 6, FALSE) +smartPatternMatch <- function(string, pattern, chars, acronym) { +pattern <- str_c("\\b", pattern, "\\b") +if(chars <= 4) { +found <- agrep(pattern, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE) +} +else if(chars >= 8) { +found <- agrep(pattern, string, max.distance = list(all = 3), ignore.case = !acronym, fixed = FALSE) +} +else { +found <- agrep(pattern, string, max.distance = list(all = 2), ignore.case = !acronym, fixed = FALSE) +} +found <- convertLogical0(found) +return(found) +} +smartPatternMatch("Bla bla Tomaten bla bla", "\\Tomate\\b", 6, FALSE) +smartPatternMatch("Bla bla Menschen bla bla", "\\Menschen\\b", 8, FALSE) +smartPatternMatch("Bla bla Menschen bla bla", "\\Menschen\\b", 7, FALSE) +smartPatternMatch("Bla bla Menschen bla bla", "\\Mensch\\b", 7, FALSE) +smartPatternMatch("Bla bla Menschen bla bla", "\\Mensch\\b", 8, FALSE) +smartPatternMatch("Bla bla Nazis bla bla", "\\Nazis\\b", 8, FALSE) +smartPatternMatch("Bla bla Nazis bla bla", "\\Nazis\\b", 5, FALSE) +smartPatternMatch("Bla bla Nazis bla bla", "\\Nazi\\b", 4, FALSE) +smartPatternMatch("Bla bla Nazi bla bla", "\\Nazis\\b", 5, FALSE) +source("issuecomp-codingsample-function.R") +smartPatternMatch("Der kleine Flüchtlingsjunge war", "\\bFlüchtling\\b", 9, FALSE) +str_detect("Der kleine Flüchtlingsjunge war", pattern = "\\bFlüchtling\\b") +str_detect("Der kleine Flüchtlingsjunge war", pattern = "Flüchtling") +str_detect("Der kleine Flücht lingsjunge war", pattern = "Flüchtling") +smartPatternMatch <- function(string, pattern, chars, acronym) { +pattern <- str_c("\\b", pattern, "\\b") +if(chars <= 4) { # 4 or less +found <- agrep(pattern, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE) +} +else if(chars >= 8) { # 8 or more +found <- agrep(pattern, string, max.distance = list(all = 3), ignore.case = !acronym, fixed = FALSE) +cat(found) +} +else { # 5,6,7 +found <- agrep(pattern, string, max.distance = list(all = 2), ignore.case = !acronym, fixed = FALSE) +} +found <- convertLogical0(found) +return(found) +} +str_detect("Der kleine Flücht lingsjunge war", pattern = "Flüchtling") +smartPatternMatch("Der kleine Flüchtlingsjunge war", "\\bFlüchtling\\b", 9, FALSE) +smartPatternMatch("Der kleine Flüchtlingsjunge war", "\\bFlüchtling\\b", 9, FALSE) +smartPatternMatch <- function(string, pattern, chars, acronym) { +pattern <- str_c("\\b", pattern, "\\b") +if(chars <= 4) { # 4 or less +found <- agrep(pattern, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE) +} +else if(chars >= 8) { # 8 or more +found <- agrep(pattern, string, max.distance = list(all = 3), ignore.case = !acronym, fixed = FALSE) +cat("it's",found) +} +else { # 5,6,7 +found <- agrep(pattern, string, max.distance = list(all = 2), ignore.case = !acronym, fixed = FALSE) +} +found <- convertLogical0(found) +return(found) +} +smartPatternMatch("Der kleine Flüchtlingsjunge war", "\\bFlüchtling\\b", 9, FALSE) +str_detect("Der kleine Flücht lingsjunge war", pattern = "Flüchtling") +str_detect("Der kleine Flüchtlingsjunge war", pattern = "Flüchtling") +smartPatternMatch("Der kleine Flüchtlingsjunge war", "\\bFlüchtling\\b", 9, FALSE) +smartPatternMatch("Der kleine Flüchtlinge war", "\\bFlüchtling\\b", 9, FALSE) +grep("Flüchtling","Der kleine Flüchtlingsjunge war", ignore.case = TRUE, fixed=FALSE) +grep("\\bFlüchtling\\b","Der kleine Flüchtlingsjunge war", ignore.case = TRUE, fixed=FALSE) +grep("\\bFlüchtling\\b","Der kleine Flüchtlingsjunge war", ignore.case = TRUE, fixed=TRUE) +grep("\\bFlüchtling\\b","Der kleine Flüchtlingsjunge war", ignore.case = TRUE, fixed=FALSE) +grep("Flüchtling","Der kleine Flüchtlingsjunge war", ignore.case = TRUE, fixed=FALSE) +grep("Flüchtling","Der kleine Flücht-lingsjunge war", ignore.case = TRUE, fixed=FALSE) +grep("Flüchtling","Der kleine Flüchtlingsjunge war", ignore.case = TRUE, fixed=FALSE) +smartPatternMatch <- function(string, pattern, chars, acronym) { +patternrex <- str_c("\\b", pattern, "\\b") +if(chars <= 4) { # 4 or less +found <- agrep(patternrex, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE) +} +else if(chars >= 8) { # 8 or more +found <- agrep(patternrex, string, max.distance = list(all = 3), ignore.case = !acronym, fixed = FALSE) +if(convertLogical0(found) == 0) { +found <- grep(pattern, string, ignore.case = !acronym, fixed = FALSE) +} +} +else { # 5,6,7 +found <- agrep(patternrex, string, max.distance = list(all = 2), ignore.case = !acronym, fixed = FALSE) +} +found <- convertLogical0(found) +return(found) +} +smartPatternMatch("Der kleine Flüchtlingsjunge war", "Flüchtling", 9, FALSE) +smartPatternMatch("Der kleine Flüchtlingsjunge war", "Flüchtling", 9, FALSE) +smartPatternMatch("Der kleine Flüchtlingsjunge war", "Flüchtling", 7, FALSE) +c_errors <- read.csv("issuecomp-codingsample-error.csv", header = F, sep=",", colClasses="character") +names(c_errors) <- c("str_id", "code", "issue", "tags", "text") +for(r in 1:nrow(c_errors)) { +c_errcode <- as.character(c_errors$code[r]) +c_errissue <- as.character(c_errors$issue[r]) +c_errtags <- as.character(c_errors$tags[r]) +c_errtext <- as.character(c_errors$text[r]) +c_errid <- as.character(c_errors$str_id[r]) +cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errtags, "\n", sep="") +source("issuecomp-codingsample-function2.R") +} +View(c_errors) +viewMatchingTweets(date = "2014-05-10", issue = "agrar.204", id_folder) +viewMatchingTweets(date = "2014-05-10", issue = "agrar.402", id_folder) +viewMatchingTweets(date = "2014-01-10", issue = "agrar.402", id_folder) +viewMatchingTweets(date = "2014-01-20", issue = "agrar.402", id_folder) +viewMatchingTweets(date = "2014-01-10", issue = "agrar.403", id_folder) +viewMatchingTweets(date = "2014-04-10", issue = "agrar.403", id_folder) +viewMatchingTweets(date = "2014-05-10", issue = "agrar.403", id_folder) +viewMatchingTweets(date = "2014-02-11", issue = "agrar.403", id_folder) +viewMatchingTweets(date = "2014-08-01", issue = "agrar.403", id_folder) +issuelist <- xmlToList("issues.xml") +issuelist +issuelist[[1]] +xmlTreeParse(file = "issues.xml") +View(issues) +issuelist +issueheads +issuelist[[1]] +issuelist2 <- xmlTreeParse(file = "issues.xml") +issuelist2[[1]] +issuelist2[[2]] +issuelist2[[1,2]] +issuelist2[1 +issuelist2[1] +issuelist2$doc$file +issuelist2$doc$version +xmlParse("issues.xml") +issuelist2 <- xmlParse("issues.xml") +issuelist2[1] +issuelist2[2] +issuelist2 +issuelist +issuelist$edu.606 +issuelist$edu.606[1] +issuelist$edu.606[2] +issuelist$edu.606[3] +issueheads +issuelist$macro.100 +length(issuelist$macro.100) +length(issuelist$macro.101) +length(issuelist$macro.103) +length(issuelist$macro.105) +issuelist$macro.105 +issuelist$macro.105[2] +issueheads +as.character(issuelist[[1]]) +as.character(issuelist[[2]]) +test <- issueheads[1] +test +as.character(issuelist$test) +as.character(issuelist$macro.100) +as.character(issuelist[test]) +as.character(issuelist[test,1]) +as.character(issuelist[1,test]) +as.character(issuelist[test]) +issuelist[test] +issuelist[test] +length(issuelist[test]) +length(issuelist$macro.100) +issuelist$macro.100 +test +issuelist[test] +issuelist[,test] +issuelist[,as.character(test)] +issuelist[[test]] +issuelist[,test] +issuelist[test] +issuelist[[test]] +length(issuelist[[test]]) +issuelist[[test]] +issuelist[[test]][1] +as.character(issuelist[[test]][1]) +as.character(issuelist[[test]]) +issueheads +issueheads[2] +as.character(issuelist[[i]]) +as.character(issuelist[[1]]) +as.character(issuelist[[test]]) +i <- 1 +curissue <- issueheads[i] +curtags <- as.character(issuelist[[curissue]]) +curfile <- str_c(id_folder,"/",curissue,".csv") +curissue +curtags +curfile +curtags[2] +# MATCH TWEETS ------------------------------------------------------------ +id_folder <- "matched-ids" +unlink(id_folder, recursive = TRUE) +dir.create(id_folder) +issues <- data.frame(date = drange) +issuelist <- xmlToList("issues.xml") +issueheads <- names(issuelist) +issues[issueheads] <- 0 +tweets$issue <- "" +tweets$tags <- "" +for(d in 1:nrow(issues)) { +# Go through every day +curdate <- issues$date[d] +cat(as.character(curdate),"\n") +# Put all tweets from specific day in a temporary DF +tweets_curday <- tweets[tweets[, "created_at"] == curdate, ] +for(t in 1:nrow(tweets_curday)){ +# Select tweet's text, make it lowercase and remove hashtag indicators (#) +curtext <- as.character(tweets_curday$text[t]) +curtext <- str_replace_all(curtext, "#", "") +curid <- as.character(tweets_curday$id_str[t]) +# Now test each single issue (not tag!) +for(i in 1:length(issueheads)) { +curissue <- issueheads[i] +curtags <- as.character(issuelist[[curissue]]) +curfile <- str_c(id_folder,"/",curissue,".csv") +# Now test all tags of a single issue +for(s in 1:length(curtags)) { +curtag <- curtags[s] +curchars <- nchar(curtag, type = "chars") +# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch +if(curchars <= 4) { +curacro <- checkAcronym(string = curtag, chars = curchars) +} else { +curacro <- FALSE +} +# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance) +tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro) +if(tags_found == 1) { +# Raise number of findings on this day for this issue by 1 +issues[d,curissue] <- issues[d,curissue] + 1 +# Add issue and first matched tag of tweet to tweets-DF +oldissue <- tweets[tweets[, "id_str"] == curid, "issue"] +tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";") +oldtag <- tweets[tweets[, "id_str"] == curid, "tags"] +tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";") +# Add information to file for function viewPatternMatching +write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE) +break +} +else { +#cat("Nothing found\n") +} +} # /for curtags +} # /for issuelist +} # /for tweets_curday +} # /for drange +smartPatternMatch(string = "er ist pädophil ", pattern = "pädophilie", chars = 10, acronym = FALSE) +smartPatternMatch(string = "er ist pädophiler ", pattern = "pädophilie", chars = 10, acronym = FALSE) +smartPatternMatch(string = "er ist pädophiler ", pattern = "Pädophilie", chars = 10, acronym = FALSE) +smartPatternMatch(string = "er ist pädophiles ", pattern = "Pädophilie", chars = 10, acronym = FALSE) +id_folder <- "matched-ids" +unlink(id_folder, recursive = TRUE) +dir.create(id_folder) +issues <- data.frame(date = drange) +issuelist <- xmlToList("issues.xml") +issueheads <- names(issuelist) +issues[issueheads] <- 0 +tweets$issue <- "" +tweets$tags <- "" +issueheads +issuelist <- xmlToList("issues.xml") +issuelist +issueheads +View(issues) +issuelist$text +issuelist$macro.100 +issuelist$macro.101 +issuelist$text +issuelist$text <- NULL +issueheads <- names(issuelist) +issueheads +issuelist +issuelist$text <- "" +issuelist +issuelist$text <- NA +issuelist +issuelist$text +issuelist$text[1] +issuelist$text[2] +issuelist$text[6] +issuelist$text[10] +issues <- data.frame(date = drange) +issuelist <- xmlToList("issues.xml") +issues <- data.frame(date = drange) +issuelist <- xmlToList("issues.xml") +issueheads <- names(issuelist) +issues[issueheads] <- 0 +tweets$issue <- "" +tweets$tags <- "" +View(tweets) diff --git a/issuecomp-functions.R b/issuecomp-functions.R index a2d60dc..26be239 100644 --- a/issuecomp-functions.R +++ b/issuecomp-functions.R @@ -33,14 +33,14 @@ smartPatternMatch <- function(string, pattern, chars, acronym) { found <- agrep(patternrex, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE) } else if(chars >= 8) { # 8 or more - found <- agrep(patternrex, string, max.distance = list(all = 3), ignore.case = !acronym, fixed = FALSE) + found <- agrep(patternrex, string, max.distance = list(all = 2), ignore.case = !acronym, fixed = FALSE) # Give longer words a chance by ignoring word boundaries \\b if(convertLogical0(found) == 0) { found <- grep(pattern, string, ignore.case = !acronym, fixed = FALSE) } } else { # 5,6,7 - found <- agrep(patternrex, string, max.distance = list(all = 2), ignore.case = !acronym, fixed = FALSE) + found <- agrep(patternrex, string, max.distance = list(all = 1), ignore.case = !acronym, fixed = FALSE) } found <- convertLogical0(found) return(found) diff --git a/issues.xml b/issues.xml index aaaae8a..1fa1ee4 100644 --- a/issues.xml +++ b/issues.xml @@ -1,5 +1,4 @@ -# Makroökonomie Wirtschaft Wirtschaftswachstum @@ -38,7 +37,7 @@ Solidaritätszuschlag Erbschaftssteuer Erbsteuer -KFZ-Steuer + KFZ-Steuer Alkoholsteuer Steueroase @@ -73,7 +72,7 @@ KFZ-Steuer Wahlrecht -NPD-Verbot + NPD-Verbot Versammlungsfreiheit @@ -439,4 +438,26 @@ NPD-Verbot Rechtschreibreform neue deutsche Rechtschreibung + + NSA + Überwachung + Snowden + GCHQ + BND + + + Irak + ISIS + IS + Kalifat + + + ebola + + + Edathy + Kinderpornografie + Kinderporno + Pädophilie +