before changing fuzzy matching because it sucks

This commit is contained in:
2015-01-18 23:35:47 +01:00
parent 4693f8571e
commit e9c5fc7d8d
3 changed files with 404 additions and 383 deletions
+378 -378
View File
@@ -1,381 +1,3 @@
View(c_errors)
for(r in 1:nrow(c_errors)) {
c_errcode <- as.character(c_errors$code[r])
c_errissue <- as.character(c_errors$issue[r])
c_errtags <- as.character(c_errors$tags[r])
c_errtext <- as.character(c_errors$text[r])
c_errid <- as.character(c_errors$str_id[r])
cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errtags, "\n", sep="")
source("issuecomp-codingsample-function2.R")
}
for(r in 1:nrow(c_errors)) {
c_errcode <- as.character(c_errors$code[r])
c_errissue <- as.character(c_errors$issue[r])
c_errtags <- as.character(c_errors$tags[r])
c_errtext <- as.character(c_errors$text[r])
c_errid <- as.character(c_errors$str_id[r])
cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errtags, "\n", sep="")
source("issuecomp-codingsample-function2.R")
}
c_curissue
c_curtags
c_errissue
c_errtags
c_errid
delrow <- NULL
for(r in 1:nrow(tweets)) {
if(format(tweets$created_at[r], "%Y") != "2014") {
delrow <- c(delrow, r)
}
curtext <- as.character(tweets$text[r])
curtext <- str_replace_all(curtext, "$", " ")
curtext <- str_replace_all(curtext, "http://.+?\\s", "URL ")
}
r
require(stringr)
View(tweets)
df <- data.frame(x= c("zeile1","zeile2"))
View(df)
df$x[1] <- "blabla"
View(df)
df <- data.frame(x= c("zeile1","zeile2"))
test <- "bla bla"
df$x[1] <- test
View(df)
df$x[1] <- as.character(test)
class(df$x)
df$x[1] <- as.factor(test)
head(tweet)
head(tweets)
df <- head(tweets)
View(df)
df$text[1] <- "test"
View(tweets)
View(df)
for(r in 1:nrow(tweets)) {
# if(format(tweets$created_at[r], "%Y") != "2014") {
# delrow <- c(delrow, r)
# }
curtext <- as.character(tweets$text[r])
curtext <- str_replace_all(curtext, "$", " ")
curtext <- str_replace_all(curtext, "http://.+?\\s", "URL ")
tweets$text[r] <- curtext
}
View(tweets)
View(c_tweets)
rm(delrow, r)
save(tweets, file="tweets_untagged.RData")
row.names(tweets) <- NULL
write.csv(tweets, "tweets.csv")
save(tweets, file="tweets.RData")
c_tweets <- read.csv("tweets.csv")
c_tweets$X <- NULL
View(c_tweets)
viewMatchingTweets
c_tweets <- read.csv("tweets.csv", colClasses="character")
c_tweets$X <- NULL
View(c_tweets)
View(c_issues)
c_errtags
for(r in 1:nrow(c_errors)) {
c_errcode <- as.character(c_errors$code[r])
c_errissue <- as.character(c_errors$issue[r])
c_errtags <- as.character(c_errors$tags[r])
c_errtext <- as.character(c_errors$text[r])
c_errid <- as.character(c_errors$str_id[r])
cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errtags, "\n", sep="")
source("issuecomp-codingsample-function2.R")
}
curissue
c_curissue
str_join(c_curissue)
str_join(c_curissue,collapse = NULL)
str_join(c_curissue,sep=";",collapse = NULL)
paste(c_curissue,sep = "")
paste(c_curissue,sep = '')
length(paste(c_curissue,sep = ''))
str_join(c_curissue,sep=";",collapse = "")
str_join(c_curissue,sep=";",collapse = "w")
str_join(c_curissue,collapse = ";")
for(r in 1:nrow(c_errors)) {
c_errcode <- as.character(c_errors$code[r])
c_errissue <- as.character(c_errors$issue[r])
c_errtags <- as.character(c_errors$tags[r])
c_errtext <- as.character(c_errors$text[r])
c_errid <- as.character(c_errors$str_id[r])
cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errtags, "\n", sep="")
source("issuecomp-codingsample-function2.R")
}
require(jsonlite)
require(stringr)
require(devtools)
require(RTwitterAPI)
setwd("~/Dokumente/Uni/Aktuell/BA-Arbeit/uni-ba-issuecomp")
source("issuecomp-functions.R")
acc_df <- read.csv("MdB-twitter.csv")
delrow <- NULL
for(r in 1:nrow(acc_df)) {
acc <- as.character(acc_df$twitter_acc[r])
if(!nzchar(acc)) {
delrow <- c(delrow, r)
}
}
acc_df <- acc_df[-delrow, ]
rm(delrow, r, acc)
acc_df$row.names <- NULL
row.names(acc_df) <- NULL
require(lubridate)
require(XML)
require(ggplot2)
require(reshape2)
require(stringr)
source("issuecomp-functions.R")
load("tweets.RData")
View(tweets)
View(tweets)
View(tweets)
date_start <- as.Date("2014-01-01")
date_start + days
date_start + days(1)
date_start + days(0)
date_start + days(0:2)
date_start <- as.Date("2014-01-01")
date_end <- as.Date("2014-12-31")
drange <- as.integer(date_end - date_start)
drange <- date_start + days(0:drange)
curdate <- date_start + days(2)
curdate
tweets[tweets[, "created_at"] == curdate, "msg_id"]
View(tweets)
curdate
tweets[tweets[, "created_at"] == "2014-01-01", "msg_id"]
tweets[tweets[, "created_at"] == curdate, "id_str"]
drange
length(tweets[tweets[, "created_at"] == curdate, "id_str"])
length(tweets[tweets[, "created_at"] == curdate+1, "id_str"])
length(tweets[tweets[, "created_at"] == curdate+15, "id_str"])
stats <- data.frame(x=NULL)
View(stats)
stats <- data.frame(date=drange)
View(stats)
stats$tpd <- NULL
stats$tpd <- ""
stats$tpd <- NULL
stats$tpd[1] <- 2
View(stats)
stats$tpd[2] <- 3
View(stats)
stats$tpd <- ""
stats$tpd <- NULL
stats$tpd <- ""
stats$tpd[1] <- 2
View(tweets)
View(stats)
stats <- data.frame(date=drange)
stats$tpd <- ""
# Total number of tweets per day over time
for(r in 1:length(drange)) {
stats$tpd[r] <- length(tweets[tweets[, "created_at"] == curdate, "id_str"])
}
View(stats)
drange[2]
stats <- data.frame(date=drange)
stats$tpd <- ""
# Total number of tweets per day over time
for(r in 1:length(drange)) {
stats$tpd[r] <- length(tweets[tweets[, "created_at"] == drange[r], "id_str"])
}
View(stats)
plot.ts(x = stats$tpd, y=stats$date)
plot.ts(x = stats$date, y=stats$tpd)
g1 <- ggplot(stats, aes(date,tpd))
g1 <- g1 + geom_histogram(fill="steelblue", stat="identity")
g1 <- g1 + stat_smooth(size=1,colour="red",method="loess", se=FALSE)
g1 <- gg1 + ggtitle("Zeitliche Entwicklung von plötzlichen Medienfokussen") + xlab("Einzelne Monate") + ylab("Plötzliche Medienfokusse")
g1
g1 <- ggplot(stats, aes(date,tpd))
g1 <- g1 + geom_histogram(fill="steelblue", stat="identity")
g1
g1 <- g1 + stat_smooth(size=1,colour="red",method="loess", se=FALSE)
g1
g1 <- ggplot(stats, aes(date,tpd))
g1 <- g1 + geom_line()
g1
g1 <- ggplot() + geom_line(data = stats, aes(x=date,y=tpd, color=black))
g1
g1 <- ggplot() + geom_line(data = stats, aes(x=date,y=tpd, color="black"))
g1
g1 <- ggplot() + geom_line(data = stats, aes(x=date,y=tpd))
g1
ggplot(stats, aes(date, tpd)) + geom_line() +
scale_x_date(format = "%b-%Y") + xlab("") + ylab("Daily Views")
ggplot(stats, aes(date, tpd)) + geom_line() +
scale_x_date() + xlab("") + ylab("Daily Views")
lapply(stats, class)
stats$tpd <- 0
lapply(stats, class)
stats <- data.frame(date=drange)
stats$tpd <- 0
# Total number of tweets per day over time
for(r in 1:length(drange)) {
stats$tpd[r] <- length(tweets[tweets[, "created_at"] == drange[r], "id_str"])
}
View(stats)
plot.ts(x = stats$date, y=stats$tpd)
g1 <- ggplot() + geom_line(data = stats, aes(x=date,y=tpd))
g1
ggplot(stats, aes(date, tpd)) + geom_line() +
scale_x_date() + xlab("") + ylab("Daily Views")
g1 <- ggplot() +
geom_line(data = stats, aes(x=date,y=tpd)) +
stats_smooth(size=1,colour="red",method="loess", se=FALSE)
g1
g1 <- ggplot() +
geom_line(data = stats, aes(x=date,y=tpd)) +
stat_smooth(size=1,colour="red",method="loess", se=FALSE)
g1
g1 <- ggplot() +
geom_line(data = stats, aes(x=date,y=tpd)) +
stat_smooth(colour="red",method="loess", se=FALSE)
g1
g1 <- ggplot() +
geom_line(data = stats, aes(x=date,y=tpd)) +
stat_smooth(colour="red",method="lm", se=FALSE)
g1
g1 <- ggplot() +
geom_line(data = stats, aes(x=date,y=tpd)) +
geom_smooth(colour="red",method="lm", se=FALSE)
g1
g1 <- ggplot() +
geom_line(data = stats, aes(x=date,y=tpd)) +
geom_smooth(colour="red",method="loess", se=FALSE)
g1
g1 <- ggplot() +
geom_line(data = stats, aes(x=date,y=tpd)) +
geom_smooth(size=1,colour="red",method="loess", se=FALSE)
g1
g1 <- g1 + geom_smooth(size=1,colour="red", method="loess", se=FALSE)
g1
g1 <- ggplot() + geom_line(data = stats, aes(x=date,y=tpd))
g1 <- g1 + geom_smooth(size=1,colour="red", method="loess", se=FALSE)
g1
g1 <- ggplot() + geom_line(data = stats, aes(x=date,y=tpd))
g1
g1 + geom_smooth()
g1 + geom_smqwd
g1 <- ggplot() + geom_point(data = stats, aes(x=date,y=tpd))
g1
g1 <- g1 + geom_smooth(size=1,colour="red", method="loess", se=FALSE)
g1
g1 <- ggplot() + geom_point(data = stats, aes(x=date,y=tpd))
g1 <- g1 + geom_smooth(size=1,method="loess", se=FALSE)
g1
g1 <- ggplot() + geom_point(data = stats, aes(x=date,y=tpd))
g1 <- g1 + geom_smooth(size=1,method="loess", se=FALSE, aes(group=1))
g1
g1 <- ggplot() + geom_point(data = stats, aes(x=date,y=tpd)) +
geom_smooth(size=1,method="loess", se=FALSE, aes(group=1))
g1
g1 <- ggplot() + geom_point(data = stats, aes(x=date,y=tpd)) +
geom_smooth(size=1,method="loess", se=FALSE, aes(x = date, y=tpd))
g1
geom_smooth(size=1,method="loess", se=FALSE)
g1 <- ggplot() + geom_point(data = stats, aes(x=date,y=tpd)) +
geom_smooth(size=1,method="loess", se=FALSE)
g1
plot.ts(x = stats$date, y=stats$tpd)
g1 <- ggplot() + geom_point(data = stats, aes(x=date,y=tpd, color=group)) +
geom_smooth(size=1,method="loess", se=FALSE)
g1
install.packages(c("BH", "bibtex", "devtools", "dplyr", "httr", "jsonlite", "lazyeval", "manipulate", "RCurl", "ROAuth", "rstudioapi", "sp", "stringi"))
g1 <- ggplot(data = stats, aes(x=date,y=tpd, color=variable, group=variable)) +
geom_line() +
geom_smooth(size=1,formula = y ~ x, method="loess", se=FALSE)
g1
stats_melt <- melt(stats, id="date")
View(stats_melt)
View(stats_melt)
stats_melt <- melt(stats, id="date")
g1 <- ggplot(data = stats_melt, aes(x=date,y=value,color=variable, group=variable)) +
geom_line() +
geom_smooth(size=1,formula = y ~ x, method="loess", se=FALSE)
g1
g1 <- ggplot(data = stats_melt, aes(x=date,y=value,color=1, group=variable)) +
geom_line() +
geom_smooth(size=1,formula = y ~ x, method="loess", se=FALSE)
g1
g1 <- ggplot(data = stats_melt, aes(x=date,y=value,color=1, group=variable)) +
geom_line() +
geom_smooth(size=1,formula = y ~ x, method="loess", se=FALSE, colors="red")
g1
g1 <- ggplot(data = stats_melt, aes(x=date,y=value,color=1, group=variable)) +
geom_line() +
geom_smooth(size=1,formula = y ~ x, method="loess", se=FALSE, color="red")
g1
g1 <- ggplot(data = stats_melt, aes(x=date,y=value,color="black", group=variable)) +
geom_line() +
geom_smooth(size=1,formula = y ~ x, method="loess", se=FALSE, color="red")
g1
g1 <- ggplot(data = stats_melt, aes(x=date,y=value,color="yellow", group=variable)) +
geom_line() +
geom_smooth(size=1,formula = y ~ x, method="loess", se=FALSE, color="red")
g1
g1 <- ggplot(data = stats_melt, aes(x=date,y=value,colour="black", group=variable)) +
geom_line() +
geom_smooth(size=1,formula = y ~ x, method="loess", se=FALSE, color="red")
g1
g1 <- ggplot(data = stats_melt, aes(x=date,y=value,colour="#FFFFFF", group=variable)) +
geom_line() +
geom_smooth(size=1,formula = y ~ x, method="loess", se=FALSE, color="red")
g1
g1 <- ggplot(data = stats_melt, aes(x=date,y=value,colour="vqwdqw", group=variable)) +
geom_line() +
geom_smooth(size=1,formula = y ~ x, method="loess", se=FALSE, color="red")
g1
g1 <- ggplot(data = stats_melt, aes(x=date,y=value,colour=variable, group=variable)) +
geom_line() +
geom_smooth(size=1,formula = y ~ x, method="loess", se=FALSE, color="red")
g1
g1 <- ggplot(data = stats_melt, aes(x=date,y=value,colour=1, group=variable)) +
geom_line() +
geom_smooth(size=1,formula = y ~ x, method="loess", se=FALSE, color="red")
g1
g1 <- ggplot(data = stats_melt, aes(x=date,y=value,colour=variable, group=variable)) +
geom_line() +
geom_smooth(size=1,formula = y ~ x, method="loess", se=FALSE, color=1)
g1
require(lubridate)
require(XML)
require(ggplot2)
require(reshape2)
require(stringr)
source("issuecomp-functions.R")
rm(curdate)
rm(date_end, date_start)
rm(g1, r, )
rm(g1, r)
id_folder <- "matched-ids"
unlink(id_folder, recursive = TRUE)
dir.create(id_folder)
issues <- data.frame(date = drange)
View(issues)
issuelist <- xmlToList("issues.xml")
issuelist
issueheads <- names(issuelist)
issueheads
issues[issueheads] <- 0
tweets$issue <- ""
tweets$tags <- ""
View(issues)
# MATCH TWEETS ------------------------------------------------------------
id_folder <- "matched-ids"
unlink(id_folder, recursive = TRUE)
dir.create(id_folder)
issues <- data.frame(date = drange)
issuelist <- xmlToList("issues.xml")
issueheads <- names(issuelist)
issues[issueheads] <- 0
tweets$issue <- ""
tweets$tags <- ""
for(d in 1:nrow(issues)) {
# Go through every day
@@ -510,3 +132,381 @@ return(status)
}
require(stringr)
require(XML)
require(stringr)
require(XML)
# FUNCTIONS ---------------------------------------------------------------
readYN <- function(question) {
n <- readline(prompt=question)
n <- as.character(n)
return(n)
}
checkIssue <- function(string, issuelist) {
status <- any(str_detect(string, issuelist))
return(status)
}
checkAllIssues <- function(string, issuelist) {
status <- NULL
for(i in 1:length(string)) {
if(checkIssue(string[i], issuelist)) {
status[i] <- TRUE
}
else {
cat("Issue",string[i],"does not exist. Please try again.\n")
status[i] <- FALSE
}
}
return(status)
}
c_issues <- data.frame(date = drange)
c_issuelist <- xmlToList("issues.xml")
c_issueheads <- names(issuelist)
c_issues[issueheads] <- 0
source("issuecomp-codingsample-function.R")
c_tweets <- tweets
View(c_tweets)
source("issuecomp-codingsample-function.R")
smartPatternMatch("Höflich, aber klares Statement zu Menschenrechten. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschenrecht\\b", 13, FALSE)
smartPatternMatch("Höflich, aber klares Statement zu Menschenrechten. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschenrechte\\b", 13, FALSE)
smartPatternMatch <- function(string, pattern, chars, acronym) {
pattern <- str_c("\\b", pattern, "\\b")
if(chars <= 4) {
found <- agrep(pattern, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE)
}
else if(chars >= 8) {
cat("bla")
found <- agrep(pattern, string, max.distance = list(all = 2), ignore.case = !acronym, fixed = FALSE)
}
else {
found <- agrep(pattern, string, max.distance = list(all = 1), ignore.case = !acronym, fixed = FALSE)
}
found <- convertLogical0(found)
return(found)
}
smartPatternMatch("Höflich, aber klares Statement zu Menschenrechten. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschenrechte\\b", 13, FALSE)
smartPatternMatch <- function(string, pattern, chars, acronym) {
pattern <- str_c("\\b", pattern, "\\b")
if(chars <= 4) {
found <- agrep(pattern, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE)
}
else if(chars >= 8) {
found <- agrep(pattern, string, max.distance = list(all = 2), ignore.case = !acronym, fixed = FALSE)
}
else {
found <- agrep(pattern, string, max.distance = list(all = 1), ignore.case = !acronym, fixed = FALSE)
}
found <- convertLogical0(found)
return(found)
}
smartPatternMatch <- function(string, pattern, chars, acronym) {
pattern <- str_c("\\b", pattern, "\\b")
if(chars <= 4) {
found <- agrep(pattern, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE)
}
else if(chars >= 8) {
found <- agrep(pattern, string, max.distance = list(all = 3), ignore.case = !acronym, fixed = FALSE)
}
else {
found <- agrep(pattern, string, max.distance = list(all = 1), ignore.case = !acronym, fixed = FALSE)
}
found <- convertLogical0(found)
return(found)
}
smartPatternMatch("Höflich, aber klares Statement zu Menschenrechten. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschenrechte\\b", 13, FALSE)
smartPatternMatch("Höflich, aber klares Statement zu Menschenrechten. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschenrecht\\b", 13, FALSE)
smartPatternMatch("Höflich, aber klares Statement zu Menschenrechten. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschenracht\\b", 13, FALSE)
smartPatternMatch("Höflich, aber klares Statement zu Menschenrechten. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschen-recht\\b", 13, FALSE)
smartPatternMatch("Höflich, aber klares Statement zu Menschen-Rechten. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschenrecht\\b", 13, FALSE)
smartPatternMatch("Höflich, aber klares Statement zu Menschen-Rechte. Der Bundespräsident macht das gut! #China #XiJinping URL ", "\\bMenschenrecht\\b", 13, FALSE)
smartPatternMatch("Bla bla Tomate ", "\\Tomate\\b", 6, FALSE)
smartPatternMatch("Bla bla Tomaten bla bla", "\\Tomate\\b", 6, FALSE)
smartPatternMatch <- function(string, pattern, chars, acronym) {
pattern <- str_c("\\b", pattern, "\\b")
if(chars <= 4) {
found <- agrep(pattern, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE)
}
else if(chars >= 8) {
found <- agrep(pattern, string, max.distance = list(all = 3), ignore.case = !acronym, fixed = FALSE)
}
else {
found <- agrep(pattern, string, max.distance = list(all = 2), ignore.case = !acronym, fixed = FALSE)
}
found <- convertLogical0(found)
return(found)
}
smartPatternMatch("Bla bla Tomaten bla bla", "\\Tomate\\b", 6, FALSE)
smartPatternMatch("Bla bla Menschen bla bla", "\\Menschen\\b", 8, FALSE)
smartPatternMatch("Bla bla Menschen bla bla", "\\Menschen\\b", 7, FALSE)
smartPatternMatch("Bla bla Menschen bla bla", "\\Mensch\\b", 7, FALSE)
smartPatternMatch("Bla bla Menschen bla bla", "\\Mensch\\b", 8, FALSE)
smartPatternMatch("Bla bla Nazis bla bla", "\\Nazis\\b", 8, FALSE)
smartPatternMatch("Bla bla Nazis bla bla", "\\Nazis\\b", 5, FALSE)
smartPatternMatch("Bla bla Nazis bla bla", "\\Nazi\\b", 4, FALSE)
smartPatternMatch("Bla bla Nazi bla bla", "\\Nazis\\b", 5, FALSE)
source("issuecomp-codingsample-function.R")
smartPatternMatch("Der kleine Flüchtlingsjunge war", "\\bFlüchtling\\b", 9, FALSE)
str_detect("Der kleine Flüchtlingsjunge war", pattern = "\\bFlüchtling\\b")
str_detect("Der kleine Flüchtlingsjunge war", pattern = "Flüchtling")
str_detect("Der kleine Flücht lingsjunge war", pattern = "Flüchtling")
smartPatternMatch <- function(string, pattern, chars, acronym) {
pattern <- str_c("\\b", pattern, "\\b")
if(chars <= 4) { # 4 or less
found <- agrep(pattern, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE)
}
else if(chars >= 8) { # 8 or more
found <- agrep(pattern, string, max.distance = list(all = 3), ignore.case = !acronym, fixed = FALSE)
cat(found)
}
else { # 5,6,7
found <- agrep(pattern, string, max.distance = list(all = 2), ignore.case = !acronym, fixed = FALSE)
}
found <- convertLogical0(found)
return(found)
}
str_detect("Der kleine Flücht lingsjunge war", pattern = "Flüchtling")
smartPatternMatch("Der kleine Flüchtlingsjunge war", "\\bFlüchtling\\b", 9, FALSE)
smartPatternMatch("Der kleine Flüchtlingsjunge war", "\\bFlüchtling\\b", 9, FALSE)
smartPatternMatch <- function(string, pattern, chars, acronym) {
pattern <- str_c("\\b", pattern, "\\b")
if(chars <= 4) { # 4 or less
found <- agrep(pattern, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE)
}
else if(chars >= 8) { # 8 or more
found <- agrep(pattern, string, max.distance = list(all = 3), ignore.case = !acronym, fixed = FALSE)
cat("it's",found)
}
else { # 5,6,7
found <- agrep(pattern, string, max.distance = list(all = 2), ignore.case = !acronym, fixed = FALSE)
}
found <- convertLogical0(found)
return(found)
}
smartPatternMatch("Der kleine Flüchtlingsjunge war", "\\bFlüchtling\\b", 9, FALSE)
str_detect("Der kleine Flücht lingsjunge war", pattern = "Flüchtling")
str_detect("Der kleine Flüchtlingsjunge war", pattern = "Flüchtling")
smartPatternMatch("Der kleine Flüchtlingsjunge war", "\\bFlüchtling\\b", 9, FALSE)
smartPatternMatch("Der kleine Flüchtlinge war", "\\bFlüchtling\\b", 9, FALSE)
grep("Flüchtling","Der kleine Flüchtlingsjunge war", ignore.case = TRUE, fixed=FALSE)
grep("\\bFlüchtling\\b","Der kleine Flüchtlingsjunge war", ignore.case = TRUE, fixed=FALSE)
grep("\\bFlüchtling\\b","Der kleine Flüchtlingsjunge war", ignore.case = TRUE, fixed=TRUE)
grep("\\bFlüchtling\\b","Der kleine Flüchtlingsjunge war", ignore.case = TRUE, fixed=FALSE)
grep("Flüchtling","Der kleine Flüchtlingsjunge war", ignore.case = TRUE, fixed=FALSE)
grep("Flüchtling","Der kleine Flücht-lingsjunge war", ignore.case = TRUE, fixed=FALSE)
grep("Flüchtling","Der kleine Flüchtlingsjunge war", ignore.case = TRUE, fixed=FALSE)
smartPatternMatch <- function(string, pattern, chars, acronym) {
patternrex <- str_c("\\b", pattern, "\\b")
if(chars <= 4) { # 4 or less
found <- agrep(patternrex, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE)
}
else if(chars >= 8) { # 8 or more
found <- agrep(patternrex, string, max.distance = list(all = 3), ignore.case = !acronym, fixed = FALSE)
if(convertLogical0(found) == 0) {
found <- grep(pattern, string, ignore.case = !acronym, fixed = FALSE)
}
}
else { # 5,6,7
found <- agrep(patternrex, string, max.distance = list(all = 2), ignore.case = !acronym, fixed = FALSE)
}
found <- convertLogical0(found)
return(found)
}
smartPatternMatch("Der kleine Flüchtlingsjunge war", "Flüchtling", 9, FALSE)
smartPatternMatch("Der kleine Flüchtlingsjunge war", "Flüchtling", 9, FALSE)
smartPatternMatch("Der kleine Flüchtlingsjunge war", "Flüchtling", 7, FALSE)
c_errors <- read.csv("issuecomp-codingsample-error.csv", header = F, sep=",", colClasses="character")
names(c_errors) <- c("str_id", "code", "issue", "tags", "text")
for(r in 1:nrow(c_errors)) {
c_errcode <- as.character(c_errors$code[r])
c_errissue <- as.character(c_errors$issue[r])
c_errtags <- as.character(c_errors$tags[r])
c_errtext <- as.character(c_errors$text[r])
c_errid <- as.character(c_errors$str_id[r])
cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errtags, "\n", sep="")
source("issuecomp-codingsample-function2.R")
}
View(c_errors)
viewMatchingTweets(date = "2014-05-10", issue = "agrar.204", id_folder)
viewMatchingTweets(date = "2014-05-10", issue = "agrar.402", id_folder)
viewMatchingTweets(date = "2014-01-10", issue = "agrar.402", id_folder)
viewMatchingTweets(date = "2014-01-20", issue = "agrar.402", id_folder)
viewMatchingTweets(date = "2014-01-10", issue = "agrar.403", id_folder)
viewMatchingTweets(date = "2014-04-10", issue = "agrar.403", id_folder)
viewMatchingTweets(date = "2014-05-10", issue = "agrar.403", id_folder)
viewMatchingTweets(date = "2014-02-11", issue = "agrar.403", id_folder)
viewMatchingTweets(date = "2014-08-01", issue = "agrar.403", id_folder)
issuelist <- xmlToList("issues.xml")
issuelist
issuelist[[1]]
xmlTreeParse(file = "issues.xml")
View(issues)
issuelist
issueheads
issuelist[[1]]
issuelist2 <- xmlTreeParse(file = "issues.xml")
issuelist2[[1]]
issuelist2[[2]]
issuelist2[[1,2]]
issuelist2[1
issuelist2[1]
issuelist2$doc$file
issuelist2$doc$version
xmlParse("issues.xml")
issuelist2 <- xmlParse("issues.xml")
issuelist2[1]
issuelist2[2]
issuelist2
issuelist
issuelist$edu.606
issuelist$edu.606[1]
issuelist$edu.606[2]
issuelist$edu.606[3]
issueheads
issuelist$macro.100
length(issuelist$macro.100)
length(issuelist$macro.101)
length(issuelist$macro.103)
length(issuelist$macro.105)
issuelist$macro.105
issuelist$macro.105[2]
issueheads
as.character(issuelist[[1]])
as.character(issuelist[[2]])
test <- issueheads[1]
test
as.character(issuelist$test)
as.character(issuelist$macro.100)
as.character(issuelist[test])
as.character(issuelist[test,1])
as.character(issuelist[1,test])
as.character(issuelist[test])
issuelist[test]
issuelist[test]
length(issuelist[test])
length(issuelist$macro.100)
issuelist$macro.100
test
issuelist[test]
issuelist[,test]
issuelist[,as.character(test)]
issuelist[[test]]
issuelist[,test]
issuelist[test]
issuelist[[test]]
length(issuelist[[test]])
issuelist[[test]]
issuelist[[test]][1]
as.character(issuelist[[test]][1])
as.character(issuelist[[test]])
issueheads
issueheads[2]
as.character(issuelist[[i]])
as.character(issuelist[[1]])
as.character(issuelist[[test]])
i <- 1
curissue <- issueheads[i]
curtags <- as.character(issuelist[[curissue]])
curfile <- str_c(id_folder,"/",curissue,".csv")
curissue
curtags
curfile
curtags[2]
# MATCH TWEETS ------------------------------------------------------------
id_folder <- "matched-ids"
unlink(id_folder, recursive = TRUE)
dir.create(id_folder)
issues <- data.frame(date = drange)
issuelist <- xmlToList("issues.xml")
issueheads <- names(issuelist)
issues[issueheads] <- 0
tweets$issue <- ""
tweets$tags <- ""
for(d in 1:nrow(issues)) {
# Go through every day
curdate <- issues$date[d]
cat(as.character(curdate),"\n")
# Put all tweets from specific day in a temporary DF
tweets_curday <- tweets[tweets[, "created_at"] == curdate, ]
for(t in 1:nrow(tweets_curday)){
# Select tweet's text, make it lowercase and remove hashtag indicators (#)
curtext <- as.character(tweets_curday$text[t])
curtext <- str_replace_all(curtext, "#", "")
curid <- as.character(tweets_curday$id_str[t])
# Now test each single issue (not tag!)
for(i in 1:length(issueheads)) {
curissue <- issueheads[i]
curtags <- as.character(issuelist[[curissue]])
curfile <- str_c(id_folder,"/",curissue,".csv")
# Now test all tags of a single issue
for(s in 1:length(curtags)) {
curtag <- curtags[s]
curchars <- nchar(curtag, type = "chars")
# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
if(curchars <= 4) {
curacro <- checkAcronym(string = curtag, chars = curchars)
} else {
curacro <- FALSE
}
# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro)
if(tags_found == 1) {
# Raise number of findings on this day for this issue by 1
issues[d,curissue] <- issues[d,curissue] + 1
# Add issue and first matched tag of tweet to tweets-DF
oldissue <- tweets[tweets[, "id_str"] == curid, "issue"]
tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";")
oldtag <- tweets[tweets[, "id_str"] == curid, "tags"]
tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";")
# Add information to file for function viewPatternMatching
write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE)
break
}
else {
#cat("Nothing found\n")
}
} # /for curtags
} # /for issuelist
} # /for tweets_curday
} # /for drange
smartPatternMatch(string = "er ist pädophil ", pattern = "pädophilie", chars = 10, acronym = FALSE)
smartPatternMatch(string = "er ist pädophiler ", pattern = "pädophilie", chars = 10, acronym = FALSE)
smartPatternMatch(string = "er ist pädophiler ", pattern = "Pädophilie", chars = 10, acronym = FALSE)
smartPatternMatch(string = "er ist pädophiles ", pattern = "Pädophilie", chars = 10, acronym = FALSE)
id_folder <- "matched-ids"
unlink(id_folder, recursive = TRUE)
dir.create(id_folder)
issues <- data.frame(date = drange)
issuelist <- xmlToList("issues.xml")
issueheads <- names(issuelist)
issues[issueheads] <- 0
tweets$issue <- ""
tweets$tags <- ""
issueheads
issuelist <- xmlToList("issues.xml")
issuelist
issueheads
View(issues)
issuelist$text
issuelist$macro.100
issuelist$macro.101
issuelist$text
issuelist$text <- NULL
issueheads <- names(issuelist)
issueheads
issuelist
issuelist$text <- ""
issuelist
issuelist$text <- NA
issuelist
issuelist$text
issuelist$text[1]
issuelist$text[2]
issuelist$text[6]
issuelist$text[10]
issues <- data.frame(date = drange)
issuelist <- xmlToList("issues.xml")
issues <- data.frame(date = drange)
issuelist <- xmlToList("issues.xml")
issueheads <- names(issuelist)
issues[issueheads] <- 0
tweets$issue <- ""
tweets$tags <- ""
View(tweets)
+2 -2
View File
@@ -33,14 +33,14 @@ smartPatternMatch <- function(string, pattern, chars, acronym) {
found <- agrep(patternrex, string, max.distance = list(all = 0), ignore.case = !acronym, fixed = FALSE)
}
else if(chars >= 8) { # 8 or more
found <- agrep(patternrex, string, max.distance = list(all = 3), ignore.case = !acronym, fixed = FALSE)
found <- agrep(patternrex, string, max.distance = list(all = 2), ignore.case = !acronym, fixed = FALSE)
# Give longer words a chance by ignoring word boundaries \\b
if(convertLogical0(found) == 0) {
found <- grep(pattern, string, ignore.case = !acronym, fixed = FALSE)
}
}
else { # 5,6,7
found <- agrep(patternrex, string, max.distance = list(all = 2), ignore.case = !acronym, fixed = FALSE)
found <- agrep(patternrex, string, max.distance = list(all = 1), ignore.case = !acronym, fixed = FALSE)
}
found <- convertLogical0(found)
return(found)
+24 -3
View File
@@ -1,5 +1,4 @@
<issuelist>
# Makroökonomie
<macro.100>
<tag>Wirtschaft</tag>
<tag>Wirtschaftswachstum</tag>
@@ -38,7 +37,7 @@
<tag>Solidaritätszuschlag</tag>
<tag>Erbschaftssteuer</tag>
<tag>Erbsteuer</tag>
KFZ-Steuer
<tag>KFZ-Steuer</tag>
<tag>Alkoholsteuer</tag>
<tag>Steueroase</tag>
</macro.107>
@@ -73,7 +72,7 @@ KFZ-Steuer
</civil.201>
<civil.206>
<tag>Wahlrecht</tag>
NPD-Verbot
<tag>NPD-Verbot</tag>
</civil.206>
<civil.207>
<tag>Versammlungsfreiheit</tag>
@@ -439,4 +438,26 @@ NPD-Verbot
<tag>Rechtschreibreform</tag>
<tag>neue deutsche Rechtschreibung</tag>
</edu.699>
<sens.nsa>
<tag>NSA</tag>
<tag>Überwachung</tag>
<tag>Snowden</tag>
<tag>GCHQ</tag>
<tag>BND</tag>
</sens.nsa>
<sens.is>
<tag>Irak</tag>
<tag>ISIS</tag>
<tag>IS</tag>
<tag>Kalifat</tag>
</sens.is>
<sens.ebola>
<tag>ebola</tag>
</sens.ebola>
<sens.edathy>
<tag>Edathy</tag>
<tag>Kinderpornografie</tag>
<tag>Kinderporno</tag>
<tag>Pädophilie</tag>
</sens.edathy>
</issuelist>