@ -1,261 +1,178 @@
# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
if ( curchars <= 4 ) {
curacro <- checkAcronym ( string = curtag , chars = curchars )
} else {
curacro <- FALSE
all ( test )
test <- NULL
View ( c_errors )
c_errors <- read.csv ( " issuecomp-codingsample-error.csv" , header = F , sep = " ," , colClasses = " character" )
names ( c_errors ) <- c ( " str_id" , " code" , " tags" , " text" )
for ( r in 1 : nrow ( c_errors ) ) {
c_errcode <- as.character ( c_errors $ code [r ] )
c_errtags <- as.character ( c_errors $ tags [r ] )
c_errtext <- as.character ( c_errors $ text [r ] )
cat ( " ===============\n\n[TWEET]: " , c_errtext , " \n[ISSUES]: " , c_errtags , " \n" , sep = " " )
source ( " issuecomp-codingsample-function2.R" )
}
# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
tags_found <- smartPatternMatch ( curtext , curtag , curchars , curacro )
if ( tags_found == 1 ) {
#cat("Matched", curtag, "with", curtext,"\n")
issues [d , curissue ] <- issues [d , curissue ] + 1
write ( str_c ( curdate , " ;\"" , curid , " \"" ) , curfile , append = TRUE )
status
for ( r in 1 : nrow ( c_errors ) ) {
c_errcode <- as.character ( c_errors $ code [r ] )
c_errtags <- as.character ( c_errors $ tags [r ] )
c_errtext <- as.character ( c_errors $ text [r ] )
cat ( " ===============\n\n[TWEET]: " , c_errtext , " \n[ISSUES]: " , c_errtags , " \n" , sep = " " )
source ( " issuecomp-codingsample-function2.R" )
}
if ( c_errcode == " 1" ) {
#cat("Which issue is incorrect?\n")
repeat {
c_tag <- readYN ( " Which issue is incorrect?: " )
c_tag <- unlist ( str_split ( c_tag , " ;" ) )
for ( i in 1 : length ( c_tag ) ) {
if ( checkIssue ( c_tag [i ] , c_issueheads ) ) { status [i ] <- TRUE } else { cat ( " Issue" , c_tag [i ] , " does not exist. Please try again.\n" ) }
}
if ( all ( status ) ) {
break
}
else {
#cat("Nothing found\n")
}
} # /for curtags
} # /for issuelist
} # /for tweets_curday
} # /for drange
smartPatternMatch ( " kerTips: Riker workplace tip: Flirt when no one else is looking. http" , " IS" , 2 , TRUE )
smartPatternMatch ( " kerTips: Riker workplace tip: Flirt when no one else is looking. http" , " is" , 2 , TRUE )
viewMatchingTweets ( " 2014-01-06" , " issue.iraq" , id_folder )
# MATCH TWEETS ------------------------------------------------------------
id_folder <- " matched-ids"
unlink ( id_folder , recursive = TRUE )
dir.create ( id_folder )
issues <- data.frame ( date = drange )
issuelist <- xmlToList ( " issues.xml" )
issueheads <- names ( issuelist )
issues [issueheads ] <- 0
for ( d in 1 : nrow ( issues ) ) {
# Go through every day
curdate <- issues $ date [d ]
cat ( as.character ( curdate ) , " \n" )
# Put all tweets from specific day in a temporary DF
tweets_curday <- tweets [tweets [ , " created_at" ] == curdate , ]
for ( t in 1 : nrow ( tweets_curday ) ) {
# Select tweet's text, make it lowercase and remove hashtag indicators (#)
curtext <- as.character ( tweets_curday $ text [t ] )
curtext <- str_replace_all ( curtext , " #" , " " )
curid <- as.character ( tweets_curday $ id_str [t ] )
# Now test each single issue (not tag!)
for ( i in 1 : length ( issuelist ) ) {
curtags <- as.character ( issuelist [ [i ] ] )
curissue <- names ( issuelist ) [i ]
curfile <- str_c ( id_folder , " /" , curissue , " .csv" )
# Now test all tags of a single issue
for ( s in 1 : length ( curtags ) ) {
curtag <- curtags [s ]
curchars <- nchar ( curtag , type = " chars" )
# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
if ( curchars <= 4 ) {
curacro <- checkAcronym ( string = curtag , chars = curchars )
} else {
curacro <- FALSE
if ( c_errcode == " 1" ) {
#cat("Which issue is incorrect?\n")
repeat {
c_tag <- readYN ( " Which issue is incorrect?: " )
c_tag <- unlist ( str_split ( c_tag , " ;" ) )
for ( i in 1 : length ( c_tag ) ) {
if ( checkIssue ( c_tag [i ] , c_issueheads ) ) { status [i ] <- TRUE } else { cat ( " Issue" , c_tag [i ] , " does not exist. Please try again.\n" ) }
}
# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
tags_found <- smartPatternMatch ( curtext , curtag , curchars , curacro )
if ( tags_found == 1 ) {
#cat("Matched", curtag, "with", curtext,"\n")
issues [d , curissue ] <- issues [d , curissue ] + 1
write ( str_c ( curdate , " ;\"" , curid , " \"" ) , curfile , append = TRUE )
if ( all ( status ) ) {
break
}
}
}
wdq
for ( r in 1 : nrow ( c_errors ) ) {
c_errcode <- as.character ( c_errors $ code [r ] )
c_errtags <- as.character ( c_errors $ tags [r ] )
c_errtext <- as.character ( c_errors $ text [r ] )
cat ( " ===============\n\n[TWEET]: " , c_errtext , " \n[ISSUES]: " , c_errtags , " \n" , sep = " " )
source ( " issuecomp-codingsample-function2.R" )
}
source ( " issuecomp-codingsample-function2.R" )
for ( r in 1 : nrow ( c_errors ) ) {
c_errcode <- as.character ( c_errors $ code [r ] )
c_errtags <- as.character ( c_errors $ tags [r ] )
c_errtext <- as.character ( c_errors $ text [r ] )
cat ( " ===============\n\n[TWEET]: " , c_errtext , " \n[ISSUES]: " , c_errtags , " \n" , sep = " " )
source ( " issuecomp-codingsample-function2.R" )
}
for ( r in 1 : nrow ( c_errors ) ) {
c_errcode <- as.character ( c_errors $ code [r ] )
c_errtags <- as.character ( c_errors $ tags [r ] )
c_errtext <- as.character ( c_errors $ text [r ] )
cat ( " ===============\n\n[TWEET]: " , c_errtext , " \n[ISSUES]: " , c_errtags , " \n" , sep = " " )
source ( " issuecomp-codingsample-function2.R" )
}
checkAllIssues <- function ( string , issuelist ) {
string <- unlist ( str_split ( string , " ;" ) )
for ( i in 1 : length ( string ) ) {
if ( checkIssue ( string [i ] , issuelist ) ) {
status [i ] <- TRUE
}
else {
#cat("Nothing found\n")
cat ( " Issue" , string [i ] , " does not exist. Please try again.\n" )
status [i ] <- FALSE
}
} # /for curtags
} # /for issuelist
} # /for tweets_curday
} # /for drange
source ( " issuecomp-functions.R" )
# MATCH TWEETS ------------------------------------------------------------
id_folder <- " matched-ids"
unlink ( id_folder , recursive = TRUE )
dir.create ( id_folder )
issues <- data.frame ( date = drange )
issuelist <- xmlToList ( " issues.xml" )
issueheads <- names ( issuelist )
issues [issueheads ] <- 0
for ( d in 1 : nrow ( issues ) ) {
# Go through every day
curdate <- issues $ date [d ]
cat ( as.character ( curdate ) , " \n" )
# Put all tweets from specific day in a temporary DF
tweets_curday <- tweets [tweets [ , " created_at" ] == curdate , ]
for ( t in 1 : nrow ( tweets_curday ) ) {
# Select tweet's text, make it lowercase and remove hashtag indicators (#)
curtext <- as.character ( tweets_curday $ text [t ] )
curtext <- str_replace_all ( curtext , " #" , " " )
curid <- as.character ( tweets_curday $ id_str [t ] )
# Now test each single issue (not tag!)
for ( i in 1 : length ( issuelist ) ) {
curtags <- as.character ( issuelist [ [i ] ] )
curissue <- names ( issuelist ) [i ]
curfile <- str_c ( id_folder , " /" , curissue , " .csv" )
# Now test all tags of a single issue
for ( s in 1 : length ( curtags ) ) {
curtag <- curtags [s ]
curchars <- nchar ( curtag , type = " chars" )
# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
if ( curchars <= 4 ) {
curacro <- checkAcronym ( string = curtag , chars = curchars )
} else {
curacro <- FALSE
}
# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
t ags_found <- smartPatternMatch ( curtext , curtag , curchars , curacro )
if ( tags_found == 1 ) {
#cat("Matched", curtag, "with", curtext,"\n" )
issues [d , curissue ] <- issues [d , curissue ] + 1
write( str_c ( curdate , " ;\"" , curid , " \";" curtag ) , curfile , append = TRUE )
break
}
test
checkAllIssues <- function ( string , issuelist ) {
string <- unlist ( str_split ( string , " ;" ) )
for ( i in 1 : length ( string ) ) {
if ( checkIssue ( string [i ] , issuelist ) ) {
status [i ] <- TRUE
}
else {
#cat("Nothing found\n")
cat ( " Issue" , string [i ] , " does not exist. Please try again.\n" )
status [i ] <- FALSE
}
} # /for curtags
} # /for issuelist
} # /for tweets_curday
} # /for drange
# MATCH TWEETS ------------------------------------------------------------
id_folder <- " matched-ids"
unlink ( id_folder , recursive = TRUE )
dir.create ( id_folder )
issues <- data.frame ( date = drange )
issuelist <- xmlToList ( " issues.xml" )
issueheads <- names ( issuelist )
issues [issueheads ] <- 0
for ( d in 1 : nrow ( issues ) ) {
# Go through every day
curdate <- issues $ date [d ]
cat ( as.character ( curdate ) , " \n" )
# Put all tweets from specific day in a temporary DF
tweets_curday <- tweets [tweets [ , " created_at" ] == curdate , ]
for ( t in 1 : nrow ( tweets_curday ) ) {
# Select tweet's text, make it lowercase and remove hashtag indicators (#)
curtext <- as.character ( tweets_curday $ text [t ] )
curtext <- str_replace_all ( curtext , " #" , " " )
curid <- as.character ( tweets_curday $ id_str [t ] )
# Now test each single issue (not tag!)
for ( i in 1 : length ( issuelist ) ) {
curtags <- as.character ( issuelist [ [i ] ] )
curissue <- names ( issuelist ) [i ]
curfile <- str_c ( id_folder , " /" , curissue , " .csv" )
# Now test all tags of a single issue
for ( s in 1 : length ( curtags ) ) {
curtag <- curtags [s ]
curchars <- nchar ( curtag , type = " chars" )
# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
if ( curchars <= 4 ) {
curacro <- checkAcronym ( string = curtag , chars = curchars )
} else {
curacro <- FALSE
}
# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
tags_found <- smartPatternMatch ( curtext , curtag , curchars , curacro )
if ( tags_found == 1 ) {
#cat("Matched", curtag, "with", curtext,"\n")
issues [d , curissue ] <- issues [d , curissue ] + 1
write ( str_c ( curdate , " ;\"" , curid , " \";" , curtag ) , curfile , append = TRUE )
break
return ( status )
}
test <- " issue.edathy"
checkAllIssues ( test , c_issueheads )
test <- " issue.edathy"
checkAllIssues ( test , c_issueheads )
rm ( status )
checkAllIssues ( test , c_issueheads )
checkAllIssues <- function ( string , issuelist ) {
status <- NULL
string <- unlist ( str_split ( string , " ;" ) )
for ( i in 1 : length ( string ) ) {
if ( checkIssue ( string [i ] , issuelist ) ) {
status [i ] <- TRUE
}
else {
#cat("Nothing found\n")
cat ( " Issue" , string [i ] , " does not exist. Please try again.\n" )
status [i ] <- FALSE
}
} # /for curtags
} # /for issuelist
} # /for tweets_curday
} # /for drange
source ( " issuecomp-functions.R" )
viewMatchingTweets ( " 2014-01-06" , " issue.iraq" , id_folder )
viewMatchingTweets ( " 2014-01-07" , " issue.iraq" , id_folder )
viewMatchingTweets ( " 2014-01-09" , " issue.iraq" , id_folder )
curtext <- " Willkürlich Menschen an ihrer #Versammlungsfreiheit zu hindern ist eindeutig rechtswidrig. http://t.co/A7IQfISIhP #Gefahrengebiet #Hamburg"
str_replace_all ( curtext , " http://.+\\W" , " " )
str_replace_all ( curtext , " http://.+?\\W" , " " )
str_replace_all ( curtext , " http://.+?\\s" , " " )
str_replace_all ( curtext , " http://.+?\\s" , " " )
curtext <- " test http://google.de haha http://nsa.gov eqiuhe"
str_replace_all ( curtext , " http://.+?\\s" , " " )
str_replace_all ( curtext , " http://.+?\\s" , " URL" )
str_replace_all ( curtext , " http://.+?\\s" , " URL " )
viewMatchingTweets ( " 2014-01-09" , " issue.iraq" , id_folder )
# MATCH TWEETS ------------------------------------------------------------
id_folder <- " matched-ids"
unlink ( id_folder , recursive = TRUE )
dir.create ( id_folder )
issues <- data.frame ( date = drange )
issuelist <- xmlToList ( " issues.xml" )
issueheads <- names ( issuelist )
issues [issueheads ] <- 0
for ( d in 1 : nrow ( issues ) ) {
# Go through every day
curdate <- issues $ date [d ]
cat ( as.character ( curdate ) , " \n" )
# Put all tweets from specific day in a temporary DF
tweets_curday <- tweets [tweets [ , " created_at" ] == curdate , ]
for ( t in 1 : nrow ( tweets_curday ) ) {
# Select tweet's text, make it lowercase and remove hashtag indicators (#)
curtext <- as.character ( tweets_curday $ text [t ] )
curtext <- str_replace_all ( curtext , " #" , " " )
curtext <- str_replace_all ( curtext , " http://.+?\\s" , " URL " )
curid <- as.character ( tweets_curday $ id_str [t ] )
# Now test each single issue (not tag!)
for ( i in 1 : length ( issuelist ) ) {
curtags <- as.character ( issuelist [ [i ] ] )
curissue <- names ( issuelist ) [i ]
curfile <- str_c ( id_folder , " /" , curissue , " .csv" )
# Now test all tags of a single issue
for ( s in 1 : length ( curtags ) ) {
curtag <- curtags [s ]
curchars <- nchar ( curtag , type = " chars" )
# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
if ( curchars <= 4 ) {
curacro <- checkAcronym ( string = curtag , chars = curchars )
} else {
curacro <- FALSE
}
# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
tags_found <- smartPatternMatch ( curtext , curtag , curchars , curacro )
if ( tags_found == 1 ) {
#cat("Matched", curtag, "with", curtext,"\n")
issues [d , curissue ] <- issues [d , curissue ] + 1
write ( str_c ( curdate , " ;\"" , curid , " \";" , curtag ) , curfile , append = TRUE )
break
return ( status )
}
checkAllIssues ( test , c_issueheads )
checkAllIssues ( " wdjqaowd" , c_issueheads )
test <- checkAllIssues ( " wdjqaowd" , c_issueheads )
test
test <- checkAllIssues ( " wdjqaow;wiqud" , c_issueheads )
test
test <- checkAllIssues ( " wdjqaow;issue.edathy" , c_issueheads )
test
checkAllIssues <- function ( string , issuelist ) {
status <- NULL
string <- unlist ( str_split ( string , " ;" ) )
for ( i in 1 : length ( string ) ) {
if ( checkIssue ( string [i ] , issuelist ) ) {
status [i ] <- TRUE
}
else {
#cat("Nothing found\n")
cat ( " Issue" , string [i ] , " does not exist. Please try again.\n" )
status [i ] <- FALSE
}
} # /for curtags
} # /for issuelist
} # /for tweets_curday
} # /for drange
viewMatchingTweets ( " 2014-01-09" , " issue.iraq" , id_folder )
viewMatchingTweets ( " 2014-01-08" , " issue.iraq" , id_folder )
viewMatchingTweets ( " 2014-01-10" , " issue.iraq" , id_folder )
curtext
str_replace_all ( curtext , " http://.+?\\>" , " URL " )
str_replace_all ( curtext , " http://.+?\\<" , " URL " )
curtext <- str_replace_all ( curtext , " http://.+?\\b" , " URL " )
str_replace_all ( curtext , " http://.+?\\b" , " URL " )
str_replace_all ( curtext , " http://.+?\\s" , " URL " )
curtext
curtext <- as.character ( tweets_curday $ text [t ] )
curtext
str_replace_all ( curtext , " http://.+?\\s" , " URL " )
str_replace_all ( curtext , " http://.+?\\b" , " URL " )
str_replace_all ( curtext , " http://.+?\\<" , " URL " )
str_replace_all ( curtext , " http://.+?\\>" , " URL " )
str_replace_all ( curtext , " http://.+?\\s" , " URL " )
str_replace_all ( curtext , " $" , " " )
curtext <- str_replace_all ( curtext , " $" , " " )
curtext
str_replace_all ( curtext , " http://.+?\\s" , " URL " )
viewMatchingTweets ( " 2014-01-10" , " issue.iraq" , id_folder )
}
return ( status )
}
for ( r in 1 : nrow ( c_errors ) ) {
c_errcode <- as.character ( c_errors $ code [r ] )
c_errtags <- as.character ( c_errors $ tags [r ] )
c_errtext <- as.character ( c_errors $ text [r ] )
cat ( " ===============\n\n[TWEET]: " , c_errtext , " \n[ISSUES]: " , c_errtags , " \n" , sep = " " )
source ( " issuecomp-codingsample-function2.R" )
}
checkAllIssues <- function ( string , issuelist ) {
status <- NULL
for ( i in 1 : length ( string ) ) {
if ( checkIssue ( string [i ] , issuelist ) ) {
status [i ] <- TRUE
}
else {
cat ( " Issue" , string [i ] , " does not exist. Please try again.\n" )
status [i ] <- FALSE
}
}
return ( status )
}
for ( r in 1 : nrow ( c_errors ) ) {
c_errcode <- as.character ( c_errors $ code [r ] )
c_errtags <- as.character ( c_errors $ tags [r ] )
c_errtext <- as.character ( c_errors $ text [r ] )
cat ( " ===============\n\n[TWEET]: " , c_errtext , " \n[ISSUES]: " , c_errtags , " \n" , sep = " " )
source ( " issuecomp-codingsample-function2.R" )
}
for ( r in 1 : nrow ( c_errors ) ) {
c_errcode <- as.character ( c_errors $ code [r ] )
c_errtags <- as.character ( c_errors $ tags [r ] )
c_errtext <- as.character ( c_errors $ text [r ] )
cat ( " ===============\n\n[TWEET]: " , c_errtext , " \n[ISSUES]: " , c_errtags , " \n" , sep = " " )
source ( " issuecomp-codingsample-function2.R" )
}
View ( c_issues )
View ( tweets )
tweets $ tagged <- NULL
View ( c_tweets )
View ( tweets )
# MATCH TWEETS ------------------------------------------------------------
id_folder <- " matched-ids"
unlink ( id_folder , recursive = TRUE )
@ -264,6 +181,8 @@ issues <- data.frame(date = drange)
issuelist <- xmlToList ( " issues.xml" )
issueheads <- names ( issuelist )
issues [issueheads ] <- 0
tweets $ issue <- " "
tweets $ tags <- " "
for ( d in 1 : nrow ( issues ) ) {
# Go through every day
curdate <- issues $ date [d ]
@ -295,8 +214,14 @@ curacro <- FALSE
# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
tags_found <- smartPatternMatch ( curtext , curtag , curchars , curacro )
if ( tags_found == 1 ) {
# cat("Matched", curtag, "with", curtext,"\n")
# Raise number of findings on this day for this issue by 1
issues [d , curissue ] <- issues [d , curissue ] + 1
# Add issue and first matched tag of tweet to tweets-DF
oldissue <- tweets [tweets [ , " id_str" ] == curid , " issue" ]
tweets [tweets [ , " id_str" ] == curid , " issue" ] <- str_c ( oldissue , curissue , " ;" )
oldtag <- tweets [tweets [ , " id_str" ] == curid , " tags" ]
tweets [tweets [ , " id_str" ] == curid , " tags" ] <- str_c ( oldtag , curtag , " ;" )
# Add information to file for function viewPatternMatching
write ( str_c ( curdate , " ;\"" , curid , " \";" , curtag ) , curfile , append = TRUE )
break
}
@ -307,7 +232,10 @@ else {
} # /for issuelist
} # /for tweets_curday
} # /for drange
viewMatchingTweets ( " 2014-01-10" , " issue.iraq" , id_folder )
date_start <- as.Date ( " 2014-01-01" )
date_end <- as.Date ( " 2014-12-31" )
drange <- as.integer ( date_end - date_start )
drange <- date_start + days ( 0 : drange )
# MATCH TWEETS ------------------------------------------------------------
id_folder <- " matched-ids"
unlink ( id_folder , recursive = TRUE )
@ -316,6 +244,8 @@ issues <- data.frame(date = drange)
issuelist <- xmlToList ( " issues.xml" )
issueheads <- names ( issuelist )
issues [issueheads ] <- 0
tweets $ issue <- " "
tweets $ tags <- " "
for ( d in 1 : nrow ( issues ) ) {
# Go through every day
curdate <- issues $ date [d ]
@ -347,8 +277,14 @@ curacro <- FALSE
# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
tags_found <- smartPatternMatch ( curtext , curtag , curchars , curacro )
if ( tags_found == 1 ) {
# cat("Matched", curtag, "with", curtext,"\n")
# Raise number of findings on this day for this issue by 1
issues [d , curissue ] <- issues [d , curissue ] + 1
# Add issue and first matched tag of tweet to tweets-DF
oldissue <- tweets [tweets [ , " id_str" ] == curid , " issue" ]
tweets [tweets [ , " id_str" ] == curid , " issue" ] <- str_c ( oldissue , curissue , " ;" )
oldtag <- tweets [tweets [ , " id_str" ] == curid , " tags" ]
tweets [tweets [ , " id_str" ] == curid , " tags" ] <- str_c ( oldtag , curtag , " ;" )
# Add information to file for function viewPatternMatching
write ( str_c ( curdate , " ;\"" , curid , " \";" , curtag ) , curfile , append = TRUE )
break
}
@ -359,32 +295,9 @@ else {
} # /for issuelist
} # /for tweets_curday
} # /for drange
View ( issues )
viewMatchingTweets ( " 2014-12-18" , " issue.edathy" , id_folder )
issues_melt <- melt ( issues , id = " date" )
ggplot ( issues_melt , aes ( x = date , y = value , colour = variable , group = variable ) ) + geom_line ( size = 1 )
ggplot ( issues_melt , aes ( x = date , y = value , colour = variable , group = variable ) ) + geom_smooth ( size = 1 , method = " loess" , formula = y ~ x , se = FALSE )
ggplot ( issues_melt , aes ( x = date , y = value , colour = variable , group = variable ) ) + geom_line ( size = 1 )
ggplot ( issues_melt , aes ( x = date , y = value , colour = variable , group = variable ) ) + geom_line ( size = 1 )
ggplot ( issues_melt , aes ( x = date , y = value , colour = variable , group = variable ) ) + geom_smooth ( size = 1 , method = " loess" , formula = y ~ x , se = FALSE )
viewMatchingTweets ( " 2014-12-18" , " issue.conservative" , id_folder )
agrep ( " christ" , " Jungparlamentarier gleich Schriftführerdienst hat" , max.distance = list ( all = 2 ) , ignore.case = TRUE , fixed = FALSE )
agrep ( " \\bchrist\\b" , " Jungparlamentarier gleich Schriftführerdienst hat" , max.distance = list ( all = 2 ) , ignore.case = TRUE , fixed = FALSE )
agrep ( " \\bchrist\\b" , " Bla Christ bla" , max.distance = list ( all = 2 ) , ignore.case = TRUE , fixed = FALSE )
agrep ( " \\bchrist\\b" , " Bla Christus bla" , max.distance = list ( all = 2 ) , ignore.case = TRUE , fixed = FALSE )
agrep ( " \\bchrist\\b" , " Bla Christu bla" , max.distance = list ( all = 2 ) , ignore.case = TRUE , fixed = FALSE )
agrep ( " \\bchrist\\b" , " Bla Christus bla" , max.distance = list ( all = 2 ) , ignore.case = TRUE , fixed = FALSE )
agrep ( " \\bchrist\\b" , " Bla Christus bla" , max.distance = list ( all = 3 ) , ignore.case = TRUE , fixed = FALSE )
agrep ( " \\bchrist\\b" , " Bla christus bla" , max.distance = list ( all = 3 ) , ignore.case = TRUE , fixed = FALSE )
agrep ( " \\bchrist\\b" , " Bla christus bla" , max.distance = list ( all = 2 ) , ignore.case = TRUE , fixed = FALSE )
agrep ( " \\bchrist\\b" , " Bla christen bla" , max.distance = list ( all = 3 ) , ignore.case = TRUE , fixed = FALSE )
agrep ( " \\bchrist\\b" , " Bla Antichrist bla" , max.distance = list ( all = 3 ) , ignore.case = TRUE , fixed = FALSE )
agrep ( " \\bchrist\\b" , " Bla Christian bla" , max.distance = list ( all = 3 ) , ignore.case = TRUE , fixed = FALSE )
agrep ( " \\bchrist\\b" , " Bla Christian bla" , max.distance = list ( all = 3 ) , ignore.case = TRUE , fixed = FALSE , value = TRUE )
agrep ( " \\bchrist\\b" , " Bla Christi bla" , max.distance = list ( all = 3 ) , ignore.case = TRUE , fixed = FALSE , value = TRUE )
agrep ( " \\bchrist\\b" , " Bla Christi bla" , max.distance = list ( all = 3 ) , ignore.case = TRUE , fixed = FALSE )
agrep ( " \\bIS\\b" , " Wir sind bei ISN Network" , max.distance = list ( all = 0 ) , ignore.case = TRUE , fixed = FALSE )
agrep ( " \\bIS\\b" , " Wir sind bei ISN Network" , max.distance = list ( all = 0 ) , ignore.case = F , fixed = FALSE )
View ( tweets )
View ( tweets )
# MATCH TWEETS ------------------------------------------------------------
id_folder <- " matched-ids"
unlink ( id_folder , recursive = TRUE )
dir.create ( id_folder )
@ -392,6 +305,8 @@ issues <- data.frame(date = drange)
issuelist <- xmlToList ( " issues.xml" )
issueheads <- names ( issuelist )
issues [issueheads ] <- 0
tweets $ issue <- " "
tweets $ tags <- " "
for ( d in 1 : nrow ( issues ) ) {
# Go through every day
curdate <- issues $ date [d ]
@ -423,8 +338,14 @@ curacro <- FALSE
# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
tags_found <- smartPatternMatch ( curtext , curtag , curchars , curacro )
if ( tags_found == 1 ) {
# cat("Matched", curtag, "with", curtext,"\n")
# Raise number of findings on this day for this issue by 1
issues [d , curissue ] <- issues [d , curissue ] + 1
# Add issue and first matched tag of tweet to tweets-DF
oldissue <- tweets [tweets [ , " id_str" ] == curid , " issue" ]
tweets [tweets [ , " id_str" ] == curid , " issue" ] <- str_c ( oldissue , curissue , " ;" )
oldtag <- tweets [tweets [ , " id_str" ] == curid , " tags" ]
tweets [tweets [ , " id_str" ] == curid , " tags" ] <- str_c ( oldtag , curtag , " ;" )
# Add information to file for function viewPatternMatching
write ( str_c ( curdate , " ;\"" , curid , " \";" , curtag ) , curfile , append = TRUE )
break
}
@ -435,78 +356,157 @@ else {
} # /for issuelist
} # /for tweets_curday
} # /for drange
issues_melt <- melt ( issues , id = " date" )
ggplot ( issues_melt , aes ( x = date , y = value , colour = variable , group = variable ) ) + geom_smooth ( size = 1 , method = " loess" , formula = y ~ x , se = FALSE )
viewMatchingTweets ( " 2014-12-18" , " issue.conservative" , id_folder )
pattern
agrep ( " \\bchrist\\b" , " RT @christophheyes: Morgen in der Presse: Oppermann - Briefkasten gestohlen! Gabriel - Poesiealbum nicht mehr auffindbar! #edathy #hartmann" , max.distance = list ( all = 1 ) , ignore.case = TRUE , fixed = FALSE )
smartPatternMatch
source ( " issuecomp-functions.R" )
smartPatternMatch
# MATCH TWEETS ------------------------------------------------------------
id_folder <- " matched-ids"
unlink ( id_folder , recursive = TRUE )
dir.create ( id_folder )
issues <- data.frame ( date = drange )
issuelist <- xmlToList ( " issues.xml" )
issueheads <- names ( issuelist )
issues [issueheads ] <- 0
for ( d in 1 : nrow ( issues ) ) {
# Go through every day
curdate <- issues $ date [d ]
cat ( as.character ( curdate ) , " \n" )
# Put all tweets from specific day in a temporary DF
tweets_curday <- tweets [tweets [ , " created_at" ] == curdate , ]
for ( t in 1 : nrow ( tweets_curday ) ) {
# Select tweet's text, make it lowercase and remove hashtag indicators (#)
curtext <- as.character ( tweets_curday $ text [t ] )
curtext <- str_replace_all ( curtext , " #" , " " )
View ( tweets )
View ( c_errors )
View ( tweets )
readYN <- function ( question ) {
n <- readline ( prompt = question )
n <- as.character ( n )
return ( n )
}
checkIssue <- function ( string , issuelist ) {
status <- any ( str_detect ( string , issuelist ) )
return ( status )
}
checkAllIssues <- function ( string , issuelist ) {
status <- NULL
for ( i in 1 : length ( string ) ) {
if ( checkIssue ( string [i ] , issuelist ) ) {
status [i ] <- TRUE
}
else {
cat ( " Issue" , string [i ] , " does not exist. Please try again.\n" )
status [i ] <- FALSE
}
}
return ( status )
}
View ( tweets )
View ( tweets )
write.csv ( tweets , " tweets.csv" )
save ( tweets , file = " tweets.RData" )
write.csv ( tweets , " tweets.csv" )
save ( tweets , file = " tweets.RData" )
c_tweets <- read.csv ( " tweets.csv" )
View ( c_tweets )
c_tweets $ X <- NULL
# Read all issues from XML file
c_issues <- data.frame ( date = drange )
c_issuelist <- xmlToList ( " issues.xml" )
c_issueheads <- names ( issuelist )
c_issues [issueheads ] <- 0
source ( " issuecomp-codingsample-function.R" )
rm ( c_err , c_result , c_samid , c_samno , c_samtags , c_samissue , c_samtext , c_yn )
rm ( c_samtag )
rm ( tweets_curday , curacro , curchars , curdate , curfile , curid , curissue , curtag , curtags , curtext , d , date_end , date_start , drange , i , id_folder , oldissue , oldtag , s , t , tags_found )
c_errors <- read.csv ( " issuecomp-codingsample-error.csv" , header = F , sep = " ," , colClasses = " character" )
View ( c_errors )
names ( c_errors ) <- c ( " str_id" , " code" , " issue" , " tags" , " text" )
View ( c_errors )
for ( r in 1 : nrow ( c_errors ) ) {
c_errcode <- as.character ( c_errors $ code [r ] )
c_errissue <- as.character ( c_errors $ issue [r ] )
c_errtags <- as.character ( c_errors $ tags [r ] )
c_errtext <- as.character ( c_errors $ text [r ] )
c_errid <- as.character ( c_errors $ str_id [r ] )
cat ( " ===============\n\n[TWEET]: " , c_errtext , " \n[ISSUES]: " , c_errtags , " \n" , sep = " " )
source ( " issuecomp-codingsample-function2.R" )
}
for ( r in 1 : nrow ( c_errors ) ) {
c_errcode <- as.character ( c_errors $ code [r ] )
c_errissue <- as.character ( c_errors $ issue [r ] )
c_errtags <- as.character ( c_errors $ tags [r ] )
c_errtext <- as.character ( c_errors $ text [r ] )
c_errid <- as.character ( c_errors $ str_id [r ] )
cat ( " ===============\n\n[TWEET]: " , c_errtext , " \n[ISSUES]: " , c_errtags , " \n" , sep = " " )
source ( " issuecomp-codingsample-function2.R" )
}
c_curissue
c_curtags
c_errissue
c_errtags
c_errid
delrow <- NULL
for ( r in 1 : nrow ( tweets ) ) {
if ( format ( tweets $ created_at [r ] , " %Y" ) != " 2014" ) {
delrow <- c ( delrow , r )
}
curtext <- as.character ( tweets $ text [r ] )
curtext <- str_replace_all ( curtext , " $" , " " )
curtext <- str_replace_all ( curtext , " http://.+?\\s" , " URL " )
curid <- as.character ( tweets_curday $ id_str [t ] )
# Now test each single issue (not tag!)
for ( i in 1 : length ( issuelist ) ) {
curtags <- as.character ( issuelist [ [i ] ] )
curissue <- names ( issuelist ) [i ]
curfile <- str_c ( id_folder , " /" , curissue , " .csv" )
# Now test all tags of a single issue
for ( s in 1 : length ( curtags ) ) {
curtag <- curtags [s ]
curchars <- nchar ( curtag , type = " chars" )
# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
if ( curchars <= 4 ) {
curacro <- checkAcronym ( string = curtag , chars = curchars )
} else {
curacro <- FALSE
}
# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
tags_found <- smartPatternMatch ( curtext , curtag , curchars , curacro )
if ( tags_found == 1 ) {
#cat("Matched", curtag, "with", curtext,"\n")
issues [d , curissue ] <- issues [d , curissue ] + 1
write ( str_c ( curdate , " ;\"" , curid , " \";" , curtag ) , curfile , append = TRUE )
break
r
require ( stringr )
View ( tweets )
df <- data.frame ( x = c ( " zeile1" , " zeile2" ) )
View ( df )
df $ x [1 ] <- " blabla"
View ( df )
df <- data.frame ( x = c ( " zeile1" , " zeile2" ) )
test <- " bla bla"
df $ x [1 ] <- test
View ( df )
df $ x [1 ] <- as.character ( test )
class ( df $ x )
df $ x [1 ] <- as.factor ( test )
head ( tweet )
head ( tweets )
df <- head ( tweets )
View ( df )
df $ text [1 ] <- " test"
View ( tweets )
View ( df )
for ( r in 1 : nrow ( tweets ) ) {
# if(format(tweets$created_at[r], "%Y") != "2014") {
# delrow <- c(delrow, r)
# }
curtext <- as.character ( tweets $ text [r ] )
curtext <- str_replace_all ( curtext , " $" , " " )
curtext <- str_replace_all ( curtext , " http://.+?\\s" , " URL " )
tweets $ text [r ] <- curtext
}
else {
#cat("Nothing found\n")
View ( tweets )
View ( c_tweets )
rm ( delrow , r )
save ( tweets , file = " tweets_untagged.RData" )
row.names ( tweets ) <- NULL
write.csv ( tweets , " tweets.csv" )
save ( tweets , file = " tweets.RData" )
c_tweets <- read.csv ( " tweets.csv" )
c_tweets $ X <- NULL
View ( c_tweets )
viewMatchingTweets
c_tweets <- read.csv ( " tweets.csv" , colClasses = " character" )
c_tweets $ X <- NULL
View ( c_tweets )
View ( c_issues )
c_errtags
for ( r in 1 : nrow ( c_errors ) ) {
c_errcode <- as.character ( c_errors $ code [r ] )
c_errissue <- as.character ( c_errors $ issue [r ] )
c_errtags <- as.character ( c_errors $ tags [r ] )
c_errtext <- as.character ( c_errors $ text [r ] )
c_errid <- as.character ( c_errors $ str_id [r ] )
cat ( " ===============\n\n[TWEET]: " , c_errtext , " \n[ISSUES]: " , c_errtags , " \n" , sep = " " )
source ( " issuecomp-codingsample-function2.R" )
}
curissue
c_curissue
str_join ( c_curissue )
str_join ( c_curissue , collapse = NULL )
str_join ( c_curissue , sep = " ;" , collapse = NULL )
paste ( c_curissue , sep = " " )
paste ( c_curissue , sep = ' ' )
length ( paste ( c_curissue , sep = ' ' ) )
str_join ( c_curissue , sep = " ;" , collapse = " " )
str_join ( c_curissue , sep = " ;" , collapse = " w" )
str_join ( c_curissue , collapse = " ;" )
for ( r in 1 : nrow ( c_errors ) ) {
c_errcode <- as.character ( c_errors $ code [r ] )
c_errissue <- as.character ( c_errors $ issue [r ] )
c_errtags <- as.character ( c_errors $ tags [r ] )
c_errtext <- as.character ( c_errors $ text [r ] )
c_errid <- as.character ( c_errors $ str_id [r ] )
cat ( " ===============\n\n[TWEET]: " , c_errtext , " \n[ISSUES]: " , c_errtags , " \n" , sep = " " )
source ( " issuecomp-codingsample-function2.R" )
}
} # /for curtags
} # /for issuelist
} # /for tweets_curday
} # /for drange
issues_melt <- melt ( issues , id = " date" )
ggplot ( issues_melt , aes ( x = date , y = value , colour = variable , group = variable ) ) + geom_smooth ( size = 1 , method = " loess" , formula = y ~ x , se = FALSE )
viewMatchingTweets ( " 2014-12-18" , " issue.conservative" , id_folder )
viewMatchingTweets ( " 2014-05-18" , " issue.conservative" , id_folder )
viewMatchingTweets ( " 2014-05-1" , " issue.conservative" , id_folder )
viewMatchingTweets ( " 2014-05-01" , " issue.conservative" , id_folder )
viewMatchingTweets ( " 2014-05-02" , " issue.conservative" , id_folder )
viewMatchingTweets ( " 2014-05-10" , " issue.conservative" , id_folder )
viewMatchingTweets ( " 2014-05-10" , " issue.middleeast" , id_folder )
viewMatchingTweets ( " 2014-05-10" , " issue.iraw" , id_folder )
viewMatchingTweets ( " 2014-05-10" , " issue.iraq" , id_folder )
viewMatchingTweets ( " 2014-08-10" , " issue.iraq" , id_folder )
viewMatchingTweets ( " 2014-11-10" , " issue.iraq" , id_folder )
viewMatchingTweets ( " 2014-12-10" , " issue.iraq" , id_folder )
View ( issues )
viewMatchingTweets ( " 2014-09-19" , " issue.control" , id_folder )