2015-01-12 23:52:24 +01:00
# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
if ( curchars <= 4 ) {
curacro <- checkAcronym ( string = curtag , chars = curchars )
} else {
curacro <- FALSE
}
# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
tags_found <- smartPatternMatch ( curtext , curtag , curchars , curacro )
if ( tags_found == 1 ) {
#cat("Matched", curtag, "with", curtext,"\n")
issues [d , curissue ] <- issues [d , curissue ] + 1
write ( str_c ( curdate , " ;\"" , curid , " \"" ) , curfile , append = TRUE )
2015-01-10 13:02:40 +01:00
break
}
else {
2015-01-12 23:52:24 +01:00
#cat("Nothing found\n")
}
} # /for curtags
} # /for issuelist
} # /for tweets_curday
} # /for drange
smartPatternMatch ( " kerTips: Riker workplace tip: Flirt when no one else is looking. http" , " IS" , 2 , TRUE )
smartPatternMatch ( " kerTips: Riker workplace tip: Flirt when no one else is looking. http" , " is" , 2 , TRUE )
viewMatchingTweets ( " 2014-01-06" , " issue.iraq" , id_folder )
# MATCH TWEETS ------------------------------------------------------------
id_folder <- " matched-ids"
unlink ( id_folder , recursive = TRUE )
dir.create ( id_folder )
issues <- data.frame ( date = drange )
issuelist <- xmlToList ( " issues.xml" )
issueheads <- names ( issuelist )
issues [issueheads ] <- 0
for ( d in 1 : nrow ( issues ) ) {
# Go through every day
curdate <- issues $ date [d ]
cat ( as.character ( curdate ) , " \n" )
# Put all tweets from specific day in a temporary DF
tweets_curday <- tweets [tweets [ , " created_at" ] == curdate , ]
for ( t in 1 : nrow ( tweets_curday ) ) {
# Select tweet's text, make it lowercase and remove hashtag indicators (#)
curtext <- as.character ( tweets_curday $ text [t ] )
curtext <- str_replace_all ( curtext , " #" , " " )
curid <- as.character ( tweets_curday $ id_str [t ] )
# Now test each single issue (not tag!)
for ( i in 1 : length ( issuelist ) ) {
curtags <- as.character ( issuelist [ [i ] ] )
curissue <- names ( issuelist ) [i ]
curfile <- str_c ( id_folder , " /" , curissue , " .csv" )
# Now test all tags of a single issue
for ( s in 1 : length ( curtags ) ) {
curtag <- curtags [s ]
curchars <- nchar ( curtag , type = " chars" )
# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
if ( curchars <= 4 ) {
curacro <- checkAcronym ( string = curtag , chars = curchars )
} else {
curacro <- FALSE
}
# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
tags_found <- smartPatternMatch ( curtext , curtag , curchars , curacro )
if ( tags_found == 1 ) {
#cat("Matched", curtag, "with", curtext,"\n")
issues [d , curissue ] <- issues [d , curissue ] + 1
write ( str_c ( curdate , " ;\"" , curid , " \"" ) , curfile , append = TRUE )
2015-01-10 13:02:40 +01:00
break
}
else {
2015-01-12 23:52:24 +01:00
#cat("Nothing found\n")
2015-01-10 13:02:40 +01:00
}
2015-01-12 23:52:24 +01:00
} # /for curtags
} # /for issuelist
} # /for tweets_curday
} # /for drange
source ( " issuecomp-functions.R" )
# MATCH TWEETS ------------------------------------------------------------
id_folder <- " matched-ids"
unlink ( id_folder , recursive = TRUE )
dir.create ( id_folder )
issues <- data.frame ( date = drange )
issuelist <- xmlToList ( " issues.xml" )
issueheads <- names ( issuelist )
issues [issueheads ] <- 0
for ( d in 1 : nrow ( issues ) ) {
# Go through every day
curdate <- issues $ date [d ]
cat ( as.character ( curdate ) , " \n" )
# Put all tweets from specific day in a temporary DF
tweets_curday <- tweets [tweets [ , " created_at" ] == curdate , ]
for ( t in 1 : nrow ( tweets_curday ) ) {
# Select tweet's text, make it lowercase and remove hashtag indicators (#)
curtext <- as.character ( tweets_curday $ text [t ] )
curtext <- str_replace_all ( curtext , " #" , " " )
curid <- as.character ( tweets_curday $ id_str [t ] )
# Now test each single issue (not tag!)
for ( i in 1 : length ( issuelist ) ) {
curtags <- as.character ( issuelist [ [i ] ] )
curissue <- names ( issuelist ) [i ]
curfile <- str_c ( id_folder , " /" , curissue , " .csv" )
# Now test all tags of a single issue
for ( s in 1 : length ( curtags ) ) {
curtag <- curtags [s ]
curchars <- nchar ( curtag , type = " chars" )
# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
if ( curchars <= 4 ) {
curacro <- checkAcronym ( string = curtag , chars = curchars )
} else {
curacro <- FALSE
}
# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
tags_found <- smartPatternMatch ( curtext , curtag , curchars , curacro )
if ( tags_found == 1 ) {
#cat("Matched", curtag, "with", curtext,"\n")
issues [d , curissue ] <- issues [d , curissue ] + 1
write ( str_c ( curdate , " ;\"" , curid , " \";" curtag ) , curfile , append = TRUE )
2015-01-10 13:02:40 +01:00
break
}
else {
2015-01-12 23:52:24 +01:00
#cat("Nothing found\n")
}
} # /for curtags
} # /for issuelist
} # /for tweets_curday
} # /for drange
# MATCH TWEETS ------------------------------------------------------------
id_folder <- " matched-ids"
unlink ( id_folder , recursive = TRUE )
dir.create ( id_folder )
issues <- data.frame ( date = drange )
issuelist <- xmlToList ( " issues.xml" )
issueheads <- names ( issuelist )
issues [issueheads ] <- 0
for ( d in 1 : nrow ( issues ) ) {
# Go through every day
curdate <- issues $ date [d ]
cat ( as.character ( curdate ) , " \n" )
# Put all tweets from specific day in a temporary DF
tweets_curday <- tweets [tweets [ , " created_at" ] == curdate , ]
for ( t in 1 : nrow ( tweets_curday ) ) {
# Select tweet's text, make it lowercase and remove hashtag indicators (#)
curtext <- as.character ( tweets_curday $ text [t ] )
curtext <- str_replace_all ( curtext , " #" , " " )
curid <- as.character ( tweets_curday $ id_str [t ] )
# Now test each single issue (not tag!)
for ( i in 1 : length ( issuelist ) ) {
curtags <- as.character ( issuelist [ [i ] ] )
curissue <- names ( issuelist ) [i ]
curfile <- str_c ( id_folder , " /" , curissue , " .csv" )
# Now test all tags of a single issue
for ( s in 1 : length ( curtags ) ) {
curtag <- curtags [s ]
curchars <- nchar ( curtag , type = " chars" )
# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
if ( curchars <= 4 ) {
curacro <- checkAcronym ( string = curtag , chars = curchars )
} else {
curacro <- FALSE
}
# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
tags_found <- smartPatternMatch ( curtext , curtag , curchars , curacro )
if ( tags_found == 1 ) {
#cat("Matched", curtag, "with", curtext,"\n")
issues [d , curissue ] <- issues [d , curissue ] + 1
write ( str_c ( curdate , " ;\"" , curid , " \";" , curtag ) , curfile , append = TRUE )
2015-01-10 13:02:40 +01:00
break
}
else {
2015-01-12 23:52:24 +01:00
#cat("Nothing found\n")
2015-01-10 13:02:40 +01:00
}
2015-01-12 23:52:24 +01:00
} # /for curtags
} # /for issuelist
} # /for tweets_curday
} # /for drange
source ( " issuecomp-functions.R" )
viewMatchingTweets ( " 2014-01-06" , " issue.iraq" , id_folder )
viewMatchingTweets ( " 2014-01-07" , " issue.iraq" , id_folder )
viewMatchingTweets ( " 2014-01-09" , " issue.iraq" , id_folder )
curtext <- " Willkürlich Menschen an ihrer #Versammlungsfreiheit zu hindern ist eindeutig rechtswidrig. http://t.co/A7IQfISIhP #Gefahrengebiet #Hamburg"
str_replace_all ( curtext , " http://.+\\W" , " " )
str_replace_all ( curtext , " http://.+?\\W" , " " )
str_replace_all ( curtext , " http://.+?\\s" , " " )
str_replace_all ( curtext , " http://.+?\\s" , " " )
curtext <- " test http://google.de haha http://nsa.gov eqiuhe"
str_replace_all ( curtext , " http://.+?\\s" , " " )
str_replace_all ( curtext , " http://.+?\\s" , " URL" )
str_replace_all ( curtext , " http://.+?\\s" , " URL " )
viewMatchingTweets ( " 2014-01-09" , " issue.iraq" , id_folder )
# MATCH TWEETS ------------------------------------------------------------
id_folder <- " matched-ids"
unlink ( id_folder , recursive = TRUE )
dir.create ( id_folder )
issues <- data.frame ( date = drange )
issuelist <- xmlToList ( " issues.xml" )
issueheads <- names ( issuelist )
issues [issueheads ] <- 0
for ( d in 1 : nrow ( issues ) ) {
# Go through every day
curdate <- issues $ date [d ]
cat ( as.character ( curdate ) , " \n" )
# Put all tweets from specific day in a temporary DF
tweets_curday <- tweets [tweets [ , " created_at" ] == curdate , ]
for ( t in 1 : nrow ( tweets_curday ) ) {
# Select tweet's text, make it lowercase and remove hashtag indicators (#)
curtext <- as.character ( tweets_curday $ text [t ] )
curtext <- str_replace_all ( curtext , " #" , " " )
curtext <- str_replace_all ( curtext , " http://.+?\\s" , " URL " )
curid <- as.character ( tweets_curday $ id_str [t ] )
# Now test each single issue (not tag!)
for ( i in 1 : length ( issuelist ) ) {
curtags <- as.character ( issuelist [ [i ] ] )
curissue <- names ( issuelist ) [i ]
curfile <- str_c ( id_folder , " /" , curissue , " .csv" )
# Now test all tags of a single issue
for ( s in 1 : length ( curtags ) ) {
curtag <- curtags [s ]
curchars <- nchar ( curtag , type = " chars" )
# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
if ( curchars <= 4 ) {
curacro <- checkAcronym ( string = curtag , chars = curchars )
} else {
curacro <- FALSE
}
# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
tags_found <- smartPatternMatch ( curtext , curtag , curchars , curacro )
if ( tags_found == 1 ) {
#cat("Matched", curtag, "with", curtext,"\n")
issues [d , curissue ] <- issues [d , curissue ] + 1
write ( str_c ( curdate , " ;\"" , curid , " \";" , curtag ) , curfile , append = TRUE )
2015-01-10 13:02:40 +01:00
break
}
2015-01-12 23:52:24 +01:00
else {
#cat("Nothing found\n")
}
} # /for curtags
} # /for issuelist
} # /for tweets_curday
} # /for drange
viewMatchingTweets ( " 2014-01-09" , " issue.iraq" , id_folder )
viewMatchingTweets ( " 2014-01-08" , " issue.iraq" , id_folder )
viewMatchingTweets ( " 2014-01-10" , " issue.iraq" , id_folder )
curtext
str_replace_all ( curtext , " http://.+?\\>" , " URL " )
str_replace_all ( curtext , " http://.+?\\<" , " URL " )
curtext <- str_replace_all ( curtext , " http://.+?\\b" , " URL " )
str_replace_all ( curtext , " http://.+?\\b" , " URL " )
str_replace_all ( curtext , " http://.+?\\s" , " URL " )
curtext
curtext <- as.character ( tweets_curday $ text [t ] )
curtext
str_replace_all ( curtext , " http://.+?\\s" , " URL " )
str_replace_all ( curtext , " http://.+?\\b" , " URL " )
str_replace_all ( curtext , " http://.+?\\<" , " URL " )
str_replace_all ( curtext , " http://.+?\\>" , " URL " )
str_replace_all ( curtext , " http://.+?\\s" , " URL " )
str_replace_all ( curtext , " $" , " " )
curtext <- str_replace_all ( curtext , " $" , " " )
curtext
str_replace_all ( curtext , " http://.+?\\s" , " URL " )
viewMatchingTweets ( " 2014-01-10" , " issue.iraq" , id_folder )
# MATCH TWEETS ------------------------------------------------------------
id_folder <- " matched-ids"
unlink ( id_folder , recursive = TRUE )
dir.create ( id_folder )
issues <- data.frame ( date = drange )
issuelist <- xmlToList ( " issues.xml" )
issueheads <- names ( issuelist )
issues [issueheads ] <- 0
for ( d in 1 : nrow ( issues ) ) {
# Go through every day
curdate <- issues $ date [d ]
cat ( as.character ( curdate ) , " \n" )
# Put all tweets from specific day in a temporary DF
tweets_curday <- tweets [tweets [ , " created_at" ] == curdate , ]
for ( t in 1 : nrow ( tweets_curday ) ) {
# Select tweet's text, make it lowercase and remove hashtag indicators (#)
curtext <- as.character ( tweets_curday $ text [t ] )
curtext <- str_replace_all ( curtext , " #" , " " )
curtext <- str_replace_all ( curtext , " $" , " " )
curtext <- str_replace_all ( curtext , " http://.+?\\s" , " URL " )
curid <- as.character ( tweets_curday $ id_str [t ] )
# Now test each single issue (not tag!)
for ( i in 1 : length ( issuelist ) ) {
curtags <- as.character ( issuelist [ [i ] ] )
curissue <- names ( issuelist ) [i ]
curfile <- str_c ( id_folder , " /" , curissue , " .csv" )
# Now test all tags of a single issue
for ( s in 1 : length ( curtags ) ) {
curtag <- curtags [s ]
curchars <- nchar ( curtag , type = " chars" )
# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
if ( curchars <= 4 ) {
curacro <- checkAcronym ( string = curtag , chars = curchars )
} else {
curacro <- FALSE
}
# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
tags_found <- smartPatternMatch ( curtext , curtag , curchars , curacro )
if ( tags_found == 1 ) {
#cat("Matched", curtag, "with", curtext,"\n")
issues [d , curissue ] <- issues [d , curissue ] + 1
write ( str_c ( curdate , " ;\"" , curid , " \";" , curtag ) , curfile , append = TRUE )
2015-01-10 13:02:40 +01:00
break
}
2015-01-12 23:52:24 +01:00
else {
#cat("Nothing found\n")
}
} # /for curtags
} # /for issuelist
} # /for tweets_curday
} # /for drange
viewMatchingTweets ( " 2014-01-10" , " issue.iraq" , id_folder )
# MATCH TWEETS ------------------------------------------------------------
id_folder <- " matched-ids"
unlink ( id_folder , recursive = TRUE )
dir.create ( id_folder )
issues <- data.frame ( date = drange )
issuelist <- xmlToList ( " issues.xml" )
issueheads <- names ( issuelist )
issues [issueheads ] <- 0
for ( d in 1 : nrow ( issues ) ) {
# Go through every day
curdate <- issues $ date [d ]
cat ( as.character ( curdate ) , " \n" )
# Put all tweets from specific day in a temporary DF
tweets_curday <- tweets [tweets [ , " created_at" ] == curdate , ]
for ( t in 1 : nrow ( tweets_curday ) ) {
# Select tweet's text, make it lowercase and remove hashtag indicators (#)
curtext <- as.character ( tweets_curday $ text [t ] )
curtext <- str_replace_all ( curtext , " #" , " " )
curtext <- str_replace_all ( curtext , " $" , " " )
curtext <- str_replace_all ( curtext , " http://.+?\\s" , " URL " )
curid <- as.character ( tweets_curday $ id_str [t ] )
# Now test each single issue (not tag!)
for ( i in 1 : length ( issuelist ) ) {
curtags <- as.character ( issuelist [ [i ] ] )
curissue <- names ( issuelist ) [i ]
curfile <- str_c ( id_folder , " /" , curissue , " .csv" )
# Now test all tags of a single issue
for ( s in 1 : length ( curtags ) ) {
curtag <- curtags [s ]
curchars <- nchar ( curtag , type = " chars" )
# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
if ( curchars <= 4 ) {
curacro <- checkAcronym ( string = curtag , chars = curchars )
} else {
curacro <- FALSE
}
# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
tags_found <- smartPatternMatch ( curtext , curtag , curchars , curacro )
if ( tags_found == 1 ) {
#cat("Matched", curtag, "with", curtext,"\n")
issues [d , curissue ] <- issues [d , curissue ] + 1
write ( str_c ( curdate , " ;\"" , curid , " \";" , curtag ) , curfile , append = TRUE )
2015-01-10 13:02:40 +01:00
break
}
2015-01-12 23:52:24 +01:00
else {
#cat("Nothing found\n")
2015-01-10 13:02:40 +01:00
}
2015-01-12 23:52:24 +01:00
} # /for curtags
} # /for issuelist
} # /for tweets_curday
} # /for drange
View ( issues )
viewMatchingTweets ( " 2014-12-18" , " issue.edathy" , id_folder )
issues_melt <- melt ( issues , id = " date" )
ggplot ( issues_melt , aes ( x = date , y = value , colour = variable , group = variable ) ) + geom_line ( size = 1 )
ggplot ( issues_melt , aes ( x = date , y = value , colour = variable , group = variable ) ) + geom_smooth ( size = 1 , method = " loess" , formula = y ~ x , se = FALSE )
ggplot ( issues_melt , aes ( x = date , y = value , colour = variable , group = variable ) ) + geom_line ( size = 1 )
ggplot ( issues_melt , aes ( x = date , y = value , colour = variable , group = variable ) ) + geom_line ( size = 1 )
ggplot ( issues_melt , aes ( x = date , y = value , colour = variable , group = variable ) ) + geom_smooth ( size = 1 , method = " loess" , formula = y ~ x , se = FALSE )
viewMatchingTweets ( " 2014-12-18" , " issue.conservative" , id_folder )
agrep ( " christ" , " Jungparlamentarier gleich Schriftführerdienst hat" , max.distance = list ( all = 2 ) , ignore.case = TRUE , fixed = FALSE )
agrep ( " \\bchrist\\b" , " Jungparlamentarier gleich Schriftführerdienst hat" , max.distance = list ( all = 2 ) , ignore.case = TRUE , fixed = FALSE )
agrep ( " \\bchrist\\b" , " Bla Christ bla" , max.distance = list ( all = 2 ) , ignore.case = TRUE , fixed = FALSE )
agrep ( " \\bchrist\\b" , " Bla Christus bla" , max.distance = list ( all = 2 ) , ignore.case = TRUE , fixed = FALSE )
agrep ( " \\bchrist\\b" , " Bla Christu bla" , max.distance = list ( all = 2 ) , ignore.case = TRUE , fixed = FALSE )
agrep ( " \\bchrist\\b" , " Bla Christus bla" , max.distance = list ( all = 2 ) , ignore.case = TRUE , fixed = FALSE )
agrep ( " \\bchrist\\b" , " Bla Christus bla" , max.distance = list ( all = 3 ) , ignore.case = TRUE , fixed = FALSE )
agrep ( " \\bchrist\\b" , " Bla christus bla" , max.distance = list ( all = 3 ) , ignore.case = TRUE , fixed = FALSE )
agrep ( " \\bchrist\\b" , " Bla christus bla" , max.distance = list ( all = 2 ) , ignore.case = TRUE , fixed = FALSE )
agrep ( " \\bchrist\\b" , " Bla christen bla" , max.distance = list ( all = 3 ) , ignore.case = TRUE , fixed = FALSE )
agrep ( " \\bchrist\\b" , " Bla Antichrist bla" , max.distance = list ( all = 3 ) , ignore.case = TRUE , fixed = FALSE )
agrep ( " \\bchrist\\b" , " Bla Christian bla" , max.distance = list ( all = 3 ) , ignore.case = TRUE , fixed = FALSE )
agrep ( " \\bchrist\\b" , " Bla Christian bla" , max.distance = list ( all = 3 ) , ignore.case = TRUE , fixed = FALSE , value = TRUE )
agrep ( " \\bchrist\\b" , " Bla Christi bla" , max.distance = list ( all = 3 ) , ignore.case = TRUE , fixed = FALSE , value = TRUE )
agrep ( " \\bchrist\\b" , " Bla Christi bla" , max.distance = list ( all = 3 ) , ignore.case = TRUE , fixed = FALSE )
agrep ( " \\bIS\\b" , " Wir sind bei ISN Network" , max.distance = list ( all = 0 ) , ignore.case = TRUE , fixed = FALSE )
agrep ( " \\bIS\\b" , " Wir sind bei ISN Network" , max.distance = list ( all = 0 ) , ignore.case = F , fixed = FALSE )
id_folder <- " matched-ids"
unlink ( id_folder , recursive = TRUE )
dir.create ( id_folder )
issues <- data.frame ( date = drange )
issuelist <- xmlToList ( " issues.xml" )
issueheads <- names ( issuelist )
issues [issueheads ] <- 0
for ( d in 1 : nrow ( issues ) ) {
# Go through every day
curdate <- issues $ date [d ]
cat ( as.character ( curdate ) , " \n" )
# Put all tweets from specific day in a temporary DF
tweets_curday <- tweets [tweets [ , " created_at" ] == curdate , ]
for ( t in 1 : nrow ( tweets_curday ) ) {
# Select tweet's text, make it lowercase and remove hashtag indicators (#)
curtext <- as.character ( tweets_curday $ text [t ] )
curtext <- str_replace_all ( curtext , " #" , " " )
curtext <- str_replace_all ( curtext , " $" , " " )
curtext <- str_replace_all ( curtext , " http://.+?\\s" , " URL " )
curid <- as.character ( tweets_curday $ id_str [t ] )
# Now test each single issue (not tag!)
for ( i in 1 : length ( issuelist ) ) {
curtags <- as.character ( issuelist [ [i ] ] )
curissue <- names ( issuelist ) [i ]
curfile <- str_c ( id_folder , " /" , curissue , " .csv" )
# Now test all tags of a single issue
for ( s in 1 : length ( curtags ) ) {
curtag <- curtags [s ]
curchars <- nchar ( curtag , type = " chars" )
# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
if ( curchars <= 4 ) {
curacro <- checkAcronym ( string = curtag , chars = curchars )
} else {
curacro <- FALSE
}
# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
tags_found <- smartPatternMatch ( curtext , curtag , curchars , curacro )
if ( tags_found == 1 ) {
#cat("Matched", curtag, "with", curtext,"\n")
issues [d , curissue ] <- issues [d , curissue ] + 1
write ( str_c ( curdate , " ;\"" , curid , " \";" , curtag ) , curfile , append = TRUE )
2015-01-10 13:02:40 +01:00
break
}
2015-01-12 23:52:24 +01:00
else {
#cat("Nothing found\n")
}
} # /for curtags
} # /for issuelist
} # /for tweets_curday
} # /for drange
issues_melt <- melt ( issues , id = " date" )
ggplot ( issues_melt , aes ( x = date , y = value , colour = variable , group = variable ) ) + geom_smooth ( size = 1 , method = " loess" , formula = y ~ x , se = FALSE )
viewMatchingTweets ( " 2014-12-18" , " issue.conservative" , id_folder )
pattern
agrep ( " \\bchrist\\b" , " RT @christophheyes: Morgen in der Presse: Oppermann - Briefkasten gestohlen! Gabriel - Poesiealbum nicht mehr auffindbar! #edathy #hartmann" , max.distance = list ( all = 1 ) , ignore.case = TRUE , fixed = FALSE )
smartPatternMatch
2015-01-12 12:48:10 +01:00
source ( " issuecomp-functions.R" )
2015-01-12 23:52:24 +01:00
smartPatternMatch
# MATCH TWEETS ------------------------------------------------------------
id_folder <- " matched-ids"
unlink ( id_folder , recursive = TRUE )
dir.create ( id_folder )
2015-01-12 12:48:10 +01:00
issues <- data.frame ( date = drange )
issuelist <- xmlToList ( " issues.xml" )
issueheads <- names ( issuelist )
issues [issueheads ] <- 0
2015-01-12 23:52:24 +01:00
for ( d in 1 : nrow ( issues ) ) {
# Go through every day
curdate <- issues $ date [d ]
cat ( as.character ( curdate ) , " \n" )
# Put all tweets from specific day in a temporary DF
tweets_curday <- tweets [tweets [ , " created_at" ] == curdate , ]
for ( t in 1 : nrow ( tweets_curday ) ) {
# Select tweet's text, make it lowercase and remove hashtag indicators (#)
curtext <- as.character ( tweets_curday $ text [t ] )
curtext <- str_replace_all ( curtext , " #" , " " )
curtext <- str_replace_all ( curtext , " $" , " " )
curtext <- str_replace_all ( curtext , " http://.+?\\s" , " URL " )
curid <- as.character ( tweets_curday $ id_str [t ] )
# Now test each single issue (not tag!)
for ( i in 1 : length ( issuelist ) ) {
curtags <- as.character ( issuelist [ [i ] ] )
curissue <- names ( issuelist ) [i ]
curfile <- str_c ( id_folder , " /" , curissue , " .csv" )
# Now test all tags of a single issue
for ( s in 1 : length ( curtags ) ) {
curtag <- curtags [s ]
curchars <- nchar ( curtag , type = " chars" )
# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
if ( curchars <= 4 ) {
curacro <- checkAcronym ( string = curtag , chars = curchars )
} else {
curacro <- FALSE
}
# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
tags_found <- smartPatternMatch ( curtext , curtag , curchars , curacro )
if ( tags_found == 1 ) {
#cat("Matched", curtag, "with", curtext,"\n")
issues [d , curissue ] <- issues [d , curissue ] + 1
write ( str_c ( curdate , " ;\"" , curid , " \";" , curtag ) , curfile , append = TRUE )
break
}
else {
#cat("Nothing found\n")
}
} # /for curtags
} # /for issuelist
} # /for tweets_curday
} # /for drange
issues_melt <- melt ( issues , id = " date" )
ggplot ( issues_melt , aes ( x = date , y = value , colour = variable , group = variable ) ) + geom_smooth ( size = 1 , method = " loess" , formula = y ~ x , se = FALSE )
viewMatchingTweets ( " 2014-12-18" , " issue.conservative" , id_folder )
viewMatchingTweets ( " 2014-05-18" , " issue.conservative" , id_folder )
viewMatchingTweets ( " 2014-05-1" , " issue.conservative" , id_folder )
viewMatchingTweets ( " 2014-05-01" , " issue.conservative" , id_folder )
viewMatchingTweets ( " 2014-05-02" , " issue.conservative" , id_folder )
viewMatchingTweets ( " 2014-05-10" , " issue.conservative" , id_folder )
viewMatchingTweets ( " 2014-05-10" , " issue.middleeast" , id_folder )
viewMatchingTweets ( " 2014-05-10" , " issue.iraw" , id_folder )
viewMatchingTweets ( " 2014-05-10" , " issue.iraq" , id_folder )
viewMatchingTweets ( " 2014-08-10" , " issue.iraq" , id_folder )
viewMatchingTweets ( " 2014-11-10" , " issue.iraq" , id_folder )
viewMatchingTweets ( " 2014-12-10" , " issue.iraq" , id_folder )
2015-01-12 12:48:10 +01:00
View ( issues )
2015-01-12 23:52:24 +01:00
viewMatchingTweets ( " 2014-09-19" , " issue.control" , id_folder )