2015-01-15 20:24:40 +01:00
tweets $ tags <- " "
2015-01-12 23:52:24 +01:00
for ( d in 1 : nrow ( issues ) ) {
# Go through every day
curdate <- issues $ date [d ]
cat ( as.character ( curdate ) , " \n" )
# Put all tweets from specific day in a temporary DF
tweets_curday <- tweets [tweets [ , " created_at" ] == curdate , ]
for ( t in 1 : nrow ( tweets_curday ) ) {
# Select tweet's text, make it lowercase and remove hashtag indicators (#)
curtext <- as.character ( tweets_curday $ text [t ] )
curtext <- str_replace_all ( curtext , " #" , " " )
curid <- as.character ( tweets_curday $ id_str [t ] )
# Now test each single issue (not tag!)
for ( i in 1 : length ( issuelist ) ) {
curtags <- as.character ( issuelist [ [i ] ] )
curissue <- names ( issuelist ) [i ]
curfile <- str_c ( id_folder , " /" , curissue , " .csv" )
# Now test all tags of a single issue
for ( s in 1 : length ( curtags ) ) {
curtag <- curtags [s ]
curchars <- nchar ( curtag , type = " chars" )
# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
if ( curchars <= 4 ) {
curacro <- checkAcronym ( string = curtag , chars = curchars )
} else {
curacro <- FALSE
}
# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
tags_found <- smartPatternMatch ( curtext , curtag , curchars , curacro )
if ( tags_found == 1 ) {
2015-01-15 20:24:40 +01:00
# Raise number of findings on this day for this issue by 1
2015-01-12 23:52:24 +01:00
issues [d , curissue ] <- issues [d , curissue ] + 1
2015-01-15 20:24:40 +01:00
# Add issue and first matched tag of tweet to tweets-DF
oldissue <- tweets [tweets [ , " id_str" ] == curid , " issue" ]
tweets [tweets [ , " id_str" ] == curid , " issue" ] <- str_c ( oldissue , curissue , " ;" )
oldtag <- tweets [tweets [ , " id_str" ] == curid , " tags" ]
tweets [tweets [ , " id_str" ] == curid , " tags" ] <- str_c ( oldtag , curtag , " ;" )
# Add information to file for function viewPatternMatching
2015-01-12 23:52:24 +01:00
write ( str_c ( curdate , " ;\"" , curid , " \";" , curtag ) , curfile , append = TRUE )
2015-01-10 13:02:40 +01:00
break
}
2015-01-12 23:52:24 +01:00
else {
#cat("Nothing found\n")
2015-01-10 13:02:40 +01:00
}
2015-01-12 23:52:24 +01:00
} # /for curtags
} # /for issuelist
} # /for tweets_curday
} # /for drange
2015-01-18 22:46:54 +01:00
View ( issues )
2015-01-15 20:24:40 +01:00
# MATCH TWEETS ------------------------------------------------------------
2015-01-12 23:52:24 +01:00
id_folder <- " matched-ids"
unlink ( id_folder , recursive = TRUE )
dir.create ( id_folder )
issues <- data.frame ( date = drange )
issuelist <- xmlToList ( " issues.xml" )
issueheads <- names ( issuelist )
issues [issueheads ] <- 0
2015-01-15 20:24:40 +01:00
tweets $ issue <- " "
tweets $ tags <- " "
2015-01-12 23:52:24 +01:00
for ( d in 1 : nrow ( issues ) ) {
# Go through every day
curdate <- issues $ date [d ]
cat ( as.character ( curdate ) , " \n" )
# Put all tweets from specific day in a temporary DF
tweets_curday <- tweets [tweets [ , " created_at" ] == curdate , ]
for ( t in 1 : nrow ( tweets_curday ) ) {
# Select tweet's text, make it lowercase and remove hashtag indicators (#)
curtext <- as.character ( tweets_curday $ text [t ] )
curtext <- str_replace_all ( curtext , " #" , " " )
curid <- as.character ( tweets_curday $ id_str [t ] )
# Now test each single issue (not tag!)
for ( i in 1 : length ( issuelist ) ) {
curtags <- as.character ( issuelist [ [i ] ] )
curissue <- names ( issuelist ) [i ]
curfile <- str_c ( id_folder , " /" , curissue , " .csv" )
# Now test all tags of a single issue
for ( s in 1 : length ( curtags ) ) {
curtag <- curtags [s ]
curchars <- nchar ( curtag , type = " chars" )
# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
if ( curchars <= 4 ) {
curacro <- checkAcronym ( string = curtag , chars = curchars )
} else {
curacro <- FALSE
}
# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
tags_found <- smartPatternMatch ( curtext , curtag , curchars , curacro )
if ( tags_found == 1 ) {
2015-01-15 20:24:40 +01:00
# Raise number of findings on this day for this issue by 1
2015-01-12 23:52:24 +01:00
issues [d , curissue ] <- issues [d , curissue ] + 1
2015-01-15 20:24:40 +01:00
# Add issue and first matched tag of tweet to tweets-DF
oldissue <- tweets [tweets [ , " id_str" ] == curid , " issue" ]
tweets [tweets [ , " id_str" ] == curid , " issue" ] <- str_c ( oldissue , curissue , " ;" )
oldtag <- tweets [tweets [ , " id_str" ] == curid , " tags" ]
tweets [tweets [ , " id_str" ] == curid , " tags" ] <- str_c ( oldtag , curtag , " ;" )
# Add information to file for function viewPatternMatching
2015-01-12 23:52:24 +01:00
write ( str_c ( curdate , " ;\"" , curid , " \";" , curtag ) , curfile , append = TRUE )
2015-01-10 13:02:40 +01:00
break
}
2015-01-12 23:52:24 +01:00
else {
#cat("Nothing found\n")
}
} # /for curtags
} # /for issuelist
} # /for tweets_curday
} # /for drange
2015-01-18 22:46:54 +01:00
#rm(tweets_curday,curacro, curchars, curdate,curfile,curid,curissue,curtag,curtags,curtext,d,date_end,date_start,i,id_folder,oldissue,oldtag,s,t,tags_found)
View ( issues )
save ( issues , " issues.RData" )
save ( issues , file = " issues.RData" )
2015-01-15 20:24:40 +01:00
readYN <- function ( question ) {
n <- readline ( prompt = question )
n <- as.character ( n )
return ( n )
}
checkIssue <- function ( string , issuelist ) {
status <- any ( str_detect ( string , issuelist ) )
return ( status )
}
checkAllIssues <- function ( string , issuelist ) {
status <- NULL
for ( i in 1 : length ( string ) ) {
if ( checkIssue ( string [i ] , issuelist ) ) {
status [i ] <- TRUE
}
else {
cat ( " Issue" , string [i ] , " does not exist. Please try again.\n" )
status [i ] <- FALSE
}
}
return ( status )
}
require ( stringr )
2015-01-18 22:46:54 +01:00
require ( XML )
2015-01-18 23:35:47 +01:00
require ( stringr )
require ( XML )
# FUNCTIONS ---------------------------------------------------------------
readYN <- function ( question ) {
n <- readline ( prompt = question )
n <- as.character ( n )
return ( n )
}
checkIssue <- function ( string , issuelist ) {
status <- any ( str_detect ( string , issuelist ) )
return ( status )
}
checkAllIssues <- function ( string , issuelist ) {
status <- NULL
for ( i in 1 : length ( string ) ) {
if ( checkIssue ( string [i ] , issuelist ) ) {
status [i ] <- TRUE
}
else {
cat ( " Issue" , string [i ] , " does not exist. Please try again.\n" )
status [i ] <- FALSE
}
}
return ( status )
}
c_issues <- data.frame ( date = drange )
c_issuelist <- xmlToList ( " issues.xml" )
c_issueheads <- names ( issuelist )
c_issues [issueheads ] <- 0
source ( " issuecomp-codingsample-function.R" )
c_tweets <- tweets
View ( c_tweets )
source ( " issuecomp-codingsample-function.R" )
smartPatternMatch ( " Höflich, aber klares Statement zu Menschenrechten. Der Bundespräsident macht das gut! #China #XiJinping URL " , " \\bMenschenrecht\\b" , 13 , FALSE )
smartPatternMatch ( " Höflich, aber klares Statement zu Menschenrechten. Der Bundespräsident macht das gut! #China #XiJinping URL " , " \\bMenschenrechte\\b" , 13 , FALSE )
smartPatternMatch <- function ( string , pattern , chars , acronym ) {
pattern <- str_c ( " \\b" , pattern , " \\b" )
if ( chars <= 4 ) {
found <- agrep ( pattern , string , max.distance = list ( all = 0 ) , ignore.case = ! acronym , fixed = FALSE )
}
else if ( chars >= 8 ) {
cat ( " bla" )
found <- agrep ( pattern , string , max.distance = list ( all = 2 ) , ignore.case = ! acronym , fixed = FALSE )
}
else {
found <- agrep ( pattern , string , max.distance = list ( all = 1 ) , ignore.case = ! acronym , fixed = FALSE )
}
found <- convertLogical0 ( found )
return ( found )
}
smartPatternMatch ( " Höflich, aber klares Statement zu Menschenrechten. Der Bundespräsident macht das gut! #China #XiJinping URL " , " \\bMenschenrechte\\b" , 13 , FALSE )
smartPatternMatch <- function ( string , pattern , chars , acronym ) {
pattern <- str_c ( " \\b" , pattern , " \\b" )
if ( chars <= 4 ) {
found <- agrep ( pattern , string , max.distance = list ( all = 0 ) , ignore.case = ! acronym , fixed = FALSE )
}
else if ( chars >= 8 ) {
found <- agrep ( pattern , string , max.distance = list ( all = 2 ) , ignore.case = ! acronym , fixed = FALSE )
}
else {
found <- agrep ( pattern , string , max.distance = list ( all = 1 ) , ignore.case = ! acronym , fixed = FALSE )
}
found <- convertLogical0 ( found )
return ( found )
}
smartPatternMatch <- function ( string , pattern , chars , acronym ) {
pattern <- str_c ( " \\b" , pattern , " \\b" )
if ( chars <= 4 ) {
found <- agrep ( pattern , string , max.distance = list ( all = 0 ) , ignore.case = ! acronym , fixed = FALSE )
}
else if ( chars >= 8 ) {
found <- agrep ( pattern , string , max.distance = list ( all = 3 ) , ignore.case = ! acronym , fixed = FALSE )
}
else {
found <- agrep ( pattern , string , max.distance = list ( all = 1 ) , ignore.case = ! acronym , fixed = FALSE )
}
found <- convertLogical0 ( found )
return ( found )
}
smartPatternMatch ( " Höflich, aber klares Statement zu Menschenrechten. Der Bundespräsident macht das gut! #China #XiJinping URL " , " \\bMenschenrechte\\b" , 13 , FALSE )
smartPatternMatch ( " Höflich, aber klares Statement zu Menschenrechten. Der Bundespräsident macht das gut! #China #XiJinping URL " , " \\bMenschenrecht\\b" , 13 , FALSE )
smartPatternMatch ( " Höflich, aber klares Statement zu Menschenrechten. Der Bundespräsident macht das gut! #China #XiJinping URL " , " \\bMenschenracht\\b" , 13 , FALSE )
smartPatternMatch ( " Höflich, aber klares Statement zu Menschenrechten. Der Bundespräsident macht das gut! #China #XiJinping URL " , " \\bMenschen-recht\\b" , 13 , FALSE )
smartPatternMatch ( " Höflich, aber klares Statement zu Menschen-Rechten. Der Bundespräsident macht das gut! #China #XiJinping URL " , " \\bMenschenrecht\\b" , 13 , FALSE )
smartPatternMatch ( " Höflich, aber klares Statement zu Menschen-Rechte. Der Bundespräsident macht das gut! #China #XiJinping URL " , " \\bMenschenrecht\\b" , 13 , FALSE )
smartPatternMatch ( " Bla bla Tomate " , " \\Tomate\\b" , 6 , FALSE )
smartPatternMatch ( " Bla bla Tomaten bla bla" , " \\Tomate\\b" , 6 , FALSE )
smartPatternMatch <- function ( string , pattern , chars , acronym ) {
pattern <- str_c ( " \\b" , pattern , " \\b" )
if ( chars <= 4 ) {
found <- agrep ( pattern , string , max.distance = list ( all = 0 ) , ignore.case = ! acronym , fixed = FALSE )
}
else if ( chars >= 8 ) {
found <- agrep ( pattern , string , max.distance = list ( all = 3 ) , ignore.case = ! acronym , fixed = FALSE )
}
else {
found <- agrep ( pattern , string , max.distance = list ( all = 2 ) , ignore.case = ! acronym , fixed = FALSE )
}
found <- convertLogical0 ( found )
return ( found )
}
smartPatternMatch ( " Bla bla Tomaten bla bla" , " \\Tomate\\b" , 6 , FALSE )
smartPatternMatch ( " Bla bla Menschen bla bla" , " \\Menschen\\b" , 8 , FALSE )
smartPatternMatch ( " Bla bla Menschen bla bla" , " \\Menschen\\b" , 7 , FALSE )
smartPatternMatch ( " Bla bla Menschen bla bla" , " \\Mensch\\b" , 7 , FALSE )
smartPatternMatch ( " Bla bla Menschen bla bla" , " \\Mensch\\b" , 8 , FALSE )
smartPatternMatch ( " Bla bla Nazis bla bla" , " \\Nazis\\b" , 8 , FALSE )
smartPatternMatch ( " Bla bla Nazis bla bla" , " \\Nazis\\b" , 5 , FALSE )
smartPatternMatch ( " Bla bla Nazis bla bla" , " \\Nazi\\b" , 4 , FALSE )
smartPatternMatch ( " Bla bla Nazi bla bla" , " \\Nazis\\b" , 5 , FALSE )
source ( " issuecomp-codingsample-function.R" )
smartPatternMatch ( " Der kleine Flüchtlingsjunge war" , " \\bFlüchtling\\b" , 9 , FALSE )
str_detect ( " Der kleine Flüchtlingsjunge war" , pattern = " \\bFlüchtling\\b" )
str_detect ( " Der kleine Flüchtlingsjunge war" , pattern = " Flüchtling" )
str_detect ( " Der kleine Flücht lingsjunge war" , pattern = " Flüchtling" )
smartPatternMatch <- function ( string , pattern , chars , acronym ) {
pattern <- str_c ( " \\b" , pattern , " \\b" )
if ( chars <= 4 ) { # 4 or less
found <- agrep ( pattern , string , max.distance = list ( all = 0 ) , ignore.case = ! acronym , fixed = FALSE )
}
else if ( chars >= 8 ) { # 8 or more
found <- agrep ( pattern , string , max.distance = list ( all = 3 ) , ignore.case = ! acronym , fixed = FALSE )
cat ( found )
}
else { # 5,6,7
found <- agrep ( pattern , string , max.distance = list ( all = 2 ) , ignore.case = ! acronym , fixed = FALSE )
}
found <- convertLogical0 ( found )
return ( found )
}
str_detect ( " Der kleine Flücht lingsjunge war" , pattern = " Flüchtling" )
smartPatternMatch ( " Der kleine Flüchtlingsjunge war" , " \\bFlüchtling\\b" , 9 , FALSE )
smartPatternMatch ( " Der kleine Flüchtlingsjunge war" , " \\bFlüchtling\\b" , 9 , FALSE )
smartPatternMatch <- function ( string , pattern , chars , acronym ) {
pattern <- str_c ( " \\b" , pattern , " \\b" )
if ( chars <= 4 ) { # 4 or less
found <- agrep ( pattern , string , max.distance = list ( all = 0 ) , ignore.case = ! acronym , fixed = FALSE )
}
else if ( chars >= 8 ) { # 8 or more
found <- agrep ( pattern , string , max.distance = list ( all = 3 ) , ignore.case = ! acronym , fixed = FALSE )
cat ( " it's" , found )
}
else { # 5,6,7
found <- agrep ( pattern , string , max.distance = list ( all = 2 ) , ignore.case = ! acronym , fixed = FALSE )
}
found <- convertLogical0 ( found )
return ( found )
}
smartPatternMatch ( " Der kleine Flüchtlingsjunge war" , " \\bFlüchtling\\b" , 9 , FALSE )
str_detect ( " Der kleine Flücht lingsjunge war" , pattern = " Flüchtling" )
str_detect ( " Der kleine Flüchtlingsjunge war" , pattern = " Flüchtling" )
smartPatternMatch ( " Der kleine Flüchtlingsjunge war" , " \\bFlüchtling\\b" , 9 , FALSE )
smartPatternMatch ( " Der kleine Flüchtlinge war" , " \\bFlüchtling\\b" , 9 , FALSE )
grep ( " Flüchtling" , " Der kleine Flüchtlingsjunge war" , ignore.case = TRUE , fixed = FALSE )
grep ( " \\bFlüchtling\\b" , " Der kleine Flüchtlingsjunge war" , ignore.case = TRUE , fixed = FALSE )
grep ( " \\bFlüchtling\\b" , " Der kleine Flüchtlingsjunge war" , ignore.case = TRUE , fixed = TRUE )
grep ( " \\bFlüchtling\\b" , " Der kleine Flüchtlingsjunge war" , ignore.case = TRUE , fixed = FALSE )
grep ( " Flüchtling" , " Der kleine Flüchtlingsjunge war" , ignore.case = TRUE , fixed = FALSE )
grep ( " Flüchtling" , " Der kleine Flücht-lingsjunge war" , ignore.case = TRUE , fixed = FALSE )
grep ( " Flüchtling" , " Der kleine Flüchtlingsjunge war" , ignore.case = TRUE , fixed = FALSE )
smartPatternMatch <- function ( string , pattern , chars , acronym ) {
patternrex <- str_c ( " \\b" , pattern , " \\b" )
if ( chars <= 4 ) { # 4 or less
found <- agrep ( patternrex , string , max.distance = list ( all = 0 ) , ignore.case = ! acronym , fixed = FALSE )
}
else if ( chars >= 8 ) { # 8 or more
found <- agrep ( patternrex , string , max.distance = list ( all = 3 ) , ignore.case = ! acronym , fixed = FALSE )
if ( convertLogical0 ( found ) == 0 ) {
found <- grep ( pattern , string , ignore.case = ! acronym , fixed = FALSE )
}
}
else { # 5,6,7
found <- agrep ( patternrex , string , max.distance = list ( all = 2 ) , ignore.case = ! acronym , fixed = FALSE )
}
found <- convertLogical0 ( found )
return ( found )
}
smartPatternMatch ( " Der kleine Flüchtlingsjunge war" , " Flüchtling" , 9 , FALSE )
smartPatternMatch ( " Der kleine Flüchtlingsjunge war" , " Flüchtling" , 9 , FALSE )
smartPatternMatch ( " Der kleine Flüchtlingsjunge war" , " Flüchtling" , 7 , FALSE )
c_errors <- read.csv ( " issuecomp-codingsample-error.csv" , header = F , sep = " ," , colClasses = " character" )
names ( c_errors ) <- c ( " str_id" , " code" , " issue" , " tags" , " text" )
for ( r in 1 : nrow ( c_errors ) ) {
c_errcode <- as.character ( c_errors $ code [r ] )
c_errissue <- as.character ( c_errors $ issue [r ] )
c_errtags <- as.character ( c_errors $ tags [r ] )
c_errtext <- as.character ( c_errors $ text [r ] )
c_errid <- as.character ( c_errors $ str_id [r ] )
cat ( " ===============\n\n[TWEET]: " , c_errtext , " \n[ISSUES]: " , c_errtags , " \n" , sep = " " )
source ( " issuecomp-codingsample-function2.R" )
}
View ( c_errors )
viewMatchingTweets ( date = " 2014-05-10" , issue = " agrar.204" , id_folder )
viewMatchingTweets ( date = " 2014-05-10" , issue = " agrar.402" , id_folder )
viewMatchingTweets ( date = " 2014-01-10" , issue = " agrar.402" , id_folder )
viewMatchingTweets ( date = " 2014-01-20" , issue = " agrar.402" , id_folder )
viewMatchingTweets ( date = " 2014-01-10" , issue = " agrar.403" , id_folder )
viewMatchingTweets ( date = " 2014-04-10" , issue = " agrar.403" , id_folder )
viewMatchingTweets ( date = " 2014-05-10" , issue = " agrar.403" , id_folder )
viewMatchingTweets ( date = " 2014-02-11" , issue = " agrar.403" , id_folder )
viewMatchingTweets ( date = " 2014-08-01" , issue = " agrar.403" , id_folder )
issuelist <- xmlToList ( " issues.xml" )
issuelist
issuelist [ [1 ] ]
xmlTreeParse ( file = " issues.xml" )
View ( issues )
issuelist
issueheads
issuelist [ [1 ] ]
issuelist2 <- xmlTreeParse ( file = " issues.xml" )
issuelist2 [ [1 ] ]
issuelist2 [ [2 ] ]
issuelist2 [ [1 , 2 ] ]
issuelist2 [1
issuelist2 [1 ]
issuelist2 $ doc $ file
issuelist2 $ doc $ version
xmlParse ( " issues.xml" )
issuelist2 <- xmlParse ( " issues.xml" )
issuelist2 [1 ]
issuelist2 [2 ]
issuelist2
issuelist
issuelist $ edu.606
issuelist $ edu.606 [1 ]
issuelist $ edu.606 [2 ]
issuelist $ edu.606 [3 ]
issueheads
issuelist $ macro.100
length ( issuelist $ macro.100 )
length ( issuelist $ macro.101 )
length ( issuelist $ macro.103 )
length ( issuelist $ macro.105 )
issuelist $ macro.105
issuelist $ macro.105 [2 ]
issueheads
as.character ( issuelist [ [1 ] ] )
as.character ( issuelist [ [2 ] ] )
test <- issueheads [1 ]
test
as.character ( issuelist $ test )
as.character ( issuelist $ macro.100 )
as.character ( issuelist [test ] )
as.character ( issuelist [test , 1 ] )
as.character ( issuelist [1 , test ] )
as.character ( issuelist [test ] )
issuelist [test ]
issuelist [test ]
length ( issuelist [test ] )
length ( issuelist $ macro.100 )
issuelist $ macro.100
test
issuelist [test ]
issuelist [ , test ]
issuelist [ , as.character ( test ) ]
issuelist [ [test ] ]
issuelist [ , test ]
issuelist [test ]
issuelist [ [test ] ]
length ( issuelist [ [test ] ] )
issuelist [ [test ] ]
issuelist [ [test ] ] [1 ]
as.character ( issuelist [ [test ] ] [1 ] )
as.character ( issuelist [ [test ] ] )
issueheads
issueheads [2 ]
as.character ( issuelist [ [i ] ] )
as.character ( issuelist [ [1 ] ] )
as.character ( issuelist [ [test ] ] )
i <- 1
curissue <- issueheads [i ]
curtags <- as.character ( issuelist [ [curissue ] ] )
curfile <- str_c ( id_folder , " /" , curissue , " .csv" )
curissue
curtags
curfile
curtags [2 ]
# MATCH TWEETS ------------------------------------------------------------
id_folder <- " matched-ids"
unlink ( id_folder , recursive = TRUE )
dir.create ( id_folder )
issues <- data.frame ( date = drange )
issuelist <- xmlToList ( " issues.xml" )
issueheads <- names ( issuelist )
issues [issueheads ] <- 0
tweets $ issue <- " "
tweets $ tags <- " "
for ( d in 1 : nrow ( issues ) ) {
# Go through every day
curdate <- issues $ date [d ]
cat ( as.character ( curdate ) , " \n" )
# Put all tweets from specific day in a temporary DF
tweets_curday <- tweets [tweets [ , " created_at" ] == curdate , ]
for ( t in 1 : nrow ( tweets_curday ) ) {
# Select tweet's text, make it lowercase and remove hashtag indicators (#)
curtext <- as.character ( tweets_curday $ text [t ] )
curtext <- str_replace_all ( curtext , " #" , " " )
curid <- as.character ( tweets_curday $ id_str [t ] )
# Now test each single issue (not tag!)
for ( i in 1 : length ( issueheads ) ) {
curissue <- issueheads [i ]
curtags <- as.character ( issuelist [ [curissue ] ] )
curfile <- str_c ( id_folder , " /" , curissue , " .csv" )
# Now test all tags of a single issue
for ( s in 1 : length ( curtags ) ) {
curtag <- curtags [s ]
curchars <- nchar ( curtag , type = " chars" )
# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
if ( curchars <= 4 ) {
curacro <- checkAcronym ( string = curtag , chars = curchars )
} else {
curacro <- FALSE
}
# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
tags_found <- smartPatternMatch ( curtext , curtag , curchars , curacro )
if ( tags_found == 1 ) {
# Raise number of findings on this day for this issue by 1
issues [d , curissue ] <- issues [d , curissue ] + 1
# Add issue and first matched tag of tweet to tweets-DF
oldissue <- tweets [tweets [ , " id_str" ] == curid , " issue" ]
tweets [tweets [ , " id_str" ] == curid , " issue" ] <- str_c ( oldissue , curissue , " ;" )
oldtag <- tweets [tweets [ , " id_str" ] == curid , " tags" ]
tweets [tweets [ , " id_str" ] == curid , " tags" ] <- str_c ( oldtag , curtag , " ;" )
# Add information to file for function viewPatternMatching
write ( str_c ( curdate , " ;\"" , curid , " \";" , curtag ) , curfile , append = TRUE )
break
}
else {
#cat("Nothing found\n")
}
} # /for curtags
} # /for issuelist
} # /for tweets_curday
} # /for drange
smartPatternMatch ( string = " er ist pädophil " , pattern = " pädophilie" , chars = 10 , acronym = FALSE )
smartPatternMatch ( string = " er ist pädophiler " , pattern = " pädophilie" , chars = 10 , acronym = FALSE )
smartPatternMatch ( string = " er ist pädophiler " , pattern = " Pädophilie" , chars = 10 , acronym = FALSE )
smartPatternMatch ( string = " er ist pädophiles " , pattern = " Pädophilie" , chars = 10 , acronym = FALSE )
id_folder <- " matched-ids"
unlink ( id_folder , recursive = TRUE )
dir.create ( id_folder )
issues <- data.frame ( date = drange )
issuelist <- xmlToList ( " issues.xml" )
issueheads <- names ( issuelist )
issues [issueheads ] <- 0
tweets $ issue <- " "
tweets $ tags <- " "
issueheads
issuelist <- xmlToList ( " issues.xml" )
issuelist
issueheads
View ( issues )
issuelist $ text
issuelist $ macro.100
issuelist $ macro.101
issuelist $ text
issuelist $ text <- NULL
issueheads <- names ( issuelist )
issueheads
issuelist
issuelist $ text <- " "
issuelist
issuelist $ text <- NA
issuelist
issuelist $ text
issuelist $ text [1 ]
issuelist $ text [2 ]
issuelist $ text [6 ]
issuelist $ text [10 ]
issues <- data.frame ( date = drange )
issuelist <- xmlToList ( " issues.xml" )
issues <- data.frame ( date = drange )
issuelist <- xmlToList ( " issues.xml" )
issueheads <- names ( issuelist )
issues [issueheads ] <- 0
tweets $ issue <- " "
tweets $ tags <- " "
View ( tweets )