added coding sample tests

This commit is contained in:
2015-01-15 20:24:40 +01:00
parent 0ea1d11100
commit a4b966965b
12 changed files with 567 additions and 366 deletions
+22 -3
View File
@@ -6,6 +6,8 @@ require(stringr)
source("issuecomp-functions.R")
load(file = "tweets_untagged.RData")
# Create date range
date_start <- as.Date("2014-01-01")
date_end <- as.Date("2014-12-31")
@@ -23,6 +25,8 @@ issues <- data.frame(date = drange)
issuelist <- xmlToList("issues.xml")
issueheads <- names(issuelist)
issues[issueheads] <- 0
tweets$issue <- ""
tweets$tags <- ""
for(d in 1:nrow(issues)) {
# Go through every day
@@ -36,8 +40,7 @@ for(d in 1:nrow(issues)) {
# Select tweet's text, make it lowercase and remove hashtag indicators (#)
curtext <- as.character(tweets_curday$text[t])
curtext <- str_replace_all(curtext, "#", "")
curtext <- str_replace_all(curtext, "$", " ")
curtext <- str_replace_all(curtext, "http://.+?\\s", "URL ")
curid <- as.character(tweets_curday$id_str[t])
# Now test each single issue (not tag!)
@@ -61,8 +64,16 @@ for(d in 1:nrow(issues)) {
# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro)
if(tags_found == 1) {
#cat("Matched", curtag, "with", curtext,"\n")
# Raise number of findings on this day for this issue by 1
issues[d,curissue] <- issues[d,curissue] + 1
# Add issue and first matched tag of tweet to tweets-DF
oldissue <- tweets[tweets[, "id_str"] == curid, "issue"]
tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";")
oldtag <- tweets[tweets[, "id_str"] == curid, "tags"]
tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";")
# Add information to file for function viewPatternMatching
write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE)
break
}
@@ -75,11 +86,19 @@ for(d in 1:nrow(issues)) {
} # /for tweets_curday
} # /for drange
rm(tweets_curday,curacro, curchars, curdate,curfile,curid,curissue,curtag,curtags,curtext,d,date_end,date_start,drange,i,id_folder,oldissue,oldtag,s,t,tags_found)
# SAVING ------------------------------------------------------------------
row.names(tweets) <- NULL
write.csv(tweets, "tweets.csv")
save(tweets, file="tweets.RData")
# VISUALS -----------------------------------------------------------------
# Level: days
issues_melt <- melt(issues,id="date")
ggplot(issues_melt,aes(x=date,y=value,colour=variable,group=variable)) + geom_line(size=1)