Browse Source

added coding sample tests

mxmehl 5 years ago
parent
commit
a4b966965b

BIN
.RData View File


+ 337
- 337
.Rhistory View File

@@ -1,261 +1,178 @@
1
-# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
2
-if(curchars <= 4) {
3
-curacro <- checkAcronym(string = curtag, chars = curchars)
4
-} else {
5
-curacro <- FALSE
1
+all(test)
2
+test <- NULL
3
+View(c_errors)
4
+c_errors <- read.csv("issuecomp-codingsample-error.csv", header = F, sep=",", colClasses="character")
5
+names(c_errors) <- c("str_id", "code", "tags", "text")
6
+for(r in 1:nrow(c_errors)) {
7
+c_errcode <- as.character(c_errors$code[r])
8
+c_errtags <- as.character(c_errors$tags[r])
9
+c_errtext <- as.character(c_errors$text[r])
10
+cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errtags, "\n", sep="")
11
+source("issuecomp-codingsample-function2.R")
6 12
 }
7
-# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
8
-tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro)
9
-if(tags_found == 1) {
10
-#cat("Matched", curtag, "with", curtext,"\n")
11
-issues[d,curissue] <- issues[d,curissue] + 1
12
-write(str_c(curdate,";\"",curid,"\""), curfile, append = TRUE)
13
+status
14
+for(r in 1:nrow(c_errors)) {
15
+c_errcode <- as.character(c_errors$code[r])
16
+c_errtags <- as.character(c_errors$tags[r])
17
+c_errtext <- as.character(c_errors$text[r])
18
+cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errtags, "\n", sep="")
19
+source("issuecomp-codingsample-function2.R")
20
+}
21
+if(c_errcode == "1") {
22
+#cat("Which issue is incorrect?\n")
23
+repeat {
24
+c_tag <- readYN("Which issue is incorrect?: ")
25
+c_tag <- unlist(str_split(c_tag, ";"))
26
+for(i in 1:length(c_tag)) {
27
+if(checkIssue(c_tag[i], c_issueheads)) {status[i] <- TRUE} else {cat("Issue",c_tag[i],"does not exist. Please try again.\n")}
28
+}
29
+if(all(status)) {
13 30
 break
14 31
 }
15
-else {
16
-#cat("Nothing found\n")
17 32
 }
18
-} # /for curtags
19
-} # /for issuelist
20
-} # /for tweets_curday
21
-} # /for drange
22
-smartPatternMatch("kerTips: Riker workplace tip: Flirt when no one else is looking. http", "IS", 2, TRUE)
23
-smartPatternMatch("kerTips: Riker workplace tip: Flirt when no one else is looking. http", "is", 2, TRUE)
24
-viewMatchingTweets("2014-01-06", "issue.iraq", id_folder)
25
-# MATCH TWEETS ------------------------------------------------------------
26
-id_folder <- "matched-ids"
27
-unlink(id_folder, recursive = TRUE)
28
-dir.create(id_folder)
29
-issues <- data.frame(date = drange)
30
-issuelist <- xmlToList("issues.xml")
31
-issueheads <- names(issuelist)
32
-issues[issueheads] <- 0
33
-for(d in 1:nrow(issues)) {
34
-# Go through every day
35
-curdate <- issues$date[d]
36
-cat(as.character(curdate),"\n")
37
-# Put all tweets from specific day in a temporary DF
38
-tweets_curday <- tweets[tweets[, "created_at"] == curdate, ]
39
-for(t in 1:nrow(tweets_curday)){
40
-# Select tweet's text, make it lowercase and remove hashtag indicators (#)
41
-curtext <- as.character(tweets_curday$text[t])
42
-curtext <- str_replace_all(curtext, "#", "")
43
-curid <- as.character(tweets_curday$id_str[t])
44
-# Now test each single issue (not tag!)
45
-for(i in 1:length(issuelist)) {
46
-curtags <- as.character(issuelist[[i]])
47
-curissue <- names(issuelist)[i]
48
-curfile <- str_c(id_folder,"/",curissue,".csv")
49
-# Now test all tags of a single issue
50
-for(s in 1:length(curtags)) {
51
-curtag <- curtags[s]
52
-curchars <- nchar(curtag, type = "chars")
53
-# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
54
-if(curchars <= 4) {
55
-curacro <- checkAcronym(string = curtag, chars = curchars)
56
-} else {
57
-curacro <- FALSE
33
+if(c_errcode == "1") {
34
+#cat("Which issue is incorrect?\n")
35
+repeat {
36
+c_tag <- readYN("Which issue is incorrect?: ")
37
+c_tag <- unlist(str_split(c_tag, ";"))
38
+for(i in 1:length(c_tag)) {
39
+if(checkIssue(c_tag[i], c_issueheads)) {status[i] <- TRUE} else {cat("Issue",c_tag[i],"does not exist. Please try again.\n")}
58 40
 }
59
-# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
60
-tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro)
61
-if(tags_found == 1) {
62
-#cat("Matched", curtag, "with", curtext,"\n")
63
-issues[d,curissue] <- issues[d,curissue] + 1
64
-write(str_c(curdate,";\"",curid,"\""), curfile, append = TRUE)
41
+if(all(status)) {
65 42
 break
66 43
 }
44
+}
45
+}
46
+wdq
47
+for(r in 1:nrow(c_errors)) {
48
+c_errcode <- as.character(c_errors$code[r])
49
+c_errtags <- as.character(c_errors$tags[r])
50
+c_errtext <- as.character(c_errors$text[r])
51
+cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errtags, "\n", sep="")
52
+source("issuecomp-codingsample-function2.R")
53
+}
54
+source("issuecomp-codingsample-function2.R")
55
+for(r in 1:nrow(c_errors)) {
56
+c_errcode <- as.character(c_errors$code[r])
57
+c_errtags <- as.character(c_errors$tags[r])
58
+c_errtext <- as.character(c_errors$text[r])
59
+cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errtags, "\n", sep="")
60
+source("issuecomp-codingsample-function2.R")
61
+}
62
+for(r in 1:nrow(c_errors)) {
63
+c_errcode <- as.character(c_errors$code[r])
64
+c_errtags <- as.character(c_errors$tags[r])
65
+c_errtext <- as.character(c_errors$text[r])
66
+cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errtags, "\n", sep="")
67
+source("issuecomp-codingsample-function2.R")
68
+}
69
+checkAllIssues <- function(string, issuelist) {
70
+string <- unlist(str_split(string, ";"))
71
+for(i in 1:length(string)) {
72
+if(checkIssue(string[i], issuelist)) {
73
+status[i] <- TRUE
74
+}
67 75
 else {
68
-#cat("Nothing found\n")
76
+cat("Issue",string[i],"does not exist. Please try again.\n")
77
+status[i] <- FALSE
69 78
 }
70
-} # /for curtags
71
-} # /for issuelist
72
-} # /for tweets_curday
73
-} # /for drange
74
-source("issuecomp-functions.R")
75
-# MATCH TWEETS ------------------------------------------------------------
76
-id_folder <- "matched-ids"
77
-unlink(id_folder, recursive = TRUE)
78
-dir.create(id_folder)
79
-issues <- data.frame(date = drange)
80
-issuelist <- xmlToList("issues.xml")
81
-issueheads <- names(issuelist)
82
-issues[issueheads] <- 0
83
-for(d in 1:nrow(issues)) {
84
-# Go through every day
85
-curdate <- issues$date[d]
86
-cat(as.character(curdate),"\n")
87
-# Put all tweets from specific day in a temporary DF
88
-tweets_curday <- tweets[tweets[, "created_at"] == curdate, ]
89
-for(t in 1:nrow(tweets_curday)){
90
-# Select tweet's text, make it lowercase and remove hashtag indicators (#)
91
-curtext <- as.character(tweets_curday$text[t])
92
-curtext <- str_replace_all(curtext, "#", "")
93
-curid <- as.character(tweets_curday$id_str[t])
94
-# Now test each single issue (not tag!)
95
-for(i in 1:length(issuelist)) {
96
-curtags <- as.character(issuelist[[i]])
97
-curissue <- names(issuelist)[i]
98
-curfile <- str_c(id_folder,"/",curissue,".csv")
99
-# Now test all tags of a single issue
100
-for(s in 1:length(curtags)) {
101
-curtag <- curtags[s]
102
-curchars <- nchar(curtag, type = "chars")
103
-# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
104
-if(curchars <= 4) {
105
-curacro <- checkAcronym(string = curtag, chars = curchars)
106
-} else {
107
-curacro <- FALSE
108 79
 }
109
-# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
110
-tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro)
111
-if(tags_found == 1) {
112
-#cat("Matched", curtag, "with", curtext,"\n")
113
-issues[d,curissue] <- issues[d,curissue] + 1
114
-write(str_c(curdate,";\"",curid,"\";"curtag), curfile, append = TRUE)
115
-break
80
+}
81
+test
82
+checkAllIssues <- function(string, issuelist) {
83
+string <- unlist(str_split(string, ";"))
84
+for(i in 1:length(string)) {
85
+if(checkIssue(string[i], issuelist)) {
86
+status[i] <- TRUE
116 87
 }
117 88
 else {
118
-#cat("Nothing found\n")
89
+cat("Issue",string[i],"does not exist. Please try again.\n")
90
+status[i] <- FALSE
119 91
 }
120
-} # /for curtags
121
-} # /for issuelist
122
-} # /for tweets_curday
123
-} # /for drange
124
-# MATCH TWEETS ------------------------------------------------------------
125
-id_folder <- "matched-ids"
126
-unlink(id_folder, recursive = TRUE)
127
-dir.create(id_folder)
128
-issues <- data.frame(date = drange)
129
-issuelist <- xmlToList("issues.xml")
130
-issueheads <- names(issuelist)
131
-issues[issueheads] <- 0
132
-for(d in 1:nrow(issues)) {
133
-# Go through every day
134
-curdate <- issues$date[d]
135
-cat(as.character(curdate),"\n")
136
-# Put all tweets from specific day in a temporary DF
137
-tweets_curday <- tweets[tweets[, "created_at"] == curdate, ]
138
-for(t in 1:nrow(tweets_curday)){
139
-# Select tweet's text, make it lowercase and remove hashtag indicators (#)
140
-curtext <- as.character(tweets_curday$text[t])
141
-curtext <- str_replace_all(curtext, "#", "")
142
-curid <- as.character(tweets_curday$id_str[t])
143
-# Now test each single issue (not tag!)
144
-for(i in 1:length(issuelist)) {
145
-curtags <- as.character(issuelist[[i]])
146
-curissue <- names(issuelist)[i]
147
-curfile <- str_c(id_folder,"/",curissue,".csv")
148
-# Now test all tags of a single issue
149
-for(s in 1:length(curtags)) {
150
-curtag <- curtags[s]
151
-curchars <- nchar(curtag, type = "chars")
152
-# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
153
-if(curchars <= 4) {
154
-curacro <- checkAcronym(string = curtag, chars = curchars)
155
-} else {
156
-curacro <- FALSE
157 92
 }
158
-# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
159
-tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro)
160
-if(tags_found == 1) {
161
-#cat("Matched", curtag, "with", curtext,"\n")
162
-issues[d,curissue] <- issues[d,curissue] + 1
163
-write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE)
164
-break
93
+return(status)
94
+}
95
+test <- "issue.edathy"
96
+checkAllIssues(test, c_issueheads)
97
+test <- "issue.edathy"
98
+checkAllIssues(test, c_issueheads)
99
+rm(status)
100
+checkAllIssues(test, c_issueheads)
101
+checkAllIssues <- function(string, issuelist) {
102
+status <- NULL
103
+string <- unlist(str_split(string, ";"))
104
+for(i in 1:length(string)) {
105
+if(checkIssue(string[i], issuelist)) {
106
+status[i] <- TRUE
165 107
 }
166 108
 else {
167
-#cat("Nothing found\n")
109
+cat("Issue",string[i],"does not exist. Please try again.\n")
110
+status[i] <- FALSE
168 111
 }
169
-} # /for curtags
170
-} # /for issuelist
171
-} # /for tweets_curday
172
-} # /for drange
173
-source("issuecomp-functions.R")
174
-viewMatchingTweets("2014-01-06", "issue.iraq", id_folder)
175
-viewMatchingTweets("2014-01-07", "issue.iraq", id_folder)
176
-viewMatchingTweets("2014-01-09", "issue.iraq", id_folder)
177
-curtext <- "Willkürlich Menschen an ihrer #Versammlungsfreiheit zu hindern ist eindeutig rechtswidrig. http://t.co/A7IQfISIhP #Gefahrengebiet #Hamburg"
178
-str_replace_all(curtext, "http://.+\\W", "")
179
-str_replace_all(curtext, "http://.+?\\W", "")
180
-str_replace_all(curtext, "http://.+?\\s", "")
181
-str_replace_all(curtext, "http://.+?\\s", "")
182
-curtext <- "test http://google.de haha http://nsa.gov eqiuhe"
183
-str_replace_all(curtext, "http://.+?\\s", "")
184
-str_replace_all(curtext, "http://.+?\\s", "URL")
185
-str_replace_all(curtext, "http://.+?\\s", "URL ")
186
-viewMatchingTweets("2014-01-09", "issue.iraq", id_folder)
187
-# MATCH TWEETS ------------------------------------------------------------
188
-id_folder <- "matched-ids"
189
-unlink(id_folder, recursive = TRUE)
190
-dir.create(id_folder)
191
-issues <- data.frame(date = drange)
192
-issuelist <- xmlToList("issues.xml")
193
-issueheads <- names(issuelist)
194
-issues[issueheads] <- 0
195
-for(d in 1:nrow(issues)) {
196
-# Go through every day
197
-curdate <- issues$date[d]
198
-cat(as.character(curdate),"\n")
199
-# Put all tweets from specific day in a temporary DF
200
-tweets_curday <- tweets[tweets[, "created_at"] == curdate, ]
201
-for(t in 1:nrow(tweets_curday)){
202
-# Select tweet's text, make it lowercase and remove hashtag indicators (#)
203
-curtext <- as.character(tweets_curday$text[t])
204
-curtext <- str_replace_all(curtext, "#", "")
205
-curtext <- str_replace_all(curtext, "http://.+?\\s", "URL ")
206
-curid <- as.character(tweets_curday$id_str[t])
207
-# Now test each single issue (not tag!)
208
-for(i in 1:length(issuelist)) {
209
-curtags <- as.character(issuelist[[i]])
210
-curissue <- names(issuelist)[i]
211
-curfile <- str_c(id_folder,"/",curissue,".csv")
212
-# Now test all tags of a single issue
213
-for(s in 1:length(curtags)) {
214
-curtag <- curtags[s]
215
-curchars <- nchar(curtag, type = "chars")
216
-# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
217
-if(curchars <= 4) {
218
-curacro <- checkAcronym(string = curtag, chars = curchars)
219
-} else {
220
-curacro <- FALSE
221 112
 }
222
-# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
223
-tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro)
224
-if(tags_found == 1) {
225
-#cat("Matched", curtag, "with", curtext,"\n")
226
-issues[d,curissue] <- issues[d,curissue] + 1
227
-write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE)
228
-break
113
+return(status)
114
+}
115
+checkAllIssues(test, c_issueheads)
116
+checkAllIssues("wdjqaowd", c_issueheads)
117
+test <- checkAllIssues("wdjqaowd", c_issueheads)
118
+test
119
+test <- checkAllIssues("wdjqaow;wiqud", c_issueheads)
120
+test
121
+test <- checkAllIssues("wdjqaow;issue.edathy", c_issueheads)
122
+test
123
+checkAllIssues <- function(string, issuelist) {
124
+status <- NULL
125
+string <- unlist(str_split(string, ";"))
126
+for(i in 1:length(string)) {
127
+if(checkIssue(string[i], issuelist)) {
128
+status[i] <- TRUE
229 129
 }
230 130
 else {
231
-#cat("Nothing found\n")
131
+cat("Issue",string[i],"does not exist. Please try again.\n")
132
+status[i] <- FALSE
232 133
 }
233
-} # /for curtags
234
-} # /for issuelist
235
-} # /for tweets_curday
236
-} # /for drange
237
-viewMatchingTweets("2014-01-09", "issue.iraq", id_folder)
238
-viewMatchingTweets("2014-01-08", "issue.iraq", id_folder)
239
-viewMatchingTweets("2014-01-10", "issue.iraq", id_folder)
240
-curtext
241
-str_replace_all(curtext, "http://.+?\\>", "URL ")
242
-str_replace_all(curtext, "http://.+?\\<", "URL ")
243
-curtext <- str_replace_all(curtext, "http://.+?\\b", "URL ")
244
-str_replace_all(curtext, "http://.+?\\b", "URL ")
245
-str_replace_all(curtext, "http://.+?\\s", "URL ")
246
-curtext
247
-curtext <- as.character(tweets_curday$text[t])
248
-curtext
249
-str_replace_all(curtext, "http://.+?\\s", "URL ")
250
-str_replace_all(curtext, "http://.+?\\b", "URL ")
251
-str_replace_all(curtext, "http://.+?\\<", "URL ")
252
-str_replace_all(curtext, "http://.+?\\>", "URL ")
253
-str_replace_all(curtext, "http://.+?\\s", "URL ")
254
-str_replace_all(curtext, "$", " ")
255
-curtext <- str_replace_all(curtext, "$", " ")
256
-curtext
257
-str_replace_all(curtext, "http://.+?\\s", "URL ")
258
-viewMatchingTweets("2014-01-10", "issue.iraq", id_folder)
134
+}
135
+return(status)
136
+}
137
+for(r in 1:nrow(c_errors)) {
138
+c_errcode <- as.character(c_errors$code[r])
139
+c_errtags <- as.character(c_errors$tags[r])
140
+c_errtext <- as.character(c_errors$text[r])
141
+cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errtags, "\n", sep="")
142
+source("issuecomp-codingsample-function2.R")
143
+}
144
+checkAllIssues <- function(string, issuelist) {
145
+status <- NULL
146
+for(i in 1:length(string)) {
147
+if(checkIssue(string[i], issuelist)) {
148
+status[i] <- TRUE
149
+}
150
+else {
151
+cat("Issue",string[i],"does not exist. Please try again.\n")
152
+status[i] <- FALSE
153
+}
154
+}
155
+return(status)
156
+}
157
+for(r in 1:nrow(c_errors)) {
158
+c_errcode <- as.character(c_errors$code[r])
159
+c_errtags <- as.character(c_errors$tags[r])
160
+c_errtext <- as.character(c_errors$text[r])
161
+cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errtags, "\n", sep="")
162
+source("issuecomp-codingsample-function2.R")
163
+}
164
+for(r in 1:nrow(c_errors)) {
165
+c_errcode <- as.character(c_errors$code[r])
166
+c_errtags <- as.character(c_errors$tags[r])
167
+c_errtext <- as.character(c_errors$text[r])
168
+cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errtags, "\n", sep="")
169
+source("issuecomp-codingsample-function2.R")
170
+}
171
+View(c_issues)
172
+View(tweets)
173
+tweets$tagged <- NULL
174
+View(c_tweets)
175
+View(tweets)
259 176
 # MATCH TWEETS ------------------------------------------------------------
260 177
 id_folder <- "matched-ids"
261 178
 unlink(id_folder, recursive = TRUE)
@@ -264,6 +181,8 @@ issues <- data.frame(date = drange)
264 181
 issuelist <- xmlToList("issues.xml")
265 182
 issueheads <- names(issuelist)
266 183
 issues[issueheads] <- 0
184
+tweets$issue <- ""
185
+tweets$tags <- ""
267 186
 for(d in 1:nrow(issues)) {
268 187
 # Go through every day
269 188
 curdate <- issues$date[d]
@@ -295,8 +214,14 @@ curacro <- FALSE
295 214
 # Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
296 215
 tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro)
297 216
 if(tags_found == 1) {
298
-#cat("Matched", curtag, "with", curtext,"\n")
217
+# Raise number of findings on this day for this issue by 1
299 218
 issues[d,curissue] <- issues[d,curissue] + 1
219
+# Add issue and first matched tag of tweet to tweets-DF
220
+oldissue <- tweets[tweets[, "id_str"] == curid, "issue"]
221
+tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";")
222
+oldtag <- tweets[tweets[, "id_str"] == curid, "tags"]
223
+tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";")
224
+# Add information to file for function viewPatternMatching
300 225
 write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE)
301 226
 break
302 227
 }
@@ -307,7 +232,10 @@ else {
307 232
 } # /for issuelist
308 233
 } # /for tweets_curday
309 234
 } # /for drange
310
-viewMatchingTweets("2014-01-10", "issue.iraq", id_folder)
235
+date_start <- as.Date("2014-01-01")
236
+date_end <- as.Date("2014-12-31")
237
+drange <- as.integer(date_end - date_start)
238
+drange <- date_start + days(0:drange)
311 239
 # MATCH TWEETS ------------------------------------------------------------
312 240
 id_folder <- "matched-ids"
313 241
 unlink(id_folder, recursive = TRUE)
@@ -316,6 +244,8 @@ issues <- data.frame(date = drange)
316 244
 issuelist <- xmlToList("issues.xml")
317 245
 issueheads <- names(issuelist)
318 246
 issues[issueheads] <- 0
247
+tweets$issue <- ""
248
+tweets$tags <- ""
319 249
 for(d in 1:nrow(issues)) {
320 250
 # Go through every day
321 251
 curdate <- issues$date[d]
@@ -347,8 +277,14 @@ curacro <- FALSE
347 277
 # Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
348 278
 tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro)
349 279
 if(tags_found == 1) {
350
-#cat("Matched", curtag, "with", curtext,"\n")
280
+# Raise number of findings on this day for this issue by 1
351 281
 issues[d,curissue] <- issues[d,curissue] + 1
282
+# Add issue and first matched tag of tweet to tweets-DF
283
+oldissue <- tweets[tweets[, "id_str"] == curid, "issue"]
284
+tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";")
285
+oldtag <- tweets[tweets[, "id_str"] == curid, "tags"]
286
+tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";")
287
+# Add information to file for function viewPatternMatching
352 288
 write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE)
353 289
 break
354 290
 }
@@ -359,32 +295,9 @@ else {
359 295
 } # /for issuelist
360 296
 } # /for tweets_curday
361 297
 } # /for drange
362
-View(issues)
363
-viewMatchingTweets("2014-12-18", "issue.edathy", id_folder)
364
-issues_melt <- melt(issues,id="date")
365
-ggplot(issues_melt,aes(x=date,y=value,colour=variable,group=variable)) + geom_line(size=1)
366
-ggplot(issues_melt,aes(x=date,y=value,colour=variable,group=variable)) + geom_smooth(size=1,method="loess",formula = y ~ x, se=FALSE)
367
-ggplot(issues_melt,aes(x=date,y=value,colour=variable,group=variable)) + geom_line(size=1)
368
-ggplot(issues_melt,aes(x=date,y=value,colour=variable,group=variable)) + geom_line(size=1)
369
-ggplot(issues_melt,aes(x=date,y=value,colour=variable,group=variable)) + geom_smooth(size=1,method="loess",formula = y ~ x, se=FALSE)
370
-viewMatchingTweets("2014-12-18", "issue.conservative", id_folder)
371
-agrep("christ", "Jungparlamentarier gleich Schriftführerdienst hat", max.distance = list(all = 2), ignore.case = TRUE, fixed = FALSE)
372
-agrep("\\bchrist\\b", "Jungparlamentarier gleich Schriftführerdienst hat", max.distance = list(all = 2), ignore.case = TRUE, fixed = FALSE)
373
-agrep("\\bchrist\\b", "Bla Christ bla", max.distance = list(all = 2), ignore.case = TRUE, fixed = FALSE)
374
-agrep("\\bchrist\\b", "Bla Christus bla", max.distance = list(all = 2), ignore.case = TRUE, fixed = FALSE)
375
-agrep("\\bchrist\\b", "Bla Christu bla", max.distance = list(all = 2), ignore.case = TRUE, fixed = FALSE)
376
-agrep("\\bchrist\\b", "Bla Christus bla", max.distance = list(all = 2), ignore.case = TRUE, fixed = FALSE)
377
-agrep("\\bchrist\\b", "Bla Christus bla", max.distance = list(all = 3), ignore.case = TRUE, fixed = FALSE)
378
-agrep("\\bchrist\\b", "Bla christus bla", max.distance = list(all = 3), ignore.case = TRUE, fixed = FALSE)
379
-agrep("\\bchrist\\b", "Bla christus bla", max.distance = list(all = 2), ignore.case = TRUE, fixed = FALSE)
380
-agrep("\\bchrist\\b", "Bla christen bla", max.distance = list(all = 3), ignore.case = TRUE, fixed = FALSE)
381
-agrep("\\bchrist\\b", "Bla Antichrist bla", max.distance = list(all = 3), ignore.case = TRUE, fixed = FALSE)
382
-agrep("\\bchrist\\b", "Bla Christian bla", max.distance = list(all = 3), ignore.case = TRUE, fixed = FALSE)
383
-agrep("\\bchrist\\b", "Bla Christian bla", max.distance = list(all = 3), ignore.case = TRUE, fixed = FALSE, value=TRUE)
384
-agrep("\\bchrist\\b", "Bla Christi bla", max.distance = list(all = 3), ignore.case = TRUE, fixed = FALSE, value=TRUE)
385
-agrep("\\bchrist\\b", "Bla Christi bla", max.distance = list(all = 3), ignore.case = TRUE, fixed = FALSE)
386
-agrep("\\bIS\\b", "Wir sind bei ISN Network", max.distance = list(all = 0), ignore.case = TRUE, fixed = FALSE)
387
-agrep("\\bIS\\b", "Wir sind bei ISN Network", max.distance = list(all = 0), ignore.case = F, fixed = FALSE)
298
+View(tweets)
299
+View(tweets)
300
+# MATCH TWEETS ------------------------------------------------------------
388 301
 id_folder <- "matched-ids"
389 302
 unlink(id_folder, recursive = TRUE)
390 303
 dir.create(id_folder)
@@ -392,6 +305,8 @@ issues <- data.frame(date = drange)
392 305
 issuelist <- xmlToList("issues.xml")
393 306
 issueheads <- names(issuelist)
394 307
 issues[issueheads] <- 0
308
+tweets$issue <- ""
309
+tweets$tags <- ""
395 310
 for(d in 1:nrow(issues)) {
396 311
 # Go through every day
397 312
 curdate <- issues$date[d]
@@ -423,8 +338,14 @@ curacro <- FALSE
423 338
 # Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
424 339
 tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro)
425 340
 if(tags_found == 1) {
426
-#cat("Matched", curtag, "with", curtext,"\n")
341
+# Raise number of findings on this day for this issue by 1
427 342
 issues[d,curissue] <- issues[d,curissue] + 1
343
+# Add issue and first matched tag of tweet to tweets-DF
344
+oldissue <- tweets[tweets[, "id_str"] == curid, "issue"]
345
+tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";")
346
+oldtag <- tweets[tweets[, "id_str"] == curid, "tags"]
347
+tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";")
348
+# Add information to file for function viewPatternMatching
428 349
 write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE)
429 350
 break
430 351
 }
@@ -435,78 +356,157 @@ else {
435 356
 } # /for issuelist
436 357
 } # /for tweets_curday
437 358
 } # /for drange
438
-issues_melt <- melt(issues,id="date")
439
-ggplot(issues_melt,aes(x=date,y=value,colour=variable,group=variable)) + geom_smooth(size=1,method="loess",formula = y ~ x, se=FALSE)
440
-viewMatchingTweets("2014-12-18", "issue.conservative", id_folder)
441
-pattern
442
-agrep("\\bchrist\\b", "RT @christophheyes: Morgen in der Presse: Oppermann - Briefkasten gestohlen! Gabriel - Poesiealbum nicht mehr auffindbar! #edathy #hartmann", max.distance = list(all = 1), ignore.case = TRUE, fixed = FALSE)
443
-smartPatternMatch
444
-source("issuecomp-functions.R")
445
-smartPatternMatch
446
-# MATCH TWEETS ------------------------------------------------------------
447
-id_folder <- "matched-ids"
448
-unlink(id_folder, recursive = TRUE)
449
-dir.create(id_folder)
450
-issues <- data.frame(date = drange)
451
-issuelist <- xmlToList("issues.xml")
452
-issueheads <- names(issuelist)
453
-issues[issueheads] <- 0
454
-for(d in 1:nrow(issues)) {
455
-# Go through every day
456
-curdate <- issues$date[d]
457
-cat(as.character(curdate),"\n")
458
-# Put all tweets from specific day in a temporary DF
459
-tweets_curday <- tweets[tweets[, "created_at"] == curdate, ]
460
-for(t in 1:nrow(tweets_curday)){
461
-# Select tweet's text, make it lowercase and remove hashtag indicators (#)
462
-curtext <- as.character(tweets_curday$text[t])
463
-curtext <- str_replace_all(curtext, "#", "")
359
+View(tweets)
360
+View(c_errors)
361
+View(tweets)
362
+readYN <- function(question) {
363
+n <- readline(prompt=question)
364
+n <- as.character(n)
365
+return(n)
366
+}
367
+checkIssue <- function(string, issuelist) {
368
+status <- any(str_detect(string, issuelist))
369
+return(status)
370
+}
371
+checkAllIssues <- function(string, issuelist) {
372
+status <- NULL
373
+for(i in 1:length(string)) {
374
+if(checkIssue(string[i], issuelist)) {
375
+status[i] <- TRUE
376
+}
377
+else {
378
+cat("Issue",string[i],"does not exist. Please try again.\n")
379
+status[i] <- FALSE
380
+}
381
+}
382
+return(status)
383
+}
384
+View(tweets)
385
+View(tweets)
386
+write.csv(tweets, "tweets.csv")
387
+save(tweets, file="tweets.RData")
388
+write.csv(tweets, "tweets.csv")
389
+save(tweets, file="tweets.RData")
390
+c_tweets <- read.csv("tweets.csv")
391
+View(c_tweets)
392
+c_tweets$X <- NULL
393
+# Read all issues from XML file
394
+c_issues <- data.frame(date = drange)
395
+c_issuelist <- xmlToList("issues.xml")
396
+c_issueheads <- names(issuelist)
397
+c_issues[issueheads] <- 0
398
+source("issuecomp-codingsample-function.R")
399
+rm(c_err, c_result, c_samid, c_samno,c_samtags,c_samissue,c_samtext,c_yn)
400
+rm(c_samtag)
401
+rm(tweets_curday,curacro, curchars, curdate,curfile,curid,curissue,curtag,curtags,curtext,d,date_end,date_start,drange,i,id_folder,oldissue,oldtag,s,t,tags_found)
402
+c_errors <- read.csv("issuecomp-codingsample-error.csv", header = F, sep=",", colClasses="character")
403
+View(c_errors)
404
+names(c_errors) <- c("str_id", "code", "issue", "tags", "text")
405
+View(c_errors)
406
+for(r in 1:nrow(c_errors)) {
407
+c_errcode <- as.character(c_errors$code[r])
408
+c_errissue <- as.character(c_errors$issue[r])
409
+c_errtags <- as.character(c_errors$tags[r])
410
+c_errtext <- as.character(c_errors$text[r])
411
+c_errid <- as.character(c_errors$str_id[r])
412
+cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errtags, "\n", sep="")
413
+source("issuecomp-codingsample-function2.R")
414
+}
415
+for(r in 1:nrow(c_errors)) {
416
+c_errcode <- as.character(c_errors$code[r])
417
+c_errissue <- as.character(c_errors$issue[r])
418
+c_errtags <- as.character(c_errors$tags[r])
419
+c_errtext <- as.character(c_errors$text[r])
420
+c_errid <- as.character(c_errors$str_id[r])
421
+cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errtags, "\n", sep="")
422
+source("issuecomp-codingsample-function2.R")
423
+}
424
+c_curissue
425
+c_curtags
426
+c_errissue
427
+c_errtags
428
+c_errid
429
+delrow <- NULL
430
+for(r in 1:nrow(tweets)) {
431
+if(format(tweets$created_at[r], "%Y") != "2014") {
432
+delrow <- c(delrow, r)
433
+}
434
+curtext <- as.character(tweets$text[r])
464 435
 curtext <- str_replace_all(curtext, "$", " ")
465 436
 curtext <- str_replace_all(curtext, "http://.+?\\s", "URL ")
466
-curid <- as.character(tweets_curday$id_str[t])
467
-# Now test each single issue (not tag!)
468
-for(i in 1:length(issuelist)) {
469
-curtags <- as.character(issuelist[[i]])
470
-curissue <- names(issuelist)[i]
471
-curfile <- str_c(id_folder,"/",curissue,".csv")
472
-# Now test all tags of a single issue
473
-for(s in 1:length(curtags)) {
474
-curtag <- curtags[s]
475
-curchars <- nchar(curtag, type = "chars")
476
-# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
477
-if(curchars <= 4) {
478
-curacro <- checkAcronym(string = curtag, chars = curchars)
479
-} else {
480
-curacro <- FALSE
481 437
 }
482
-# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
483
-tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro)
484
-if(tags_found == 1) {
485
-#cat("Matched", curtag, "with", curtext,"\n")
486
-issues[d,curissue] <- issues[d,curissue] + 1
487
-write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE)
488
-break
438
+r
439
+require(stringr)
440
+View(tweets)
441
+df <- data.frame(x= c("zeile1","zeile2"))
442
+View(df)
443
+df$x[1] <- "blabla"
444
+View(df)
445
+df <- data.frame(x= c("zeile1","zeile2"))
446
+test <- "bla bla"
447
+df$x[1] <- test
448
+View(df)
449
+df$x[1] <- as.character(test)
450
+class(df$x)
451
+df$x[1] <- as.factor(test)
452
+head(tweet)
453
+head(tweets)
454
+df <- head(tweets)
455
+View(df)
456
+df$text[1] <- "test"
457
+View(tweets)
458
+View(df)
459
+for(r in 1:nrow(tweets)) {
460
+#   if(format(tweets$created_at[r], "%Y") != "2014") {
461
+#     delrow <- c(delrow, r)
462
+#   }
463
+curtext <- as.character(tweets$text[r])
464
+curtext <- str_replace_all(curtext, "$", " ")
465
+curtext <- str_replace_all(curtext, "http://.+?\\s", "URL ")
466
+tweets$text[r] <- curtext
489 467
 }
490
-else {
491
-#cat("Nothing found\n")
468
+View(tweets)
469
+View(c_tweets)
470
+rm(delrow, r)
471
+save(tweets, file="tweets_untagged.RData")
472
+row.names(tweets) <- NULL
473
+write.csv(tweets, "tweets.csv")
474
+save(tweets, file="tweets.RData")
475
+c_tweets <- read.csv("tweets.csv")
476
+c_tweets$X <- NULL
477
+View(c_tweets)
478
+viewMatchingTweets
479
+c_tweets <- read.csv("tweets.csv", colClasses="character")
480
+c_tweets$X <- NULL
481
+View(c_tweets)
482
+View(c_issues)
483
+c_errtags
484
+for(r in 1:nrow(c_errors)) {
485
+c_errcode <- as.character(c_errors$code[r])
486
+c_errissue <- as.character(c_errors$issue[r])
487
+c_errtags <- as.character(c_errors$tags[r])
488
+c_errtext <- as.character(c_errors$text[r])
489
+c_errid <- as.character(c_errors$str_id[r])
490
+cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errtags, "\n", sep="")
491
+source("issuecomp-codingsample-function2.R")
492
+}
493
+curissue
494
+c_curissue
495
+str_join(c_curissue)
496
+str_join(c_curissue,collapse = NULL)
497
+str_join(c_curissue,sep=";",collapse = NULL)
498
+paste(c_curissue,sep = "")
499
+paste(c_curissue,sep = '')
500
+length(paste(c_curissue,sep = ''))
501
+str_join(c_curissue,sep=";",collapse = "")
502
+str_join(c_curissue,sep=";",collapse = "w")
503
+str_join(c_curissue,collapse = ";")
504
+for(r in 1:nrow(c_errors)) {
505
+c_errcode <- as.character(c_errors$code[r])
506
+c_errissue <- as.character(c_errors$issue[r])
507
+c_errtags <- as.character(c_errors$tags[r])
508
+c_errtext <- as.character(c_errors$text[r])
509
+c_errid <- as.character(c_errors$str_id[r])
510
+cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errtags, "\n", sep="")
511
+source("issuecomp-codingsample-function2.R")
492 512
 }
493
-} # /for curtags
494
-} # /for issuelist
495
-} # /for tweets_curday
496
-} # /for drange
497
-issues_melt <- melt(issues,id="date")
498
-ggplot(issues_melt,aes(x=date,y=value,colour=variable,group=variable)) + geom_smooth(size=1,method="loess",formula = y ~ x, se=FALSE)
499
-viewMatchingTweets("2014-12-18", "issue.conservative", id_folder)
500
-viewMatchingTweets("2014-05-18", "issue.conservative", id_folder)
501
-viewMatchingTweets("2014-05-1", "issue.conservative", id_folder)
502
-viewMatchingTweets("2014-05-01", "issue.conservative", id_folder)
503
-viewMatchingTweets("2014-05-02", "issue.conservative", id_folder)
504
-viewMatchingTweets("2014-05-10", "issue.conservative", id_folder)
505
-viewMatchingTweets("2014-05-10", "issue.middleeast", id_folder)
506
-viewMatchingTweets("2014-05-10", "issue.iraw", id_folder)
507
-viewMatchingTweets("2014-05-10", "issue.iraq", id_folder)
508
-viewMatchingTweets("2014-08-10", "issue.iraq", id_folder)
509
-viewMatchingTweets("2014-11-10", "issue.iraq", id_folder)
510
-viewMatchingTweets("2014-12-10", "issue.iraq", id_folder)
511
-View(issues)
512
-viewMatchingTweets("2014-09-19", "issue.control", id_folder)

+ 3
- 1
.gitignore View File

@@ -1,4 +1,6 @@
1 1
 tweets_complete.csv
2
-current.txt
2
+tweets.csv
3
+tweets_untagged.csv
4
+tweets_untagged.RData
3 5
 .RData
4 6
 matched-ids

+ 22
- 3
issuecomp-analysis.R View File

@@ -6,6 +6,8 @@ require(stringr)
6 6
 
7 7
 source("issuecomp-functions.R")
8 8
 
9
+load(file = "tweets_untagged.RData")
10
+
9 11
 # Create date range
10 12
 date_start <- as.Date("2014-01-01")
11 13
 date_end <- as.Date("2014-12-31")
@@ -23,6 +25,8 @@ issues <- data.frame(date = drange)
23 25
 issuelist <- xmlToList("issues.xml")
24 26
 issueheads <- names(issuelist)
25 27
 issues[issueheads] <- 0
28
+tweets$issue <- ""
29
+tweets$tags <- ""
26 30
 
27 31
 for(d in 1:nrow(issues)) {
28 32
   # Go through every day
@@ -36,8 +40,7 @@ for(d in 1:nrow(issues)) {
36 40
     # Select tweet's text, make it lowercase and remove hashtag indicators (#)
37 41
     curtext <- as.character(tweets_curday$text[t])
38 42
     curtext <- str_replace_all(curtext, "#", "")
39
-    curtext <- str_replace_all(curtext, "$", " ")
40
-    curtext <- str_replace_all(curtext, "http://.+?\\s", "URL ")
43
+    
41 44
     curid <- as.character(tweets_curday$id_str[t])
42 45
     
43 46
     # Now test each single issue (not tag!)
@@ -61,8 +64,16 @@ for(d in 1:nrow(issues)) {
61 64
         # Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow 2 (Levenshtein distance)
62 65
         tags_found <- smartPatternMatch(curtext, curtag, curchars, curacro)
63 66
         if(tags_found == 1) {
64
-          #cat("Matched", curtag, "with", curtext,"\n")
67
+          # Raise number of findings on this day for this issue by 1
65 68
           issues[d,curissue] <- issues[d,curissue] + 1
69
+          
70
+          # Add issue and first matched tag of tweet to tweets-DF
71
+          oldissue <- tweets[tweets[, "id_str"] == curid, "issue"]
72
+          tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";")
73
+          oldtag <- tweets[tweets[, "id_str"] == curid, "tags"]
74
+          tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";")
75
+          
76
+          # Add information to file for function viewPatternMatching
66 77
           write(str_c(curdate,";\"",curid,"\";",curtag), curfile, append = TRUE)
67 78
           break
68 79
         }
@@ -75,11 +86,19 @@ for(d in 1:nrow(issues)) {
75 86
   } # /for tweets_curday
76 87
 } # /for drange
77 88
 
89
+rm(tweets_curday,curacro, curchars, curdate,curfile,curid,curissue,curtag,curtags,curtext,d,date_end,date_start,drange,i,id_folder,oldissue,oldtag,s,t,tags_found)
90
+
78 91
 
92
+# SAVING ------------------------------------------------------------------
79 93
 
80 94
 
95
+row.names(tweets) <- NULL
96
+write.csv(tweets, "tweets.csv")
97
+save(tweets, file="tweets.RData")
98
+
81 99
 # VISUALS -----------------------------------------------------------------
82 100
 
101
+
83 102
 # Level: days
84 103
 issues_melt <- melt(issues,id="date")
85 104
 ggplot(issues_melt,aes(x=date,y=value,colour=variable,group=variable)) + geom_line(size=1)

+ 2
- 0
issuecomp-codingsample-correct.csv View File

@@ -0,0 +1,2 @@
1
+"443032724505624576",0,"","","RT @Linksfraktion: Sevim Dagdelen: Optionspflicht abschaffen – ohne „Wenn“ und „Aber“ http://t.co/PXVz60RyDa"
2
+"464039981766684672",0,"","","Hange spricht von einer Kutsche ohne Dach in der wir fahren und hoffen, dass es nicht regne #btADA"

+ 3
- 0
issuecomp-codingsample-error.csv View File

@@ -0,0 +1,3 @@
1
+"532463690013233152",1,"","","RT @cordhos: ! “@cducsubt: Paul Breitner vom @FCBayern - #Bundestag jetzt mit eigenem Fanclub @hahnflo @DoroBaer @dieAlbsteigerin http://t.…"
2
+"516584367448403968",1,"","","Debate und critics in the parlamentarian assembly of the European Council about the elections in #Turkey @PACE_News @GeziParkii"
3
+"516624274522918912",1,"","","Nach Bürgergespräch bin nun noch im Ratshof zur Ausstellungseröffnung - Wanderausstellung zum Bundestag."

+ 39
- 0
issuecomp-codingsample-function.R View File

@@ -0,0 +1,39 @@
1
+repeat {
2
+  c_samno <- sample(1:nrow(c_tweets), 1)
3
+  c_samtext <- as.character(c_tweets$text[c_samno])
4
+  c_samissue <- as.character(c_tweets$issue[c_samno])
5
+  c_samtags <- as.character(c_tweets$tags[c_samno])
6
+  c_samid <- as.character(c_tweets$id_str[c_samno])
7
+  
8
+  
9
+  repeat {
10
+    cat("===================\n\n[TWEET]: ",c_samtext,"\n", "[ISSUES]: ", c_samissue, sep="")
11
+    c_yn <- readYN("Is the categorization correct AND complete?\nEnter y or n: ")
12
+    
13
+    # Check if input is correct
14
+    if(c_yn == "y" || c_yn == "n" || c_yn == "QUIT") {break} else {cat("Wrong input, please enter y or n (or QUIT if you want to exit safely).\n")}
15
+  }
16
+
17
+  # Exit codes:
18
+  # 0 = Correct tagging
19
+  # 1 = At least one tag was incorrect
20
+  # 2 = At least one tag was missing
21
+  # 3 = Both 1 and 2
22
+  
23
+  if(c_yn == "y") {
24
+    c_result <- str_c("\"",c_samid,"\"",",0,","\"",c_samissue,"\",\"",c_samtags,"\",\"",c_samtext,"\"")
25
+    write(c_result, file = "issuecomp-codingsample-correct.csv", append = T)
26
+  } 
27
+  else if(c_yn == "n") {
28
+    repeat {
29
+      c_err <- readYN("Enter 1 if a tag is incorrect, 2 if a tag is missing, and 3 if a tag is incorrect AND missing: ")
30
+      if(c_err == "1" || c_err == "2" || c_err == "3") {break} else {cat("Wrong input, please enter 1, 2 or 3.\n")}
31
+    }
32
+    c_result <- str_c("\"",c_samid,"\",",c_err,",\"",c_samissue,"\",\"",c_samtags,"\",\"",c_samtext,"\"")
33
+    write(c_result, file = "issuecomp-codingsample-error.csv", append = T)
34
+  } 
35
+  else {
36
+    cat("Quitting now.")
37
+    break
38
+  }
39
+}

+ 58
- 0
issuecomp-codingsample-function2.R View File

@@ -0,0 +1,58 @@
1
+# A tweet was falsely categorized
2
+if(c_errcode == "1") {
3
+  repeat {
4
+    c_curissue <- readYN("Which issue is incorrect?: ")
5
+    if(c_curissue == "QUIT") {cat("Quitting this item without changes or entries.\n"); break}
6
+    c_curissue <- unlist(str_split(c_curissue, ";"))
7
+    
8
+    status <- checkAllIssues(c_curissue, c_issueheads)
9
+    
10
+    # Only continue if every given issue really exists (all "status" have to be TRUE)
11
+    if(all(status)) {
12
+      # Revert str_split
13
+      c_curissue <- str_join(c_curissue,collapse = ";")
14
+      
15
+      # <ID>,<all issues>,<faulty issue(s),<all tags>,<tweet text>
16
+      c_result <- str_c("\"",c_errid,"\",\"",c_errissue,"\",\"",c_curissue,"\",\"",c_errtags,"\",\"",c_errtext,"\"")
17
+      write(c_result, file = "issuecomp-codingsample-error1.csv", append = T)
18
+      break
19
+    }
20
+  }
21
+
22
+# A tweet should be categorized with an additional issue
23
+} else if(c_errcode == "2") {
24
+  repeat {
25
+    c_curissue <- readYN("Which issue is missing?: ")
26
+    if(c_curissue == "QUIT") {cat("Quitting this item without changes or entries.\n"); break}
27
+    c_curissue <- unlist(str_split(c_curissue, ";"))
28
+    
29
+    status <- checkAllIssues(c_curissue, c_issueheads)
30
+    
31
+    # Only continue if every given issue really exists (all "status" have to be TRUE)
32
+    if(all(status)) {
33
+      # Revert str_split
34
+      c_curissue <- str_join(c_curissue,collapse = ";")
35
+      
36
+      # <ID>,<all issues>,<faulty issue(s),<all tags>,<tweet text>
37
+      c_result <- str_c("\"",c_errid,"\",\"",c_errissue,"\",\"",c_curissue,"\",\"",c_errtags,"\",\"",c_errtext,"\"")
38
+      write(c_result, file = "issuecomp-codingsample-error2.csv", append = T)
39
+      break
40
+    }
41
+  }
42
+
43
+# There is an issue missing AND a issue was wrong
44
+} else if(c_errcode == "3") {
45
+  #cat("Which issue is incorrect and which one is missing?\n")
46
+  repeat {
47
+    c_tag <- readYN("Which issue is incorrect?: ")
48
+    c_tag <- unlist(str_split(c_tag, ";"))
49
+    for(i in 1:length(c_tag)) {
50
+      if(checkIssue(c_tag[i], c_issueheads)) {} else {cat("Issue",c_tag[i],"does not exist. Please try again.\n")}
51
+    }
52
+  }
53
+  
54
+
55
+# If this triggers the hell freezes...
56
+} else {
57
+  cat("Neither 1, 2 or 3 as error code...")
58
+}

+ 75
- 0
issuecomp-codingsample.R View File

@@ -0,0 +1,75 @@
1
+require(stringr)
2
+require(XML)
3
+
4
+# FUNCTIONS ---------------------------------------------------------------
5
+
6
+readYN <- function(question) { 
7
+  n <- readline(prompt=question)
8
+  n <- as.character(n)
9
+  return(n)
10
+}
11
+
12
+checkIssue <- function(string, issuelist) {
13
+  status <- any(str_detect(string, issuelist))
14
+  return(status)
15
+}
16
+
17
+checkAllIssues <- function(string, issuelist) {
18
+  status <- NULL
19
+  for(i in 1:length(string)) {
20
+    if(checkIssue(string[i], issuelist)) {
21
+      status[i] <- TRUE
22
+    } 
23
+    else {
24
+      cat("Issue",string[i],"does not exist. Please try again.\n")
25
+      status[i] <- FALSE
26
+    }
27
+  }
28
+  return(status)
29
+}
30
+
31
+
32
+# SAMPLE OUT/INPUT --------------------------------------------------------
33
+
34
+
35
+# Read CSV of all tweets (with tags, if available)
36
+c_tweets <- read.csv("tweets.csv", colClasses="character")
37
+c_tweets$X <- NULL
38
+
39
+# Read all issues from XML file
40
+c_issues <- data.frame(date = drange)
41
+c_issuelist <- xmlToList("issues.xml")
42
+c_issueheads <- names(issuelist)
43
+c_issues[issueheads] <- 0
44
+
45
+
46
+# Run through as many tweets as wished to mark them as correct or incorrect
47
+source("issuecomp-codingsample-function.R")
48
+rm(c_err, c_result, c_samid, c_samno,c_samtags,c_samissue,c_samtext,c_yn)
49
+
50
+
51
+# Now go through tweets/tags marked as false
52
+
53
+# Exit codes:
54
+# 0 = Correct tagging
55
+# 1 = At least one tag was incorrect
56
+# 2 = At least one tag was missing
57
+# 3 = Both 1 and 2
58
+
59
+c_errors <- read.csv("issuecomp-codingsample-error.csv", header = F, sep=",", colClasses="character")
60
+names(c_errors) <- c("str_id", "code", "issue", "tags", "text")
61
+
62
+for(r in 1:nrow(c_errors)) {
63
+  c_errcode <- as.character(c_errors$code[r])
64
+  c_errissue <- as.character(c_errors$issue[r])
65
+  c_errtags <- as.character(c_errors$tags[r])
66
+  c_errtext <- as.character(c_errors$text[r])
67
+  c_errid <- as.character(c_errors$str_id[r])
68
+  
69
+  cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errtags, "\n", sep="")
70
+  source("issuecomp-codingsample-function2.R")
71
+}
72
+
73
+
74
+
75
+

+ 6
- 3
issuecomp-scraping.R View File

@@ -176,19 +176,22 @@ tweets <- tweets[order(tweets$created_at), ]
176 176
 
177 177
 # Finally delete every tweet not from 2014 (2013 or 2015)
178 178
 delrow <- NULL
179
-pb <- txtProgressBar(min = 0, max = nrow(tweets), style = 3)
180 179
 for(r in 1:nrow(tweets)) {
181 180
   if(format(tweets$created_at[r], "%Y") != "2014") {
182 181
     delrow <- c(delrow, r)
183 182
   }
184
-  setTxtProgressBar(pb, r)
183
+  curtext <- as.character(tweets$text[r])
184
+  curtext <- str_replace_all(curtext, "$", " ")
185
+  curtext <- str_replace_all(curtext, "http://.+?\\s", "URL ")
186
+  tweets$text[r] <- curtext
185 187
 }
186 188
 tweets <- tweets[-delrow, ]
187 189
 rm(delrow, r)
188 190
 
191
+
189 192
 # Convert dates to omit (unnecessary) time
190 193
 tweets$created_at <- format(tweets$created_at, "%Y-%m-%d")
191 194
 
192
-save(tweets, file="tweets.RData")
195
+save(tweets, file="tweets_untagged.RData")
193 196
 
194 197
 

+ 1
- 1
issues.xml View File

@@ -87,7 +87,7 @@
87 87
     </issue.conservative>
88 88
 
89 89
     <issue.control>
90
-        <tag>pillepalle</tag>
90
+        <tag>der</tag>
91 91
         <tag>schundluder</tag>
92 92
         <tag>whatthefuck</tag>
93 93
     </issue.control>

BIN
tweets.RData View File


Loading…
Cancel
Save