Browse Source

Corrent status

mxmehl 5 years ago
parent
commit
9bbf1b4f56

+ 506
- 236
.Rhistory View File

@@ -1,242 +1,512 @@
1
-require(lubridate)
2
-require(XML)
3
-require(ggplot2)
4
-require(reshape2)
5
-require(stringr)
6
-library(foreach)
7
-library(doParallel)
8
-source("issuecomp-functions.R")
9
-getwd()
10
-setwd("~/Dokumente/Uni/Aktuell/BA-Arbeit/uni-ba-issuecomp")
11
-getwd()
12
-list.files()
13
-list.files("matched-ids/")
14
-load(file = "tweets_untagged.RData")
15
-issues <- data.frame(date = drange)
16
-# Create date range
17
-date_start <- as.Date("2014-01-01")
18
-date_end <- as.Date("2014-12-31")
19
-drange <- as.integer(date_end - date_start)
20
-drange <- date_start + days(0:drange)
21
-issues <- data.frame(date = drange)
22
-issuelist <- readLines("issues.xml")
23
-issuelist <- str_replace_all(string = issuelist, pattern = ".*<!-- .+ -->", "")
24
-issuelist <- xmlToList(issuelist)
25
-issueheads <- names(issuelist)
26
-issues[issueheads] <- 0
27
-tweets$issue <- ""
28
-tweets$tags <- ""
29
-View(issues)
30
-list.files("matched-ids/")
31
-results <- list.files("matched-ids/")
32
-results
33
-read.csv("matched-ids/i10.trans.csv")
34
-read.csv("matched-ids/i10.trans.csv", sep=";")
35
-read.csv("matched-ids/i10.trans.csv", sep=";", stringsAsFactors=F)
36
-read.csv("matched-ids/i10.trans.csv", sep=";", stringsAsFactors=T)
37
-reesult_files <- read.csv("matched-ids/i10.trans.csv", sep=";", stringsAsFactors=F)
38
-View(reesult_files)
39
-result_files <- read.csv("matched-ids/i10.trans.csv", sep=";", colClasses=c("date", "character", "character", "character"))
40
-result_files <- read.csv("matched-ids/i10.trans.csv", sep=";", colClasses=c("character", "character", "character", "character"))
41
-rm(reesult_files)
42
-View(result_files)
43
-nrow(result_files)
44
-result_files <- result_files(!duplicated(result_files))
45
-result_files <- result_files[!duplicated(result_files)]
46
-result_files <- result_files[!duplicated(result_files), ]
47
-nrow(result_files)
48
-result_files <- read.csv("matched-ids/i10.trans.csv", sep=";", colClasses=c("character", "character", "character", "character"), header=F)
49
-View(result_files)
50
-read.results
51
-results
52
-setwd("matched-ids/")
53
-list.files("")
54
-getwd()
55
-list.files()
56
-results <- list.files()
57
-results
58
-results_cat <- read.csv(results, sep=";", colClasses=c("character", "character", "character", "character"), header=F)
59
-results_cat <- read.csv(results[1], sep=";", colClasses=c("character", "character", "character", "character"), header=F)
60
-results_cat
61
-View(results_cat)
62
-source("issuecomp-functions.R")
63
-setwd("~/Dokumente/Uni/Aktuell/BA-Arbeit/uni-ba-issuecomp")
64
-source("issuecomp-functions.R")
65
-insertRow
66
-results_temp <- read.csv(results[2], sep=";", colClasses=c("character", "character", "character", "character"), header=F)
67
-setwd("matched-ids/")
68
-results_temp <- read.csv(results[2], sep=";", colClasses=c("character", "character", "character", "character"), header=F)
69
-rm(result_files)
70
-insertRow(existingDF = results_cat, results_temp)
71
-rm(results_cat)
72
-for(r in 1:length(results)) {
73
-if(r == 1) {
74
-results_cat <- read.csv(results[r], sep=";", colClasses=c("character", "character", "character", "character"), header=F)
75
-} else {
76
-results_temp <- read.csv(results[r], sep=";", colClasses=c("character", "character", "character", "character"), header=F)
77
-insertRow(results_cat, results_temp)
78
-}
79
-}
80
-for(r in 1:length(results)) {
81
-if(r == 1) {
82
-results_cat <- read.csv(results[r], sep=";", colClasses=c("character", "character", "character", "character"), header=F)
83
-} else {
84
-results_temp <- read.csv(results[r], sep=";", colClasses=c("character", "character", "character", "character"), header=F)
85
-results_cat insertRow(results_cat, results_temp)
86
-}
87
-}
88
-for(r in 1:length(results)) {
89
-if(r == 1) {
90
-results_cat <- read.csv(results[r], sep=";", colClasses=c("character", "character", "character", "character"), header=F)
91
-} else {
92
-results_temp <- read.csv(results[r], sep=";", colClasses=c("character", "character", "character", "character"), header=F)
93
-results_cat <- insertRow(results_cat, results_temp)
94
-}
95
-}
96
-View(results_cat)
97
-results_cat[20000]
98
-results_cat[20000, ]
99
-rm(r, results_temp)
100
-results_cat <- results_cat[!duplicated(results_cat), ]
101
-View(results_cat)
102
-rm(results, results_cat)
103
-results_files <- list.files()
104
-for(r in 1:length(results)) {
105
-if(r == 1) {
106
-results <- read.csv(results_files[r], sep=";", colClasses=c("character", "character", "character", "character"), header=F)
107
-} else {
108
-results_temp <- read.csv(results_files[r], sep=";", colClasses=c("character", "character", "character", "character"), header=F)
109
-results <- insertRow(results_cat, results_temp)
110
-}
111
-}
112
-rm(r, results_temp)
113
-results <- results[!duplicated(results), ]
114
-results_files <- list.files()
115
-for(r in 1:length(results_files)) {
116
-if(r == 1) {
117
-results <- read.csv(results_files[r], sep=";", colClasses=c("character", "character", "character", "character"), header=F)
118
-} else {
119
-results_temp <- read.csv(results_files[r], sep=";", colClasses=c("character", "character", "character", "character"), header=F)
120
-results <- insertRow(results, results_temp)
121
-}
122
-}
123
-rm(r, results_temp)
124
-results <- results[!duplicated(results), ]
125
-View(results)
126
-View(issues)
127
-row.names(results) <- NULL
128
-View(results)
129
-rownames(results)
130
-row.names(results)
131
-names(results)
132
-View(tweets)
133
-View(tweets)
134
-names(results) <- c("date", "id_str", "issue", "tags")
135
-View(results)
136
-results_test <- results[order(results$id_str)]
137
-results_test <- results[order(results$id_str), ]
138
-View(results_test)
139
-results_files <- list.files()
140
-for(r in 1:length(results_files)) {
141
-if(r == 1) {
142
-results <- read.csv(results_files[r], sep=";", colClasses=c("character", "character", "character", "character"), header=F)
143
-} else {
144
-results_temp <- read.csv(results_files[r], sep=";", colClasses=c("character", "character", "character", "character"), header=F)
145
-results <- insertRow(results, results_temp)
146
-}
147
-}
148
-rm(r, results_temp)
149
-rm(r, results_temp, results_files)
150
-results <- results[!duplicated(results), ]
151
-names(results)
152
-names(results) <- c("date", "id_str", "issue", "tags")
153
-View(results)
154
-results_test <- results[order(results$id_str), ]
155
-row.names(results) <- NULL
156
-results <- results[order(results$id_str), ]
157
-row.names(results) <- NULL
158
-View(results)
159
-rm(results_test)
160
-View(issues)
161
-as.character(results$date[2])
162
-class(results$date)
163
-class(issues$date)
1
+# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
2
+curacro <- checkAcronym(string = curtag)
3
+# Check if tag is some kind of specific hashtag. If so, do not handle as acronym, but don't expand it either
4
+if(str_detect(curtag, "^#")) {
5
+curacro <- FALSE
6
+curhash <- TRUE
7
+curtag <- str_replace(curtag, "#", "")
8
+curchars <- curchars - 1
9
+} else {
10
+curhash <- FALSE
11
+}
12
+# Now expand the current tag by possible suffixes that may be plural forms
13
+# Only do if it isn't an acronym or specific hastag
14
+if(!curacro && !curhash) {
15
+for(e in 1:length(tagexpand)) {
16
+curtag[e] <- str_c(curtag[1], tagexpand[e])
17
+}
18
+}
19
+##############
20
+if(curchars <= 4 || curacro || curhash) {
21
+cat("distance 0\n")
22
+} else {
23
+cat("distance 1\n")
24
+}
25
+curtag <- "EURATOM"
26
+curchars <- nchar(curtag, type = "chars")
27
+# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
28
+curacro <- checkAcronym(string = curtag)
29
+# Check if tag is some kind of specific hashtag. If so, do not handle as acronym, but don't expand it either
30
+if(str_detect(curtag, "^#")) {
31
+curacro <- FALSE
32
+curhash <- TRUE
33
+curtag <- str_replace(curtag, "#", "")
34
+curchars <- curchars - 1
35
+} else {
36
+curhash <- FALSE
37
+}
38
+# Now expand the current tag by possible suffixes that may be plural forms
39
+# Only do if it isn't an acronym or specific hastag
40
+if(!curacro && !curhash) {
41
+for(e in 1:length(tagexpand)) {
42
+curtag[e] <- str_c(curtag[1], tagexpand[e])
43
+}
44
+}
45
+##############
46
+if(curchars <= 4 || curacro || curhash) {
47
+cat("distance 0\n")
48
+} else {
49
+cat("distance 1\n")
50
+}
51
+curtag <- "Energiewende"
52
+curchars <- nchar(curtag, type = "chars")
53
+# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
54
+curacro <- checkAcronym(string = curtag)
55
+# Check if tag is some kind of specific hashtag. If so, do not handle as acronym, but don't expand it either
56
+if(str_detect(curtag, "^#")) {
57
+curacro <- FALSE
58
+curhash <- TRUE
59
+curtag <- str_replace(curtag, "#", "")
60
+curchars <- curchars - 1
61
+} else {
62
+curhash <- FALSE
63
+}
64
+# Now expand the current tag by possible suffixes that may be plural forms
65
+# Only do if it isn't an acronym or specific hastag
66
+if(!curacro && !curhash) {
67
+for(e in 1:length(tagexpand)) {
68
+curtag[e] <- str_c(curtag[1], tagexpand[e])
69
+}
70
+}
71
+##############
72
+if(curchars <= 4 || curacro || curhash) {
73
+cat("distance 0\n")
74
+} else {
75
+cat("distance 1\n")
76
+}
77
+curtag <- "bnd"
78
+curchars <- nchar(curtag, type = "chars")
79
+# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
80
+curacro <- checkAcronym(string = curtag)
81
+# Check if tag is some kind of specific hashtag. If so, do not handle as acronym, but don't expand it either
82
+if(str_detect(curtag, "^#")) {
83
+curacro <- FALSE
84
+curhash <- TRUE
85
+curtag <- str_replace(curtag, "#", "")
86
+curchars <- curchars - 1
87
+} else {
88
+curhash <- FALSE
89
+}
90
+# Now expand the current tag by possible suffixes that may be plural forms
91
+# Only do if it isn't an acronym or specific hastag
92
+if(!curacro && !curhash) {
93
+for(e in 1:length(tagexpand)) {
94
+curtag[e] <- str_c(curtag[1], tagexpand[e])
95
+}
96
+}
97
+##############
98
+if(curchars <= 4 || curacro || curhash) {
99
+cat("distance 0\n")
100
+} else {
101
+cat("distance 1\n")
102
+}
103
+curtag <- "#WM"
104
+curchars <- nchar(curtag, type = "chars")
105
+# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
106
+curacro <- checkAcronym(string = curtag)
107
+# Check if tag is some kind of specific hashtag. If so, do not handle as acronym, but don't expand it either
108
+if(str_detect(curtag, "^#")) {
109
+curacro <- FALSE
110
+curhash <- TRUE
111
+curtag <- str_replace(curtag, "#", "")
112
+curchars <- curchars - 1
113
+} else {
114
+curhash <- FALSE
115
+}
116
+# Now expand the current tag by possible suffixes that may be plural forms
117
+# Only do if it isn't an acronym or specific hastag
118
+if(!curacro && !curhash) {
119
+for(e in 1:length(tagexpand)) {
120
+curtag[e] <- str_c(curtag[1], tagexpand[e])
121
+}
122
+}
123
+##############
124
+if(curchars <= 4 || curacro || curhash) {
125
+cat("distance 0\n")
126
+} else {
127
+cat("distance 1\n")
128
+}
129
+curtag
130
+curtag <- "Energiewende"
131
+curchars <- nchar(curtag, type = "chars")
132
+# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
133
+curacro <- checkAcronym(string = curtag)
134
+# Check if tag is some kind of specific hashtag. If so, do not handle as acronym, but don't expand it either
135
+if(str_detect(curtag, "^#")) {
136
+curacro <- FALSE
137
+curhash <- TRUE
138
+curtag <- str_replace(curtag, "#", "")
139
+curchars <- curchars - 1
140
+} else {
141
+curhash <- FALSE
142
+}
143
+# Now expand the current tag by possible suffixes that may be plural forms
144
+# Only do if it isn't an acronym or specific hastag
145
+if(!curacro && !curhash) {
146
+for(e in 1:length(tagexpand)) {
147
+curtag[e] <- str_c(curtag[1], tagexpand[e])
148
+}
149
+}
150
+##############
151
+if(curchars <= 4 || curacro || curhash) {
152
+cat("distance 0\n")
153
+} else {
154
+cat("distance 1\n")
155
+}
156
+curtag <- "Energiewende"
157
+curchars <- nchar(curtag, type = "chars")
158
+# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
159
+curacro <- checkAcronym(string = curtag)
160
+# Check if tag is some kind of specific hashtag. If so, do not handle as acronym, but don't expand it either
161
+if(str_detect(curtag, "^#")) {
162
+curacro <- FALSE
163
+curhash <- TRUE
164
+curtag <- str_replace(curtag, "#", "")
165
+curchars <- curchars - 1
166
+} else {
167
+curhash <- FALSE
168
+}
169
+# Now expand the current tag by possible suffixes that may be plural forms
170
+# Only do if it isn't an acronym or specific hastag
171
+if(!curacro && !curhash) {
172
+for(e in 1:length(tagexpand)) {
173
+curtag[e] <- str_c(curtag[1], tagexpand[e])
174
+}
175
+}
176
+# Set Levenshtein distance depending on char length, acronym and hashtag status
177
+if(curchars <= 4 || curacro || curhash) {
178
+curdistance <- 0
179
+} else {
180
+curdistance <- 1
181
+}
182
+curtag
183
+smartPatternMatch("Die Energiewende ist toll!", curtag, curdistance, curacro)
184
+smartPatternMatch("Die Energiewende ist toll!", curtag[1], curdistance, curacro)
185
+smartPatternMatch("Die Energiewende ist toll!", curtag[2], curdistance, curacro)
186
+smartPatternMatch("Die Energiewende ist toll!", sprintf("%s", curtag), curdistance, curacro)
187
+tags_found <- NULL
188
+# Match the tweet with each variation of tagexpand
189
+for(e in 1:length(curtag)) {
190
+tags_found[e] <- smartPatternMatch(curtext, curtag[e], curdistance, curacro)
191
+}
192
+curtext <- "Die Energiewende ist toll!"
193
+tags_found <- NULL
194
+# Match the tweet with each variation of tagexpand
195
+for(e in 1:length(curtag)) {
196
+tags_found[e] <- smartPatternMatch(curtext, curtag[e], curdistance, curacro)
197
+}
198
+tags_found
199
+curtag
200
+curtag <- "#WM2014"
201
+curtext <- "Ich freu mich auf wm2014 sehr"
202
+curchars <- nchar(curtag, type = "chars")
203
+# Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
204
+curacro <- checkAcronym(string = curtag)
205
+# Check if tag is some kind of specific hashtag. If so, do not handle as acronym, but don't expand it either
206
+if(str_detect(curtag, "^#")) {
207
+curacro <- FALSE
208
+curhash <- TRUE
209
+curtag <- str_replace(curtag, "#", "")
210
+curchars <- curchars - 1
211
+} else {
212
+curhash <- FALSE
213
+}
214
+# Now expand the current tag by possible suffixes that may be plural forms
215
+# Only do if it isn't an acronym or specific hastag
216
+if(!curacro && !curhash) {
217
+for(e in 1:length(tagexpand)) {
218
+curtag[e] <- str_c(curtag[1], tagexpand[e])
219
+}
220
+}
221
+# Set Levenshtein distance depending on char length, acronym and hashtag status
222
+if(curchars <= 4 || curacro || curhash) {
223
+curdistance <- 0
224
+} else {
225
+curdistance <- 1
226
+}
227
+# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow also 1 (Levenshtein distance)
228
+tags_found <- NULL
229
+# Match the tweet with each variation of tagexpand
230
+for(e in 1:length(curtag)) {
231
+tags_found[e] <- smartPatternMatch(curtext, curtag[e], curdistance, curacro)
232
+}
233
+tags_found <- any(tags_found)
234
+tags_found
235
+curtag
236
+curtext
237
+curdistance
238
+test <- VAR(issues[,2:32], p=3, type=c("const", "trend", "both", "none"), season=NULL, exogen = NULL, lag.max = NULL, ic = c("AIC", "HQ", "SC", "FPE"))
239
+test
240
+test <- VAR(issues[,2:32], p=1, type="none")
241
+capture.output(print(summary(test), prmsd=TRUE, digits=1), file="out.txt")
164 242
 View(issues)
165
-as.character(issues$date[2])
166
-issues$date[2]
167
-issuelist <- readLines("issues.xml")
168
-issuelist <- str_replace_all(string = issuelist, pattern = ".*<!-- .+ -->", "")
169
-issuelist <- xmlToList(issuelist)
170
-issueheads <- names(issuelist)
171
-require(lubridate)
172
-require(XML)
173
-require(ggplot2)
174
-require(reshape2)
243
+test <- VAR(issues_i[,2:22], p=1, type="none", exogen = issues_s[,2])
244
+test <- VAR(issues_i[,2:22], p=1, type="none", exogen = issues_s[,2:3])
245
+capture.output(print(summary(test), prmsd=TRUE, digits=1), file="out.txt")
246
+irf(test)
247
+test <- VAR(issues_s[,2:11], p=1, type="none")
248
+irf(test)
249
+plot(irf(test))
250
+test <- VAR(issues[,2:32], p=1, type="none")
251
+plot(irf(test, impulse = names(issues_s[2:11]), response = names(issues_i[2:22])))
252
+plot(irf(test, impulse = names(issues_s[2:11]), response = names(issues_i[2:22]), n.ahead = 5))
175 253
 require(stringr)
176
-library(foreach)
177
-library(doParallel)
178
-issuelist <- readLines("issues.xml")
179
-issuelist <- str_replace_all(string = issuelist, pattern = ".*<!-- .+ -->", "")
180
-issuelist <- xmlToList(issuelist)
181
-issueheads <- names(issuelist)
182
-setwd("~/Dokumente/Uni/Aktuell/BA-Arbeit/uni-ba-issuecomp")
183
-issuelist <- readLines("issues.xml")
184
-issuelist <- str_replace_all(string = issuelist, pattern = ".*<!-- .+ -->", "")
185
-issuelist <- xmlToList(issuelist)
186
-issueheads <- names(issuelist)
187
-issues[issueheads] <- 0
188
-curdate <- as.character(results$date[3])
189
-curissue <- as.character(results$issue[3])
190
-curdate
191
-curissue
192
-issues[curdate, curissue] <- issues[curdate, curissue] + 1
193
-View(issues)
194
-issues <- data.frame(date = drange)
195
-issues[issueheads] <- 0
196
-View(issues)
197
-issues[issues[, "date"] == curdate, curissue] <- issues[issues[, "date"] == curdate, curissue] + 1
198
-View(issues)
199
-for(r in 1:nrow(results)) {
200
-curdate <- as.character(results$date[r])
201
-curissue <- as.character(results$issue[r])
202
-issues[issues[, "date"] == curdate, curissue] <- issues[issues[, "date"] == curdate, curissue] + 1
254
+require(XML)
255
+readYN <- function(question) {
256
+n <- readline(prompt=question)
257
+n <- as.character(n)
258
+return(n)
203 259
 }
204
-View(issues)
205
-issues[issueheads] <- 0
206
-View(issues)
207
-for(r in 1:nrow(results)) {
208
-curdate <- as.character(results$date[r])
209
-curid <- as.character(results$id_str[r])
210
-curissue <- as.character(results$issue[r])
211
-curtag <- as.character(results$tags[r])
212
-# Update issue counter (date and issue)
213
-issues[issues[, "date"] == curdate, curissue] <- issues[issues[, "date"] == curdate, curissue] + 1
214
-# Update tweet dataframe (id, issue and tags)
215
-oldissue <- tweets[tweets[, "id_str"] == curid, "issue"]
216
-tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ",")
217
-oldtag <- tweets[tweets[, "id_str"] == curid, "tags"]
218
-tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ",")
260
+checkIssue <- function(string, issuelist) {
261
+status <- any(str_detect(string, issuelist))
262
+return(status)
219 263
 }
220
-View(tweets)
221
-tweets$issue <- ""
222
-tweets$tags <- ""
223
-View(tweets)
224
-issues[issueheads] <- 0
225
-for(r in 1:nrow(results)) {
226
-curdate <- as.character(results$date[r])
227
-curid <- as.character(results$id_str[r])
228
-curissue <- as.character(results$issue[r])
229
-curtag <- as.character(results$tags[r])
230
-cat("Sorting match", r, "from", nrow(results), "\n")
231
-# Update issue counter (date and issue)
232
-issues[issues[, "date"] == curdate, curissue] <- issues[issues[, "date"] == curdate, curissue] + 1
233
-# Update tweet dataframe (id, issue and tags)
234
-oldissue <- tweets[tweets[, "id_str"] == curid, "issue"]
235
-tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ",")
236
-oldtag <- tweets[tweets[, "id_str"] == curid, "tags"]
237
-tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ",")
264
+checkAllIssues <- function(string, issuelist) {
265
+status <- NULL
266
+for(i in 1:length(string)) {
267
+if(checkIssue(string[i], issuelist)) {
268
+status[i] <- TRUE
269
+}
270
+else {
271
+cat("Issue",string[i],"does not exist. Please try again.\n")
272
+status[i] <- FALSE
273
+}
274
+}
275
+return(status)
238 276
 }
239
-View(issues)
240
-View(tweets)
241 277
 View(tweets)
242
-save(tweets, file="tweets_tagged.RData")
278
+write.csv(tweets, file="tweets.csv")
279
+c_tweets <- read.csv("tweets.csv", colClasses="character")
280
+View(c_tweets)
281
+c_tweets$X <- NULL
282
+c_issues <- data.frame(date = drange)
283
+c_issuelist <- xmlToList("issues.xml")
284
+c_issueheads <- names(issuelist)
285
+c_issues[issueheads] <- 0
286
+source("issuecomp-codingsample-function.R")
287
+rm(c_err, c_result, c_samid, c_samno,c_samtags,c_samissue,c_samtext,c_yn)
288
+c_errors <- read.csv("issuecomp-codingsample-error.csv", header = F, sep=",", colClasses="character")
289
+View(c_errors)
290
+names(c_errors) <- c("str_id", "code", "issue", "tags", "text")
291
+View(c_errors)
292
+for(r in 1:nrow(c_errors)) {
293
+c_errcode <- as.character(c_errors$code[r])
294
+c_errissue <- as.character(c_errors$issue[r])
295
+c_errtags <- as.character(c_errors$tags[r])
296
+c_errtext <- as.character(c_errors$text[r])
297
+c_errid <- as.character(c_errors$str_id[r])
298
+cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errtags, "\n", sep="")
299
+source("issuecomp-codingsample-function2.R")
300
+}
301
+c_errors <- read.csv("issuecomp-codingsample-error.csv", header = F, sep=",", colClasses="character")
302
+names(c_errors) <- c("str_id", "code", "issue", "tags", "text")
303
+for(r in 1:nrow(c_errors)) {
304
+c_errcode <- as.character(c_errors$code[r])
305
+c_errissue <- as.character(c_errors$issue[r])
306
+c_errtags <- as.character(c_errors$tags[r])
307
+c_errtext <- as.character(c_errors$text[r])
308
+c_errid <- as.character(c_errors$str_id[r])
309
+cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errtags, "\n", sep="")
310
+source("issuecomp-codingsample-function2.R")
311
+}
312
+for(r in 1:nrow(c_errors)) {
313
+c_errcode <- as.character(c_errors$code[r])
314
+c_errissue <- as.character(c_errors$issue[r])
315
+c_errtags <- as.character(c_errors$tags[r])
316
+c_errtext <- as.character(c_errors$text[r])
317
+c_errid <- as.character(c_errors$str_id[r])
318
+cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errissue, " (", c_errtags, ")\n", sep="")
319
+source("issuecomp-codingsample-function2.R")
320
+}
321
+for(r in 1:nrow(c_errors)) {
322
+c_errcode <- as.character(c_errors$code[r])
323
+c_errissue <- as.character(c_errors$issue[r])
324
+c_errtags <- as.character(c_errors$tags[r])
325
+c_errtext <- as.character(c_errors$text[r])
326
+c_errid <- as.character(c_errors$str_id[r])
327
+cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errissue, " (", c_errtags, ")\n", sep="")
328
+source("issuecomp-codingsample-function2.R")
329
+}
330
+tagexpand
331
+source("issuecomp-codingsample-function.R")
332
+source("issuecomp-codingsample-function.R")
333
+source("issuecomp-codingsample-function.R")
334
+c_errors <- read.csv("issuecomp-codingsample-error.csv", header = F, sep=",", colClasses="character")
335
+names(c_errors) <- c("str_id", "code", "issue", "tags", "text")
336
+for(r in 1:nrow(c_errors)) {
337
+c_errcode <- as.character(c_errors$code[r])
338
+c_errissue <- as.character(c_errors$issue[r])
339
+c_errtags <- as.character(c_errors$tags[r])
340
+c_errtext <- as.character(c_errors$text[r])
341
+c_errid <- as.character(c_errors$str_id[r])
342
+cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errissue, " (", c_errtags, ")\n", sep="")
343
+source("issuecomp-codingsample-function2.R")
344
+}
345
+for(r in 1:nrow(c_errors)) {
346
+c_errcode <- as.character(c_errors$code[r])
347
+c_errissue <- as.character(c_errors$issue[r])
348
+c_errtags <- as.character(c_errors$tags[r])
349
+c_errtext <- as.character(c_errors$text[r])
350
+c_errid <- as.character(c_errors$str_id[r])
351
+cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errissue, " (", c_errtags, ")\n", sep="")
352
+source("issuecomp-codingsample-function2.R")
353
+}
354
+for(r in 1:nrow(c_errors)) {
355
+c_errcode <- as.character(c_errors$code[r])
356
+c_errissue <- as.character(c_errors$issue[r])
357
+c_errtags <- as.character(c_errors$tags[r])
358
+c_errtext <- as.character(c_errors$text[r])
359
+c_errid <- as.character(c_errors$str_id[r])
360
+cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errissue, " (", c_errtags, ")\n", sep="")
361
+source("issuecomp-codingsample-function2.R")
362
+}
363
+for(r in 1:nrow(c_errors)) {
364
+c_errcode <- as.character(c_errors$code[r])
365
+c_errissue <- as.character(c_errors$issue[r])
366
+c_errtags <- as.character(c_errors$tags[r])
367
+c_errtext <- as.character(c_errors$text[r])
368
+c_errid <- as.character(c_errors$str_id[r])
369
+cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errissue, " (", c_errtags, ")\n", sep="")
370
+source("issuecomp-codingsample-function2.R")
371
+}
372
+for(r in 1:nrow(c_errors)) {
373
+c_errcode <- as.character(c_errors$code[r])
374
+c_errissue <- as.character(c_errors$issue[r])
375
+c_errtags <- as.character(c_errors$tags[r])
376
+c_errtext <- as.character(c_errors$text[r])
377
+c_errid <- as.character(c_errors$str_id[r])
378
+cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errissue, " (", c_errtags, ")\n", sep="")
379
+source("issuecomp-codingsample-function2.R")
380
+}
381
+c_tmp <- read.csv("issuecomp-codingsample-error1.csv", header = F, colClasses="character")
382
+View(c_tmp)
383
+View(c_errors)
384
+View(c_tmp)
385
+names(c_tmp) <- c("str_id", "all", "wrong", "tags", "text")
386
+View(c_tmp)
387
+c_tmp[, c("wrong", "tagged", "all", "text")]
388
+View(c_tmp)
389
+names(c_tmp) <- c("str_id", "all", "wrong", "tagged", "text")
390
+c_tmp[, c("wrong", "tagged", "all", "text")]
391
+c_error1 <- c_tmp[, c("wrong", "tagged", "all", "text")]
392
+View(c_error1)
393
+c_tmp <- read.csv("issuecomp-codingsample-error2.csv", header = F, colClasses="character")
394
+View(c_tmp)
395
+c_tmp <- read.csv("issuecomp-codingsample-error2.csv", header = F, colClasses="character")
396
+names(c_tmp) <- c("str_id", "all", "missing", "tagged", "text")
397
+c_error1 <- c_tmp[, c("missing", "tagged", "all", "text")]
398
+c_error2 <- c_tmp[, c("missing", "tagged", "all", "text")]
399
+View(c_error2)
400
+c_error2 <- c_tmp[, c("missing", "text", "tagged", "all")]
401
+View(c_error2)
402
+View(c_error1)
403
+View(c_error2)
404
+c_tmp <- read.csv("issuecomp-codingsample-correct.csv", header = F, colClasses="character")
405
+View(c_tmp)
406
+names(c_tmp) <- c("str_id", "status", "issue", "tags", "text")
407
+View(c_tmp)
408
+c_currect <- c_tmp
409
+c_correct <- c_tmp
410
+rm(c_currect)
411
+View(c_correct)
412
+source("issuecomp-codingsample-function.R")
413
+rm(c_err, c_result, c_samid, c_samno,c_samtags,c_samissue,c_samtext,c_yn)
414
+c_errors <- read.csv("issuecomp-codingsample-error.csv", header = F, sep=",", colClasses="character")
415
+c_errors <- read.csv("issuecomp-codingsample-error.csv", header = F, sep=",", colClasses="character")
416
+c_errors <- read.csv("issuecomp-codingsample-error.csv", header = F, sep=",", colClasses="character")
417
+c_errors <- read.csv("issuecomp-codingsample-error.csv", header = F, sep=",", colClasses="character")
418
+c_errors <- read.csv("issuecomp-codingsample-error.csv", header = F, sep=",", colClasses="character", quote = "")
419
+View(c_errors)
420
+c_errors <- read.csv("issuecomp-codingsample-error.csv", header = F, sep=",", colClasses="character")
421
+test <- "Zitat "total dämlich!""
422
+tweets$id_str == "523512815425175552"
423
+tweets[tweets$id_str == "523512815425175552"]
424
+tweets[tweets$id_str == "523512815425175552", ]
425
+tweets[tweets$id_str == "523512815425175552", "text"]
426
+test <- tweets[tweets$id_str == "523512815425175552", "text"]
427
+test
428
+test <- c_tweets[ctweets$id_str == "523512815425175552", "text"]
429
+test <- c_tweets[c_tweets$id_str == "523512815425175552", "text"]
430
+test
431
+str_replace(test, "\\"", ")
432
+str_replace(test, "\\"", "")
433
+str_replace(test, "\"", "")
434
+str_detect(test, "\"")
435
+test <- as.character(c_tweets[c_tweets$id_str == "523512815425175552", "text"])
436
+test
437
+c_tweets <- read.csv("tweets.csv", colClasses="character")
438
+for(r in 1:nrow(c_tweets)) {
439
+curtext <- as.character(c_tweets$text[r])
440
+if(str_detect(curtext, "\"") {
441
+c_tweets$text[r] <- str_replace(curtext, "\"", "")
442
+}
443
+}
444
+for(r in 1:nrow(c_tweets)) {
445
+curtext <- as.character(c_tweets$text[r])
446
+if(str_detect(curtext, "\"") {
447
+c_tweets$text[r] <- str_replace(curtext, "\"", "")
448
+} else {}
449
+}
450
+for(r in 1:nrow(c_tweets)) {
451
+curtext <- as.character(c_tweets$text[r])
452
+if(str_detect(curtext, "\"") {
453
+c_tweets$text[r] <- str_replace(curtext, "\"", "")
454
+} else {
455
+}
456
+}
457
+for(r in 1:nrow(c_tweets)) {
458
+curtext <- as.character(c_tweets$text[r])
459
+if(str_detect(curtext, "\"")) {
460
+c_tweets$text[r] <- str_replace(curtext, "\"", "")
461
+}
462
+}
463
+test <- as.character(c_tweets[c_tweets$id_str == "523512815425175552", "text"])
464
+test
465
+View(c_tweets)
466
+c_errors <- read.csv("issuecomp-codingsample-error.csv", header = F, sep=",", colClasses="character")
467
+View(c_errors)
468
+names(c_errors) <- c("str_id", "code", "issue", "tags", "text")
469
+View(c_errors)
470
+for(r in 1:nrow(c_errors)) {
471
+c_errcode <- as.character(c_errors$code[r])
472
+c_errissue <- as.character(c_errors$issue[r])
473
+c_errtags <- as.character(c_errors$tags[r])
474
+c_errtext <- as.character(c_errors$text[r])
475
+c_errid <- as.character(c_errors$str_id[r])
476
+cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errissue, " (", c_errtags, ")\n", sep="")
477
+source("issuecomp-codingsample-function2.R")
478
+}
479
+issueheads
480
+for(r in 1:nrow(c_errors)) {
481
+c_errcode <- as.character(c_errors$code[r])
482
+c_errissue <- as.character(c_errors$issue[r])
483
+c_errtags <- as.character(c_errors$tags[r])
484
+c_errtext <- as.character(c_errors$text[r])
485
+c_errid <- as.character(c_errors$str_id[r])
486
+cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errissue, " (", c_errtags, ")\n", sep="")
487
+source("issuecomp-codingsample-function2.R")
488
+}
489
+# All tweets with WRONG ISSUES
490
+c_tmp <- read.csv("issuecomp-codingsample-error1.csv", header = F, colClasses="character")
491
+names(c_tmp) <- c("str_id", "all", "wrong", "tagged", "text")
492
+c_error1 <- c_tmp[, c("wrong", "tagged", "all", "text")]
493
+# All tweets with MISSING ISSUES
494
+c_tmp <- read.csv("issuecomp-codingsample-error2.csv", header = F, colClasses="character")
495
+names(c_tmp) <- c("str_id", "all", "missing", "tagged", "text")
496
+c_error2 <- c_tmp[, c("missing", "text", "tagged", "all")]
497
+# All CORRECT tweets
498
+c_tmp <- read.csv("issuecomp-codingsample-correct.csv", header = F, colClasses="character")
499
+names(c_tmp) <- c("str_id", "status", "issue", "tags", "text")
500
+c_correct <- c_tmp
501
+View(c_error1)
502
+View(c_error2)
503
+View(c_error1)
504
+View(c_correct)
505
+test <- VAR(issues_i[,2:22], p=1, type="none", exogen = issues_s[,2:3])
506
+plot(irf(test, impulse = names(issues_s[2:11]), response = names(issues_i[2:22])))
507
+test <- VAR(issues[,2:32], p=1, type="none")
508
+plot(irf(test, impulse = names(issues_s[2:11]), response = names(issues_i[2:22])))
509
+VARselect(issues[,2:32], lag.max=8, type="none")
510
+VARselect(issues[,2:32], lag.max=8, type="both")
511
+VARselect(issues[,2:32], lag.max=30, type="both")
512
+VARselect(issues[,2:32], lag.max=15, type="both")

+ 3
- 2
.gitignore View File

@@ -1,7 +1,8 @@
1 1
 tweets_complete.csv
2 2
 tweets.csv
3
-tweets_untagged.csv
4
-tweets_untagged.RData
5 3
 .RData
6 4
 matched-ids
7 5
 issuecomp-analysis.log
6
+issuecomp-codingsample-correct.csv
7
+issuecomp-codingsample-error.csv
8
+issuecomp-codingsample-error2.csv

+ 138
- 0
issuecomp-2-analysis-EXT.R View File

@@ -0,0 +1,138 @@
1
+require(lubridate)
2
+require(XML)
3
+require(stringr)
4
+require(foreach)
5
+require(doParallel)
6
+
7
+source("issuecomp-functions.R")
8
+setwd("~/Dokumente/Uni/Aktuell/BA-Arbeit/uni-ba-issuecomp")
9
+
10
+
11
+load(file = "tweets_untagged.RData")
12
+
13
+# Create date range
14
+date_start <- as.Date("2014-01-01")
15
+date_end <- as.Date("2014-12-31")
16
+drange <- as.integer(date_end - date_start)
17
+drange <- date_start + days(0:drange)
18
+
19
+# Import issues and prepare everything
20
+# Will only be filled after the large categorisation loop
21
+issues <- data.frame(date = drange)
22
+issuelist <- readLines("issues-v2.xml")
23
+issuelist <- str_replace_all(string = issuelist, pattern = ".*<!-- .+ -->", "")
24
+issuelist <- xmlToList(issuelist)
25
+issueheads <- names(issuelist)
26
+issues[issueheads] <- 0
27
+tweets$issue <- ""
28
+tweets$tags <- ""
29
+
30
+
31
+# MATCH TWEETS ------------------------------------------------------------
32
+
33
+# Create folder where all results will be saved (saver for backup and import)
34
+id_folder <- "matched-ids"
35
+unlink(id_folder, recursive = TRUE)
36
+dir.create(id_folder)
37
+
38
+# Tag expansion for plural, genetiv etc
39
+tagexpand <- c("", "s", "n", "en", "er", "e")
40
+
41
+# Parameters for parallelisation
42
+writeLines(c(""), "issuecomp-analysis.log")  
43
+cl<-makeCluster(7)
44
+registerDoParallel(cl)
45
+
46
+# START CAT LOOP
47
+foreach(d = 1:nrow(issues), .packages = c("stringr"), .combine=rbind) %dopar% {
48
+  # Go through every day
49
+  curdate <- issues$date[d]
50
+  cat(paste(as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE)
51
+  
52
+  # Put all tweets from specific day in a temporary DF
53
+  tweets_curday <- tweets[tweets[, "created_at"] == curdate, ]
54
+  
55
+  for(t in 1:nrow(tweets_curday)){
56
+    #     cat(paste("Starting tweet", t, "of",as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE)
57
+    # Select tweet's text, make it lowercase and remove hashtag indicators (#)
58
+    curtext <- as.character(tweets_curday$text[t])
59
+    curtext <- str_replace_all(curtext, "#", "")
60
+    
61
+    curid <- as.character(tweets_curday$id_str[t])
62
+    
63
+    # Now test each single issue (not tag!)
64
+    for(i in 1:length(issueheads)) {
65
+      curissue <- issueheads[i]
66
+      curtags <- as.character(issuelist[[curissue]])  
67
+      curfile <- str_c(id_folder,"/",curissue,".csv")
68
+      
69
+      # Now test all tags of a single issue
70
+      for(s in 1:length(curtags)) {
71
+        curtag <- curtags[s]
72
+        curchars <- nchar(curtag, type = "chars")
73
+        
74
+        # Check if tag is an acronym. If so, ignore.case will be deactivated in smartPatternMatch
75
+        curacro <- checkAcronym(string = curtag)
76
+        # Check if tag is some kind of specific hashtag. If so, do not handle as acronym, but don't expand it either
77
+        if(str_detect(curtag, "^#")) {
78
+          curacro <- FALSE   # hashtags like #WM2014 are also written as #wm2014, so we need case-insensitivity
79
+          curhash <- TRUE    # But we need to mark it as hashtag, so it doesn't get extended or Levenshtein distance > 0
80
+          curtag <- str_replace(curtag, "#", "")
81
+          curchars <- curchars - 1
82
+        } else {
83
+          curhash <- FALSE
84
+        }
85
+        
86
+        # Now expand the current tag by possible suffixes that may be plural forms
87
+        # Only do if it isn't an acronym or specific hastag
88
+        if(!curacro && !curhash) {
89
+          for(e in 1:length(tagexpand)) {
90
+            curtag[e] <- str_c(curtag[1], tagexpand[e])
91
+          }
92
+        }
93
+        
94
+        # Set Levenshtein distance depending on char length, acronym and hashtag status
95
+        if(curchars <= 6 || curacro || curhash) { # Distance = 1 if 7 chars or longer
96
+          curdistance <- 0
97
+        } else {
98
+          curdistance <- 1
99
+        }
100
+        
101
+        # Match current tweet with tag. 
102
+          # Allow 1 Levenshtein distance if tag is >= 5 letters and no hashtag or acronym
103
+          # Make is case-sensitiv if tag is an acronym
104
+        
105
+        tags_found <- NULL
106
+        # Match the tweet with each variation of tagexpand
107
+        for(e in 1:length(curtag)) {
108
+          tags_found[e] <- smartPatternMatch(curtext, curtag[e], curdistance, curacro)
109
+        }
110
+        tags_found <- any(tags_found)
111
+        curtag <- curtag[1]
112
+        
113
+        if(tags_found == TRUE) {
114
+          #           # Raise number of findings on this day for this issue by 1
115
+          #           issues[d,curissue] <- issues[d,curissue] + 1
116
+          #           
117
+          #           # Add issue and first matched tag of tweet to tweets-DF
118
+          #           oldissue <- tweets[tweets[, "id_str"] == curid, "issue"]
119
+          #           tweets[tweets[, "id_str"] == curid, "issue"] <- str_c(oldissue, curissue, ";")
120
+          #           oldtag <- tweets[tweets[, "id_str"] == curid, "tags"]
121
+          #           tweets[tweets[, "id_str"] == curid, "tags"] <- str_c(oldtag, curtag, ";")
122
+          
123
+          # Add information to file for function viewPatternMatching
124
+          write(str_c(curdate,";\"",curid,"\";",curissue,";",curtag), curfile, append = TRUE)
125
+          #           cat(paste("Match!\n"), file="issuecomp-analysis.log", append=TRUE)
126
+          #           data.frame(date=curdate, issue=curissue)
127
+          break   # next issue, no more tags from same issue
128
+        }
129
+        else {
130
+          #cat("Nothing found\n")
131
+        }
132
+      } # /for curtags
133
+    } # /for issuelist
134
+  } # /for tweets_curday
135
+} # /for drange
136
+
137
+#rm(tweets_curday,curacro, curchars, curdate,curfile,curid,curissue,curtag,curtags,curtext,d,date_end,date_start,i,id_folder,oldissue,oldtag,s,t,tags_found)
138
+stopCluster(cl)

+ 4
- 3
issuecomp-2-analysis.R View File

@@ -3,8 +3,8 @@ require(XML)
3 3
 require(ggplot2)
4 4
 require(reshape2)
5 5
 require(stringr)
6
-library(foreach)
7
-library(doParallel)
6
+require(foreach)
7
+require(doParallel)
8 8
 
9 9
 source("issuecomp-functions.R")
10 10
 setwd("~/Dokumente/Uni/Aktuell/BA-Arbeit/uni-ba-issuecomp")
@@ -94,7 +94,7 @@ foreach(d = 1:nrow(issues), .packages = c("stringr"), .combine=rbind) %dopar% {
94 94
         }
95 95
         
96 96
         # Set Levenshtein distance depending on char length, acronym and hashtag status
97
-        if(curchars <= 4 || curacro || curhash) {
97
+        if(curchars <= 6 || curacro || curhash) { # Distance = 1 if 7 chars or longer
98 98
           curdistance <- 0
99 99
         } else {
100 100
           curdistance <- 1
@@ -191,6 +191,7 @@ for(r in 1:nrow(results)) {
191 191
 # SAVING ------------------------------------------------------------------
192 192
 
193 193
 save(tweets, file="tweets_tagged.RData")
194
+write.csv(tweets, file="tweets.csv")
194 195
 save(issues, file="issues.RData")
195 196
 
196 197
 

+ 7
- 2
issuecomp-3-calc.R View File

@@ -72,15 +72,20 @@ stats_entropy <- melt(stats_entropy, id="date")
72 72
 g1 <- ggplot(data = stats_entropy, aes(x=date,y=value,colour=variable, group=variable)) + 
73 73
   geom_line() + 
74 74
   geom_smooth(size=1,formula = y ~ x, method="loess", se=FALSE, color=1)
75
-g1
75
+# g1
76 76
 
77 77
 
78 78
 
79 79
 # VAR ---------------------------------------------------------------------
80 80
 
81
-test <- VAR(issues[,2:32], p=3, type=c("const", "trend", "both", "none"), season=NULL, exogen = NULL, lag.max = NULL, ic = c("AIC", "HQ", "SC", "FPE"))
81
+test <- VAR(issues[,2:32], p=1, type=c("const", "trend", "both", "none"), season=NULL, exogen = NULL, lag.max = NULL, ic = c("AIC", "HQ", "SC", "FPE"))
82
+test <- VAR(issues_i[,2:22], p=1, type="none", exogen = issues_s[,2:3])
83
+test <- VAR(issues_s[,2:11], p=1, type="none")
84
+test <- VAR(issues[,2:32], p=1, type="none")
82 85
 VAR(issues_s[,2:11], p=1, type=c("const", "trend", "both", "none"), season=NULL, exogen = issues_i[2:22])
83 86
 
87
+plot(irf(test, impulse = names(issues_s[2:11]), response = names(issues_i[2:22])))
88
+
84 89
 capture.output(print(summary(test), prmsd=TRUE, digits=1), file="out.txt")
85 90
 
86 91
 

+ 0
- 31
issuecomp-codingsample-correct.csv View File

@@ -1,31 +0,0 @@
1
-"443032724505624576",0,"","","RT @Linksfraktion: Sevim Dagdelen: Optionspflicht abschaffen – ohne „Wenn“ und „Aber“ http://t.co/PXVz60RyDa"
2
-"464039981766684672",0,"","","Hange spricht von einer Kutsche ohne Dach in der wir fahren und hoffen, dass es nicht regne #btADA"
3
-"528513118440525824",0,"","","Dora zu Erfolgen der #Hamburger Linken: kostenfreies Mittag + Initiative, dass Maklergebühr nicht von Vermieter*innen.bezahlt werden müssen "
4
-"511144502590181376",0,"","","RT @2kdei: TY! RİP @SajadJiyad: post about David Haines? Use this photo. He would want us to remember him. URL #ISISmedi… "
5
-"515791668969504768",0,"","","Anschauen: Beitrag Heute Show zu #TTIP  URL "
6
-"533717720689549312",0,"","","RT @JUKoMo: "Wie groß muss die Angst der SPD vor Julia Klöckner sein?" Starker @LSaktuellRP-Kommentar zum SPD-Parteitag: URL "
7
-"472393996879527936",0,"","","RT @johannisbear: Bitte RT! Bitte helft @ulf_der_freak  #Aurela darf nicht sterben!  URL "
8
-"499494814778662912",0,"","","Das wievielte Mal verspricht #Merkel die Angleichung der #Ostrenten an Westniveau? Es gibt gute Gründe zu misstrauen URL "
9
-"530750832443400192",0,"","","ist - gezwungenermaßen - mit dem #Fernbus unterwegs #gdlstreik "
10
-"532310093904089088",0,"","","SPD will Waldschluchtpfad als Dauerstandort - Nachrichten Gatow | SPANDAUER VOLKSBLATT URL "
11
-"465407100408320000",0,"","","Die neue Modernität in den Kleingarten Kneipen URL "
12
-"428481075732811776",0,"civil.208;","Datenschutz;",""Datenschutz soll nicht unverhältnismäßig geschwächt werden." Was heißt da unverhältnismäßig? #Regierungserklärung #Merkel #Bundestag "
13
-"421711548189786114",0,"","","...und inzwischen gute Freunde. URL "
14
-"532608281009979392",0,"","","RT @initiatived21: “@anipenny: ”Graswurzelbewegg medienpägogisch interessierter Lehrer vernetzt sich, braucht aber auch Unterstützg“ @Esken… "
15
-"537931613464965121",0,"","","RT @StefanKaufmann: Es läuft Plenardebatte zum Haushalt des Bundesmin. für Bild. und Forschung. Trotz Schwarzer Null steigt Etat deutlich -… "
16
-"499843193069129728",0,"","","#CETA - der komplette Text wurde heute geleakt: https://t.co/YeWUsSAHoB "
17
-"477758769703944194",0,"","","Beim Tag der Offenen Tür des THW beeindruckt von der Vielfalt der Einsätze der Organisation.  10 Mill. im Haushalt sind hier gut eingesetzt "
18
-"539467838520832000",0,"","","RT @spdbt: KoA-Vertrag muss gelten! @ThomasOppermann: „Bei #Maut darf es keine Mehrbelastung für deutsche Autofahrer geben." URL "
19
-"449534756259381248",0,"","","Höflich, aber klares Statement zu Menschenrechten. Der Bundespräsident macht das gut! #China #XiJinping URL "
20
-"514314498510168065",0,"","","RT @zeitonline_wir: Ban Ki Moon lädt zum #Klimagipfel. 120 Staats- und Regierungschefs kommen. Nur nicht Angela Merkel. (ae) #climate2014 h… "
21
-"434413898054524928",0,"","","RT @KonstantinNotz: Zum Appell der SchriftstellerInnen URL und Videobericht zum Empfang der #grünen Bundestagsfraktion h… "
22
-"509707990929534976",0,"","","RT @weltnetzTV: URL in Kooperation mit theREALnews! URL "
23
-"542066419274637312",0,"","","RT @tagesthemen: "Ein bescheuerter Satz" - Sigmund Gottlieb kommentiert #YallaCSU. Jetzt in den Tagesthemen. URL "
24
-"508921157131984896",0,"","","RT @drthomasfeist: #gain2014 @MartinRabanus @KaiGehring @PLengsfeld @KarambaDiaby @the_dfg @DAADnewyork @AvHStiftung @UniLeipzig URL "
25
-"540745511024992257",0,"","","RT @maybritillner: .@MikeMohring Der Freistaat #Thueringen hat es nicht verdient von einer Regierung geführt zu werden, die sich als Experi… "
26
-"507492334347763712",0,"","","Bitte RT: Wer kann helfen? Brauchen ganz dringend mobile Duschcontainer für die Flüchtlingszelte in #Nürnberg #followerpower "
27
-"485509000268902400",0,"","","#WM2014 #Deutschland #GER - #Thalheim #Erzgebirge URL "
28
-"542710860528234497",0,"","","Illegaler #Kunsthandel blüht! BReg muss nachbessern. Kein Umschlagplatz BRD f. Raubkunst. Dazu @GreenClaudia + ich: https://t.co/tr16CjQl42 "
29
-"535525766919106560",0,"","","RT @EU_Salon: .@DJanecek: #TTIP Zivilgesellsxhaft hat Funktion zu treiben... ohne kritische Masse hätte es Diskussion so nicht gegeben. #ES3 "
30
-"472481188641124352",0,"","","Vor dem Grand Serail in Beirut,- im Schatten @nouripour :-) URL "
31
-"532818411374788608",0,"","","Laut einer Studie treibt unsere Debatte um Mietpreisbremse Mieten in die Höhe - tolle Wurst! "

+ 0
- 6
issuecomp-codingsample-error.csv View File

@@ -1,6 +0,0 @@
1
-"532463690013233152",1,"","","RT @cordhos: ! “@cducsubt: Paul Breitner vom @FCBayern - #Bundestag jetzt mit eigenem Fanclub @hahnflo @DoroBaer @dieAlbsteigerin http://t.…"
2
-"516584367448403968",1,"","","Debate und critics in the parlamentarian assembly of the European Council about the elections in #Turkey @PACE_News @GeziParkii"
3
-"516624274522918912",1,"","","Nach Bürgergespräch bin nun noch im Ratshof zur Ausstellungseröffnung - Wanderausstellung zum Bundestag."
4
-"530357749188923392",2,"","","Streiks müssen Auswirkungen haben - und die #bahn verletzt täglich Verbraucherinteressen: URL #gdlstreik #gdl #db "
5
-"465846218858708992",2,"","","RT @bioland_de: Wieso sollen Biobauern dafür bestraft werden, dass sie KEINE Pestizide einsetzen? Genau das hat die EU vor:  URL "
6
-"543111899794407426",2,"","","RT @DanielLuecking: #NSAUA @Peter_Schaar Verhältnismäßigkeit muss hinterfragt werden - Grundrechtsverletzungen durch Überwachung gehören au… "

+ 0
- 1
issuecomp-codingsample-error2.csv View File

@@ -1 +0,0 @@
1
-"530357749188923392","","labor.504","","Streiks müssen Auswirkungen haben - und die #bahn verletzt täglich Verbraucherinteressen: URL #gdlstreik #gdl #db "

+ 1
- 1
issuecomp-codingsample-function.R View File

@@ -7,7 +7,7 @@ repeat {
7 7
   
8 8
   
9 9
   repeat {
10
-    cat("===================\n\n[TWEET]: ",c_samtext,"\n", "[ISSUES]: ", c_samissue, sep="")
10
+    cat("===================\n\n[TWEET]: ",c_samtext,"\n", "[ISSUES]: ", c_samissue, " (", c_samtags, ")", sep="")
11 11
     c_yn <- readYN("Is the categorization correct AND complete?\nEnter y or n: ")
12 12
     
13 13
     # Check if input is correct

+ 46
- 9
issuecomp-codingsample-function2.R View File

@@ -3,14 +3,14 @@ if(c_errcode == "1") {
3 3
   repeat {
4 4
     c_curissue <- readYN("Which issue is incorrect?: ")
5 5
     if(c_curissue == "QUIT") {cat("Quitting this item without changes or entries.\n"); break}
6
-    c_curissue <- unlist(str_split(c_curissue, ";"))
6
+    c_curissue <- unlist(str_split(c_curissue, ","))
7 7
     
8 8
     status <- checkAllIssues(c_curissue, c_issueheads)
9 9
     
10 10
     # Only continue if every given issue really exists (all "status" have to be TRUE)
11 11
     if(all(status)) {
12 12
       # Revert str_split
13
-      c_curissue <- str_join(c_curissue,collapse = ";")
13
+      c_curissue <- str_join(c_curissue,collapse = ",")
14 14
       
15 15
       # <ID>,<all issues>,<faulty issue(s),<all tags>,<tweet text>
16 16
       c_result <- str_c("\"",c_errid,"\",\"",c_errissue,"\",\"",c_curissue,"\",\"",c_errtags,"\",\"",c_errtext,"\"")
@@ -24,14 +24,14 @@ if(c_errcode == "1") {
24 24
   repeat {
25 25
     c_curissue <- readYN("Which issue is missing?: ")
26 26
     if(c_curissue == "QUIT") {cat("Quitting this item without changes or entries.\n"); break}
27
-    c_curissue <- unlist(str_split(c_curissue, ";"))
27
+    c_curissue <- unlist(str_split(c_curissue, ","))
28 28
     
29 29
     status <- checkAllIssues(c_curissue, c_issueheads)
30 30
     
31 31
     # Only continue if every given issue really exists (all "status" have to be TRUE)
32 32
     if(all(status)) {
33 33
       # Revert str_split
34
-      c_curissue <- str_join(c_curissue,collapse = ";")
34
+      c_curissue <- str_join(c_curissue,collapse = ",")
35 35
       
36 36
       # <ID>,<all issues>,<faulty issue(s),<all tags>,<tweet text>
37 37
       c_result <- str_c("\"",c_errid,"\",\"",c_errissue,"\",\"",c_curissue,"\",\"",c_errtags,"\",\"",c_errtext,"\"")
@@ -42,14 +42,51 @@ if(c_errcode == "1") {
42 42
 
43 43
 # There is an issue missing AND a issue was wrong
44 44
 } else if(c_errcode == "3") {
45
-  #cat("Which issue is incorrect and which one is missing?\n")
45
+#   #cat("Which issue is incorrect and which one is missing?\n")
46
+#   repeat {
47
+#     c_tag <- readYN("Which issue is incorrect?: ")
48
+#     c_tag <- unlist(str_split(c_tag, ","))
49
+#     for(i in 1:length(c_tag)) {
50
+#       if(checkIssue(c_tag[i], c_issueheads)) {} else {cat("Issue",c_tag[i],"does not exist. Please try again.\n")}
51
+#     }
52
+  
46 53
   repeat {
47
-    c_tag <- readYN("Which issue is incorrect?: ")
48
-    c_tag <- unlist(str_split(c_tag, ";"))
49
-    for(i in 1:length(c_tag)) {
50
-      if(checkIssue(c_tag[i], c_issueheads)) {} else {cat("Issue",c_tag[i],"does not exist. Please try again.\n")}
54
+    c_curissue <- readYN("Which issue is incorrect?: ")
55
+    if(c_curissue == "QUIT") {cat("Quitting this item without changes or entries.\n"); break}
56
+    c_curissue <- unlist(str_split(c_curissue, ","))
57
+    
58
+    status <- checkAllIssues(c_curissue, c_issueheads)
59
+    
60
+    # Only continue if every given issue really exists (all "status" have to be TRUE)
61
+    if(all(status)) {
62
+      # Revert str_split
63
+      c_curissue <- str_join(c_curissue,collapse = ",")
64
+      
65
+      # <ID>,<all issues>,<faulty issue(s),<all tags>,<tweet text>
66
+      c_result <- str_c("\"",c_errid,"\",\"",c_errissue,"\",\"",c_curissue,"\",\"",c_errtags,"\",\"",c_errtext,"\"")
67
+      write(c_result, file = "issuecomp-codingsample-error1.csv", append = T)
68
+      break
51 69
     }
52 70
   }
71
+    
72
+    repeat {
73
+      c_curissue <- readYN("Which issue is missing?: ")
74
+      if(c_curissue == "QUIT") {cat("Quitting this item without changes or entries.\n"); break}
75
+      c_curissue <- unlist(str_split(c_curissue, ","))
76
+      
77
+      status <- checkAllIssues(c_curissue, c_issueheads)
78
+      
79
+      # Only continue if every given issue really exists (all "status" have to be TRUE)
80
+      if(all(status)) {
81
+        # Revert str_split
82
+        c_curissue <- str_join(c_curissue,collapse = ",")
83
+        
84
+        # <ID>,<all issues>,<faulty issue(s),<all tags>,<tweet text>
85
+        c_result <- str_c("\"",c_errid,"\",\"",c_errissue,"\",\"",c_curissue,"\",\"",c_errtags,"\",\"",c_errtext,"\"")
86
+        write(c_result, file = "issuecomp-codingsample-error2.csv", append = T)
87
+        break
88
+      }
89
+    }
53 90
   
54 91
 
55 92
 # If this triggers the hell freezes...

+ 22
- 1
issuecomp-codingsample.R View File

@@ -34,6 +34,13 @@ checkAllIssues <- function(string, issuelist) {
34 34
 
35 35
 # Read CSV of all tweets (with tags, if available)
36 36
 c_tweets <- read.csv("tweets.csv", colClasses="character")
37
+# Replace quotes because it may cause problems when saving and reading as CSV files
38
+for(r in 1:nrow(c_tweets)) {
39
+  curtext <- as.character(c_tweets$text[r])
40
+  if(str_detect(curtext, "\"")) {
41
+    c_tweets$text[r] <- str_replace(curtext, "\"", "")
42
+  }
43
+}
37 44
 c_tweets$X <- NULL
38 45
 
39 46
 # Read all issues from XML file
@@ -66,10 +73,24 @@ for(r in 1:nrow(c_errors)) {
66 73
   c_errtext <- as.character(c_errors$text[r])
67 74
   c_errid <- as.character(c_errors$str_id[r])
68 75
   
69
-  cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errtags, "\n", sep="")
76
+  cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errissue, " (", c_errtags, ")\n", sep="")
70 77
   source("issuecomp-codingsample-function2.R")
71 78
 }
72 79
 
73 80
 
81
+# Now import the error files in a human readable data frame to improve the issue database
82
+
83
+# All tweets with WRONG ISSUES
84
+c_tmp <- read.csv("issuecomp-codingsample-error1.csv", header = F, colClasses="character")
85
+names(c_tmp) <- c("str_id", "all", "wrong", "tagged", "text")
86
+c_error1 <- c_tmp[, c("wrong", "tagged", "all", "text")]
74 87
 
88
+# All tweets with MISSING ISSUES
89
+c_tmp <- read.csv("issuecomp-codingsample-error2.csv", header = F, colClasses="character")
90
+names(c_tmp) <- c("str_id", "all", "missing", "tagged", "text")
91
+c_error2 <- c_tmp[, c("missing", "text", "tagged", "all")]
75 92
 
93
+# All CORRECT tweets
94
+c_tmp <- read.csv("issuecomp-codingsample-correct.csv", header = F, colClasses="character")
95
+names(c_tmp) <- c("str_id", "status", "issue", "tags", "text")
96
+c_correct <- c_tmp

+ 0
- 202
issues-expand.xml View File

@@ -1,203 +1 @@
1
-<i7.env>
2
-  <tag>Energiewende</tag>
3
-  <tag>Klimaschutz</tag>
4
-  <tag>COP20</tag>
5
-  <tag>cop20</tag>
6
-</i7.env>
7
-
8
-<i3.health>
9
-  <tag>Gesundheitsbranche</tag>
10
-</i3.health>
11
-
12
-<i10.trans>
13
-  <tag>LKWs</tag> 
14
-  <tag>PKWs</tag>
15
-</i10.trans>
16
-
17
-<i1.macro>
18
-  <tag>Arbeitslose</tag>
19
-</i1.macro>
20
-
21
-<i12.law>
22
-  <tag>Vorratsdatenspeicherung</tag>
23
-  <tag>VDS</tag>
24
-  <tag>Cybercrime</tag>
25
-  <tag>Vorratsdatenspeicherung</tag>
26
-  <tag>VDS</tag>
27
-</i12.law>
28
-
29
-
30
-<!-- Sensational issues -->
31
-
32
-<!-- Political/conflicts -->
33
-<s.nsa>
34
-  <tag>NSA</tag>
35
-  <tag>Snowden</tag>
36
-  <tag>GCHQ</tag>
37
-</s.nsa>
38
-
39
-<s.is>
40
-  <tag>ISIS</tag>
41
-  <tag>IS</tag>
42
-  <tag>al-Baghdadi</tag>
43
-  <tag>Kurde</tag>
44
-  <tag>Jeside</tag>
45
-  <tag>#Mosul</tag>
46
-  <tag>#Mossul</tag>
47
-  <tag>#Fallujah</tag>
48
-  <tag>#Falludscha</tag>
49
-  <tag>#Kobanê</tag>
50
-  <tag>#Kobane</tag>
51
-  <tag>Syrien</tag>
52
-  <tag>Irak</tag>
53
-  <tag>#Aleppo</tag>
54
-</s.is>
55
-
56
-<s.ebola>
57
-  <tag>Ebola</tag>
58
-</s.ebola>
59
-
60
-<s.edathy>
61
-  <tag>Edathy</tag>
62
-  <tag>Edathy-Affäre</tag>
63
-</s.edathy>
64
-
65
-<s.ukraine>
66
-  <tag>Ukraine</tag>
67
-  <tag>Krim</tag>
68
-  <tag>Prorussisch</tag>
69
-  <tag>Donetsk</tag>
70
-  <tag>Donezk</tag>
71
-  <tag>Euromaidan</tag>
72
-</s.ukraine>
73
-
74
-<s.hk>
75
-  <tag>Hong Kong</tag>
76
-  <tag>Hong-Kong</tag>
77
-  <tag>Studentenprotest</tag>
78
-  <tag>Protest der Studenten</tag>
79
-  <tag>Hongkong</tag>
80
-</s.hk>
81
-
82
-<s.mh17>
83
-  <tag>#MH17</tag>
84
-  <tag>#KL4103</tag>
85
-</s.mh17>
86
-
87
-<s.mh370>
88
-  <tag>#MH370</tag>
89
-  <tag>#CZ748</tag>
90
-</s.mh370>
91
-
92
-<s.gaza>
93
-  <tag>Gaza</tag>
94
-  <tag>Hamas</tag>
95
-</s.gaza>
96
-
97
-<s.ferguson>
98
-  <tag>Ferguson</tag>
99
-  <tag>Michael Brown</tag>
100
-</s.ferguson>
101
-
102
-<s.boko>
103
-  <tag>Boko Haram</tag>
104
-</s.boko>
105
-
106
-<s.pegida>
107
-  <tag>Pegida</tag>
108
-  <tag>#nopegida</tag>
109
-</s.pegida>
110
-
111
-<!-- Yellow pages -->
112
-<s.schumi>
113
-  <tag>Schumacher</tag>
114
-  <tag>Schumi</tag>
115
-</s.schumi>
116
-
117
-<s.esc>
118
-  <tag>ESC</tag>
119
-  <tag>Conchita Wurst</tag>
120
-  <tag>#ConchitaWurst</tag>
121
-  <tag>Eurovision Song Contest</tag>
122
-</s.esc>
123
-
124
-<s.wulff>
125
-  <tag>Wulff</tag>
126
-</s.wulff>
127
-
128
-<s.tebartz>
129
-  <tag>Tebartz-van Elst</tag>
130
-  <tag>Tebartz</tag>
131
-  <tag>Limburg</tag>
132
-</s.tebartz>
133
-
134
-<s.gurlitt>
135
-  <tag>Gurlitt</tag>
136
-</s.gurlitt>
137
-
138
-<s.hoen>
139
-  <tag>Hoeneß</tag>
140
-  <tag>Hoeness</tag>
141
-</s.hoen>
142
-
143
-<s.pistorius>
144
-  <tag>Pistorius</tag>
145
-  <tag>#OscarPistorius</tag>
146
-</s.pistorius>
147
-
148
-<!-- Science -->
149
-<s.philae>
150
-  <tag>Philae</tag>
151
-  <tag>#Tschuri</tag>
152
-  <tag>#Rosetta</tag>
153
-  <tag>#CometLanding</tag>
154
-</s.philae>
155
-
156
-<!-- Sports -->
157
-<s.wm>
158
-  <tag>Fußball</tag>
159
-  <tag>Fussball</tag>
160
-  <tag>Stadion</tag>
161
-  <tag>Weltmeisterschaft</tag>
162
-  <tag>#WM</tag>
163
-  <tag>#BRAGER</tag>
164
-  <tag>#GERBRA</tag>
165
-  <tag>Fußballmeisterschaft</tag>
166
-  <tag>Fussballmeisterschaft</tag>
167
-  <tag>Fußballweltmeisterschaft</tag>
168
-  <tag>Fußssallweltmeisterschaft</tag>
169
-  <tag>Nationalmannschaft</tag>
170
-  <tag>Weltmeister</tag>
171
-  <tag>Brasilien</tag>
172
-  <tag>#WorldCup</tag>
173
-  <tag>#WM2014</tag>
174
-</s.wm>
175
-
176
-<s.sotschi>
177
-  <tag>Sotschi</tag>
178
-  <tag>#sochi2014</tag>
179
-  <tag>#sotschi2014</tag>
180
-  <tag>#Sochi</tag>
181
-  <tag>#WirfuerD</tag>
182
-</s.sotschi>
183
-
184
-<!-- Tests -->
185
-<s.de>
186
-  <tag>Deutschland</tag>
187
-  <tag>Deutsche</tag>
188
-</s.de>
189
-
190
-
191
-
192
-
193
-
194
-
195
-
196
-
197
-
198
-
199
-
200
-
201
-
202
-
203 1
 

issues.xml → issues-v1.xml View File


+ 1986
- 0
issues-v2.xml
File diff suppressed because it is too large
View File


matched-ids.tar → matched-ids-v1.tar View File


BIN
tweets_untagged.RData View File


Loading…
Cancel
Save