Browse Source

some small changes

mxmehl 5 years ago
parent
commit
cbbb664e04
6 changed files with 2164 additions and 153 deletions
  1. 148
    148
      .Rhistory
  2. 5
    3
      issuecomp-2-analysis.R
  3. 17
    1
      issuecomp-3-calc.R
  4. 1
    1
      issuecomp-codingsample.R
  5. 10
    0
      issues-expand.xml
  6. 1983
    0
      issues-v3.xml

+ 148
- 148
.Rhistory View File

@@ -1,151 +1,3 @@
1
-curdistance <- 1
2
-}
3
-# Match current tweet with tag. If >= 5 letters allow 1 changed letter, if >=8 letters allow also 1 (Levenshtein distance)
4
-tags_found <- NULL
5
-# Match the tweet with each variation of tagexpand
6
-for(e in 1:length(curtag)) {
7
-tags_found[e] <- smartPatternMatch(curtext, curtag[e], curdistance, curacro)
8
-}
9
-tags_found <- any(tags_found)
10
-tags_found
11
-curtag
12
-curtext
13
-curdistance
14
-test <- VAR(issues[,2:32], p=3, type=c("const", "trend", "both", "none"), season=NULL, exogen = NULL, lag.max = NULL, ic = c("AIC", "HQ", "SC", "FPE"))
15
-test
16
-test <- VAR(issues[,2:32], p=1, type="none")
17
-capture.output(print(summary(test), prmsd=TRUE, digits=1), file="out.txt")
18
-View(issues)
19
-test <- VAR(issues_i[,2:22], p=1, type="none", exogen = issues_s[,2])
20
-test <- VAR(issues_i[,2:22], p=1, type="none", exogen = issues_s[,2:3])
21
-capture.output(print(summary(test), prmsd=TRUE, digits=1), file="out.txt")
22
-irf(test)
23
-test <- VAR(issues_s[,2:11], p=1, type="none")
24
-irf(test)
25
-plot(irf(test))
26
-test <- VAR(issues[,2:32], p=1, type="none")
27
-plot(irf(test, impulse = names(issues_s[2:11]), response = names(issues_i[2:22])))
28
-plot(irf(test, impulse = names(issues_s[2:11]), response = names(issues_i[2:22]), n.ahead = 5))
29
-require(stringr)
30
-require(XML)
31
-readYN <- function(question) {
32
-n <- readline(prompt=question)
33
-n <- as.character(n)
34
-return(n)
35
-}
36
-checkIssue <- function(string, issuelist) {
37
-status <- any(str_detect(string, issuelist))
38
-return(status)
39
-}
40
-checkAllIssues <- function(string, issuelist) {
41
-status <- NULL
42
-for(i in 1:length(string)) {
43
-if(checkIssue(string[i], issuelist)) {
44
-status[i] <- TRUE
45
-}
46
-else {
47
-cat("Issue",string[i],"does not exist. Please try again.\n")
48
-status[i] <- FALSE
49
-}
50
-}
51
-return(status)
52
-}
53
-View(tweets)
54
-write.csv(tweets, file="tweets.csv")
55
-c_tweets <- read.csv("tweets.csv", colClasses="character")
56
-View(c_tweets)
57
-c_tweets$X <- NULL
58
-c_issues <- data.frame(date = drange)
59
-c_issuelist <- xmlToList("issues.xml")
60
-c_issueheads <- names(issuelist)
61
-c_issues[issueheads] <- 0
62
-source("issuecomp-codingsample-function.R")
63
-rm(c_err, c_result, c_samid, c_samno,c_samtags,c_samissue,c_samtext,c_yn)
64
-c_errors <- read.csv("issuecomp-codingsample-error.csv", header = F, sep=",", colClasses="character")
65
-View(c_errors)
66
-names(c_errors) <- c("str_id", "code", "issue", "tags", "text")
67
-View(c_errors)
68
-for(r in 1:nrow(c_errors)) {
69
-c_errcode <- as.character(c_errors$code[r])
70
-c_errissue <- as.character(c_errors$issue[r])
71
-c_errtags <- as.character(c_errors$tags[r])
72
-c_errtext <- as.character(c_errors$text[r])
73
-c_errid <- as.character(c_errors$str_id[r])
74
-cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errtags, "\n", sep="")
75
-source("issuecomp-codingsample-function2.R")
76
-}
77
-c_errors <- read.csv("issuecomp-codingsample-error.csv", header = F, sep=",", colClasses="character")
78
-names(c_errors) <- c("str_id", "code", "issue", "tags", "text")
79
-for(r in 1:nrow(c_errors)) {
80
-c_errcode <- as.character(c_errors$code[r])
81
-c_errissue <- as.character(c_errors$issue[r])
82
-c_errtags <- as.character(c_errors$tags[r])
83
-c_errtext <- as.character(c_errors$text[r])
84
-c_errid <- as.character(c_errors$str_id[r])
85
-cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errtags, "\n", sep="")
86
-source("issuecomp-codingsample-function2.R")
87
-}
88
-for(r in 1:nrow(c_errors)) {
89
-c_errcode <- as.character(c_errors$code[r])
90
-c_errissue <- as.character(c_errors$issue[r])
91
-c_errtags <- as.character(c_errors$tags[r])
92
-c_errtext <- as.character(c_errors$text[r])
93
-c_errid <- as.character(c_errors$str_id[r])
94
-cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errissue, " (", c_errtags, ")\n", sep="")
95
-source("issuecomp-codingsample-function2.R")
96
-}
97
-for(r in 1:nrow(c_errors)) {
98
-c_errcode <- as.character(c_errors$code[r])
99
-c_errissue <- as.character(c_errors$issue[r])
100
-c_errtags <- as.character(c_errors$tags[r])
101
-c_errtext <- as.character(c_errors$text[r])
102
-c_errid <- as.character(c_errors$str_id[r])
103
-cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errissue, " (", c_errtags, ")\n", sep="")
104
-source("issuecomp-codingsample-function2.R")
105
-}
106
-tagexpand
107
-source("issuecomp-codingsample-function.R")
108
-source("issuecomp-codingsample-function.R")
109
-source("issuecomp-codingsample-function.R")
110
-c_errors <- read.csv("issuecomp-codingsample-error.csv", header = F, sep=",", colClasses="character")
111
-names(c_errors) <- c("str_id", "code", "issue", "tags", "text")
112
-for(r in 1:nrow(c_errors)) {
113
-c_errcode <- as.character(c_errors$code[r])
114
-c_errissue <- as.character(c_errors$issue[r])
115
-c_errtags <- as.character(c_errors$tags[r])
116
-c_errtext <- as.character(c_errors$text[r])
117
-c_errid <- as.character(c_errors$str_id[r])
118
-cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errissue, " (", c_errtags, ")\n", sep="")
119
-source("issuecomp-codingsample-function2.R")
120
-}
121
-for(r in 1:nrow(c_errors)) {
122
-c_errcode <- as.character(c_errors$code[r])
123
-c_errissue <- as.character(c_errors$issue[r])
124
-c_errtags <- as.character(c_errors$tags[r])
125
-c_errtext <- as.character(c_errors$text[r])
126
-c_errid <- as.character(c_errors$str_id[r])
127
-cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errissue, " (", c_errtags, ")\n", sep="")
128
-source("issuecomp-codingsample-function2.R")
129
-}
130
-for(r in 1:nrow(c_errors)) {
131
-c_errcode <- as.character(c_errors$code[r])
132
-c_errissue <- as.character(c_errors$issue[r])
133
-c_errtags <- as.character(c_errors$tags[r])
134
-c_errtext <- as.character(c_errors$text[r])
135
-c_errid <- as.character(c_errors$str_id[r])
136
-cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errissue, " (", c_errtags, ")\n", sep="")
137
-source("issuecomp-codingsample-function2.R")
138
-}
139
-for(r in 1:nrow(c_errors)) {
140
-c_errcode <- as.character(c_errors$code[r])
141
-c_errissue <- as.character(c_errors$issue[r])
142
-c_errtags <- as.character(c_errors$tags[r])
143
-c_errtext <- as.character(c_errors$text[r])
144
-c_errid <- as.character(c_errors$str_id[r])
145
-cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errissue, " (", c_errtags, ")\n", sep="")
146
-source("issuecomp-codingsample-function2.R")
147
-}
148
-for(r in 1:nrow(c_errors)) {
149 1
 c_errcode <- as.character(c_errors$code[r])
150 2
 c_errissue <- as.character(c_errors$issue[r])
151 3
 c_errtags <- as.character(c_errors$tags[r])
@@ -510,3 +362,151 @@ View(issues)
510 362
 test <- VAR(issues[,2:44], p=1, type="none")
511 363
 VAR(issues_s[,2:23], p=1, type=c("const", "trend", "both", "none"), season=NULL, exogen = issues_i[2:22])
512 364
 plot(irf(test, impulse = names(issues_s[2:23]), response = names(issues_i[2:22])))
365
+rm(c_correct, c_curissue, c_errcode, c_errid, c_errissue, c_error1, c_error2, c_errors)
366
+rm(c_issues, c_issuelist, c_issueheads)
367
+rm(c_errtags, c_errtext, c_result, c_tag, c_tmp, c_tweets)
368
+require(stringr)
369
+require(XML)
370
+readYN <- function(question) {
371
+n <- readline(prompt=question)
372
+n <- as.character(n)
373
+return(n)
374
+}
375
+checkIssue <- function(string, issuelist) {
376
+status <- any(str_detect(string, issuelist))
377
+return(status)
378
+}
379
+checkAllIssues <- function(string, issuelist) {
380
+status <- NULL
381
+for(i in 1:length(string)) {
382
+if(checkIssue(string[i], issuelist)) {
383
+status[i] <- TRUE
384
+}
385
+else {
386
+cat("Issue",string[i],"does not exist. Please try again.\n")
387
+status[i] <- FALSE
388
+}
389
+}
390
+return(status)
391
+}
392
+View(tweets)
393
+c_tweets <- read.csv("tweets.csv", colClasses="character")
394
+for(r in 1:nrow(c_tweets)) {
395
+curtext <- as.character(c_tweets$text[r])
396
+if(str_detect(curtext, "\"")) {
397
+c_tweets$text[r] <- str_replace(curtext, "\"", "")
398
+}
399
+}
400
+c_tweets$X <- NULL
401
+c_issues <- data.frame(date = drange)
402
+c_issuelist <- xmlToList("issues-v2.xml")
403
+c_issueheads <- names(issuelist)
404
+c_issues[issueheads] <- 0
405
+source("issuecomp-codingsample-function.R")
406
+require(stringr)
407
+curtext
408
+curtext <- str_replace_all(curtext, "#", "")
409
+curtext <- str_replace_all(curtext, "-", " ")
410
+curtext
411
+curtext
412
+str_replace_all(curtext, "[^[:alnum:]]", "")
413
+str_replace_all(curtext, "[^[:alnum:]\s]", "")
414
+str_replace_all(curtext, "[^[:alnum:]\\s]", "")
415
+str_replace_all(curtext, "[^[:alnum:]^\\s]", "")
416
+str_replace_all(curtext, "[^[:alnum:]^\\S]", "")
417
+str_replace_all(curtext, "[^[:alnum:]][^\\s]", "")
418
+str_replace_all(curtext, "[^[:alnum:]][^\\S]", "")
419
+str_replace_all(curtext, "[^[:alnum:]][^[:blank]]", "")
420
+str_replace_all(curtext, "[^[:alnum:]][^[:blank:]]", "")
421
+str_replace_all(curtext, "[^[:alnum:]]", "")
422
+str_replace_all(curtext, "\\W", "")
423
+str_replace_all(curtext, "[\\W|\\S]", "")
424
+str_replace_all(curtext, "(\\W|\\S)", "")
425
+str_replace_all(curtext, "\\W|\\S", "")
426
+str_replace_all(curtext, "\\W", "")
427
+str_replace_all(curtext, "[\\W\\S]", "")
428
+str_replace_all(curtext, "[\\S\\W]", "")
429
+str_replace_all(curtext, "[\\s\\W]", "")
430
+str_replace_all(curtext, "[\\W\\s]", "")
431
+str_replace_all(curtext, "[\\W\s]", "")
432
+str_replace_all(curtext, "[\\Ws]", "")
433
+str_replace_all(curtext, "[\\W]", "")
434
+str_replace_all(curtext, "\\W", "")
435
+str_replace_all(curtext, "\\W|\\S", "")
436
+str_replace_all(curtext, "\\W|\\s", "")
437
+str_replace_all(curtext, "[^[:alnum:]]", "")
438
+str_replace_all(curtext, "[^[:alnum:] ]", "")
439
+str_replace_all(curtext, "[^[:alnum:]\\s]", "")
440
+str_replace_all(curtext, "[^[:alnum:] ]", "")
441
+curtext
442
+curtext <- "liebe @cdu, wir finden #Steuer gut, aber die KFZ-Steuer nicht!"
443
+curtext <- str_replace_all(curtext, "-", " ")
444
+curtext <- str_replace_all(curtext, "[^[:alnum:] ]", "")
445
+curtext
446
+curtext <- "liebe @cdu, wir finden #Steuer gut, aber die KFZ--Steuer nicht!"
447
+curtext <- str_replace_all(curtext, "-", " ")
448
+curtext <- str_replace_all(curtext, "[^[:alnum:] ]", "")
449
+curtext
450
+str_replace_all(curtext, "  ", " ")
451
+smartPatternMatch
452
+require(vars)
453
+require(stringr)
454
+adf1 <- summary(ur.df(issues))
455
+issues
456
+summary(issues)
457
+summary(issues[2:44])
458
+summary(issues[2:44], digits = 2)
459
+adf1 <- summary(ur.df(issues[, 2:44]), type ="trend", lags=1)
460
+data("Canda")
461
+data("Canada")
462
+class(Canada)
463
+class(issues)
464
+view(Canada)
465
+View(Canada)
466
+as.ts(issues)
467
+issues_ts <- as.ts(issues)
468
+class(issues_ts)
469
+View(issues_ts)
470
+View(issues)
471
+adf1 <- summary(ur.df(issues_ts[, 2:44]), type ="trend", lags=1)
472
+adf1 <- summary(ur.df(issues_ts[, 2]), type ="trend", lags=1)
473
+adf1 <- summary(ur.df(issues_ts[, 2], type ="trend", lags=1))
474
+adf1 <- summary(ur.df(issues_ts[, 2:44], type ="trend", lags=1))
475
+adf1 <- summary(ur.df(issues_ts[, 2], type ="trend", lags=1))
476
+adf1
477
+adf1 <- summary(ur.df(issues_ts[, 3], type ="trend", lags=1))
478
+adf1
479
+adf1 <- summary(ur.df(issues_ts[, 2], type ="none", lags=1))
480
+adf1
481
+adf1 <- summary(ur.df(issues_ts[, 2], type ="trend", lags=1))
482
+adf1
483
+summary(ur.df(issues_ts[, 2], type ="none", lags=1))
484
+VARselect(issues_ts[2:44], lag.max = 8, type = "both")
485
+VARselect(issues_ts[1:44], lag.max = 8, type = "both")
486
+VARselect(issues[1:44], lag.max = 8, type = "both")
487
+VARselect(issues[2:44], lag.max = 8, type = "both")
488
+VARselect(issues_ts[2:44], lag.max = 8, type = "both")
489
+VARselect(issues[2:44], lag.max = 8, type = "none")
490
+VARselect(issues[2:44], lag.max = 8, type = "trend")
491
+VARselect(issues[2:44], lag.max = 8, type = "const")
492
+VARselect(issues[2:44], lag.max = 8, type = "both")
493
+test <- VAR(issues[,2:44], p=1, type="both")
494
+# VAR(issues_s[,2:23], p=1, type=c("const", "trend", "both", "none"), season=NULL, exogen = issues_i[2:22])
495
+plot(irf(test, impulse = names(issues_s[2:23]), response = names(issues_i[2:22])))
496
+summary(ur.df(issues_ts[, 2], type ="both", lags=1))
497
+summary(ur.df(issues_ts[, 2], type ="none", lags=1))
498
+test <- VAR(issues_ts[,2:44], p=1, type="both")
499
+plot(irf(test, impulse = names(issues_s[2:23]), response = names(issues_i[2:22])))
500
+acc_df <- read.csv("MdB-twitter.csv")
501
+delrow <- NULL
502
+for(r in 1:nrow(acc_df)) {
503
+acc <- as.character(acc_df$twitter_acc[r])
504
+if(!nzchar(acc)) {
505
+delrow <- c(delrow, r)
506
+}
507
+}
508
+acc_df <- acc_df[-delrow, ]
509
+rm(delrow, r, acc)
510
+acc_df$row.names <- NULL
511
+row.names(acc_df) <- NULL
512
+View(acc_df)

+ 5
- 3
issuecomp-2-analysis.R View File

@@ -56,9 +56,11 @@ foreach(d = 1:nrow(issues), .packages = c("stringr"), .combine=rbind) %dopar% {
56 56
   
57 57
   for(t in 1:nrow(tweets_curday)){
58 58
     #     cat(paste("Starting tweet", t, "of",as.character(curdate),"\n"), file="issuecomp-analysis.log", append=TRUE)
59
-    # Select tweet's text, make it lowercase and remove hashtag indicators (#)
59
+    # Select tweet's text, make it lowercase and remove hashtags, mentions and replace hyphens by spaces
60 60
     curtext <- as.character(tweets_curday$text[t])
61
-    curtext <- str_replace_all(curtext, "#", "")
61
+    curtext <- str_replace_all(curtext, "-", " ")
62
+    curtext <- str_replace_all(curtext, "[^[:alnum:] ]", "")
63
+    curtext <- str_replace_all(curtext, "  ", " ")  # remove double spaces
62 64
     
63 65
     curid <- as.character(tweets_curday$id_str[t])
64 66
     
@@ -95,7 +97,7 @@ foreach(d = 1:nrow(issues), .packages = c("stringr"), .combine=rbind) %dopar% {
95 97
         }
96 98
         
97 99
         # Set Levenshtein distance depending on char length, acronym and hashtag status
98
-        if(curchars <= 6 || curacro || curhash) { # Distance = 1 if 7 chars or longer
100
+        if(curchars <= 7 || curacro || curhash) { # Distance = 1 if 8 chars or longer
99 101
           curdistance <- 0
100 102
         } else {
101 103
           curdistance <- 1

+ 17
- 1
issuecomp-3-calc.R View File

@@ -81,13 +81,19 @@ g1
81 81
 # test <- VAR(issues[,2:32], p=1, type=c("const", "trend", "both", "none"), season=NULL, exogen = NULL, lag.max = NULL, ic = c("AIC", "HQ", "SC", "FPE"))
82 82
 # test <- VAR(issues_i[,2:22], p=1, type="none", exogen = issues_s[,2:3])
83 83
 # test <- VAR(issues_s[,2:11], p=1, type="none")
84
-test <- VAR(issues[,2:44], p=1, type="none")
85 84
 # VAR(issues_s[,2:23], p=1, type=c("const", "trend", "both", "none"), season=NULL, exogen = issues_i[2:22])
86 85
 
86
+issues_ts <- as.ts(issues)
87
+vIssues <- VAR(issues_ts[,2:44], p=1, type="both")
88
+
87 89
 plot(irf(test, impulse = names(issues_s[2:23]), response = names(issues_i[2:22])))
88 90
 
89 91
 capture.output(print(summary(test), prmsd=TRUE, digits=1), file="out.txt")
90 92
 
93
+# Tests
94
+issues_ts <- as.ts(issues)
95
+VARselect(issues[2:44], lag.max = 8, type = "both")
96
+summary(ur.df(issues_ts[, 2], type ="none", lags=1))
91 97
 
92 98
 # SOME TESTS --------------------------------------------------------------
93 99
 
@@ -123,6 +129,16 @@ pie(acc_parties$twitter, col=c("black", "red", "purple", "green"), labels = c("C
123 129
 rm(acc_parties, p)
124 130
 
125 131
 
132
+# Count all tags
133
+num <- 0
134
+for(i in 1:length(issuelist)) {
135
+  j <- length(issuelist[[i]])
136
+  num <- num + j
137
+  rm(j)
138
+}
139
+num
140
+
141
+
126 142
 # VISUALS -----------------------------------------------------------------
127 143
 
128 144
 

+ 1
- 1
issuecomp-codingsample.R View File

@@ -45,7 +45,7 @@ c_tweets$X <- NULL
45 45
 
46 46
 # Read all issues from XML file
47 47
 c_issues <- data.frame(date = drange)
48
-c_issuelist <- xmlToList("issues.xml")
48
+c_issuelist <- xmlToList("issues-v2.xml")
49 49
 c_issueheads <- names(issuelist)
50 50
 c_issues[issueheads] <- 0
51 51
 

+ 10
- 0
issues-expand.xml View File

@@ -1 +1,11 @@
1
+<s.ukraine>
2
+  <tag>#Janukowitsch</tag>
3
+</s.ukraine>
1 4
 
5
+<i2.civil>
6
+  <tag>Foltermethode</tag>
7
+</i2.civil>
8
+
9
+<i19.ib>
10
+  --<tag>Afghanistan</tag>
11
+</i19.ib>

+ 1983
- 0
issues-v3.xml
File diff suppressed because it is too large
View File


Loading…
Cancel
Save