require(stringr) require(XML) # FUNCTIONS --------------------------------------------------------------- readYN <- function(question) { n <- readline(prompt=question) n <- as.character(n) return(n) } checkIssue <- function(string, issuelist) { status <- any(str_detect(string, issuelist)) return(status) } checkAllIssues <- function(string, issuelist) { status <- NULL for(i in 1:length(string)) { if(checkIssue(string[i], issuelist)) { status[i] <- TRUE } else { cat("Issue",string[i],"does not exist. Please try again.\n") status[i] <- FALSE } } return(status) } # SAMPLE OUT/INPUT -------------------------------------------------------- # Read CSV of all tweets (with tags, if available) c_tweets <- read.csv("tweets.csv", colClasses="character") # Replace quotes because it may cause problems when saving and reading as CSV files for(r in 1:nrow(c_tweets)) { curtext <- as.character(c_tweets$text[r]) if(str_detect(curtext, "\"")) { c_tweets$text[r] <- str_replace(curtext, "\"", "") } } c_tweets$X <- NULL # Read all issues from XML file c_issues <- data.frame(date = drange) c_issuelist <- xmlToList("issues-v2.xml") c_issueheads <- names(issuelist) c_issues[issueheads] <- 0 # Run through as many tweets as wished to mark them as correct or incorrect source("issuecomp-codingsample-function.R") rm(c_err, c_result, c_samid, c_samno,c_samtags,c_samissue,c_samtext,c_yn) # Now go through tweets/tags marked as false # Exit codes: # 0 = Correct tagging # 1 = At least one tag was incorrect # 2 = At least one tag was missing # 3 = Both 1 and 2 c_errors <- read.csv("issuecomp-codingsample-error.csv", header = F, sep=",", colClasses="character") names(c_errors) <- c("str_id", "code", "issue", "tags", "text") for(r in 1:nrow(c_errors)) { c_errcode <- as.character(c_errors$code[r]) c_errissue <- as.character(c_errors$issue[r]) c_errtags <- as.character(c_errors$tags[r]) c_errtext <- as.character(c_errors$text[r]) c_errid <- as.character(c_errors$str_id[r]) cat("===============\n\n[TWEET]: ",c_errtext,"\n[ISSUES]: ", c_errissue, " (", c_errtags, ")\n", sep="") source("issuecomp-codingsample-function2.R") } # Now import the error files in a human readable data frame to improve the issue database # All tweets with WRONG ISSUES c_tmp <- read.csv("issuecomp-codingsample-error1.csv", header = F, colClasses="character") names(c_tmp) <- c("str_id", "all", "wrong", "tagged", "text") c_error1 <- c_tmp[, c("wrong", "tagged", "all", "text")] # All tweets with MISSING ISSUES c_tmp <- read.csv("issuecomp-codingsample-error2.csv", header = F, colClasses="character") names(c_tmp) <- c("str_id", "all", "missing", "tagged", "text") c_error2 <- c_tmp[, c("missing", "text", "tagged", "all")] # All CORRECT tweets c_tmp <- read.csv("issuecomp-codingsample-correct.csv", header = F, colClasses="character") names(c_tmp) <- c("str_id", "status", "issue", "tags", "text") c_correct <- c_tmp