second run; improving behaviour at different places

This commit is contained in:
2015-03-06 13:40:22 +02:00
parent e3ff17df48
commit 42bfe4c773
5 changed files with 233 additions and 231 deletions
+9 -7
View File
@@ -21,7 +21,7 @@ drange <- date_start + days(0:drange)
# Import issues and prepare everything
# Will only be filled after the large categorisation loop
issues <- data.frame(date = drange)
issuelist <- readLines("issues.xml")
issuelist <- readLines("issues-v2.xml")
issuelist <- str_replace_all(string = issuelist, pattern = ".*<!-- .+ -->", "")
issuelist <- xmlToList(issuelist)
issueheads <- names(issuelist)
@@ -66,7 +66,8 @@ foreach(d = 1:nrow(issues), .packages = c("stringr"), .combine=rbind) %dopar% {
for(i in 1:length(issueheads)) {
curissue <- issueheads[i]
curtags <- as.character(issuelist[[curissue]])
curfile <- str_c(id_folder,"/",curissue,".csv")
# curfile <- str_c(id_folder,"/",curissue,".csv")
curfile <- str_c(id_folder,"/",curdate,".csv") # Possible solution to avoid buggy files when using many processes
# Now test all tags of a single issue
for(s in 1:length(curtags)) {
@@ -144,8 +145,9 @@ stopCluster(cl)
# IMPORT RESULTS ----------------------------------------------------------
# Import all files which have been generated at the categorisation run above.
setwd("matched-ids/")
results_files <- list.files()
#setwd("matched-ids/")
#results_files <- list.files()
results_files <- "matched-ids/all.csv"
for(r in 1:length(results_files)) {
if(r == 1) {
results <- read.csv(results_files[r], sep=";", colClasses=c("character", "character", "character", "character"), header=F)
@@ -166,15 +168,15 @@ row.names(results) <- NULL
# (which wasn't possible in the categorisation process because of parallelisation)
# Reset issues counter
# issues[issueheads] <- 0
#issues[issueheads] <- 0
for(r in 1:nrow(results)) {
for(r in 33170:nrow(results)) {
curdate <- as.character(results$date[r])
curid <- as.character(results$id_str[r])
curissue <- as.character(results$issue[r])
curtag <- as.character(results$tags[r])
cat("Sorting match", r, "of 62827 \n")
cat("Sorting match", r, "of 53383 \n")
# Update issue counter (date and issue)
issues[issues[, "date"] == curdate, curissue] <- issues[issues[, "date"] == curdate, curissue] + 1