second run; improving behaviour at different places
This commit is contained in:
@@ -21,7 +21,7 @@ drange <- date_start + days(0:drange)
|
||||
# Import issues and prepare everything
|
||||
# Will only be filled after the large categorisation loop
|
||||
issues <- data.frame(date = drange)
|
||||
issuelist <- readLines("issues.xml")
|
||||
issuelist <- readLines("issues-v2.xml")
|
||||
issuelist <- str_replace_all(string = issuelist, pattern = ".*<!-- .+ -->", "")
|
||||
issuelist <- xmlToList(issuelist)
|
||||
issueheads <- names(issuelist)
|
||||
@@ -66,7 +66,8 @@ foreach(d = 1:nrow(issues), .packages = c("stringr"), .combine=rbind) %dopar% {
|
||||
for(i in 1:length(issueheads)) {
|
||||
curissue <- issueheads[i]
|
||||
curtags <- as.character(issuelist[[curissue]])
|
||||
curfile <- str_c(id_folder,"/",curissue,".csv")
|
||||
# curfile <- str_c(id_folder,"/",curissue,".csv")
|
||||
curfile <- str_c(id_folder,"/",curdate,".csv") # Possible solution to avoid buggy files when using many processes
|
||||
|
||||
# Now test all tags of a single issue
|
||||
for(s in 1:length(curtags)) {
|
||||
@@ -144,8 +145,9 @@ stopCluster(cl)
|
||||
# IMPORT RESULTS ----------------------------------------------------------
|
||||
|
||||
# Import all files which have been generated at the categorisation run above.
|
||||
setwd("matched-ids/")
|
||||
results_files <- list.files()
|
||||
#setwd("matched-ids/")
|
||||
#results_files <- list.files()
|
||||
results_files <- "matched-ids/all.csv"
|
||||
for(r in 1:length(results_files)) {
|
||||
if(r == 1) {
|
||||
results <- read.csv(results_files[r], sep=";", colClasses=c("character", "character", "character", "character"), header=F)
|
||||
@@ -166,15 +168,15 @@ row.names(results) <- NULL
|
||||
# (which wasn't possible in the categorisation process because of parallelisation)
|
||||
|
||||
# Reset issues counter
|
||||
# issues[issueheads] <- 0
|
||||
#issues[issueheads] <- 0
|
||||
|
||||
for(r in 1:nrow(results)) {
|
||||
for(r in 33170:nrow(results)) {
|
||||
curdate <- as.character(results$date[r])
|
||||
curid <- as.character(results$id_str[r])
|
||||
curissue <- as.character(results$issue[r])
|
||||
curtag <- as.character(results$tags[r])
|
||||
|
||||
cat("Sorting match", r, "of 62827 \n")
|
||||
cat("Sorting match", r, "of 53383 \n")
|
||||
|
||||
# Update issue counter (date and issue)
|
||||
issues[issues[, "date"] == curdate, curissue] <- issues[issues[, "date"] == curdate, curissue] + 1
|
||||
|
||||
Reference in New Issue
Block a user