uni-surprising-newsfocus/R-Code/newsfokus-3-analysis.R

require(plyr)
require(dplyr)
require(stringr)


# PREPARATIONS ------------------------------------------------------------


no <- 0
for (r in 1:nrow(cl)) {
  for (c in 3:ncol(cl)) {
    no <- no + as.numeric(cl[r,c])
  }
}
cat("Sum of country news entries:",no)
rm(r,c,no)

# Save old cl-dataframe for next steps
cl_bak <- cl

# Merge appearances for same country code (but different name)
cl <- ddply(cl,"code",numcolwise(sum))
target <- which(names(cl) == 'code')[1]
cl <- cbind(cl[,1:target,drop=F], data.frame(name="PLACEHOLDER"), cl[,(target+1):length(cl),drop=F])
rm(target)
cl$name <- "bla"

# Now choose names for the according codes (first fits)
for (r in 1:nrow(cl)) {
  code <- as.character(cl$code[r])
  code <- namibiaBug(code)
  r2 <- 1

  repeat {
    code2 <- as.character(cl_bak$code[r2])
    code2 <- namibiaBug(code2)

    if (code2 == code) {
      name <- as.character(cl_bak$name[r2])
      cl$name[r] <- name
      break
    }
    else {
      r2 <- r2 + 1
    }
  }
  rm(r, r2, code, code2, name)
}


# Summarize all counts for each country
cl_stats <- cl[,1:2]
cl_stats["overall"] <- 0
overall <- NULL
for (r in 1:nrow(cl)) {
  overall[r] <- 0
  for (c in 3:ncol(cl)) {
    overall[r] <- overall[r] + as.numeric(cl[r,c])
  }
  cl_stats$overall[r] <- overall[r]
}
rm(overall, r, c)


# # If we would want the overall-counter in cl
# cl["overall"] <- 0
# cl$overall <- cl_stats$overall


# # Get all names for one country code (if there are multiple)
# for (r in 1:nrow(cl)) {
#   name <- NULL
#   no <- 0
#   code <- as.character(cl$code[r])
#   code <- namibiaBug(code)
#   r2 <- 1
#   repeat {
#     if (r2 > nrow(cl_bak)) {
#       break  # only end if no next row
#     }
#     code2 <- as.character(cl_bak$code[r2])
#     code2 <- namibiaBug(code2)
#     if (code2 == code) {
#       no <- no + 1
#       name[no] <- as.character(cl_bak$name[r2])
#       r2 <- r2 + 1
#     }
#     else {
#       r2 <- r2 + 1
#     }
#   }
#   if (length(name) > 1) {
#     cat("For",code,"there are", no, "names:", name, "\n")
#   }
#   rm(r, name, no, code, r2, code2)
# }

# Old dataframe not needed anymore
rm(cl_bak)


# Calculate the total and average news entries for each year
years <- 2000:2014
# Search string for str_detect for every year
year_str <- sprintf("^\\d{1,2}\\.%s", years)

for (r in 1:nrow(cl)) {
  total <- 0
  average <- 0
  for (y in 1:length(years)) {
    months <- 0
    for (c in 1:ncol(cl)) {
      if (str_detect(names(cl)[c], year_str[y])) {
        total <- total + as.numeric(cl[r,c])
        months <- months + 1
      }
    }
    colnametotal <- str_c(years[y],"-total")
    colnameaverg <- str_c(years[y],"-averg")
    average <- round(total / months, 4)
    cl_stats[r, colnametotal] <- total
    cl_stats[r, colnameaverg] <- average
    total <- 0
    average <- 0
  }
  rm(r, total, average, y, months, c, colnametotal, colnameaverg)
}
rm(years, year_str)


# IDENTIFY SURPRISING NEWSFOCUS -------------------------------------------


# Land war 3x öfter genannt als im Monat davor, aber mehr als 50x
no <- 0
for (c in 4:ncol(cl)) { # starting 1 month later
  for (r in 1:nrow(cl)) {
    month <- names(cl)[c]

    # Conditions to fulfill
    status1 <- cl[r,c] > 3 * cl[r,c-1]
    status2 <- cl[r,c] > 50

    if (status1 && status2) {
      no <- no + 1
      cat("[",no,"] ",as.character(cl$code[r]),": 3x m-1 && >50 in: ", month,"\n", sep = "")
    }
  }
  rm(r,c,month,status1,status2)
}
rm(no)


# Land wurde in einem Monat 3x öfter als im Jahresdurchschnitt genannt
no <- 0
for (c in 3:ncol(cl)) {
  for (r in 1:nrow(cl)) {
    month <- names(cl)[c]
    year <- str_extract(month, "\\d{4}")
    averg <- str_c(year,"-averg")
    averg <- cl_stats[r,averg]

    # Conditions to fulfill
    status1 <- cl[r,c] > 3 * averg
    status2 <- cl[r,c] > 50

    if (status1 && status2) {
      no <- no + 1
      cat("[",no,"] ",as.character(cl$code[r]),": 3x year average && >50 ", month,"\n", sep = "")    }
  }
  rm(r,c,month,year,averg,status1,status2)
}
rm(no)

# Final method: Land in einem Monat öfter genannt als alle 3 Monate davor zusammen
cl_supfoc_mon <- data.frame(code=NA,name=NA)
cl_supfoc_mon[sprintf("%s", tspans)] <- 0
no <- 0
for (c in 6:ncol(cl)) {
  for (r in 1:nrow(cl)) {
    month <- names(cl)[c]
    code  <- as.character(cl$code[r])
    name  <- as.character(cl$name[r])

    # Conditions to fulfill
    status1 <- cl[r,c] > cl[r,c-1]+cl[r,c-2]+cl[r,c-3]
    status2 <- cl[r,c] > 50

    if (status1 && status2) {
      no <- no + 1
      cl_supfoc_mon[no, "code"] <- code
      cl_supfoc_mon$name[no] <- name
      #if (is.null(cl_supfoc_mon[no, month])) { cl_supfoc_mon[no, month] <- 0}
      #cl_supfoc_mon[no, month] <- cl_supfoc_mon[no, month] + 1
      cl_supfoc_mon[no, month] <- 1
      cat("[",no,"] ",as.character(cl$code[r]),": >m-(1:3) && >50 ", month,"\n", sep = "")
    }
  }
  rm(r,c,month,status1,status2)
}
rm(no, code, name)

# Clean cl_supfoc_mon: Replace NAs by 0, and sum up multiple appeared countries
cl_supfoc_mon[is.na(cl_supfoc_mon)] <- 0
cl_supfoc_mon <- ddply(cl_supfoc_mon,c("code", "name"),numcolwise(sum))

# Delete all month-columns with 0 surprising events
cl_supfoc_only_mon <- removeZeroMonths(cl_supfoc_mon, 3, ncol(cl_supfoc_mon))


# Get total surprising newsfocuses for each country
cl_supfoc_total <- data.frame(code=NA, name=NA, total=NA)
for (r in 1:nrow(cl_supfoc_mon)) {
  total <- 0
  cl_supfoc_total[r,"code"] <- as.character(cl_supfoc_mon$code[r])
  cl_supfoc_total[r,"name"] <- as.character(cl_supfoc_mon$name[r])
  for (c in 3:ncol(cl_supfoc_mon)) {
    total <- total + as.numeric(cl_supfoc_mon[r,c])
  }
  cl_supfoc_total[r,"total"] <- total
}
rm(r, total, c)


# Total highlights per month + turn around table for graphs
cl_supfoc_mon["highlight"] <- 1
cl_supfoc_turn_mon <- ddply(cl_supfoc_mon,"highlight", numcolwise(sum))
cl_supfoc_mon$highlight <- NULL
cl_supfoc_turn_mon$highlight <- NULL
cl_supfoc_turn_mon <- data.frame(month = names(cl_supfoc_turn_mon), highs = as.numeric(cl_supfoc_turn_mon[1,]))
# Convert %d.%y to valid date class
months <- NULL
for (m in 1:length(tspans)) {
  dates <- str_c("15.",tspans[m])
  months[m] <- dates
}
rm(m, dates)
cl_supfoc_turn_mon$month <- as.Date(months, format = "%d.%m.%Y")
rm(months)

# Delete all month-rows with 0 surprising events
cl_supfoc_turn_only_mon <- cl_supfoc_turn_mon[!cl_supfoc_turn_mon$highs == 0,]
rownames(cl_supfoc_turn_only_mon) <- NULL


# # Replace 0s by NAs
# for (r in 1:180) {
#   if (! is.na(cl_total2$highs[r])) {
#     if (cl_total2$highs[r] == 0) {
#       cl_total2$highs[r] <- NA
#     }
#   }
# }