initial commit v1.0
This commit is contained in:
230
R-Code/imprintanalyse.R
Normal file
230
R-Code/imprintanalyse.R
Normal file
@@ -0,0 +1,230 @@
|
||||
library(XML)
|
||||
library(stringr)
|
||||
library(RCurl)
|
||||
library(ggplot2)
|
||||
library(xtable)
|
||||
|
||||
setwd(dir="~/Dokumente/Uni/Aktuell/Datenerhebung WWW/Blogartikel 1 - Impressum")
|
||||
|
||||
|
||||
|
||||
# COLLECTION OF LINKS -----------------------------------------------------
|
||||
|
||||
links <- readLines("links.txt")
|
||||
dmozlinks <- NULL
|
||||
dmozcat <- NULL
|
||||
for (i in 1:length(links)) {
|
||||
dmozlinks[i] <- str_extract(links[i], "(.+)?;")
|
||||
dmozlinks[i] <- str_replace(dmozlinks[i], ";", "")
|
||||
dmozcat[i] <- str_extract(links[i], ";(.+)?")
|
||||
dmozcat[i] <- str_replace(dmozcat[i], ";", "")
|
||||
rm(i)
|
||||
}
|
||||
|
||||
dmozDF <- data.frame(cat = dmozcat, link = dmozlinks)
|
||||
|
||||
|
||||
|
||||
# FUNCTIONS ---------------------------------------------------------------
|
||||
|
||||
handle <- getCurlHandle(useragent = str_c(R.version$version.string), httpheader = c(from = "mail@mehl.mx"))
|
||||
|
||||
# Load functions from external file
|
||||
source("imprintanalyse-functions.R")
|
||||
|
||||
|
||||
# PREPARATIONS ------------------------------------------------------------
|
||||
|
||||
# Create an empty data frame
|
||||
df <- data.frame(kind=NA, cat=NA, title=NA, link=NA)
|
||||
df <- df[-1,]
|
||||
|
||||
# Get all links and titles from dmoz category pages and put in DF
|
||||
for (i in 1:nrow(dmozDF)) {
|
||||
dmozpage <- htmlParse(dmozDF$link[i])
|
||||
cat <- dmozDF$cat[i]
|
||||
kind <- str_extract(cat, "[[:alpha:]]+")
|
||||
title <- xpathSApply(dmozpage, path="//ul[@class='directory-url']/li/a", xmlValue)
|
||||
link <- xpathSApply(dmozpage, path="//ul[@class='directory-url']/li/a", xmlGetAttr, "href")
|
||||
df_temp <- data.frame(kind=kind, cat=cat, title=title, link=link)
|
||||
df <- rbind(df, df_temp)
|
||||
}
|
||||
|
||||
# Clean up variables/data frames used above
|
||||
rm(links, dmozcat, dmozlinks, dmozDF, df_temp, i, dmozpage, kind, cat, title, link)
|
||||
|
||||
|
||||
|
||||
# DOWNLOAD INDEXES --------------------------------------------------------
|
||||
|
||||
# Check all links, then transform all links to valid filenames and generate their local destination
|
||||
# All files will be downloaded to a folder named by their kind of website/category
|
||||
|
||||
# Check all links and write status in data frame
|
||||
# This is important to filter out unreachable websites in the first step
|
||||
for (i in 1:nrow(df)) {
|
||||
status <- urlCheck(as.character(df$link[i]), i)
|
||||
cat(as.character(df$title[i]), "is online:", status, "\n")
|
||||
df$ison[i] <- status
|
||||
rm(i, status)
|
||||
}
|
||||
|
||||
# Convert links to filepaths and create directories
|
||||
for (i in 1:nrow(df)) {
|
||||
df$filepath[i] <- urlToPath(df$link[i], df$kind[i], df$cat[i])
|
||||
rm(i)
|
||||
}
|
||||
|
||||
# Download all links and put them in a category folder
|
||||
for (i in 1:nrow(df)) {
|
||||
cat("Downloading entry", i, "\n")
|
||||
fileurl <- as.character(df$link[i])
|
||||
destfile <- df$filepath[i]
|
||||
var <- tryCatch({downloadFile(fileurl, destfile, i)}, error=function(e){cat("ERROR :", conditionMessage(e), "\n")})
|
||||
if (!is.null(var)) {
|
||||
cat("Entry", i, "throws out an download error. Marking as offline\n")
|
||||
df$ison[i] <- FALSE
|
||||
}
|
||||
rm(destfile, fileurl, i, var) # clean up after each loop
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
# CHECK FOR IMPRESS -------------------------------------------------------
|
||||
|
||||
# Check all downloaded files for imprint and put result in DF
|
||||
|
||||
# Create empty data frame columns
|
||||
for (i in 1:nrow(df)) {
|
||||
df$hasimpress[i] <- NA
|
||||
df$conhasimpress <- NA
|
||||
df$hascontact[i] <- NA
|
||||
df$conlink[i] <- NA
|
||||
df$conpath[i] <- NA
|
||||
rm(i)
|
||||
}
|
||||
|
||||
impressstring <- c("Impressum", "impressum", "IMPRESSUM", "Impress", "impress", "IMPRESS", "Imprint", "imprint", "IMPRINT")
|
||||
contactstring <- c("Kontakt", "kontakt", "KONTAKT", "Contact", "contact", "CONTACT")
|
||||
|
||||
for (i in 1:nrow(df)) {
|
||||
if (file.exists(df$filepath[i])) { # only check for imprint, if page has been downloaded
|
||||
file_parsed <- htmlParse(df$filepath[i])
|
||||
impress <- unlist(xpathSApply(file_parsed, sprintf("//a[contains(text(), '%s')]", impressstring)))
|
||||
impress2 <- unlist(xpathSApply(file_parsed, sprintf("//a/*[contains(text(), '%s')]", impressstring)))
|
||||
# Two ways to find an imprint link. Only one has to be valid
|
||||
if (!is.null(impress) || !is.null(impress2)) {impress <- "not null"} else {impress <- NULL}
|
||||
if (!is.null(impress)) { # index page has imprint
|
||||
df$hasimpress[i] <- TRUE
|
||||
df$hascontact[i] <- FALSE
|
||||
cat(as.character(df$title[i]), "has an imprint\n")
|
||||
}
|
||||
else { # there's no imprint-link. Now look for contact page
|
||||
cat(as.character(df$title[i]), "has no imprint, checking for contact page\n")
|
||||
contact <- unlist(xpathSApply(file_parsed, sprintf("//a[contains(text(), '%s')]", contactstring)))
|
||||
if (is.null(contact)) { # there's no imprint and no contact page
|
||||
cat(as.character(df$title[i]), "has no contact page either\n")
|
||||
df$hasimpress[i] <- FALSE
|
||||
df$hascontact[i] <- FALSE
|
||||
}
|
||||
else { # there's no imprint but a contact page
|
||||
df$hascontact[i] <- TRUE
|
||||
cat(as.character(df$title[i]), "has an contact page\n")
|
||||
}
|
||||
rm(contact)
|
||||
}
|
||||
rm(file_parsed, i, impress, impress2) # clean up after each loop
|
||||
}
|
||||
}
|
||||
|
||||
for (i in 1:nrow(df)) {
|
||||
if (file.exists(df$filepath[i])) {
|
||||
if (df$hascontact[i]) {
|
||||
cat("Processing item", i, "\n")
|
||||
file_parsed <- htmlParse(df$filepath[i])
|
||||
df$conlink[i] <- unlist(xpathSApply(file_parsed, sprintf("//a[contains(text(), '%s')]", contactstring), xmlGetAttr, "href"))
|
||||
df$conlink[i] <- relToAbsUrl(abslink=df$link[i], rellink=df$conlink[i])
|
||||
df$conpath[i] <- urlToPath(df$conlink[i], df$kind[i], df$cat[i])
|
||||
if (urlCheck(df$conlink[i])) {
|
||||
downloadContact(df$conlink[i], df$conpath[i], i)
|
||||
# Now analyse the (already) downloaded contact page
|
||||
contact_parsed <- htmlParse(df$conpath[i])
|
||||
contactimpress <- unlist(xpathSApply(contact_parsed, sprintf("//*[contains(text(), '%s')]", impressstring)))
|
||||
if (!is.null(contactimpress)) { # there's a imprint on the contact page
|
||||
cat(as.character(df$title[i]), "has an contact page with imprint\n")
|
||||
df$hasimpress[i] <- TRUE
|
||||
df$conhasimpress[i] <- TRUE
|
||||
}
|
||||
else { # there's no imprint on the contact page
|
||||
cat(as.character(df$title[i]), "has an contact page WITHOUT imprint\n")
|
||||
df$hasimpress[i] <- FALSE
|
||||
df$conhasimpress[i] <- FALSE
|
||||
}
|
||||
rm(contact_parsed, contactimpress)
|
||||
}
|
||||
else {
|
||||
cat(as.character(df$title[i]), "has an contact page, which is offline\n")
|
||||
df$hasimpress[i] <- FALSE
|
||||
}
|
||||
rm(file_parsed)
|
||||
}
|
||||
}
|
||||
rm(i)
|
||||
}
|
||||
|
||||
|
||||
# ANALYSIS ----------------------------------------------------------------
|
||||
|
||||
df <- tbl_df(df)
|
||||
|
||||
# First let's export the data frame
|
||||
write.table(df, "df_backup.txt", sep=";")
|
||||
|
||||
# Cleaning offline (and therefore not analysed) websites to remove NAs in analysis
|
||||
for (i in 1:nrow(df)) {
|
||||
status <- df$hasimpress[i]
|
||||
if (is.na(status)) {
|
||||
cat("Deleting entry", i, "\n")
|
||||
df <- df[-i,]
|
||||
}
|
||||
rm(i, status)
|
||||
}
|
||||
|
||||
|
||||
# Removing websites with .ch or .at domain as they most likely are not driven by a German individual or company
|
||||
# Possible R bug. Run multiple times until no status message returns
|
||||
for (i in 1:nrow(df)) {
|
||||
url <- as.character(df$link[i])
|
||||
status <- str_detect(url, "(\\.ch)|(\\.at)")
|
||||
if (status) {
|
||||
cat(as.character(df$link[i]), "is from CH or AT. Therefore deleting it\n")
|
||||
df <- df[-i,]
|
||||
}
|
||||
rm(i, status, url)
|
||||
}
|
||||
|
||||
## HISTOGRAM
|
||||
histo <- ggplot(df,aes(kind))
|
||||
#histo <- ggplot(df,aes(cat))
|
||||
|
||||
histo + geom_histogram(aes(fill = hasimpress), position = "fill") + xlab("Art der Webseite") + ylab("Prozentualer Anteil von Impressen") + ggtitle("Anteil von Impressen auf\n verschiedenen Webseiten-Typen") + guides(fill=guide_legend(title=NULL)) + scale_fill_discrete(breaks=c(TRUE, FALSE), labels=c("Impressum vorhanden", "Kein Impressum"))
|
||||
|
||||
ggsave(file="abbildung1mehl.pdf")
|
||||
ggsave(file="abbildung1mehl.png")
|
||||
|
||||
## TABLE1
|
||||
table1 <- c("blogs", "foren", "shops", "news")
|
||||
table1 <- data.frame(kind=table1)
|
||||
|
||||
for (i in 1:nrow(table1)) {
|
||||
table1$total[i] <- nrow(subset(df, kind==table1$kind[i]))
|
||||
table1$hasimpress[i] <- nrow(subset(df, kind==table1$kind[i] & hasimpress==TRUE))
|
||||
table1$noimpress[i] <- nrow(subset(df, kind==table1$kind[i] & hasimpress==FALSE))
|
||||
p <- round(table1$hasimpress[i]/table1$total[i]*100, 2)
|
||||
table1$pimpress[i] <- str_c(p, " %")
|
||||
rm(i,p)
|
||||
}
|
||||
names(table1) <- c("Seitentyp", "Anzahl", "Mit Impressum", "Ohne Impressum", "Anteil der Seiten mit Impressum")
|
||||
table1[,1] <- c("Blogs", "Foren", "Shops", "News-Seiten")
|
||||
print(xtable(table1), type="html", file="tabelle1mehl.html")
|
||||
Reference in New Issue
Block a user