library(XML) library(stringr) library(RCurl) library(ggplot2) library(xtable) setwd(dir="~/Dokumente/Uni/Aktuell/Datenerhebung WWW/Blogartikel 1 - Impressum") # COLLECTION OF LINKS ----------------------------------------------------- links <- readLines("links.txt") dmozlinks <- NULL dmozcat <- NULL for (i in 1:length(links)) { dmozlinks[i] <- str_extract(links[i], "(.+)?;") dmozlinks[i] <- str_replace(dmozlinks[i], ";", "") dmozcat[i] <- str_extract(links[i], ";(.+)?") dmozcat[i] <- str_replace(dmozcat[i], ";", "") rm(i) } dmozDF <- data.frame(cat = dmozcat, link = dmozlinks) # FUNCTIONS --------------------------------------------------------------- handle <- getCurlHandle(useragent = str_c(R.version$version.string), httpheader = c(from = "mail@mehl.mx")) # Load functions from external file source("imprintanalyse-functions.R") # PREPARATIONS ------------------------------------------------------------ # Create an empty data frame df <- data.frame(kind=NA, cat=NA, title=NA, link=NA) df <- df[-1,] # Get all links and titles from dmoz category pages and put in DF for (i in 1:nrow(dmozDF)) { dmozpage <- htmlParse(dmozDF$link[i]) cat <- dmozDF$cat[i] kind <- str_extract(cat, "[[:alpha:]]+") title <- xpathSApply(dmozpage, path="//ul[@class='directory-url']/li/a", xmlValue) link <- xpathSApply(dmozpage, path="//ul[@class='directory-url']/li/a", xmlGetAttr, "href") df_temp <- data.frame(kind=kind, cat=cat, title=title, link=link) df <- rbind(df, df_temp) } # Clean up variables/data frames used above rm(links, dmozcat, dmozlinks, dmozDF, df_temp, i, dmozpage, kind, cat, title, link) # DOWNLOAD INDEXES -------------------------------------------------------- # Check all links, then transform all links to valid filenames and generate their local destination # All files will be downloaded to a folder named by their kind of website/category # Check all links and write status in data frame # This is important to filter out unreachable websites in the first step for (i in 1:nrow(df)) { status <- urlCheck(as.character(df$link[i]), i) cat(as.character(df$title[i]), "is online:", status, "\n") df$ison[i] <- status rm(i, status) } # Convert links to filepaths and create directories for (i in 1:nrow(df)) { df$filepath[i] <- urlToPath(df$link[i], df$kind[i], df$cat[i]) rm(i) } # Download all links and put them in a category folder for (i in 1:nrow(df)) { cat("Downloading entry", i, "\n") fileurl <- as.character(df$link[i]) destfile <- df$filepath[i] var <- tryCatch({downloadFile(fileurl, destfile, i)}, error=function(e){cat("ERROR :", conditionMessage(e), "\n")}) if (!is.null(var)) { cat("Entry", i, "throws out an download error. Marking as offline\n") df$ison[i] <- FALSE } rm(destfile, fileurl, i, var) # clean up after each loop } # CHECK FOR IMPRESS ------------------------------------------------------- # Check all downloaded files for imprint and put result in DF # Create empty data frame columns for (i in 1:nrow(df)) { df$hasimpress[i] <- NA df$conhasimpress <- NA df$hascontact[i] <- NA df$conlink[i] <- NA df$conpath[i] <- NA rm(i) } impressstring <- c("Impressum", "impressum", "IMPRESSUM", "Impress", "impress", "IMPRESS", "Imprint", "imprint", "IMPRINT") contactstring <- c("Kontakt", "kontakt", "KONTAKT", "Contact", "contact", "CONTACT") for (i in 1:nrow(df)) { if (file.exists(df$filepath[i])) { # only check for imprint, if page has been downloaded file_parsed <- htmlParse(df$filepath[i]) impress <- unlist(xpathSApply(file_parsed, sprintf("//a[contains(text(), '%s')]", impressstring))) impress2 <- unlist(xpathSApply(file_parsed, sprintf("//a/*[contains(text(), '%s')]", impressstring))) # Two ways to find an imprint link. Only one has to be valid if (!is.null(impress) || !is.null(impress2)) {impress <- "not null"} else {impress <- NULL} if (!is.null(impress)) { # index page has imprint df$hasimpress[i] <- TRUE df$hascontact[i] <- FALSE cat(as.character(df$title[i]), "has an imprint\n") } else { # there's no imprint-link. Now look for contact page cat(as.character(df$title[i]), "has no imprint, checking for contact page\n") contact <- unlist(xpathSApply(file_parsed, sprintf("//a[contains(text(), '%s')]", contactstring))) if (is.null(contact)) { # there's no imprint and no contact page cat(as.character(df$title[i]), "has no contact page either\n") df$hasimpress[i] <- FALSE df$hascontact[i] <- FALSE } else { # there's no imprint but a contact page df$hascontact[i] <- TRUE cat(as.character(df$title[i]), "has an contact page\n") } rm(contact) } rm(file_parsed, i, impress, impress2) # clean up after each loop } } for (i in 1:nrow(df)) { if (file.exists(df$filepath[i])) { if (df$hascontact[i]) { cat("Processing item", i, "\n") file_parsed <- htmlParse(df$filepath[i]) df$conlink[i] <- unlist(xpathSApply(file_parsed, sprintf("//a[contains(text(), '%s')]", contactstring), xmlGetAttr, "href")) df$conlink[i] <- relToAbsUrl(abslink=df$link[i], rellink=df$conlink[i]) df$conpath[i] <- urlToPath(df$conlink[i], df$kind[i], df$cat[i]) if (urlCheck(df$conlink[i])) { downloadContact(df$conlink[i], df$conpath[i], i) # Now analyse the (already) downloaded contact page contact_parsed <- htmlParse(df$conpath[i]) contactimpress <- unlist(xpathSApply(contact_parsed, sprintf("//*[contains(text(), '%s')]", impressstring))) if (!is.null(contactimpress)) { # there's a imprint on the contact page cat(as.character(df$title[i]), "has an contact page with imprint\n") df$hasimpress[i] <- TRUE df$conhasimpress[i] <- TRUE } else { # there's no imprint on the contact page cat(as.character(df$title[i]), "has an contact page WITHOUT imprint\n") df$hasimpress[i] <- FALSE df$conhasimpress[i] <- FALSE } rm(contact_parsed, contactimpress) } else { cat(as.character(df$title[i]), "has an contact page, which is offline\n") df$hasimpress[i] <- FALSE } rm(file_parsed) } } rm(i) } # ANALYSIS ---------------------------------------------------------------- df <- tbl_df(df) # First let's export the data frame write.table(df, "df_backup.txt", sep=";") # Cleaning offline (and therefore not analysed) websites to remove NAs in analysis for (i in 1:nrow(df)) { status <- df$hasimpress[i] if (is.na(status)) { cat("Deleting entry", i, "\n") df <- df[-i,] } rm(i, status) } # Removing websites with .ch or .at domain as they most likely are not driven by a German individual or company # Possible R bug. Run multiple times until no status message returns for (i in 1:nrow(df)) { url <- as.character(df$link[i]) status <- str_detect(url, "(\\.ch)|(\\.at)") if (status) { cat(as.character(df$link[i]), "is from CH or AT. Therefore deleting it\n") df <- df[-i,] } rm(i, status, url) } ## HISTOGRAM histo <- ggplot(df,aes(kind)) #histo <- ggplot(df,aes(cat)) histo + geom_histogram(aes(fill = hasimpress), position = "fill") + xlab("Art der Webseite") + ylab("Prozentualer Anteil von Impressen") + ggtitle("Anteil von Impressen auf\n verschiedenen Webseiten-Typen") + guides(fill=guide_legend(title=NULL)) + scale_fill_discrete(breaks=c(TRUE, FALSE), labels=c("Impressum vorhanden", "Kein Impressum")) ggsave(file="abbildung1mehl.pdf") ggsave(file="abbildung1mehl.png") ## TABLE1 table1 <- c("blogs", "foren", "shops", "news") table1 <- data.frame(kind=table1) for (i in 1:nrow(table1)) { table1$total[i] <- nrow(subset(df, kind==table1$kind[i])) table1$hasimpress[i] <- nrow(subset(df, kind==table1$kind[i] & hasimpress==TRUE)) table1$noimpress[i] <- nrow(subset(df, kind==table1$kind[i] & hasimpress==FALSE)) p <- round(table1$hasimpress[i]/table1$total[i]*100, 2) table1$pimpress[i] <- str_c(p, " %") rm(i,p) } names(table1) <- c("Seitentyp", "Anzahl", "Mit Impressum", "Ohne Impressum", "Anteil der Seiten mit Impressum") table1[,1] <- c("Blogs", "Foren", "Shops", "News-Seiten") print(xtable(table1), type="html", file="tabelle1mehl.html")