Last active
August 29, 2015 14:00
-
-
Save DASpringate/11253206 to your computer and use it in GitHub Desktop.
Web mining example: Scraping web data to look for a new job.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require(XML) | |
require(RCurl) | |
require(stringr) | |
require(rentrez) | |
require(rjson) | |
require(reshape2) | |
require(ggmap) | |
require(mapproj) | |
require(devtools) | |
install_github("rOpenHealth/rpubmed") | |
require(rpubmed) | |
## Get IDs of papers containing "electronic medical records" | |
paper_ids <- entrez_search("pubmed", "electronic medical records", retmax = 5000)$ids | |
records <- fetch_in_chunks(paper_ids) | |
## Find article locations in the nested records | |
addresses <- as.character(sapply(records, function(x) x$MedlineCitation$Article$AuthorList$Author$Affiliation)) | |
# clean up with regex: | |
addresses <- str_replace_all(addresses, "(Electronic [Aa]ddress:)|[[:alnum:][:punct:]]+@+[[:alnum:][:punct:]]+", "") | |
journals <- as.character(sapply(records, function(x) x$MedlineCitation$MedlineJournalInfo$ISSNLinking)) | |
job_data <- data.frame(address = addresses, ISSN = journals) | |
# Get data on impact factors of major scientific journals | |
impact_url <- "http://www.citefactor.org/impact-factor-list-2012.html" | |
# The data we want is in an HTML table on the webpage | |
impacts <- readHTMLTable(impact_url)[[1]] | |
impacts <- impacts[, c(1,2,4)] # keep only these columns | |
## article data and impact factor share the ISSN column so merge: | |
job_data <- merge(job_data, impacts, all.x = TRUE) | |
job_data <- job_data[complete.cases(job_data),] | |
job_data <- job_data[job_data$address != "NULL",] | |
save(job_data, file = "data/job_data.RData") | |
## This website provides ranking data for the top 100 European universities in a table: | |
rankings_url <- "http://www.researchranking.org/?action=ranking" | |
# No header in the table so we will make our own: | |
rankings <- readHTMLTable(rankings_url, header = FALSE, stringsAsFactors = FALSE)[[1]] | |
names(rankings) <- c("rank", "institution", "type", "country", "score") | |
rankings$institution <- str_replace(rankings$institution, "^THE ", "") # cleanup | |
## Iterate over the top 100 institutions, check if they are in the article address and include ranking data | |
job_data$institution_score <- NA | |
job_data$institutions <- NA | |
job_data$countries <- NA | |
for(institution in 1:nrow(rankings)){ | |
new_score <- str_detect(job_data$address, ignore.case(rankings[institution, "institution"])) | |
if(sum(new_score)){ | |
job_data$institution_score[new_score] <- rankings[institution, "score"] | |
job_data$institutions[new_score] <- rankings[institution, "institution"] | |
job_data$countries[new_score] <- rankings[institution, "country"] | |
} | |
} | |
save(job_data, file = "data/job_data.RData") | |
## Subset by universities with rankings: | |
selected <- job_data[!is.na(job_data$institution_score),] | |
selected$impact_factor <- as.numeric(as.character(selected[["Impact Factor"]])) | |
selected[["Impact Factor"]] <- NULL | |
selected$institution_score <- as.numeric(selected$institution_score) | |
## Use ggmap to get longitudes and latitudes: | |
coords <- geocode(paste(selected$institution, selected$country)) | |
# Bind on to the side of the article data | |
selected <- cbind(selected, coords) | |
save(selected, file = "selected.RData") | |
## Use reshape2 to aggragate our data to 1 line for each institution | |
molten <- melt(selected) | |
# Get mean impact factor for articles from a university: | |
my_data <- dcast(molten, country + institution ~ variable, mean) | |
# Get publication counts for each university: | |
publications <- dcast(molten, country + institution ~ variable, length) | |
my_data$publications <- publications$institution_score | |
# composite ranking | |
my_data$ranks <- rank(my_data$institution_score) + rank(my_data$impact_factor) + rank(my_data$publications) | |
my_data <- my_data[order(my_data$ranks, decreasing = TRUE),] | |
# Capitalise the university names for display | |
capitalise <- function(x) { | |
s <- tolower(strsplit(x, " ")[[1]]) | |
paste(toupper(substring(s, 1,1)), substring(s, 2), | |
sep="", collapse=" ") | |
} | |
my_data$institution <- sapply(my_data$institution, capitalise) | |
my_data$labels <- my_data$institution | |
my_data$labels[my_data$ranks < 35] <- "" | |
# Plot the map: | |
p <- qmap(c(-5, 60, 19, 48), zoom = 4) | |
p + geom_point(aes(lon, lat, colour = ranks, size = publications), data = my_data) + | |
scale_color_continuous( low="red", high="blue") + | |
geom_text(aes(x = lon, y = lat, label = labels, size = 6, | |
hjust = c(1,rep(0, 5), 1, 0,0,1,rep(0,11)), vjust = 0), data = my_data) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment