DASpringate · August 29, 2015 14:00
diff --git a/web_mining_example.R b/web_mining_example.R
 require(XML)
 require(RCurl)
 require(stringr)
 require(rentrez)
 require(rjson)
 require(reshape2)
 require(ggmap)
 require(mapproj)
 require(devtools)

 install_github("rOpenHealth/rpubmed")
 require(rpubmed)

 ## Get IDs of papers containing "electronic medical records"
 paper_ids <- entrez_search("pubmed", "electronic medical records", retmax = 5000)$ids

 records <- fetch_in_chunks(paper_ids)

 ## Find article locations in the nested records

 addresses <- as.character(sapply(records, function(x) x$MedlineCitation$Article$AuthorList$Author$Affiliation))
 # clean up with regex:
 addresses <- str_replace_all(addresses, "(Electronic [Aa]ddress:)|[[:alnum:][:punct:]]+@+[[:alnum:][:punct:]]+", "")
 journals <- as.character(sapply(records, function(x) x$MedlineCitation$MedlineJournalInfo$ISSNLinking))


 job_data <- data.frame(address = addresses, ISSN = journals)


 # Get data on impact factors of major scientific journals
 impact_url <- "http://www.citefactor.org/impact-factor-list-2012.html"

 # The data we want is in an HTML table on the webpage
 impacts <- readHTMLTable(impact_url)[[1]]
 impacts <- impacts[, c(1,2,4)] # keep only these columns

 ## article data and impact factor share the ISSN column so merge:
 job_data <- merge(job_data, impacts, all.x = TRUE)

 job_data <- job_data[complete.cases(job_data),]
 job_data <- job_data[job_data$address != "NULL",]

 save(job_data, file = "data/job_data.RData")

 ## This website provides ranking data for the top 100 European universities in a table:
 rankings_url <- "http://www.researchranking.org/?action=ranking"

 # No header in the table so we will make our own:
 rankings <- readHTMLTable(rankings_url, header = FALSE, stringsAsFactors = FALSE)[[1]]
 names(rankings)  <-  c("rank", "institution", "type", "country", "score")
 rankings$institution <- str_replace(rankings$institution, "^THE ", "") # cleanup

 ## Iterate over the top 100 institutions, check if they are in the article address and include ranking data
 job_data$institution_score <- NA
 job_data$institutions <- NA
 job_data$countries <- NA
 for(institution in 1:nrow(rankings)){
    new_score <- str_detect(job_data$address, ignore.case(rankings[institution, "institution"]))
    if(sum(new_score)){
        job_data$institution_score[new_score] <- rankings[institution, "score"]
        job_data$institutions[new_score] <- rankings[institution, "institution"]
        job_data$countries[new_score] <- rankings[institution, "country"]
    }
 }


 save(job_data, file = "data/job_data.RData")

 ## Subset by universities with rankings:
 selected <- job_data[!is.na(job_data$institution_score),]
 selected$impact_factor <- as.numeric(as.character(selected[["Impact Factor"]]))
 selected[["Impact Factor"]] <- NULL
 selected$institution_score <- as.numeric(selected$institution_score)

 ## Use ggmap to get longitudes and latitudes:
 coords <- geocode(paste(selected$institution, selected$country))

 # Bind on to the side of the article data
 selected <- cbind(selected, coords)
 save(selected, file = "selected.RData")

 ## Use reshape2 to aggragate our data to 1 line for each institution
 molten <- melt(selected)

 # Get mean impact factor for articles from a university:
 my_data <- dcast(molten, country + institution ~ variable, mean)
 # Get publication counts for each university:
 publications <- dcast(molten, country + institution ~ variable, length)
 my_data$publications <- publications$institution_score
 # composite ranking
 my_data$ranks <- rank(my_data$institution_score) + rank(my_data$impact_factor) + rank(my_data$publications)
 my_data <- my_data[order(my_data$ranks, decreasing = TRUE),]

 # Capitalise the university names for display
 capitalise <- function(x) {
    s <- tolower(strsplit(x, " ")[[1]])
    paste(toupper(substring(s, 1,1)), substring(s, 2),
          sep="", collapse=" ")
 }

 my_data$institution <- sapply(my_data$institution, capitalise)
 my_data$labels <- my_data$institution
 my_data$labels[my_data$ranks < 35] <- ""

 # Plot the map:
 p <- qmap(c(-5, 60, 19, 48), zoom = 4)
 p + geom_point(aes(lon, lat, colour = ranks, size = publications), data = my_data) +
    scale_color_continuous( low="red", high="blue") + 
    geom_text(aes(x = lon, y = lat, label = labels, size = 6, 
                  hjust = c(1,rep(0, 5), 1, 0,0,1,rep(0,11)), vjust = 0), data = my_data)
	require(XML)
	require(RCurl)
	require(stringr)
	require(rentrez)
	require(rjson)
	require(reshape2)
	require(ggmap)
	require(mapproj)
	require(devtools)

	install_github("rOpenHealth/rpubmed")
	require(rpubmed)

	## Get IDs of papers containing "electronic medical records"
	paper_ids <- entrez_search("pubmed", "electronic medical records", retmax = 5000)$ids

	records <- fetch_in_chunks(paper_ids)

	## Find article locations in the nested records

	addresses <- as.character(sapply(records, function(x) x$MedlineCitation$Article$AuthorList$Author$Affiliation))
	# clean up with regex:
	addresses <- str_replace_all(addresses, "(Electronic [Aa]ddress:)\|[[:alnum:][:punct:]]+@+[[:alnum:][:punct:]]+", "")
	journals <- as.character(sapply(records, function(x) x$MedlineCitation$MedlineJournalInfo$ISSNLinking))


	job_data <- data.frame(address = addresses, ISSN = journals)


	# Get data on impact factors of major scientific journals
	impact_url <- "http://www.citefactor.org/impact-factor-list-2012.html"

	# The data we want is in an HTML table on the webpage
	impacts <- readHTMLTable(impact_url)[[1]]
	impacts <- impacts[, c(1,2,4)] # keep only these columns

	## article data and impact factor share the ISSN column so merge:
	job_data <- merge(job_data, impacts, all.x = TRUE)

	job_data <- job_data[complete.cases(job_data),]
	job_data <- job_data[job_data$address != "NULL",]

	save(job_data, file = "data/job_data.RData")

	## This website provides ranking data for the top 100 European universities in a table:
	rankings_url <- "http://www.researchranking.org/?action=ranking"

	# No header in the table so we will make our own:
	rankings <- readHTMLTable(rankings_url, header = FALSE, stringsAsFactors = FALSE)[[1]]
	names(rankings) <- c("rank", "institution", "type", "country", "score")
	rankings$institution <- str_replace(rankings$institution, "^THE ", "") # cleanup

	## Iterate over the top 100 institutions, check if they are in the article address and include ranking data
	job_data$institution_score <- NA
	job_data$institutions <- NA
	job_data$countries <- NA
	for(institution in 1:nrow(rankings)){
	new_score <- str_detect(job_data$address, ignore.case(rankings[institution, "institution"]))
	if(sum(new_score)){
	job_data$institution_score[new_score] <- rankings[institution, "score"]
	job_data$institutions[new_score] <- rankings[institution, "institution"]
	job_data$countries[new_score] <- rankings[institution, "country"]
	}
	}


	save(job_data, file = "data/job_data.RData")

	## Subset by universities with rankings:
	selected <- job_data[!is.na(job_data$institution_score),]
	selected$impact_factor <- as.numeric(as.character(selected[["Impact Factor"]]))
	selected[["Impact Factor"]] <- NULL
	selected$institution_score <- as.numeric(selected$institution_score)

	## Use ggmap to get longitudes and latitudes:
	coords <- geocode(paste(selected$institution, selected$country))

	# Bind on to the side of the article data
	selected <- cbind(selected, coords)
	save(selected, file = "selected.RData")

	## Use reshape2 to aggragate our data to 1 line for each institution
	molten <- melt(selected)

	# Get mean impact factor for articles from a university:
	my_data <- dcast(molten, country + institution ~ variable, mean)
	# Get publication counts for each university:
	publications <- dcast(molten, country + institution ~ variable, length)
	my_data$publications <- publications$institution_score
	# composite ranking
	my_data$ranks <- rank(my_data$institution_score) + rank(my_data$impact_factor) + rank(my_data$publications)
	my_data <- my_data[order(my_data$ranks, decreasing = TRUE),]

	# Capitalise the university names for display
	capitalise <- function(x) {
	s <- tolower(strsplit(x, " ")[[1]])
	paste(toupper(substring(s, 1,1)), substring(s, 2),
	sep="", collapse=" ")
	}

	my_data$institution <- sapply(my_data$institution, capitalise)
	my_data$labels <- my_data$institution
	my_data$labels[my_data$ranks < 35] <- ""

	# Plot the map:
	p <- qmap(c(-5, 60, 19, 48), zoom = 4)
	p + geom_point(aes(lon, lat, colour = ranks, size = publications), data = my_data) +
	scale_color_continuous( low="red", high="blue") +
	geom_text(aes(x = lon, y = lat, label = labels, size = 6,
	hjust = c(1,rep(0, 5), 1, 0,0,1,rep(0,11)), vjust = 0), data = my_data)