-
-
Save kamermanpr/65ffef3e49c73be89e55 to your computer and use it in GitHub Desktop.
R code for generating a network of referring articles from pubmed. More info here: http://mikebarnkob.dk/2015/generating-an-article-network-using-rentrez-and-igraph-in-r/
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Article Network ver 2 | |
#============================ | |
# This program will scrape all citing articles surrounding one main article and | |
# create: 1) a list of most cited articles, 2) lists of main authors, 3) network of articles | |
# Mike Barnkob, December 28th 2015. | |
# www.mikebarnkob.dk | |
#Background and references | |
#1. https://cran.r-project.org/web/packages/rentrez/vignettes/rentrez_tutorial.html#advanced-counting | |
#2. https://ropensci.org/tutorials/rentrez_tutorial.html | |
#3. http://www.kateto.net/polnet2015 | |
#4. http://www.shizukalab.com/toolkits/sna | |
#5. http://stackoverflow.com/questions/16390221/how-to-make-grouped-layout-in-igraph/31976350#31976350 | |
#6. http://michael.hahsler.net/SMU/LearnROnYourOwn/code/igraph.html | |
#SETUP | |
#Load libraries | |
if (!require('XML')) install.packages('XML'); library('XML') | |
if (!require('rentrez')) install.packages('rentrez'); library('rentrez') | |
if (!require('igraph')) install.packages('igraph'); library('igraph') | |
#Set working directory | |
setwd("~/Dropbox/Projekter/2015 - Article Network") | |
#RETRIEVE DATA - this will take some time, depending on number of levels to iterate through | |
#Pubmed ID to start from - retrieve data and setup data frames | |
startnode <- c(11953749) | |
num_of_levels <- 4 #number of article "levels" from the start node | |
#Get initial data from startnode - this will form the spring/center node | |
startnode_links <- entrez_link(dbfrom='pubmed', id=startnode, db='all') | |
startnode_citations <- startnode_links$links$pubmed_pubmed_citedin | |
#startnode_links$links #shows all the link-out info available | |
startnode_data <- entrez_summary(db="pubmed", id=startnode) | |
#Creates node list - will contain data on all nodes | |
nodes <- data.frame(PubmedID = startnode_data$uid, | |
Title = startnode_data$title, | |
Last_Author = startnode_data$lastauthor, | |
Year = substr(startnode_data$pubdate, 0, 4), | |
Pubtype = startnode_data$pubtype, | |
Citations = startnode_data$pmcrefcount, | |
Level = 0) | |
#Create Edges list - will contain to and from data | |
edges <- data.frame(From = startnode_citations, To = rep(startnode, times = length(startnode_citations))) | |
#Loop through pubmedIDs to generate list | |
for(l in 1:num_of_levels) { | |
tofind_list <- c() #reset list | |
cat("Level:", l, ".") | |
cat("IDs:") | |
for(i in 1:length(startnode_citations)) { | |
cat(i, ", ") #output id, to check program is still running | |
#Get node data - but only if not already in node-list | |
if(startnode_citations[i] %in% nodes$PubmedID == FALSE) { | |
node_data <- entrez_summary(db="pubmed", id=startnode_citations[i]) | |
nodes <- rbind(nodes, data.frame(PubmedID = node_data$uid, | |
Title = node_data$title, | |
Last_Author = node_data$lastauthor, | |
Year = substr(node_data$pubdate, 0, 4), | |
Pubtype = node_data$pubtype, | |
Citations = node_data$pmcrefcount, | |
Level = l)) | |
#Get edge data | |
node_links <- entrez_link(dbfrom='pubmed', id=startnode_citations[i], db='all') | |
node_citations <- node_links$links$pubmed_pubmed_citedin | |
if (length(node_citations > 0)) { | |
edges <- rbind(edges, data.frame(From = node_citations, To = rep(startnode_citations[i], times = length(startnode_citations[i])))) | |
tofind_list <- append(tofind_list, node_links$links$pubmed_pubmed_citedin) #Add citations from this article to next list to find | |
} | |
} | |
} | |
startnode_citations <- unique(tofind_list) #Make new citation list from all edges found at previous level (but only once, hence the unique function) | |
} | |
#Find missing nodes | |
missing_nodes_IDs <- as.vector(edges[!(edges$From %in% nodes$PubmedID), ][1]) | |
head(missing_nodes_IDs) | |
nrow(missing_nodes_IDs) | |
#Remove duplicate nodes | |
nrow(nodes[duplicated(nodes$PubmedID), ]) #count number of duplicates | |
nrow(nodes[!duplicated(nodes$PubmedID), ]) | |
nodes <- nodes[!duplicated(nodes$PubmedID), ] | |
#Loop through missing list | |
for(i in 1:nrow(missing_nodes_IDs)) { | |
cat(missing_nodes_IDs$From[i]) #output id, to check program is still running | |
node_data <- entrez_summary(db="pubmed", id=missing_nodes_IDs$From[i]) | |
#Check that data exists | |
if (length(node_data$pubtype) > 0) { | |
node_data_pubtype = node_data$pubtype | |
} else { | |
node_data_pubtype = "NA" | |
} | |
#Get node data | |
nodes <- rbind(nodes, data.frame(PubmedID = node_data$uid, | |
Title = node_data$title, | |
Last_Author = node_data$lastauthor, | |
Year = substr(node_data$pubdate, 0, 4), | |
Pubtype = node_data_pubtype, | |
Citations = node_data$pmcrefcount, | |
Level = num_of_levels+1)) | |
} | |
#Check that there are no missing nodes left - should give NULL result | |
missing_nodes_IDs <- as.vector(edges[!(edges$From %in% nodes$PubmedID), ][1]) | |
missing_nodes_IDs | |
#Remove duplicate nodes | |
nrow(nodes[duplicated(nodes$PubmedID), ]) #count number of duplicates | |
nrow(nodes[!duplicated(nodes$PubmedID), ]) | |
nodes <- nodes[!duplicated(nodes$PubmedID), ] | |
#SAVE DATA | |
write.table(nodes, file="nodes.csv", sep = ",", col.names = TRUE, row.names = FALSE) | |
write.table(edges, file="edges.csv", sep = ",", col.names = TRUE, row.names = FALSE) | |
#LOAD DATA | |
nodes <- read.csv2("nodes.csv", as.is = TRUE, header = TRUE, sep = ",", | |
#nrows = 100, | |
row.names = NULL | |
) | |
edges <- read.csv2("edges.csv", as.is = TRUE, header = TRUE, sep = ",", | |
#nrows = 100, | |
row.names = NULL | |
) | |
#SETUP NETWORK | |
net <- graph_from_data_frame(edges, directed=TRUE, vertices=nodes) | |
cutoff <- 30 | |
#VISUALIZE | |
#Degree distribution | |
dd <- degree.distribution(net, cumulative=T, mode="all") | |
plot(dd, pch=19, cex=0.5, col="orange", xlab="Degree", ylab="Cumulative Frequency") | |
#Whole network with highly citated nodes outlined in red | |
l <- layout_with_lgl(net) #very large graph layout | |
l <- layout.norm(l, ymin=-1, ymax=1, xmin=-1, xmax=1) #Sets up graph area | |
in_degree <- degree(net, mode="in") #mode can be all, out or in | |
plot(net, | |
vertex.size = log(in_degree+1), #Size of node determained by connections into it | |
vertex.label.color = "black", | |
#vertex.label = ifelse(degree(net, mode = "in", loops = FALSE) > cutoff, V(net)$Last_Author, NA), | |
vertex.label = NA, | |
vertex.color = ifelse(degree(net, mode = "in", loops = FALSE) > cutoff, "red", "white"), #Color nodes differently based on their pubtype | |
edge.width = .4, | |
edge.curved = 0, | |
edge.arrow.size = 0, | |
rescale = FALSE, | |
main = "Network", | |
layout = l*1.4 #For playing around with the size of the graph | |
) | |
legend(x=-2, y=-1, # position | |
legend = c(paste("Nodes >", cutoff, "degrees"), paste("Nodes <", cutoff, " degrees")), | |
pch = 21, #open point | |
pt.bg = c("Red","White"), | |
pt.cex=2, | |
cex=.8, bty="n", | |
ncol=1 | |
) | |
legend(x=1, y=-1, # position | |
legend = c(paste("Nodes :", vcount(net)), paste("Edges :", ecount(net))), | |
pt.bg = c("black"), | |
pt.cex=2, | |
cex=.8, bty="n", | |
ncol=1 | |
) | |
#EXTRACT DATA | |
#Show data for articles with a minimum number of citations | |
net.vs <- V(net)[degree(net) < cutoff] #identify those vertices part of less than cutoff value | |
net.cutoff <- delete.vertices(net, net.vs) #exclude them from object | |
#create dataframe | |
list_of_articles <- data.frame(PubmedID = V(net.cutoff)$name, Last_Author = V(net.cutoff)$Last_Author, Title = V(net.cutoff)$Title, Pubtype = V(net.cutoff)$Pubtype) | |
list_of_articles | |
nrow(list_of_articles) | |
#export | |
write.table(list_of_articles, file="significant-articles.csv", sep = ",", col.names = TRUE, row.names = FALSE) | |
#Show top 10 of last authors have published most | |
top_authors <- as.data.frame(table(nodes$Last_Author)) | |
top_authors[order(top_authors$Freq, decreasing = TRUE)[1:10], ] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment