Created
February 13, 2016 21:58
-
-
Save mbarnkob/c954f867ed249b9767af to your computer and use it in GitHub Desktop.
A script to scrape Nature Immunology "news and views" text files and convert them to a format, # thats easier to listen to in Capti (https://www.captivoice.com/capti-site/). More info here: http://mikebarnkob.dk/2016/an-r-script-for-scraping-news-from-nature-journals-to-capti-narrator
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Script to convert Nature articles into Capti text files | |
# A script to scrape Nature Immunology "news and views" text files and convert them to a format, | |
# thats easier to listen to in Capti (https://www.captivoice.com/capti-site/). Does the following: | |
# 1. Add "Title " in front of title, "Author " in front of author, "Abstract " in front of abstract, "Text " in front of text | |
# 2. Removes references and website links | |
# 3. Remove figure texts. | |
# Version 1 | |
# Mike Barnkob, 2016-02-01. | |
# License: CC BY-NC 4.0 - please share, remix, and credit. | |
# Info: http://mikebarnkob.dk/2016/an-r-script-for-scraping-news-from-nature-journals-to-capti-narrator | |
# References | |
# 1. http://www.regular-expressions.info/examples.html | |
# 2. http://www.cheatography.com/davechild/cheat-sheets/regular-expressions/ | |
# SETUP | |
#Working directory | |
setwd("~/Dropbox/Projekter/2016 - Scrape Nature Journals for Capti") | |
#Load libraries | |
if (!require('XML')) install.packages('XML'); library('XML') | |
if (!require('rvest')) install.packages('rvest'); library('rvest') | |
if (!require('stringr')) install.packages('stringr'); library('stringr') | |
# SCRAPE | |
#Find all articles in "news and views" and "Research Highlights" | |
proxy_link <- "http://ezproxy-prd.bodleian.ox.ac.uk:2076" #Use your universities proxy - remember to login first! | |
current_issue <- "/ni/journal/v17/n2/index.html" #Link to nature Immunology - might work with other Nature Journals | |
html_link <- paste(proxy_link,current_issue,sep="") | |
news_and_views <- read_html(html_link) | |
news_and_views <- news_and_views %>% | |
html_node("#nv") %>% | |
html_nodes(".fulltext") %>% | |
html_attr("href") | |
news_and_views <- paste(proxy_link, news_and_views, sep="") | |
research_highlights <- read_html(html_link) | |
research_highlights <- research_highlights %>% | |
html_node("#rhighlts") %>% | |
html_nodes(".atl") %>% | |
html_nodes("a") %>% | |
html_attr("href") | |
research_highlights <- paste(proxy_link, research_highlights, sep="") | |
perspective <- read_html(html_link) | |
perspective <- perspective %>% | |
html_node("#pe , .fulltext") %>% | |
html_attr("href") | |
perspective <- paste(proxy_link, perspective, sep="") | |
#List of all articles to scrape | |
article_list <- c(news_and_views, research_highlights, perspective) | |
#LOOP THROUGH ARTICLE LIST | |
article_complete <- c() | |
all_together_now <- c() | |
for(i in 1:length(article_list)) { | |
#clear holders | |
article <- c() | |
title <- c() | |
author <- c() | |
abstract <- c() | |
txt <- c() | |
#scrape + add text for easier listening | |
article <- read_html(article_list[i]) #download html page | |
title <- article %>% #scrape title of article | |
html_node(".article-heading") %>% | |
html_text | |
title <- paste("\n\nTitle -", title) | |
author <- article %>% #scrape author of article | |
html_node(".fn") %>% | |
html_text | |
author <- paste("\n\nBy -", author) | |
abstract <- article %>% #scrape abstract, if available | |
html_nodes(".standfirst") %>% | |
html_text | |
if (length(abstract)>0) { | |
abstract <- paste("\n\nAbstract -", abstract) | |
} | |
txt <- article %>% #scrape main text | |
html_nodes(".content") %>% | |
.[2] | |
txt <- gsub("<sup\\b[^<]*>[^<]*(?:<(?!/sup>)[^<]*)*</sup>", "", txt, perl=T) #Remove all <sup> (ie references) from html - http://stackoverflow.com/questions/33970549/remove-all-specific-html-tags-using-gsub-r | |
txt <- str_replace_all(txt, "\t", "") #Remove all \t | |
txt <- gsub('[\n]', '', txt) #Remove all \n - http://stackoverflow.com/questions/9562535/gsub-reduce-all-repeating-characters-to-one-instance | |
txt <- gsub("\\s*\\([^\\)]+\\)","",txt) #Remove urls | |
txt <- str_replace_all(txt, "</p><p>", "\n\n New paragraph \n") #Replace </p><p> with \n\n New paragraph | |
txt <- gsub("<.*?>", "", txt) #Strip all html tags #http://stackoverflow.com/questions/17227294/removing-html-tags-from-a-string-in-r | |
txt <- gsub("\\s*\\([^\\)]+\\)","",txt) # Remove urls | |
txt <- paste("\n\nText -", txt) | |
article_complete <- paste(title, author, if (length(abstract)>0) { abstract }, txt) | |
all_together_now <- paste(all_together_now, "\n\n Next article", article_complete) | |
} | |
#SAVE TEXT FILE | |
cat(all_together_now,file="Nature Immunology News.txt",sep="\n") | |
file.show("Nature Immunology News.txt") #Shows the text file |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment