Created
June 14, 2023 16:47
-
-
Save nutjob4life/1aa843b6a6540bbac51b4f53e6b74c1c to your computer and use it in GitHub Desktop.
Extract abstracts for EDRN's publications
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding: utf-8 | |
'''Extract abstracts used by EDRN publications. | |
To use:: | |
python3.10 -m venv .venv | |
.venv/bin/pip install --quiet --upgrade setuptools pip wheel build | |
.venv/bin/pip install biopython~=1.81 rdflib~=6.3.2 | |
.venv/bin/python extract.py > all-abstracts.txt | |
''' | |
from Bio import Entrez | |
import logging, rdflib, time, contextlib | |
_logger = logging.getLogger(__name__) # Logging | |
Entrez.tool = 'edrn-pubs' # Name of our "tool" for Entrez API identification | |
Entrez.email = '[email protected]' # Owner of the "tool" | |
_batch_size = 20 # How many abstracts to get at a time | |
_wait_betwixt = 5 # How long in seconds to let the API rest between batches | |
# RDF predicate that contains pub med IDs | |
_pubmed_predicate = rdflib.URIRef('http://edrn.nci.nih.gov/rdf/schema.rdf#pmid') | |
# Sources of EDRN publications | |
_rdf_sources = [ | |
'https://bmdb.jpl.nasa.gov/rdf/publications', # Biomarker database publications | |
'https://edrn.jpl.nasa.gov/cancerdataexpo/rdf-data/publications/@@rdf' # DMCC-tracked publications | |
] | |
def _read_rdf(url: str) -> dict[rdflib.URIRef, dict[rdflib.URIRef, list[rdflib.URIRef | rdflib.Literal]]]: | |
'''Read RDF from the given ``url`` and return a dictionary of statements made, which are | |
subject URIs to predicates. The predicates are themselves a dictionary of predicate URIs to | |
sequences of objects, which are either other URI references or literals. | |
''' | |
graph = rdflib.Graph() | |
_logger.debug('Parsing RDF at %s', url) | |
graph.parse(url) | |
statements = {} | |
for s, p, o in graph: | |
if s not in statements: | |
statements[s] = {} | |
predicates = statements[s] | |
if p not in predicates: | |
predicates[p] = [] | |
predicates[p].append(o) | |
return statements | |
def _get_pub_med_ids(rdf_sources: list[str]) -> set[str]: | |
'''Given a list of possible sources of RDF-based information, determine the unique set of | |
PubMedIDs represented. | |
''' | |
_logger.debug('Geting RDF from %r', rdf_sources) | |
ids = set() | |
for rdf_source in rdf_sources: | |
statements = _read_rdf(rdf_source) | |
for s, p in statements.items(): | |
pubmeds = p.get(_pubmed_predicate, []) | |
for pubmedid in pubmeds: | |
ids.add(str(pubmedid)) | |
_logger.debug('Found %d unique pubmed IDs', len(ids)) | |
return ids | |
def _divvy(ids: list[str], batch_size: int) -> list[str]: | |
'''Divvy up a list of string ``ids`` into ``batch_size`` parts.''' | |
while len(ids) > 0: | |
batch, ids = ids[:batch_size], ids[batch_size:] | |
yield batch | |
def _retrieve_abstracts(ids: list[str]) -> list[str]: | |
'''Retrieve the abstracts for pubmed ``ids`` and return them as a sequence of strings.''' | |
_logger.debug('Retrieving abstracts for %d pubmed IDs', len(ids)) | |
abstracts = [] | |
with contextlib.closing(Entrez.efetch(db='pubmed', retmode='xml', rettype='medline', id=ids)) as ef: | |
records = Entrez.read(ef) | |
for record in records['PubmedArticle']: | |
pubmedid = str(record['MedlineCitation']['PMID']) | |
try: | |
abstract = '\n'.join([str(i) for i in record['MedlineCitation']['Article']['Abstract']['AbstractText']]) | |
abstracts.append(abstract) | |
except KeyError: | |
_logger.info('No abstract available for pubmed ID %s; skipping it', pubmedid) | |
return abstracts | |
def main(): | |
'''Write all known EDRN abstracts to the standard output.''' | |
pub_med_ids = list(_get_pub_med_ids(_rdf_sources)) | |
for batch in _divvy(pub_med_ids, _batch_size): | |
abstracts = _retrieve_abstracts(batch) | |
for abstract in abstracts: | |
print(abstract) | |
if len(batch) == _batch_size: | |
time.sleep(_wait_betwixt) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment