Skip to content

Instantly share code, notes, and snippets.

@egonw
Created February 12, 2025 19:11
Show Gist options
  • Save egonw/9a8f8f0266658be6a616d84f7e2a11d5 to your computer and use it in GitHub Desktop.
Save egonw/9a8f8f0266658be6a616d84f7e2a11d5 to your computer and use it in GitHub Desktop.
Extract chemical from a Beilstein Journal of Organic Chemistry article
// Copyright (c) 2022-2024 Egon Willighagen <[email protected]>
//
// How to cite: https://joss.theoj.org/papers/10.21105/joss.02558
//
// GPL v3
@Grab(group='io.github.egonw.bacting', module='managers-rdf', version='1.0.4')
@Grab(group='io.github.egonw.bacting', module='managers-ui', version='1.0.4')
@Grab(group='io.github.egonw.bacting', module='net.bioclipse.managers.jsoup', version='1.0.4')
bioclipse = new net.bioclipse.managers.BioclipseManager(".");
rdf = new net.bioclipse.managers.RDFManager(".");
jsoup = new net.bioclipse.managers.JSoupManager(".");
articles = [
args[0]
]
kg = rdf.createInMemoryStore()
for (article in articles) {
htmlContent = bioclipse.download(article)
htmlDom = jsoup.parseString(htmlContent)
// application/ld+json
bioschemasSections = jsoup.select(htmlDom, "script[type='application/ld+json']");
for (section in bioschemasSections) {
bioschemasJSON = section.html()
rdf.importFromString(kg, bioschemasJSON, "JSON-LD")
}
}
turtle = rdf.asTurtle(kg);
println "#" + rdf.size(kg) + " triples detected in the JSON-LD"
// println turtle
sparql = """
PREFIX schema: <http://schema.org/>
SELECT ?entity ?inchikey ?smiles WHERE {
?entity a schema:MolecularEntity .
OPTIONAL { ?entity schema:inChIKey ?inchikey }
OPTIONAL { ?entity schema:smiles ?smiles }
}
"""
results = rdf.sparql(kg, sparql)
for (i=1;i<=results.rowCount;i++) {
println "${results.get(i, "inchikey")}\t${results.get(i, "smiles")}"
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment