Created
May 1, 2024 19:06
-
-
Save egonw/7b471e489c120e4485a86e10846253ae to your computer and use it in GitHub Desktop.
Groovy script using SPARQL to retrieve polymers with CXSMILES from Wikidata (CCZero), generates coordinates, and then writes an SD file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// CC-BY 4.0 International. (c) 2024 Egon Willighagen | |
@Grab(group='io.github.egonw.bacting', module='managers-ui', version='0.5.2') | |
@Grab(group='io.github.egonw.bacting', module='managers-rdf', version='0.5.2') | |
@Grab(group='org.openscience.cdk', module='cdk-smiles', version='2.9') | |
@Grab(group='org.openscience.cdk', module='cdk-silent', version='2.9') | |
@Grab(group='org.openscience.cdk', module='cdk-ctab', version='2.9') | |
@Grab(group='org.openscience.cdk', module='cdk-sdg', version='2.9') | |
import org.openscience.cdk.smiles.SmilesParser; | |
import org.openscience.cdk.interfaces.*; | |
import org.openscience.cdk.silent.SilentChemObjectBuilder; | |
import org.openscience.cdk.io.*; | |
import org.openscience.cdk.layout.StructureDiagramGenerator; | |
import javax.vecmath.Vector2d | |
workspaceRoot = ".." | |
// ui = new net.bioclipse.managers.UIManager(workspaceRoot); | |
// cdk = new net.bioclipse.managers.CDKManager(workspaceRoot); | |
bioclipse = new net.bioclipse.managers.BioclipseManager(workspaceRoot); | |
rdf = new net.bioclipse.managers.RDFManager(workspaceRoot); | |
builder = SilentChemObjectBuilder.getInstance() | |
sp = new SmilesParser(builder) | |
mappingQuery = """ | |
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> | |
PREFIX wd: <http://www.wikidata.org/entity/> | |
PREFIX wdt: <http://www.wikidata.org/prop/direct/> | |
SELECT DISTINCT ?polymer ?polymerLabel ?cxsmiles WHERE { | |
?polymer wdt:P31/wdt:P279* wd:Q81163 ; | |
wdt:P10718 ?cxsmiles ; | |
rdfs:label ?polymerLabel . FILTER (LANG(?polymerLabel) = "en") . | |
} | |
""" | |
// sparqlEP = "https://qlever.cs.uni-freiburg.de/api/wikidata" | |
sparqlEP = "https://query.wikidata.org/sparql" | |
rawResults = bioclipse.sparqlRemote(sparqlEP, mappingQuery) | |
results = rdf.processSPARQLXML(rawResults, mappingQuery) | |
molList = builder.newInstance(IAtomContainerSet.class) | |
for (i=1;i<=results.rowCount;i++) { | |
wdItemIRI = results.get(i, "polymer") | |
println wdItemIRI | |
wdItem = wdItemIRI.replace("wd:", "").replace("http://www.wikidata.org/entity/", "") | |
label = results.get(i, "polymerLabel") | |
cxSMILES = results.get(i, "cxsmiles") | |
mol = sp.parseSmiles(cxSMILES) | |
sdg = new StructureDiagramGenerator(); | |
sdg.setMolecule(mol); | |
sdg.generateCoordinates(new Vector2d(0, 1)); | |
mol = sdg.getMolecule(); | |
mol.setTitle(label) | |
mol.setProperty("PUBCHEM_SUBSTANCE_SYNONYM", label) | |
mol.setProperty("PUBCHEM_SUBSTANCE_COMMENT", cxSMILES) | |
mol.setProperty("PUBCHEM_EXT_DATASOURCE_REGID", wdItem) | |
mol.setProperty("PUBCHEM_EXT_SUBSTANCE_URL", "https://scholia.toolforge.org/" + wdItem) | |
molList.addAtomContainer(mol) | |
} | |
writer = new FileWriter(new File("wikidata_polymers.sdf")) | |
SDFWriter sdfWriter = new SDFWriter(writer); | |
sdfWriter.write(molList); | |
sdfWriter.close(); | |
writer.close(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment