Created
October 9, 2019 14:59
-
-
Save pebbie/a3c9795a20510f2765c3d707eaff4e5d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
file: rdf2pandas.py | |
auth: Peb Ruswono Aryan | |
desc: import data in RDF Data Cube (assumed in particular shape) from Graph to Pandas DataFrame | |
""" | |
from rdflib import Graph, Namespace, RDF, RDFS | |
import pandas as pd | |
QB = Namespace('http://purl.org/linked-data/cube#') | |
DCT = Namespace('http://purl.org/dc/terms/') | |
def short_name(uristr: str) -> str: | |
hpos = uristr.rindex('#') if '#' in uristr else -1 | |
spos = uristr.rindex('/') if '/' in uristr else -1 | |
return uristr[max(hpos,spos)+1:] | |
def from_graph(g : Graph) -> pd.DataFrame : | |
""" | |
import DataFrame from rdflib.Graph | |
expects shape : | |
?ds RDF.type QB.DataSet | |
?ds QB.structure ?dsd | |
?dsd QB.component [QB.dimension ?dim] || ?dsd QB.component [QB.measure ?mea] | |
?obs QB.dataSet ?ds | |
?obs ?dim ?dimval || ?obs ?mea ?meaval | |
""" | |
data = [] | |
columns = [] | |
dss = list(g.subjects(RDF.type, QB.DataSet)) | |
if len(dss)>0: | |
ds = dss[0] | |
dsds = list(g.objects(ds, QB.structure)) | |
if len(dsds)>0: | |
dsd = dsds[0] | |
dims = [] | |
meas = [] | |
for c in g.objects(dsd, QB.component): | |
dim = g.value(c, QB.dimension) | |
mea = g.value(c, QB.measure) | |
if dim is not None: | |
dims.append(dim) | |
elif mea is not None: | |
meas.append(mea) | |
comps = dims + meas | |
for c in comps: | |
# try if there's some label annotation in the graph | |
lbl = g.value(c, RDFS.label) | |
if lbl is not None: | |
columns.append(lbl) | |
continue | |
lbl = g.value(c, DCT.title) | |
if lbl is not None: | |
columns.append(lbl) | |
continue | |
lbl = short_name(str(c)) | |
columns.append(lbl) | |
for obs in g.subjects(QB.dataSet, ds): | |
row = [] | |
for c in comps: | |
row.append(g.value(obs, c)) | |
data.append(row) | |
df = pd.DataFrame(data, columns = columns) | |
return df |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment