Last active
May 22, 2022 09:57
-
-
Save xflr6/2c6ddf6cda55313bc8d4fbdc65a88ca4 to your computer and use it in GitHub Desktop.
Read pandas.DataFrame from SPARQL query
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"id": "1d935d67-aab3-49d1-b0e3-d82acd9e3329", | |
"metadata": {}, | |
"source": [ | |
"# pandas `read_sparql_query()`" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "5ebd1ea3-c44f-4b56-9ab9-58f58620f643", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from __future__ import annotations\n", | |
"\n", | |
"import datetime\n", | |
"import decimal\n", | |
"import distutils.util\n", | |
"import functools\n", | |
"import io\n", | |
"import logging\n", | |
"import types\n", | |
"import urllib.parse\n", | |
"import urllib.request\n", | |
"import warnings\n", | |
"import xml.etree.ElementTree as etree\n", | |
"\n", | |
"import pandas as pd\n", | |
"import rdflib\n", | |
"import SPARQLWrapper as sparqlwrapper" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "01c3b56a-7318-4999-8a5c-a2464921bbbb", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"ENDPOINT = 'https://query.wikidata.org/sparql'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "32fcfb65-45d5-4467-b88c-e59ea43890a1", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"QUERY = '''\n", | |
"SELECT\n", | |
" ?glottocode\n", | |
" (strafter(str(?languoid), str(wd:)) AS ?qid)\n", | |
" (?languoidLabel AS ?name)\n", | |
" (strafter(str(?siteLink), \"https://en.wikipedia.org/wiki/\") AS ?title)\n", | |
"WHERE {\n", | |
" ?languoid wdt:P1394 ?glottocode.\n", | |
" FILTER (REGEX(?glottocode, \"^[a-z0-9]{4}[0-9]{4}$\")).\n", | |
" OPTIONAL {\n", | |
" ?siteLink schema:about ?languoid;\n", | |
" schema:inLanguage \"en\";\n", | |
" schema:isPartOf <https://en.wikipedia.org/>.\n", | |
" }\n", | |
" SERVICE wikibase:label {\n", | |
" bd:serviceParam wikibase:language \"en\".\n", | |
" ?languoid rdfs:label ?languoidLabel.\n", | |
" }\n", | |
"}\n", | |
"ORDER BY\n", | |
" ?glottocode\n", | |
" xsd:integer(strafter(str(?languoid), str(wd:Q)))\n", | |
"LIMIT 15000\n", | |
"'''.strip()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "4ad0ee4a-44d0-4e24-a2c0-5be14d846795", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"TEST_QUERY = '''\n", | |
"SELECT ?index ?string ?integer ?boolean ?decimal ?double ?float ?datetime ?date ?time\n", | |
"WHERE {\n", | |
" VALUES ?index { 0 }\n", | |
" VALUES ?string { \"spam\" \"eggs\"^^xsd:string }\n", | |
" VALUES ?integer { 7 \"42\"^^xsd:integer }\n", | |
" VALUES ?boolean { true false }\n", | |
" VALUES ?decimal { 1.3 }\n", | |
" VALUES ?double { 1.0e6 }\n", | |
" VALUES ?float { \"6.275\"^^xsd:float }\n", | |
" VALUES ?datetime { \"2005-04-04T04:04:04\"^^xsd:dateTime }\n", | |
" VALUES ?date { \"2001-01-01\"^^xsd:date }\n", | |
" VALUES ?time { \"18:30\"^^xsd:time }\n", | |
"}\n", | |
"'''.strip()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "beb198ed-6fc0-4a53-ba57-49f4cb9c060d", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"LOGLEVEL = logging.INFO" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "3bea886d-f8e3-49e0-9b9a-8113f33f6035", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"logging.basicConfig(format='[%(levelname)s@%(name)s] %(message)s', level=LOGLEVEL, force=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "86aede62-2837-4013-9801-3778a6c1a21b", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"class QueryPandas:\n", | |
"\n", | |
" def __init__(self, endpoint: str, **kwargs) -> None:\n", | |
" super().__init__(**kwargs)\n", | |
" self.endpoint=endpoint\n", | |
"\n", | |
" def __repr__(self) -> str:\n", | |
" return f'{self.__class__.__name__}({self.endpoint!r})'\n", | |
" \n", | |
" def query_pandas(self, sparql: str, **kwargs) -> pd.DataFrame:\n", | |
" result = self.query(sparql)\n", | |
"\n", | |
" df = self.read_sparql_query_result(result, **kwargs)\n", | |
"\n", | |
" with io.StringIO() as f:\n", | |
" df.info(buf=f, memory_usage='deep')\n", | |
" logging.debug(f.getvalue())\n", | |
"\n", | |
" return df\n", | |
"\n", | |
" def query(self, sparql: str):\n", | |
" raise NotImplementedError \n", | |
" \n", | |
" def read_sparql_query_result(self, result, **kwargs) -> pd.DataFrame:\n", | |
" raise NotImplementedError" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"id": "44f9aadc-8d2c-4a59-a5b7-f58cc14db1d6", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def pd_dataframe_from_records(records, **kwargs) -> pd.DataFrame:\n", | |
" kwargs.setdefault('coerce_float', True)\n", | |
" logging.info('pandas.DataFrame.from_records(%r, **%r)', records, kwargs)\n", | |
" return pd.DataFrame.from_records(records, **kwargs)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"id": "a96dcb4d-ae5a-4b84-b0dd-5ea6bfc859c1", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def pd_read_csv(data, **kwargs) -> pd.DataFrame:\n", | |
" kwargs.setdefault('na_values', '')\n", | |
" kwargs.setdefault('keep_default_na', False)\n", | |
"\n", | |
" if kwargs.get('encoding') is None:\n", | |
" content_type = data.info()['content-type']\n", | |
" kwargs['encoding'] = get_encoding(content_type)\n", | |
" logging.debug('encoding: %r', kwargs['encoding'])\n", | |
"\n", | |
" logging.info('pandas.read_csv(%r, **%r)', data, kwargs)\n", | |
" return pd.read_csv(data, **kwargs)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"id": "3366b708-968f-4e85-8c25-6cb0819e0ab2", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def get_encoding(content_type: str) -> str: \n", | |
" _, sep, encoding = content_type.partition(';charset=')\n", | |
" assert sep and encoding\n", | |
" return encoding" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"id": "bea6da0c-6b5e-4998-b97c-002af7e41e89", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def pd_json_normalize(jsondata, **kwargs) -> pd.DataFrame:\n", | |
" kwargs.setdefault('record_path', ['results', 'bindings'])\n", | |
" logging.info('pandas.json_normalize(jsondata, **%r)', kwargs)\n", | |
" return pd.json_normalize(jsondata, **kwargs)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"id": "9f0ca230-7848-4ec1-92b4-70389cfcca02", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def pipe_info(df) -> pd.DataFrame:\n", | |
" df.info(memory_usage='deep')\n", | |
" return df" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "be2d9827-0273-462c-b712-7d9d276694ee", | |
"metadata": {}, | |
"source": [ | |
"## urllib" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"id": "03b875f6-397f-41a6-9479-d9889013b66e", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"class QueryUrllib(QueryPandas):\n", | |
"\n", | |
" method = 'GET'\n", | |
" \n", | |
" headers = None\n", | |
" \n", | |
" def query(self, sparql: str, *, method=None, headers=None):\n", | |
" logging.info('endpoint: %r', self.endpoint)\n", | |
" request_url = (urllib.parse.urlparse(self.endpoint)\n", | |
" ._replace(query=urllib.parse.urlencode({'query': sparql})))\n", | |
"\n", | |
" request = urllib.request.Request(request_url.geturl(),\n", | |
" method=method if method is not None else self.method,\n", | |
" headers=headers if headers is not None else self.headers)\n", | |
" logging.info('request: %r %r', request.method, request)\n", | |
" logging.debug('url: %r', request.full_url)\n", | |
"\n", | |
" response = urllib.request.urlopen(request)\n", | |
" logging.info('response: %r %r', response.code, response)\n", | |
" logging.info('content-type: %r', response.headers['content-type'])\n", | |
" logging.debug('headers: %r', dict(response.headers))\n", | |
"\n", | |
" return types.SimpleNamespace(response=response, headers=response.headers, info=response.info)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "4c85659a-a85d-4cd1-8d41-74346c44106e", | |
"metadata": { | |
"tags": [] | |
}, | |
"source": [ | |
"### ``CsvUrllibStrategy``" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"id": "e10dfe89-4693-4409-8993-7e73f95c2718", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def read_sparql_query_csv(result, **kwargs) -> pd.DataFrame:\n", | |
" return pd_read_csv(result.response, **kwargs)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"id": "aebf75f8-9a76-400a-8b03-af6f232816a5", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"class CsvUrllibStrategy(QueryUrllib):\n", | |
"\n", | |
" headers = {'Accept': 'text/csv'}\n", | |
"\n", | |
" read_sparql_query_result = staticmethod(read_sparql_query_csv)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"id": "a6b910e8-b162-4922-81ca-30dcf2711208", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"[INFO@root] endpoint: 'https://query.wikidata.org/sparql'\n", | |
"[INFO@root] request: 'GET' <urllib.request.Request object at 0x000001E1EEB98F40>\n", | |
"[INFO@root] response: 200 <http.client.HTTPResponse object at 0x000001E1EEB9A590>\n", | |
"[INFO@root] content-type: 'text/csv;charset=utf-8'\n", | |
"[INFO@root] pandas.read_csv(<http.client.HTTPResponse object at 0x000001E1EEB9A590>, **{'index_col': 'index', 'na_values': '', 'keep_default_na': False, 'encoding': 'utf-8'})\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"<class 'pandas.core.frame.DataFrame'>\n", | |
"Int64Index: 8 entries, 0 to 0\n", | |
"Data columns (total 9 columns):\n", | |
" # Column Non-Null Count Dtype \n", | |
"--- ------ -------------- ----- \n", | |
" 0 string 8 non-null object \n", | |
" 1 integer 8 non-null int64 \n", | |
" 2 boolean 8 non-null bool \n", | |
" 3 decimal 8 non-null float64 \n", | |
" 4 double 8 non-null float64 \n", | |
" 5 float 8 non-null float64 \n", | |
" 6 datetime 8 non-null datetime64[ns]\n", | |
" 7 date 8 non-null object \n", | |
" 8 time 8 non-null object \n", | |
"dtypes: bool(1), datetime64[ns](1), float64(3), int64(1), object(3)\n", | |
"memory usage: 1.9 KB\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>string</th>\n", | |
" <th>integer</th>\n", | |
" <th>boolean</th>\n", | |
" <th>decimal</th>\n", | |
" <th>double</th>\n", | |
" <th>float</th>\n", | |
" <th>datetime</th>\n", | |
" <th>date</th>\n", | |
" <th>time</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>index</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'spam'</td>\n", | |
" <td>7</td>\n", | |
" <td>True</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>'2001-01-01'</td>\n", | |
" <td>'18:30'</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'spam'</td>\n", | |
" <td>7</td>\n", | |
" <td>False</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>'2001-01-01'</td>\n", | |
" <td>'18:30'</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'spam'</td>\n", | |
" <td>42</td>\n", | |
" <td>True</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>'2001-01-01'</td>\n", | |
" <td>'18:30'</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'spam'</td>\n", | |
" <td>42</td>\n", | |
" <td>False</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>'2001-01-01'</td>\n", | |
" <td>'18:30'</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'eggs'</td>\n", | |
" <td>7</td>\n", | |
" <td>True</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>'2001-01-01'</td>\n", | |
" <td>'18:30'</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'eggs'</td>\n", | |
" <td>7</td>\n", | |
" <td>False</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>'2001-01-01'</td>\n", | |
" <td>'18:30'</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'eggs'</td>\n", | |
" <td>42</td>\n", | |
" <td>True</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>'2001-01-01'</td>\n", | |
" <td>'18:30'</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'eggs'</td>\n", | |
" <td>42</td>\n", | |
" <td>False</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>'2001-01-01'</td>\n", | |
" <td>'18:30'</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" string integer boolean decimal double float \\\n", | |
"index \n", | |
"0 'spam' 7 True 1.3 1000000.0 6.275 \n", | |
"0 'spam' 7 False 1.3 1000000.0 6.275 \n", | |
"0 'spam' 42 True 1.3 1000000.0 6.275 \n", | |
"0 'spam' 42 False 1.3 1000000.0 6.275 \n", | |
"0 'eggs' 7 True 1.3 1000000.0 6.275 \n", | |
"0 'eggs' 7 False 1.3 1000000.0 6.275 \n", | |
"0 'eggs' 42 True 1.3 1000000.0 6.275 \n", | |
"0 'eggs' 42 False 1.3 1000000.0 6.275 \n", | |
"\n", | |
" datetime date time \n", | |
"index \n", | |
"0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n", | |
"0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n", | |
"0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n", | |
"0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n", | |
"0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n", | |
"0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n", | |
"0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n", | |
"0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' " | |
] | |
}, | |
"execution_count": 16, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"(CsvUrllibStrategy(ENDPOINT).query_pandas(TEST_QUERY, index_col='index')\n", | |
" .astype({'datetime': 'datetime64'}).pipe(pipe_info).applymap(repr))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "911ce4dd-3ea8-4cb7-b552-0c3a21b2e2eb", | |
"metadata": {}, | |
"source": [ | |
"### ``XmlUrllibStrategy``" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"id": "b915473b-14f1-4bd6-841c-56b1bce34192", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"SPARQL_RESULTS = 'http://www.w3.org/2005/sparql-results#'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"id": "4e57a676-dd62-4bc3-94df-0c6c13db6f62", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def extract_ns(tag) -> str:\n", | |
" ns = tag.partition('{')[2].partition('}')[0]\n", | |
" assert tag.startswith(f'{{{ns}}}')\n", | |
" return ns" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"id": "87447a7e-d491-4008-ae06-00433b93dedb", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"NS = f'{{{SPARQL_RESULTS}}}'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"id": "ff74fd77-d64e-4434-9eaf-488b3f75069d", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def read_sparql_query_xml(result, *, encoding=None, **kwargs) -> pd.DataFrame:\n", | |
" if encoding is None:\n", | |
" content_type = result.info()['content-type']\n", | |
" encoding = get_encoding(content_type)\n", | |
" logging.debug('encoding: %r', encoding)\n", | |
"\n", | |
" pairs = etree.iterparse(result.response, events=('start', 'end'),\n", | |
" parser=etree.XMLParser(encoding=encoding))\n", | |
" _, root = next(pairs)\n", | |
" logging.info('xml: %r', root)\n", | |
"\n", | |
" ns = extract_ns(root.tag)\n", | |
" if ns != SPARQL_RESULTS:\n", | |
" raise ValueError(f'error: unknown xml namespace {ns!r} (expected: {SPARQL_RESULTS!r})')\n", | |
" ns = f'{{{ns}}}'\n", | |
"\n", | |
" sparql_root = f'{ns}sparql'\n", | |
" if root.tag != sparql_root:\n", | |
" raise ValueError(f'error: invalid xml root tag {root.tag!r} (expected: {sparql_root!r})')\n", | |
"\n", | |
" sparql_head = f'{ns}head'\n", | |
" head = next(elem for event, elem in pairs if event == 'end' and elem.tag == sparql_head)\n", | |
" variables = [variable.attrib['name'] for variable in head.findall(f'{ns}variable')]\n", | |
" kwargs.setdefault('columns', variables)\n", | |
"\n", | |
" records = iterrecords(root, pairs, variables=variables)\n", | |
" return pd_dataframe_from_records(records, **kwargs)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"id": "5d3903f1-b6a8-4049-a079-15fe856357ba", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def iterrecords(root, pairs, *, variables, sparql_result: str = f'{NS}result'):\n", | |
" sparql_values = [f'{NS}binding[@name=\"{name}\"]/' for name in variables]\n", | |
" for event, elem in pairs:\n", | |
" if event != 'end':\n", | |
" pass\n", | |
" elif elem.tag == sparql_result: \n", | |
" value_elements = map(elem.find, sparql_values)\n", | |
" yield tuple(itervalues(value_elements))\n", | |
" root.clear()\n", | |
" elif elem.tag == root.tag:\n", | |
" assert next(pairs, None) is None\n", | |
" return" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"id": "32b4e966-a4dd-4d83-8ba3-7f2a93cb844e", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def itervalues(value_elements, *,\n", | |
" sparql_literal: str = f'{NS}literal',\n", | |
" sparql_uri: str = f'{NS}uri',\n", | |
" sparql_bnode: str = f'{NS}bnode',):\n", | |
" for value_elem in value_elements:\n", | |
" if value_elem is None:\n", | |
" yield None\n", | |
" continue\n", | |
"\n", | |
" if value_elem.tag not in (sparql_literal, sparql_uri, sparql_bnode):\n", | |
" raise ValueError(f'invalid binding value tag: {value_elem.tag!r}')\n", | |
" value = value_elem.text\n", | |
" if value_elem.tag == sparql_literal and 'datatype' in value_elem.attrib:\n", | |
" datatype = value_elem.attrib['datatype']\n", | |
" if datatype is not None:\n", | |
" try:\n", | |
" parse_value = PARSE_FUNC[datatype]\n", | |
" except KeyError:\n", | |
" warnings.warn(f'cannot convert unknown datatype: {datatype!r}')\n", | |
" else:\n", | |
" value = parse_value(value)\n", | |
" yield value" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"id": "0b4f1fab-4f46-4c6b-b107-bffc6388f775", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"XSD = 'http://www.w3.org/2001/XMLSchema#'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"id": "2d68fcef-ef94-4b3a-9dc5-de49b212cce0", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"PARSE_FUNC = {f'{XSD}string': lambda x: x,\n", | |
" f'{XSD}integer': int,\n", | |
" f'{XSD}boolean': lambda x: bool(distutils.util.strtobool(x)),\n", | |
" f'{XSD}decimal': decimal.Decimal,\n", | |
" f'{XSD}double': float,\n", | |
" f'{XSD}float': float,\n", | |
" f'{XSD}dateTime': datetime.datetime.fromisoformat,\n", | |
" f'{XSD}date': datetime.date.fromisoformat,\n", | |
" f'{XSD}time': datetime.time.fromisoformat}" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"id": "29bf61ef-0328-478a-a877-30fbd45eeeca", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"class XmlUrllibStrategy(QueryUrllib):\n", | |
"\n", | |
" headers = {'Accept': 'application/sparql-results+xml'}\n", | |
"\n", | |
" read_sparql_query_result = staticmethod(read_sparql_query_xml)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"id": "95d5889f-e830-4b68-839d-5558d8fcc959", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"[INFO@root] endpoint: 'https://query.wikidata.org/sparql'\n", | |
"[INFO@root] request: 'GET' <urllib.request.Request object at 0x000001E1EEC03C10>\n", | |
"[INFO@root] response: 200 <http.client.HTTPResponse object at 0x000001E1EEC01F60>\n", | |
"[INFO@root] content-type: 'application/sparql-results+xml;charset=utf-8'\n", | |
"[INFO@root] xml: <Element '{http://www.w3.org/2005/sparql-results#}sparql' at 0x000001E1EEC3DB70>\n", | |
"[INFO@root] pandas.DataFrame.from_records(<generator object iterrecords at 0x000001E1EEBABBC0>, **{'index': 'index', 'columns': ['index', 'string', 'integer', 'boolean', 'decimal', 'double', 'float', 'datetime', 'date', 'time'], 'coerce_float': True})\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"<class 'pandas.core.frame.DataFrame'>\n", | |
"Int64Index: 8 entries, 0 to 0\n", | |
"Data columns (total 9 columns):\n", | |
" # Column Non-Null Count Dtype \n", | |
"--- ------ -------------- ----- \n", | |
" 0 string 8 non-null object \n", | |
" 1 integer 8 non-null int64 \n", | |
" 2 boolean 8 non-null bool \n", | |
" 3 decimal 8 non-null float64 \n", | |
" 4 double 8 non-null float64 \n", | |
" 5 float 8 non-null float64 \n", | |
" 6 datetime 8 non-null datetime64[ns]\n", | |
" 7 date 8 non-null object \n", | |
" 8 time 8 non-null object \n", | |
"dtypes: bool(1), datetime64[ns](1), float64(3), int64(1), object(3)\n", | |
"memory usage: 1.5 KB\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>string</th>\n", | |
" <th>integer</th>\n", | |
" <th>boolean</th>\n", | |
" <th>decimal</th>\n", | |
" <th>double</th>\n", | |
" <th>float</th>\n", | |
" <th>datetime</th>\n", | |
" <th>date</th>\n", | |
" <th>time</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>index</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'spam'</td>\n", | |
" <td>7</td>\n", | |
" <td>True</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>datetime.date(2001, 1, 1)</td>\n", | |
" <td>datetime.time(18, 30)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'spam'</td>\n", | |
" <td>7</td>\n", | |
" <td>False</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>datetime.date(2001, 1, 1)</td>\n", | |
" <td>datetime.time(18, 30)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'spam'</td>\n", | |
" <td>42</td>\n", | |
" <td>True</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>datetime.date(2001, 1, 1)</td>\n", | |
" <td>datetime.time(18, 30)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'spam'</td>\n", | |
" <td>42</td>\n", | |
" <td>False</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>datetime.date(2001, 1, 1)</td>\n", | |
" <td>datetime.time(18, 30)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'eggs'</td>\n", | |
" <td>7</td>\n", | |
" <td>True</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>datetime.date(2001, 1, 1)</td>\n", | |
" <td>datetime.time(18, 30)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'eggs'</td>\n", | |
" <td>7</td>\n", | |
" <td>False</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>datetime.date(2001, 1, 1)</td>\n", | |
" <td>datetime.time(18, 30)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'eggs'</td>\n", | |
" <td>42</td>\n", | |
" <td>True</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>datetime.date(2001, 1, 1)</td>\n", | |
" <td>datetime.time(18, 30)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'eggs'</td>\n", | |
" <td>42</td>\n", | |
" <td>False</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>datetime.date(2001, 1, 1)</td>\n", | |
" <td>datetime.time(18, 30)</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" string integer boolean decimal double float \\\n", | |
"index \n", | |
"0 'spam' 7 True 1.3 1000000.0 6.275 \n", | |
"0 'spam' 7 False 1.3 1000000.0 6.275 \n", | |
"0 'spam' 42 True 1.3 1000000.0 6.275 \n", | |
"0 'spam' 42 False 1.3 1000000.0 6.275 \n", | |
"0 'eggs' 7 True 1.3 1000000.0 6.275 \n", | |
"0 'eggs' 7 False 1.3 1000000.0 6.275 \n", | |
"0 'eggs' 42 True 1.3 1000000.0 6.275 \n", | |
"0 'eggs' 42 False 1.3 1000000.0 6.275 \n", | |
"\n", | |
" datetime date \\\n", | |
"index \n", | |
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
"\n", | |
" time \n", | |
"index \n", | |
"0 datetime.time(18, 30) \n", | |
"0 datetime.time(18, 30) \n", | |
"0 datetime.time(18, 30) \n", | |
"0 datetime.time(18, 30) \n", | |
"0 datetime.time(18, 30) \n", | |
"0 datetime.time(18, 30) \n", | |
"0 datetime.time(18, 30) \n", | |
"0 datetime.time(18, 30) " | |
] | |
}, | |
"execution_count": 26, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"XmlUrllibStrategy(ENDPOINT).query_pandas(TEST_QUERY, index='index').pipe(pipe_info).applymap(repr)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "a490428a-aeef-4a07-a886-66c992b031e9", | |
"metadata": {}, | |
"source": [ | |
"## rdflib" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"id": "6a78e26c-e9d9-408c-8726-35507a76f3af", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"class SCHEMA(rdflib.SDO):\n", | |
" \"\"\"https://github.com/RDFLib/rdflib/issues/1120\"\"\"\n", | |
"\n", | |
" _NS = rdflib.Namespace(rdflib.SDO._NS.replace('https://', 'http://'))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"id": "7919a1c7-e917-4f55-b7e7-fd950226616b", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def open_sparql_graph(endpoint: str, *, prefixes={'schema': SCHEMA}):\n", | |
" logging.info('endpoint: %r', endpoint)\n", | |
" graph = rdflib.ConjunctiveGraph('SPARQLStore')\n", | |
" graph.open(endpoint)\n", | |
" logging.info('graph: %s', graph)\n", | |
"\n", | |
" logging.info('prefixes: %r', prefixes)\n", | |
" for prefix, namespace in prefixes.items():\n", | |
" graph.namespace_manager.bind(prefix, namespace, replace=True)\n", | |
"\n", | |
" logging.debug('namespaces: %r', list(graph.namespaces()))\n", | |
" return graph" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "e2a7cf6d-7d0d-471d-b09c-43be532a2eaf", | |
"metadata": {}, | |
"source": [ | |
"### `rdflib.ConjunctiveGraph('SPARQLStore')`" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 29, | |
"id": "3ad3eee6-0a03-4a77-9972-858521ded602", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def read_sparql_query_graph(result, **kwargs) -> pd.DataFrame:\n", | |
" variables = [v.toPython().removeprefix('?') for v in result.vars]\n", | |
" kwargs.setdefault('columns', variables)\n", | |
" records = ([v.toPython() if v is not None else None for v in values]\n", | |
" for values in result)\n", | |
" return pd_dataframe_from_records(records, **kwargs)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 30, | |
"id": "771a038a-e21d-4415-b11a-720ce570b804", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"class GraphXmlStrategy(QueryPandas):\n", | |
" \n", | |
" def __repr__(self) -> str:\n", | |
" return f'<{self.__class__.__name__} {self.endpoint!r} graph={self.graph!r}>'\n", | |
"\n", | |
" @functools.cached_property\n", | |
" def graph(self):\n", | |
" return open_sparql_graph(self.endpoint)\n", | |
"\n", | |
" def query(self, query: str):\n", | |
" result = self.graph.query(query)\n", | |
" logging.info('result: %r', result)\n", | |
" return result\n", | |
"\n", | |
" read_sparql_query_result = staticmethod(read_sparql_query_graph)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 31, | |
"id": "e711ce12-c947-4525-b8c5-e0ff656ff970", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"[INFO@root] endpoint: 'https://query.wikidata.org/sparql'\n", | |
"[INFO@root] graph: [a rdflib:ConjunctiveGraph;rdflib:storage [a rdflib:Store;rdfs:label 'SPARQLStore']]\n", | |
"[INFO@root] prefixes: {'schema': Namespace(\"http://schema.org/\")}\n", | |
"[INFO@root] result: <rdflib.plugins.sparql.results.xmlresults.XMLResult object at 0x000001E1EEC99990>\n", | |
"[INFO@root] pandas.DataFrame.from_records(<generator object read_sparql_query_graph.<locals>.<genexpr> at 0x000001E1EF0D1310>, **{'index': 'index', 'columns': ['index', 'string', 'integer', 'boolean', 'decimal', 'double', 'float', 'datetime', 'date', 'time'], 'coerce_float': True})\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"<class 'pandas.core.frame.DataFrame'>\n", | |
"Int64Index: 8 entries, 0 to 0\n", | |
"Data columns (total 9 columns):\n", | |
" # Column Non-Null Count Dtype \n", | |
"--- ------ -------------- ----- \n", | |
" 0 string 8 non-null object \n", | |
" 1 integer 8 non-null int64 \n", | |
" 2 boolean 8 non-null bool \n", | |
" 3 decimal 8 non-null float64 \n", | |
" 4 double 8 non-null float64 \n", | |
" 5 float 8 non-null float64 \n", | |
" 6 datetime 8 non-null datetime64[ns]\n", | |
" 7 date 8 non-null object \n", | |
" 8 time 8 non-null object \n", | |
"dtypes: bool(1), datetime64[ns](1), float64(3), int64(1), object(3)\n", | |
"memory usage: 1.5 KB\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>string</th>\n", | |
" <th>integer</th>\n", | |
" <th>boolean</th>\n", | |
" <th>decimal</th>\n", | |
" <th>double</th>\n", | |
" <th>float</th>\n", | |
" <th>datetime</th>\n", | |
" <th>date</th>\n", | |
" <th>time</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>index</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'spam'</td>\n", | |
" <td>7</td>\n", | |
" <td>True</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>datetime.date(2001, 1, 1)</td>\n", | |
" <td>datetime.time(18, 30)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'spam'</td>\n", | |
" <td>7</td>\n", | |
" <td>False</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>datetime.date(2001, 1, 1)</td>\n", | |
" <td>datetime.time(18, 30)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'spam'</td>\n", | |
" <td>42</td>\n", | |
" <td>True</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>datetime.date(2001, 1, 1)</td>\n", | |
" <td>datetime.time(18, 30)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'spam'</td>\n", | |
" <td>42</td>\n", | |
" <td>False</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>datetime.date(2001, 1, 1)</td>\n", | |
" <td>datetime.time(18, 30)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'eggs'</td>\n", | |
" <td>7</td>\n", | |
" <td>True</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>datetime.date(2001, 1, 1)</td>\n", | |
" <td>datetime.time(18, 30)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'eggs'</td>\n", | |
" <td>7</td>\n", | |
" <td>False</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>datetime.date(2001, 1, 1)</td>\n", | |
" <td>datetime.time(18, 30)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'eggs'</td>\n", | |
" <td>42</td>\n", | |
" <td>True</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>datetime.date(2001, 1, 1)</td>\n", | |
" <td>datetime.time(18, 30)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'eggs'</td>\n", | |
" <td>42</td>\n", | |
" <td>False</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>datetime.date(2001, 1, 1)</td>\n", | |
" <td>datetime.time(18, 30)</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" string integer boolean decimal double float \\\n", | |
"index \n", | |
"0 'spam' 7 True 1.3 1000000.0 6.275 \n", | |
"0 'spam' 7 False 1.3 1000000.0 6.275 \n", | |
"0 'spam' 42 True 1.3 1000000.0 6.275 \n", | |
"0 'spam' 42 False 1.3 1000000.0 6.275 \n", | |
"0 'eggs' 7 True 1.3 1000000.0 6.275 \n", | |
"0 'eggs' 7 False 1.3 1000000.0 6.275 \n", | |
"0 'eggs' 42 True 1.3 1000000.0 6.275 \n", | |
"0 'eggs' 42 False 1.3 1000000.0 6.275 \n", | |
"\n", | |
" datetime date \\\n", | |
"index \n", | |
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
"\n", | |
" time \n", | |
"index \n", | |
"0 datetime.time(18, 30) \n", | |
"0 datetime.time(18, 30) \n", | |
"0 datetime.time(18, 30) \n", | |
"0 datetime.time(18, 30) \n", | |
"0 datetime.time(18, 30) \n", | |
"0 datetime.time(18, 30) \n", | |
"0 datetime.time(18, 30) \n", | |
"0 datetime.time(18, 30) " | |
] | |
}, | |
"execution_count": 31, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"GraphXmlStrategy(ENDPOINT).query_pandas(TEST_QUERY, index='index').pipe(pipe_info).applymap(repr)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "6d30940d-4cde-44ee-a767-a98a538dbb08", | |
"metadata": {}, | |
"source": [ | |
"## SPARQLWrapper" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 32, | |
"id": "62fed472-860d-4ea5-bf37-0f93eaebffd8", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"class QuerySPARQLWrapper:\n", | |
"\n", | |
" def __repr__(self) -> str:\n", | |
" return f'{self.__class__.__name__}({self.endpoint!r}, returnFormat={self.returnFormat!r})'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 33, | |
"id": "db4a18f8-d06a-473c-bded-65bfe509eefa", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"class QueryService(QuerySPARQLWrapper, sparqlwrapper.SPARQLWrapper, QueryPandas):\n", | |
" \"\"\"Query service.\"\"\"\n", | |
"\n", | |
" def query(self, sparql: str):\n", | |
" logging.info('endpoint: %r', self.endpoint)\n", | |
" logging.info('returnFormat: %r', self.returnFormat)\n", | |
" if self.returnFormat in (sparqlwrapper.CSV, sparqlwrapper.TSV):\n", | |
" logging.info('%r.setOnlyConnreg(True)', self)\n", | |
" self.setOnlyConneg(True)\n", | |
" elif not self.supportsReturnFormat(self.returnFormat):\n", | |
" raise ValueError('unsupposted return format: %r', self.returnFormat)\n", | |
" self.setQuery(sparql)\n", | |
" result = super().query()\n", | |
" logging.info('result: %r', result)\n", | |
" headers = result.info()\n", | |
" logging.debug('headers: %r', headers)\n", | |
" logging.info('content_type: %r', headers['content-type'])\n", | |
" return result" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 34, | |
"id": "a3b5dd29-ca50-4088-a568-90bac6d604fc", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"class JsonReturnFormatQueryService(QuerySPARQLWrapper, sparqlwrapper.SPARQLWrapper2, QueryPandas):\n", | |
" \"\"\"Query service with JSON return format.\"\"\"\n", | |
" \n", | |
" def query(self, sparql: str):\n", | |
" logging.info('endpoint: %r', self.endpoint)\n", | |
" self.setQuery(sparql)\n", | |
" result = super().query()\n", | |
" logging.info('result: %r', result)\n", | |
" logging.info('result.variables: %r', result.variables)\n", | |
" return result" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "ca4321ee-3a45-4c36-8a4c-617a13a5140b", | |
"metadata": {}, | |
"source": [ | |
"### `SPARQLWrapper.SPARQLWrapper(returnFormat='csv')`" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 35, | |
"id": "fb82bc0a-8fea-4478-9eb7-8d7af008dbf0", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"class CsvSwStrategy(QueryService):\n", | |
"\n", | |
" def __init__(self, endpoint: str, *, returnFormat=sparqlwrapper.CSV, **kwargs) -> None:\n", | |
" super().__init__(endpoint, returnFormat=returnFormat, **kwargs)\n", | |
"\n", | |
" read_sparql_query_result = staticmethod(read_sparql_query_csv)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 36, | |
"id": "0e59b728-e7dc-426a-8c85-cbd6fdc03c44", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"[INFO@root] endpoint: 'https://query.wikidata.org/sparql'\n", | |
"[INFO@root] returnFormat: 'csv'\n", | |
"[INFO@root] CsvSwStrategy('https://query.wikidata.org/sparql', returnFormat='csv').setOnlyConnreg(True)\n", | |
"[INFO@root] result: <SPARQLWrapper.Wrapper.QueryResult object at 0x000001E1EF0FBB80>\n", | |
"[INFO@root] content_type: 'text/csv;charset=utf-8'\n", | |
"[INFO@root] pandas.read_csv(<http.client.HTTPResponse object at 0x000001E1EF0FBCD0>, **{'index_col': 'index', 'na_values': '', 'keep_default_na': False, 'encoding': 'utf-8'})\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"<class 'pandas.core.frame.DataFrame'>\n", | |
"Int64Index: 8 entries, 0 to 0\n", | |
"Data columns (total 9 columns):\n", | |
" # Column Non-Null Count Dtype \n", | |
"--- ------ -------------- ----- \n", | |
" 0 string 8 non-null object \n", | |
" 1 integer 8 non-null int64 \n", | |
" 2 boolean 8 non-null bool \n", | |
" 3 decimal 8 non-null float64 \n", | |
" 4 double 8 non-null float64 \n", | |
" 5 float 8 non-null float64 \n", | |
" 6 datetime 8 non-null datetime64[ns]\n", | |
" 7 date 8 non-null object \n", | |
" 8 time 8 non-null object \n", | |
"dtypes: bool(1), datetime64[ns](1), float64(3), int64(1), object(3)\n", | |
"memory usage: 1.9 KB\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>string</th>\n", | |
" <th>integer</th>\n", | |
" <th>boolean</th>\n", | |
" <th>decimal</th>\n", | |
" <th>double</th>\n", | |
" <th>float</th>\n", | |
" <th>datetime</th>\n", | |
" <th>date</th>\n", | |
" <th>time</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>index</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'spam'</td>\n", | |
" <td>7</td>\n", | |
" <td>True</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>'2001-01-01'</td>\n", | |
" <td>'18:30'</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'spam'</td>\n", | |
" <td>7</td>\n", | |
" <td>False</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>'2001-01-01'</td>\n", | |
" <td>'18:30'</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'spam'</td>\n", | |
" <td>42</td>\n", | |
" <td>True</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>'2001-01-01'</td>\n", | |
" <td>'18:30'</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'spam'</td>\n", | |
" <td>42</td>\n", | |
" <td>False</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>'2001-01-01'</td>\n", | |
" <td>'18:30'</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'eggs'</td>\n", | |
" <td>7</td>\n", | |
" <td>True</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>'2001-01-01'</td>\n", | |
" <td>'18:30'</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'eggs'</td>\n", | |
" <td>7</td>\n", | |
" <td>False</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>'2001-01-01'</td>\n", | |
" <td>'18:30'</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'eggs'</td>\n", | |
" <td>42</td>\n", | |
" <td>True</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>'2001-01-01'</td>\n", | |
" <td>'18:30'</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'eggs'</td>\n", | |
" <td>42</td>\n", | |
" <td>False</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>'2001-01-01'</td>\n", | |
" <td>'18:30'</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" string integer boolean decimal double float \\\n", | |
"index \n", | |
"0 'spam' 7 True 1.3 1000000.0 6.275 \n", | |
"0 'spam' 7 False 1.3 1000000.0 6.275 \n", | |
"0 'spam' 42 True 1.3 1000000.0 6.275 \n", | |
"0 'spam' 42 False 1.3 1000000.0 6.275 \n", | |
"0 'eggs' 7 True 1.3 1000000.0 6.275 \n", | |
"0 'eggs' 7 False 1.3 1000000.0 6.275 \n", | |
"0 'eggs' 42 True 1.3 1000000.0 6.275 \n", | |
"0 'eggs' 42 False 1.3 1000000.0 6.275 \n", | |
"\n", | |
" datetime date time \n", | |
"index \n", | |
"0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n", | |
"0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n", | |
"0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n", | |
"0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n", | |
"0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n", | |
"0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n", | |
"0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n", | |
"0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' " | |
] | |
}, | |
"execution_count": 36, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"(CsvSwStrategy(ENDPOINT).query_pandas(TEST_QUERY, index_col='index')\n", | |
" .astype({'datetime': 'datetime64'}).pipe(pipe_info).applymap(repr))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "947aa733-b6ff-47cf-a862-d0ab23ac147c", | |
"metadata": {}, | |
"source": [ | |
"### `SPARQLWrapper.SPARQLWrapper(returnFormat='xml')`" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 37, | |
"id": "64ffe095-4c87-4b80-bb93-077c18a39c5c", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"class XmlSwStrategy(QueryService):\n", | |
"\n", | |
" read_sparql_query_result = staticmethod(read_sparql_query_xml)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 38, | |
"id": "bcf515da-e4a7-45e2-b4f0-9c38fba6895b", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"[INFO@root] endpoint: 'https://query.wikidata.org/sparql'\n", | |
"[INFO@root] returnFormat: 'xml'\n", | |
"[INFO@root] result: <SPARQLWrapper.Wrapper.QueryResult object at 0x000001E1EF0FB670>\n", | |
"[INFO@root] content_type: 'application/sparql-results+xml;charset=utf-8'\n", | |
"[INFO@root] xml: <Element '{http://www.w3.org/2005/sparql-results#}sparql' at 0x000001E1EF12B060>\n", | |
"[INFO@root] pandas.DataFrame.from_records(<generator object iterrecords at 0x000001E1EF0D1D20>, **{'index': 'index', 'columns': ['index', 'string', 'integer', 'boolean', 'decimal', 'double', 'float', 'datetime', 'date', 'time'], 'coerce_float': True})\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"<class 'pandas.core.frame.DataFrame'>\n", | |
"Int64Index: 8 entries, 0 to 0\n", | |
"Data columns (total 9 columns):\n", | |
" # Column Non-Null Count Dtype \n", | |
"--- ------ -------------- ----- \n", | |
" 0 string 8 non-null object \n", | |
" 1 integer 8 non-null int64 \n", | |
" 2 boolean 8 non-null bool \n", | |
" 3 decimal 8 non-null float64 \n", | |
" 4 double 8 non-null float64 \n", | |
" 5 float 8 non-null float64 \n", | |
" 6 datetime 8 non-null datetime64[ns]\n", | |
" 7 date 8 non-null object \n", | |
" 8 time 8 non-null object \n", | |
"dtypes: bool(1), datetime64[ns](1), float64(3), int64(1), object(3)\n", | |
"memory usage: 1.5 KB\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>string</th>\n", | |
" <th>integer</th>\n", | |
" <th>boolean</th>\n", | |
" <th>decimal</th>\n", | |
" <th>double</th>\n", | |
" <th>float</th>\n", | |
" <th>datetime</th>\n", | |
" <th>date</th>\n", | |
" <th>time</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>index</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'spam'</td>\n", | |
" <td>7</td>\n", | |
" <td>True</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>datetime.date(2001, 1, 1)</td>\n", | |
" <td>datetime.time(18, 30)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'spam'</td>\n", | |
" <td>7</td>\n", | |
" <td>False</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>datetime.date(2001, 1, 1)</td>\n", | |
" <td>datetime.time(18, 30)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'spam'</td>\n", | |
" <td>42</td>\n", | |
" <td>True</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>datetime.date(2001, 1, 1)</td>\n", | |
" <td>datetime.time(18, 30)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'spam'</td>\n", | |
" <td>42</td>\n", | |
" <td>False</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>datetime.date(2001, 1, 1)</td>\n", | |
" <td>datetime.time(18, 30)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'eggs'</td>\n", | |
" <td>7</td>\n", | |
" <td>True</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>datetime.date(2001, 1, 1)</td>\n", | |
" <td>datetime.time(18, 30)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'eggs'</td>\n", | |
" <td>7</td>\n", | |
" <td>False</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>datetime.date(2001, 1, 1)</td>\n", | |
" <td>datetime.time(18, 30)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'eggs'</td>\n", | |
" <td>42</td>\n", | |
" <td>True</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>datetime.date(2001, 1, 1)</td>\n", | |
" <td>datetime.time(18, 30)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'eggs'</td>\n", | |
" <td>42</td>\n", | |
" <td>False</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>datetime.date(2001, 1, 1)</td>\n", | |
" <td>datetime.time(18, 30)</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" string integer boolean decimal double float \\\n", | |
"index \n", | |
"0 'spam' 7 True 1.3 1000000.0 6.275 \n", | |
"0 'spam' 7 False 1.3 1000000.0 6.275 \n", | |
"0 'spam' 42 True 1.3 1000000.0 6.275 \n", | |
"0 'spam' 42 False 1.3 1000000.0 6.275 \n", | |
"0 'eggs' 7 True 1.3 1000000.0 6.275 \n", | |
"0 'eggs' 7 False 1.3 1000000.0 6.275 \n", | |
"0 'eggs' 42 True 1.3 1000000.0 6.275 \n", | |
"0 'eggs' 42 False 1.3 1000000.0 6.275 \n", | |
"\n", | |
" datetime date \\\n", | |
"index \n", | |
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
"\n", | |
" time \n", | |
"index \n", | |
"0 datetime.time(18, 30) \n", | |
"0 datetime.time(18, 30) \n", | |
"0 datetime.time(18, 30) \n", | |
"0 datetime.time(18, 30) \n", | |
"0 datetime.time(18, 30) \n", | |
"0 datetime.time(18, 30) \n", | |
"0 datetime.time(18, 30) \n", | |
"0 datetime.time(18, 30) " | |
] | |
}, | |
"execution_count": 38, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"XmlSwStrategy(ENDPOINT).query_pandas(TEST_QUERY, index='index').pipe(pipe_info).applymap(repr)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "b167836d-de6c-4f99-a840-ff05cb41d623", | |
"metadata": {}, | |
"source": [ | |
"### `SPARQLWrapper.SPARQLWrapper2()`" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 39, | |
"id": "69ea97b4-8814-45c6-ad0f-aeb27896a59a", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def read_sparql_query_json_result(result, **kwargs) -> pd.DataFrame:\n", | |
" kwargs.setdefault('columns', result.variables)\n", | |
" records = (tuple(itervalues_json(binding, variables=result.variables))\n", | |
" for binding in result.bindings)\n", | |
" return pd_dataframe_from_records(records, **kwargs)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 40, | |
"id": "688cfb61-97ba-4aa4-b794-146c00ff5b2c", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def itervalues_json(binding, *, variables):\n", | |
" for v in variables:\n", | |
" if v not in binding:\n", | |
" yield None\n", | |
" continue\n", | |
"\n", | |
" value_dict = binding[v]\n", | |
" if value_dict.type not in ('literal', 'uri', 'bnode'):\n", | |
" raise ValueError(f'invalid binding value type {b.type!r}')\n", | |
" value = value_dict.value\n", | |
" if value_dict.type == 'literal' and value_dict.datatype is not None:\n", | |
" try:\n", | |
" parse_value = PARSE_FUNC[value_dict.datatype]\n", | |
" except KeyError:\n", | |
" warnings.warn('cannot convert unmatched datatype: {value_dict!r}')\n", | |
" else:\n", | |
" value = parse_value(value)\n", | |
" yield value" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 41, | |
"id": "fd10ccbd-65b2-4dcc-8fc5-2ac154ce3833", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"class JsonResultStrategy(JsonReturnFormatQueryService):\n", | |
"\n", | |
" read_sparql_query_result = staticmethod(read_sparql_query_json_result)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 42, | |
"id": "b22dbc5a-cc0e-405b-ae2e-0ab3cf2e7fc6", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"[INFO@root] endpoint: 'https://query.wikidata.org/sparql'\n", | |
"[INFO@root] result: <SPARQLWrapper.SmartWrapper.Bindings object at 0x000001E1EF0FB9D0>\n", | |
"[INFO@root] result.variables: ['index', 'string', 'integer', 'boolean', 'decimal', 'double', 'float', 'datetime', 'date', 'time']\n", | |
"[INFO@root] pandas.DataFrame.from_records(<generator object read_sparql_query_json_result.<locals>.<genexpr> at 0x000001E1EF0D2180>, **{'index': 'index', 'columns': ['index', 'string', 'integer', 'boolean', 'decimal', 'double', 'float', 'datetime', 'date', 'time'], 'coerce_float': True})\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"<class 'pandas.core.frame.DataFrame'>\n", | |
"Int64Index: 8 entries, 0 to 0\n", | |
"Data columns (total 9 columns):\n", | |
" # Column Non-Null Count Dtype \n", | |
"--- ------ -------------- ----- \n", | |
" 0 string 8 non-null object \n", | |
" 1 integer 8 non-null int64 \n", | |
" 2 boolean 8 non-null bool \n", | |
" 3 decimal 8 non-null float64 \n", | |
" 4 double 8 non-null float64 \n", | |
" 5 float 8 non-null float64 \n", | |
" 6 datetime 8 non-null datetime64[ns]\n", | |
" 7 date 8 non-null object \n", | |
" 8 time 8 non-null object \n", | |
"dtypes: bool(1), datetime64[ns](1), float64(3), int64(1), object(3)\n", | |
"memory usage: 1.5 KB\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>string</th>\n", | |
" <th>integer</th>\n", | |
" <th>boolean</th>\n", | |
" <th>decimal</th>\n", | |
" <th>double</th>\n", | |
" <th>float</th>\n", | |
" <th>datetime</th>\n", | |
" <th>date</th>\n", | |
" <th>time</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>index</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'spam'</td>\n", | |
" <td>7</td>\n", | |
" <td>True</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>datetime.date(2001, 1, 1)</td>\n", | |
" <td>datetime.time(18, 30)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'spam'</td>\n", | |
" <td>7</td>\n", | |
" <td>False</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>datetime.date(2001, 1, 1)</td>\n", | |
" <td>datetime.time(18, 30)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'spam'</td>\n", | |
" <td>42</td>\n", | |
" <td>True</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>datetime.date(2001, 1, 1)</td>\n", | |
" <td>datetime.time(18, 30)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'spam'</td>\n", | |
" <td>42</td>\n", | |
" <td>False</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>datetime.date(2001, 1, 1)</td>\n", | |
" <td>datetime.time(18, 30)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'eggs'</td>\n", | |
" <td>7</td>\n", | |
" <td>True</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>datetime.date(2001, 1, 1)</td>\n", | |
" <td>datetime.time(18, 30)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'eggs'</td>\n", | |
" <td>7</td>\n", | |
" <td>False</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>datetime.date(2001, 1, 1)</td>\n", | |
" <td>datetime.time(18, 30)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'eggs'</td>\n", | |
" <td>42</td>\n", | |
" <td>True</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>datetime.date(2001, 1, 1)</td>\n", | |
" <td>datetime.time(18, 30)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'eggs'</td>\n", | |
" <td>42</td>\n", | |
" <td>False</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>datetime.date(2001, 1, 1)</td>\n", | |
" <td>datetime.time(18, 30)</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" string integer boolean decimal double float \\\n", | |
"index \n", | |
"0 'spam' 7 True 1.3 1000000.0 6.275 \n", | |
"0 'spam' 7 False 1.3 1000000.0 6.275 \n", | |
"0 'spam' 42 True 1.3 1000000.0 6.275 \n", | |
"0 'spam' 42 False 1.3 1000000.0 6.275 \n", | |
"0 'eggs' 7 True 1.3 1000000.0 6.275 \n", | |
"0 'eggs' 7 False 1.3 1000000.0 6.275 \n", | |
"0 'eggs' 42 True 1.3 1000000.0 6.275 \n", | |
"0 'eggs' 42 False 1.3 1000000.0 6.275 \n", | |
"\n", | |
" datetime date \\\n", | |
"index \n", | |
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
"\n", | |
" time \n", | |
"index \n", | |
"0 datetime.time(18, 30) \n", | |
"0 datetime.time(18, 30) \n", | |
"0 datetime.time(18, 30) \n", | |
"0 datetime.time(18, 30) \n", | |
"0 datetime.time(18, 30) \n", | |
"0 datetime.time(18, 30) \n", | |
"0 datetime.time(18, 30) \n", | |
"0 datetime.time(18, 30) " | |
] | |
}, | |
"execution_count": 42, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"JsonResultStrategy(ENDPOINT).query_pandas(TEST_QUERY, index='index').pipe(pipe_info).applymap(repr)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "d020a3fc-248c-4537-a0ad-17a387272863", | |
"metadata": {}, | |
"source": [ | |
"### `SPARQLWrapper.SPARQLWrapper(returnFormat='json')`" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 43, | |
"id": "650f5ce1-b0c4-497d-9665-42409061e0b9", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def read_sparql_query_json_normalize(result, *, raw: bool = False, **kwargs) -> pd.DataFrame:\n", | |
" jsondata = result.convert()\n", | |
" logging.info(\"result['head']: %r\", jsondata['head'])\n", | |
" return _read_sparql_query_json_normalize(jsondata, raw=raw, **kwargs)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 44, | |
"id": "25aaec36-4cbc-4c7c-8636-413079ecb93c", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"PARSE_DTYPE = {f'{XSD}string': 'string',\n", | |
" f'{XSD}integer': 'int',\n", | |
" f'{XSD}boolean': 'bool',\n", | |
" f'{XSD}decimal': 'float',\n", | |
" f'{XSD}double': 'float',\n", | |
" f'{XSD}float': 'float',\n", | |
" f'{XSD}dateTime': 'datetime64',\n", | |
" f'{XSD}date': 'string',\n", | |
" f'{XSD}time': 'string'}" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 45, | |
"id": "aa31d885-1a16-4964-9d33-560a3047bde6", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def _read_sparql_query_json_normalize(jsondata, *, raw: bool, **kwargs) -> pd.DataFrame:\n", | |
" df = pd_json_normalize(jsondata, **kwargs)\n", | |
" if df.empty:\n", | |
" return df\n", | |
" if not raw:\n", | |
" columns = jsondata['head']['vars']\n", | |
" rename = {f'{c}.value': c for c in columns}\n", | |
" dtype = {name: PARSE_DTYPE[binding['datatype']]\n", | |
" if binding['type'] == 'literal' and 'datatype' in binding\n", | |
" else 'string'\n", | |
" for name, binding in jsondata['results']['bindings'][0].items()}\n", | |
" booleans = [name for name, d in dtype.items() if d in ('bool', 'boolean')]\n", | |
" df = df[list(rename)].rename(rename, axis='columns')\n", | |
" df[booleans] = (df[booleans]\n", | |
" .replace(r'^[Tt]rue$', True, regex=True)\n", | |
" .replace(r'^[Ff]alse$', False, regex=True))\n", | |
" return df.astype(dtype)\n", | |
" return df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 46, | |
"id": "b919c3be-569c-4b15-8256-c31598fded50", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"class JsonNormalizeStrategy(QueryService):\n", | |
"\n", | |
" def __init__(self, endpoint, *, returnFormat=sparqlwrapper.JSON, **kwargs) -> None:\n", | |
" super().__init__(endpoint, returnFormat=returnFormat, **kwargs)\n", | |
"\n", | |
" read_sparql_query_result = staticmethod(read_sparql_query_json_normalize)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 47, | |
"id": "7069bf4b-c35e-4e13-a901-25a5f90cd7dc", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"[INFO@root] endpoint: 'https://query.wikidata.org/sparql'\n", | |
"[INFO@root] returnFormat: 'json'\n", | |
"[INFO@root] result: <SPARQLWrapper.Wrapper.QueryResult object at 0x000001E1EEB99CF0>\n", | |
"[INFO@root] content_type: 'application/sparql-results+json;charset=utf-8'\n", | |
"[INFO@root] result['head']: {'vars': ['index', 'string', 'integer', 'boolean', 'decimal', 'double', 'float', 'datetime', 'date', 'time']}\n", | |
"[INFO@root] pandas.json_normalize(jsondata, **{'record_path': ['results', 'bindings']})\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"<class 'pandas.core.frame.DataFrame'>\n", | |
"Int64Index: 8 entries, 0 to 0\n", | |
"Data columns (total 9 columns):\n", | |
" # Column Non-Null Count Dtype \n", | |
"--- ------ -------------- ----- \n", | |
" 0 string 8 non-null string \n", | |
" 1 integer 8 non-null int32 \n", | |
" 2 boolean 8 non-null bool \n", | |
" 3 decimal 8 non-null float64 \n", | |
" 4 double 8 non-null float64 \n", | |
" 5 float 8 non-null float64 \n", | |
" 6 datetime 8 non-null datetime64[ns]\n", | |
" 7 date 8 non-null string \n", | |
" 8 time 8 non-null string \n", | |
"dtypes: bool(1), datetime64[ns](1), float64(3), int32(1), string(3)\n", | |
"memory usage: 1.8 KB\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>string</th>\n", | |
" <th>integer</th>\n", | |
" <th>boolean</th>\n", | |
" <th>decimal</th>\n", | |
" <th>double</th>\n", | |
" <th>float</th>\n", | |
" <th>datetime</th>\n", | |
" <th>date</th>\n", | |
" <th>time</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>index</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'spam'</td>\n", | |
" <td>7</td>\n", | |
" <td>True</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>'2001-01-01'</td>\n", | |
" <td>'18:30'</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'spam'</td>\n", | |
" <td>7</td>\n", | |
" <td>False</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>'2001-01-01'</td>\n", | |
" <td>'18:30'</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'spam'</td>\n", | |
" <td>42</td>\n", | |
" <td>True</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>'2001-01-01'</td>\n", | |
" <td>'18:30'</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'spam'</td>\n", | |
" <td>42</td>\n", | |
" <td>False</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>'2001-01-01'</td>\n", | |
" <td>'18:30'</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'eggs'</td>\n", | |
" <td>7</td>\n", | |
" <td>True</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>'2001-01-01'</td>\n", | |
" <td>'18:30'</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'eggs'</td>\n", | |
" <td>7</td>\n", | |
" <td>False</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>'2001-01-01'</td>\n", | |
" <td>'18:30'</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'eggs'</td>\n", | |
" <td>42</td>\n", | |
" <td>True</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>'2001-01-01'</td>\n", | |
" <td>'18:30'</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'eggs'</td>\n", | |
" <td>42</td>\n", | |
" <td>False</td>\n", | |
" <td>1.3</td>\n", | |
" <td>1000000.0</td>\n", | |
" <td>6.275</td>\n", | |
" <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
" <td>'2001-01-01'</td>\n", | |
" <td>'18:30'</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" string integer boolean decimal double float \\\n", | |
"index \n", | |
"0 'spam' 7 True 1.3 1000000.0 6.275 \n", | |
"0 'spam' 7 False 1.3 1000000.0 6.275 \n", | |
"0 'spam' 42 True 1.3 1000000.0 6.275 \n", | |
"0 'spam' 42 False 1.3 1000000.0 6.275 \n", | |
"0 'eggs' 7 True 1.3 1000000.0 6.275 \n", | |
"0 'eggs' 7 False 1.3 1000000.0 6.275 \n", | |
"0 'eggs' 42 True 1.3 1000000.0 6.275 \n", | |
"0 'eggs' 42 False 1.3 1000000.0 6.275 \n", | |
"\n", | |
" datetime date time \n", | |
"index \n", | |
"0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n", | |
"0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n", | |
"0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n", | |
"0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n", | |
"0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n", | |
"0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n", | |
"0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n", | |
"0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' " | |
] | |
}, | |
"execution_count": 47, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"JsonNormalizeStrategy(ENDPOINT).query_pandas(TEST_QUERY).set_index('index').pipe(pipe_info).applymap(repr)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 48, | |
"id": "ef657e62-7f89-413a-91a8-5bba2b3fefb8", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"[INFO@root] endpoint: 'https://query.wikidata.org/sparql'\n", | |
"[INFO@root] returnFormat: 'json'\n", | |
"[INFO@root] result: <SPARQLWrapper.Wrapper.QueryResult object at 0x000001E1EEB998D0>\n", | |
"[INFO@root] content_type: 'application/sparql-results+json;charset=utf-8'\n", | |
"[INFO@root] result['head']: {'vars': ['index', 'string', 'integer', 'boolean', 'decimal', 'double', 'float', 'datetime', 'date', 'time']}\n", | |
"[INFO@root] pandas.json_normalize(jsondata, **{'sep': '/', 'record_path': ['results', 'bindings']})\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"<class 'pandas.core.frame.DataFrame'>\n", | |
"Index: 8 entries, 0 to 0\n", | |
"Data columns (total 28 columns):\n", | |
" # Column Non-Null Count Dtype \n", | |
"--- ------ -------------- ----- \n", | |
" 0 index/datatype 8 non-null object\n", | |
" 1 index/type 8 non-null object\n", | |
" 2 string/type 8 non-null object\n", | |
" 3 string/value 8 non-null object\n", | |
" 4 integer/datatype 8 non-null object\n", | |
" 5 integer/type 8 non-null object\n", | |
" 6 integer/value 8 non-null object\n", | |
" 7 boolean/datatype 8 non-null object\n", | |
" 8 boolean/type 8 non-null object\n", | |
" 9 boolean/value 8 non-null object\n", | |
" 10 decimal/datatype 8 non-null object\n", | |
" 11 decimal/type 8 non-null object\n", | |
" 12 decimal/value 8 non-null object\n", | |
" 13 double/datatype 8 non-null object\n", | |
" 14 double/type 8 non-null object\n", | |
" 15 double/value 8 non-null object\n", | |
" 16 float/datatype 8 non-null object\n", | |
" 17 float/type 8 non-null object\n", | |
" 18 float/value 8 non-null object\n", | |
" 19 datetime/datatype 8 non-null object\n", | |
" 20 datetime/type 8 non-null object\n", | |
" 21 datetime/value 8 non-null object\n", | |
" 22 date/datatype 8 non-null object\n", | |
" 23 date/type 8 non-null object\n", | |
" 24 date/value 8 non-null object\n", | |
" 25 time/datatype 8 non-null object\n", | |
" 26 time/type 8 non-null object\n", | |
" 27 time/value 8 non-null object\n", | |
"dtypes: object(28)\n", | |
"memory usage: 16.7 KB\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>index/datatype</th>\n", | |
" <th>index/type</th>\n", | |
" <th>string/type</th>\n", | |
" <th>string/value</th>\n", | |
" <th>integer/datatype</th>\n", | |
" <th>integer/type</th>\n", | |
" <th>integer/value</th>\n", | |
" <th>boolean/datatype</th>\n", | |
" <th>boolean/type</th>\n", | |
" <th>boolean/value</th>\n", | |
" <th>...</th>\n", | |
" <th>float/value</th>\n", | |
" <th>datetime/datatype</th>\n", | |
" <th>datetime/type</th>\n", | |
" <th>datetime/value</th>\n", | |
" <th>date/datatype</th>\n", | |
" <th>date/type</th>\n", | |
" <th>date/value</th>\n", | |
" <th>time/datatype</th>\n", | |
" <th>time/type</th>\n", | |
" <th>time/value</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>index/value</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'http://www.w3.org/2001/XMLSchema#integer'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'spam'</td>\n", | |
" <td>'http://www.w3.org/2001/XMLSchema#integer'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'7'</td>\n", | |
" <td>'http://www.w3.org/2001/XMLSchema#boolean'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'true'</td>\n", | |
" <td>...</td>\n", | |
" <td>'6.275'</td>\n", | |
" <td>'http://www.w3.org/2001/XMLSchema#dateTime'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'2005-04-04T04:04:04'</td>\n", | |
" <td>'http://www.w3.org/2001/XMLSchema#date'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'2001-01-01'</td>\n", | |
" <td>'http://www.w3.org/2001/XMLSchema#time'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'18:30'</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'http://www.w3.org/2001/XMLSchema#integer'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'spam'</td>\n", | |
" <td>'http://www.w3.org/2001/XMLSchema#integer'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'7'</td>\n", | |
" <td>'http://www.w3.org/2001/XMLSchema#boolean'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'false'</td>\n", | |
" <td>...</td>\n", | |
" <td>'6.275'</td>\n", | |
" <td>'http://www.w3.org/2001/XMLSchema#dateTime'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'2005-04-04T04:04:04'</td>\n", | |
" <td>'http://www.w3.org/2001/XMLSchema#date'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'2001-01-01'</td>\n", | |
" <td>'http://www.w3.org/2001/XMLSchema#time'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'18:30'</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'http://www.w3.org/2001/XMLSchema#integer'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'spam'</td>\n", | |
" <td>'http://www.w3.org/2001/XMLSchema#integer'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'42'</td>\n", | |
" <td>'http://www.w3.org/2001/XMLSchema#boolean'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'true'</td>\n", | |
" <td>...</td>\n", | |
" <td>'6.275'</td>\n", | |
" <td>'http://www.w3.org/2001/XMLSchema#dateTime'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'2005-04-04T04:04:04'</td>\n", | |
" <td>'http://www.w3.org/2001/XMLSchema#date'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'2001-01-01'</td>\n", | |
" <td>'http://www.w3.org/2001/XMLSchema#time'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'18:30'</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'http://www.w3.org/2001/XMLSchema#integer'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'spam'</td>\n", | |
" <td>'http://www.w3.org/2001/XMLSchema#integer'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'42'</td>\n", | |
" <td>'http://www.w3.org/2001/XMLSchema#boolean'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'false'</td>\n", | |
" <td>...</td>\n", | |
" <td>'6.275'</td>\n", | |
" <td>'http://www.w3.org/2001/XMLSchema#dateTime'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'2005-04-04T04:04:04'</td>\n", | |
" <td>'http://www.w3.org/2001/XMLSchema#date'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'2001-01-01'</td>\n", | |
" <td>'http://www.w3.org/2001/XMLSchema#time'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'18:30'</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'http://www.w3.org/2001/XMLSchema#integer'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'eggs'</td>\n", | |
" <td>'http://www.w3.org/2001/XMLSchema#integer'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'7'</td>\n", | |
" <td>'http://www.w3.org/2001/XMLSchema#boolean'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'true'</td>\n", | |
" <td>...</td>\n", | |
" <td>'6.275'</td>\n", | |
" <td>'http://www.w3.org/2001/XMLSchema#dateTime'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'2005-04-04T04:04:04'</td>\n", | |
" <td>'http://www.w3.org/2001/XMLSchema#date'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'2001-01-01'</td>\n", | |
" <td>'http://www.w3.org/2001/XMLSchema#time'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'18:30'</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'http://www.w3.org/2001/XMLSchema#integer'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'eggs'</td>\n", | |
" <td>'http://www.w3.org/2001/XMLSchema#integer'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'7'</td>\n", | |
" <td>'http://www.w3.org/2001/XMLSchema#boolean'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'false'</td>\n", | |
" <td>...</td>\n", | |
" <td>'6.275'</td>\n", | |
" <td>'http://www.w3.org/2001/XMLSchema#dateTime'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'2005-04-04T04:04:04'</td>\n", | |
" <td>'http://www.w3.org/2001/XMLSchema#date'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'2001-01-01'</td>\n", | |
" <td>'http://www.w3.org/2001/XMLSchema#time'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'18:30'</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'http://www.w3.org/2001/XMLSchema#integer'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'eggs'</td>\n", | |
" <td>'http://www.w3.org/2001/XMLSchema#integer'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'42'</td>\n", | |
" <td>'http://www.w3.org/2001/XMLSchema#boolean'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'true'</td>\n", | |
" <td>...</td>\n", | |
" <td>'6.275'</td>\n", | |
" <td>'http://www.w3.org/2001/XMLSchema#dateTime'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'2005-04-04T04:04:04'</td>\n", | |
" <td>'http://www.w3.org/2001/XMLSchema#date'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'2001-01-01'</td>\n", | |
" <td>'http://www.w3.org/2001/XMLSchema#time'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'18:30'</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'http://www.w3.org/2001/XMLSchema#integer'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'eggs'</td>\n", | |
" <td>'http://www.w3.org/2001/XMLSchema#integer'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'42'</td>\n", | |
" <td>'http://www.w3.org/2001/XMLSchema#boolean'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'false'</td>\n", | |
" <td>...</td>\n", | |
" <td>'6.275'</td>\n", | |
" <td>'http://www.w3.org/2001/XMLSchema#dateTime'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'2005-04-04T04:04:04'</td>\n", | |
" <td>'http://www.w3.org/2001/XMLSchema#date'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'2001-01-01'</td>\n", | |
" <td>'http://www.w3.org/2001/XMLSchema#time'</td>\n", | |
" <td>'literal'</td>\n", | |
" <td>'18:30'</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>8 rows × 28 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" index/datatype index/type \\\n", | |
"index/value \n", | |
"0 'http://www.w3.org/2001/XMLSchema#integer' 'literal' \n", | |
"0 'http://www.w3.org/2001/XMLSchema#integer' 'literal' \n", | |
"0 'http://www.w3.org/2001/XMLSchema#integer' 'literal' \n", | |
"0 'http://www.w3.org/2001/XMLSchema#integer' 'literal' \n", | |
"0 'http://www.w3.org/2001/XMLSchema#integer' 'literal' \n", | |
"0 'http://www.w3.org/2001/XMLSchema#integer' 'literal' \n", | |
"0 'http://www.w3.org/2001/XMLSchema#integer' 'literal' \n", | |
"0 'http://www.w3.org/2001/XMLSchema#integer' 'literal' \n", | |
"\n", | |
" string/type string/value \\\n", | |
"index/value \n", | |
"0 'literal' 'spam' \n", | |
"0 'literal' 'spam' \n", | |
"0 'literal' 'spam' \n", | |
"0 'literal' 'spam' \n", | |
"0 'literal' 'eggs' \n", | |
"0 'literal' 'eggs' \n", | |
"0 'literal' 'eggs' \n", | |
"0 'literal' 'eggs' \n", | |
"\n", | |
" integer/datatype integer/type \\\n", | |
"index/value \n", | |
"0 'http://www.w3.org/2001/XMLSchema#integer' 'literal' \n", | |
"0 'http://www.w3.org/2001/XMLSchema#integer' 'literal' \n", | |
"0 'http://www.w3.org/2001/XMLSchema#integer' 'literal' \n", | |
"0 'http://www.w3.org/2001/XMLSchema#integer' 'literal' \n", | |
"0 'http://www.w3.org/2001/XMLSchema#integer' 'literal' \n", | |
"0 'http://www.w3.org/2001/XMLSchema#integer' 'literal' \n", | |
"0 'http://www.w3.org/2001/XMLSchema#integer' 'literal' \n", | |
"0 'http://www.w3.org/2001/XMLSchema#integer' 'literal' \n", | |
"\n", | |
" integer/value boolean/datatype \\\n", | |
"index/value \n", | |
"0 '7' 'http://www.w3.org/2001/XMLSchema#boolean' \n", | |
"0 '7' 'http://www.w3.org/2001/XMLSchema#boolean' \n", | |
"0 '42' 'http://www.w3.org/2001/XMLSchema#boolean' \n", | |
"0 '42' 'http://www.w3.org/2001/XMLSchema#boolean' \n", | |
"0 '7' 'http://www.w3.org/2001/XMLSchema#boolean' \n", | |
"0 '7' 'http://www.w3.org/2001/XMLSchema#boolean' \n", | |
"0 '42' 'http://www.w3.org/2001/XMLSchema#boolean' \n", | |
"0 '42' 'http://www.w3.org/2001/XMLSchema#boolean' \n", | |
"\n", | |
" boolean/type boolean/value ... float/value \\\n", | |
"index/value ... \n", | |
"0 'literal' 'true' ... '6.275' \n", | |
"0 'literal' 'false' ... '6.275' \n", | |
"0 'literal' 'true' ... '6.275' \n", | |
"0 'literal' 'false' ... '6.275' \n", | |
"0 'literal' 'true' ... '6.275' \n", | |
"0 'literal' 'false' ... '6.275' \n", | |
"0 'literal' 'true' ... '6.275' \n", | |
"0 'literal' 'false' ... '6.275' \n", | |
"\n", | |
" datetime/datatype datetime/type \\\n", | |
"index/value \n", | |
"0 'http://www.w3.org/2001/XMLSchema#dateTime' 'literal' \n", | |
"0 'http://www.w3.org/2001/XMLSchema#dateTime' 'literal' \n", | |
"0 'http://www.w3.org/2001/XMLSchema#dateTime' 'literal' \n", | |
"0 'http://www.w3.org/2001/XMLSchema#dateTime' 'literal' \n", | |
"0 'http://www.w3.org/2001/XMLSchema#dateTime' 'literal' \n", | |
"0 'http://www.w3.org/2001/XMLSchema#dateTime' 'literal' \n", | |
"0 'http://www.w3.org/2001/XMLSchema#dateTime' 'literal' \n", | |
"0 'http://www.w3.org/2001/XMLSchema#dateTime' 'literal' \n", | |
"\n", | |
" datetime/value date/datatype \\\n", | |
"index/value \n", | |
"0 '2005-04-04T04:04:04' 'http://www.w3.org/2001/XMLSchema#date' \n", | |
"0 '2005-04-04T04:04:04' 'http://www.w3.org/2001/XMLSchema#date' \n", | |
"0 '2005-04-04T04:04:04' 'http://www.w3.org/2001/XMLSchema#date' \n", | |
"0 '2005-04-04T04:04:04' 'http://www.w3.org/2001/XMLSchema#date' \n", | |
"0 '2005-04-04T04:04:04' 'http://www.w3.org/2001/XMLSchema#date' \n", | |
"0 '2005-04-04T04:04:04' 'http://www.w3.org/2001/XMLSchema#date' \n", | |
"0 '2005-04-04T04:04:04' 'http://www.w3.org/2001/XMLSchema#date' \n", | |
"0 '2005-04-04T04:04:04' 'http://www.w3.org/2001/XMLSchema#date' \n", | |
"\n", | |
" date/type date/value time/datatype \\\n", | |
"index/value \n", | |
"0 'literal' '2001-01-01' 'http://www.w3.org/2001/XMLSchema#time' \n", | |
"0 'literal' '2001-01-01' 'http://www.w3.org/2001/XMLSchema#time' \n", | |
"0 'literal' '2001-01-01' 'http://www.w3.org/2001/XMLSchema#time' \n", | |
"0 'literal' '2001-01-01' 'http://www.w3.org/2001/XMLSchema#time' \n", | |
"0 'literal' '2001-01-01' 'http://www.w3.org/2001/XMLSchema#time' \n", | |
"0 'literal' '2001-01-01' 'http://www.w3.org/2001/XMLSchema#time' \n", | |
"0 'literal' '2001-01-01' 'http://www.w3.org/2001/XMLSchema#time' \n", | |
"0 'literal' '2001-01-01' 'http://www.w3.org/2001/XMLSchema#time' \n", | |
"\n", | |
" time/type time/value \n", | |
"index/value \n", | |
"0 'literal' '18:30' \n", | |
"0 'literal' '18:30' \n", | |
"0 'literal' '18:30' \n", | |
"0 'literal' '18:30' \n", | |
"0 'literal' '18:30' \n", | |
"0 'literal' '18:30' \n", | |
"0 'literal' '18:30' \n", | |
"0 'literal' '18:30' \n", | |
"\n", | |
"[8 rows x 28 columns]" | |
] | |
}, | |
"execution_count": 48, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"JsonNormalizeStrategy(ENDPOINT).query_pandas(TEST_QUERY, raw=True, sep='/').set_index('index/value').pipe(pipe_info).applymap(repr)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "23939e50-6d14-4cec-abdd-689dee23bf43", | |
"metadata": {}, | |
"source": [ | |
"## `read_sparql_query(sparql, *, endpoint, strategy)`" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 49, | |
"id": "9bbca0b3-15e8-4e56-b147-f6316c7ad169", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"class ReadSparqlQuery:\n", | |
"\n", | |
" @classmethod\n", | |
" @functools.lru_cache(maxsize=10)\n", | |
" def from_cache(cls, endpoint: str, *, strategy: str) -> ReadSparqlQuery:\n", | |
" inst = cls(endpoint, strategy=strategy)\n", | |
" logging.info('%r', inst)\n", | |
" return inst\n", | |
"\n", | |
" _strategies = {'csv': CsvUrllibStrategy,\n", | |
" 'csv_sw': CsvSwStrategy,\n", | |
" 'json': JsonResultStrategy,\n", | |
" 'json_normalize': JsonNormalizeStrategy,\n", | |
" 'xml': XmlUrllibStrategy,\n", | |
" 'xml_sw': XmlSwStrategy,\n", | |
" 'xml_rdflib': GraphXmlStrategy}\n", | |
"\n", | |
" def __init__(self, endpoint: str, *, strategy: str) -> None:\n", | |
" self.endpoint = endpoint\n", | |
" assert strategy in self._strategies\n", | |
" self._strategy = strategy\n", | |
"\n", | |
" def __repr__(self) -> str:\n", | |
" return f'{self.__class__.__name__}({self.endpoint!r}, strategy={self._strategy!r})'\n", | |
"\n", | |
" @functools.cached_property\n", | |
" def strategy(self):\n", | |
" strategy_cls = self._strategies[self._strategy]\n", | |
" strategy = strategy_cls(self.endpoint)\n", | |
" logging.info('strategy: %r', strategy)\n", | |
" return strategy\n", | |
"\n", | |
" def __call__(self, sparql: str, **kwargs) -> pd.DataFrame:\n", | |
" return self.strategy.query_pandas(sparql, **kwargs)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 50, | |
"id": "4ecf6e63-5dba-48b1-8494-47977d822e7d", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def read_sparql_query(sparql: str, *, convert_dtypes: bool = False,\n", | |
" endpoint: str, strategy: str, **kwargs) -> pd.DataFrame:\n", | |
" reader = ReadSparqlQuery.from_cache(endpoint, strategy=strategy)\n", | |
" df = reader(sparql, **kwargs)\n", | |
" if convert_dtypes:\n", | |
" return df.convert_dtypes()\n", | |
" return df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 51, | |
"id": "c40aa7a7-da7e-4a34-b85c-e742dc50baff", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"read_wikidata = functools.partial(read_sparql_query, QUERY, convert_dtypes=True, endpoint=ENDPOINT)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "5791761f-99c7-4fb0-aa61-848553864874", | |
"metadata": {}, | |
"source": [ | |
"### `strategy='csv'`" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 52, | |
"id": "aa002249-49fe-49f7-acd9-8c2021c04139", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"[INFO@root] ReadSparqlQuery('https://query.wikidata.org/sparql', strategy='csv')\n", | |
"[INFO@root] strategy: CsvUrllibStrategy('https://query.wikidata.org/sparql')\n", | |
"[INFO@root] endpoint: 'https://query.wikidata.org/sparql'\n", | |
"[INFO@root] request: 'GET' <urllib.request.Request object at 0x000001E1EF0FB640>\n", | |
"[INFO@root] response: 200 <http.client.HTTPResponse object at 0x000001E1EF0F9B10>\n", | |
"[INFO@root] content-type: 'text/csv;charset=utf-8'\n", | |
"[INFO@root] pandas.read_csv(<http.client.HTTPResponse object at 0x000001E1EF0F9B10>, **{'index_col': 'glottocode', 'na_values': '', 'keep_default_na': False, 'encoding': 'utf-8'})\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: total: 15.6 ms\n", | |
"Wall time: 6.85 s\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>qid</th>\n", | |
" <th>name</th>\n", | |
" <th>title</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>glottocode</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>aant1238</th>\n", | |
" <td>Q31312216</td>\n", | |
" <td>Aantantara</td>\n", | |
" <td><NA></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>aari1238</th>\n", | |
" <td>Q85516014</td>\n", | |
" <td>Aari-Gayil</td>\n", | |
" <td><NA></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>aari1239</th>\n", | |
" <td>Q7495</td>\n", | |
" <td>Aari</td>\n", | |
" <td>Aari_language</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>aari1240</th>\n", | |
" <td>Q4661732</td>\n", | |
" <td>Aariya</td>\n", | |
" <td>Aariya_language</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>aasa1238</th>\n", | |
" <td>Q56620</td>\n", | |
" <td>Asa</td>\n", | |
" <td>Asa_language</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>zumb1240</th>\n", | |
" <td>Q56252</td>\n", | |
" <td>Zumbun</td>\n", | |
" <td>Zumbun_language</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>zuni1245</th>\n", | |
" <td>Q10188</td>\n", | |
" <td>Zuni</td>\n", | |
" <td>Zuni_language</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>zuoj1238</th>\n", | |
" <td>Q13848149</td>\n", | |
" <td>Zuojiang Zhuang</td>\n", | |
" <td>Zuojiang_Zhuang_languages</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>zuri1238</th>\n", | |
" <td>Q248682</td>\n", | |
" <td>Zurich German</td>\n", | |
" <td>Z%C3%BCrich_German</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>zyph1238</th>\n", | |
" <td>Q57004</td>\n", | |
" <td>Zyphe</td>\n", | |
" <td>Zyphe_language</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>10873 rows × 3 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" qid name title\n", | |
"glottocode \n", | |
"aant1238 Q31312216 Aantantara <NA>\n", | |
"aari1238 Q85516014 Aari-Gayil <NA>\n", | |
"aari1239 Q7495 Aari Aari_language\n", | |
"aari1240 Q4661732 Aariya Aariya_language\n", | |
"aasa1238 Q56620 Asa Asa_language\n", | |
"... ... ... ...\n", | |
"zumb1240 Q56252 Zumbun Zumbun_language\n", | |
"zuni1245 Q10188 Zuni Zuni_language\n", | |
"zuoj1238 Q13848149 Zuojiang Zhuang Zuojiang_Zhuang_languages\n", | |
"zuri1238 Q248682 Zurich German Z%C3%BCrich_German\n", | |
"zyph1238 Q57004 Zyphe Zyphe_language\n", | |
"\n", | |
"[10873 rows x 3 columns]" | |
] | |
}, | |
"execution_count": 52, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%time read_wikidata(strategy='csv', index_col='glottocode')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "6052a9c6-37a5-4331-9577-f0d1b9eaa7eb", | |
"metadata": {}, | |
"source": [ | |
"### `strategy='xml'`" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 53, | |
"id": "eb73feff-a3cc-47e3-90c8-d1ff6a7ea564", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"[INFO@root] ReadSparqlQuery('https://query.wikidata.org/sparql', strategy='xml')\n", | |
"[INFO@root] strategy: XmlUrllibStrategy('https://query.wikidata.org/sparql')\n", | |
"[INFO@root] endpoint: 'https://query.wikidata.org/sparql'\n", | |
"[INFO@root] request: 'GET' <urllib.request.Request object at 0x000001E1EF0FB640>\n", | |
"[INFO@root] response: 200 <http.client.HTTPResponse object at 0x000001E1EEB99810>\n", | |
"[INFO@root] content-type: 'application/sparql-results+xml;charset=utf-8'\n", | |
"[INFO@root] xml: <Element '{http://www.w3.org/2005/sparql-results#}sparql' at 0x000001E1F119F510>\n", | |
"[INFO@root] pandas.DataFrame.from_records(<generator object iterrecords at 0x000001E1F03ABE60>, **{'index': 'glottocode', 'columns': ['glottocode', 'qid', 'name', 'title'], 'coerce_float': True})\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: total: 484 ms\n", | |
"Wall time: 18.7 s\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>qid</th>\n", | |
" <th>name</th>\n", | |
" <th>title</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>glottocode</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>aant1238</th>\n", | |
" <td>Q31312216</td>\n", | |
" <td>Aantantara</td>\n", | |
" <td><NA></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>aari1238</th>\n", | |
" <td>Q85516014</td>\n", | |
" <td>Aari-Gayil</td>\n", | |
" <td><NA></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>aari1239</th>\n", | |
" <td>Q7495</td>\n", | |
" <td>Aari</td>\n", | |
" <td>Aari_language</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>aari1240</th>\n", | |
" <td>Q4661732</td>\n", | |
" <td>Aariya</td>\n", | |
" <td>Aariya_language</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>aasa1238</th>\n", | |
" <td>Q56620</td>\n", | |
" <td>Asa</td>\n", | |
" <td>Asa_language</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>zumb1240</th>\n", | |
" <td>Q56252</td>\n", | |
" <td>Zumbun</td>\n", | |
" <td>Zumbun_language</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>zuni1245</th>\n", | |
" <td>Q10188</td>\n", | |
" <td>Zuni</td>\n", | |
" <td>Zuni_language</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>zuoj1238</th>\n", | |
" <td>Q13848149</td>\n", | |
" <td>Zuojiang Zhuang</td>\n", | |
" <td>Zuojiang_Zhuang_languages</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>zuri1238</th>\n", | |
" <td>Q248682</td>\n", | |
" <td>Zurich German</td>\n", | |
" <td>Z%C3%BCrich_German</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>zyph1238</th>\n", | |
" <td>Q57004</td>\n", | |
" <td>Zyphe</td>\n", | |
" <td>Zyphe_language</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>10873 rows × 3 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" qid name title\n", | |
"glottocode \n", | |
"aant1238 Q31312216 Aantantara <NA>\n", | |
"aari1238 Q85516014 Aari-Gayil <NA>\n", | |
"aari1239 Q7495 Aari Aari_language\n", | |
"aari1240 Q4661732 Aariya Aariya_language\n", | |
"aasa1238 Q56620 Asa Asa_language\n", | |
"... ... ... ...\n", | |
"zumb1240 Q56252 Zumbun Zumbun_language\n", | |
"zuni1245 Q10188 Zuni Zuni_language\n", | |
"zuoj1238 Q13848149 Zuojiang Zhuang Zuojiang_Zhuang_languages\n", | |
"zuri1238 Q248682 Zurich German Z%C3%BCrich_German\n", | |
"zyph1238 Q57004 Zyphe Zyphe_language\n", | |
"\n", | |
"[10873 rows x 3 columns]" | |
] | |
}, | |
"execution_count": 53, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%time read_wikidata(strategy='xml', index='glottocode')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "426f9ea7-031c-47fb-89e0-04c17d6ed924", | |
"metadata": {}, | |
"source": [ | |
"### `strategy='xml_rdflib`" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 54, | |
"id": "c7edcb9e-9a58-4166-acad-e7bf0f7884cb", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"[INFO@root] ReadSparqlQuery('https://query.wikidata.org/sparql', strategy='xml_rdflib')\n", | |
"[INFO@root] endpoint: 'https://query.wikidata.org/sparql'\n", | |
"[INFO@root] graph: [a rdflib:ConjunctiveGraph;rdflib:storage [a rdflib:Store;rdfs:label 'SPARQLStore']]\n", | |
"[INFO@root] prefixes: {'schema': Namespace(\"http://schema.org/\")}\n", | |
"[INFO@root] strategy: <GraphXmlStrategy 'https://query.wikidata.org/sparql' graph=<Graph identifier=Ncb3cf8bf5f4b429fb9d63d2a1a090667 (<class 'rdflib.graph.ConjunctiveGraph'>)>>\n", | |
"[INFO@root] result: <rdflib.plugins.sparql.results.xmlresults.XMLResult object at 0x000001E1EF0F9C90>\n", | |
"[INFO@root] pandas.DataFrame.from_records(<generator object read_sparql_query_graph.<locals>.<genexpr> at 0x000001E1F3A80270>, **{'index': 'glottocode', 'columns': ['glottocode', 'qid', 'name', 'title'], 'coerce_float': True})\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: total: 438 ms\n", | |
"Wall time: 14 s\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>qid</th>\n", | |
" <th>name</th>\n", | |
" <th>title</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>glottocode</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>aant1238</th>\n", | |
" <td>Q31312216</td>\n", | |
" <td>Aantantara</td>\n", | |
" <td><NA></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>aari1238</th>\n", | |
" <td>Q85516014</td>\n", | |
" <td>Aari-Gayil</td>\n", | |
" <td><NA></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>aari1239</th>\n", | |
" <td>Q7495</td>\n", | |
" <td>Aari</td>\n", | |
" <td>Aari_language</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>aari1240</th>\n", | |
" <td>Q4661732</td>\n", | |
" <td>Aariya</td>\n", | |
" <td>Aariya_language</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>aasa1238</th>\n", | |
" <td>Q56620</td>\n", | |
" <td>Asa</td>\n", | |
" <td>Asa_language</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>zumb1240</th>\n", | |
" <td>Q56252</td>\n", | |
" <td>Zumbun</td>\n", | |
" <td>Zumbun_language</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>zuni1245</th>\n", | |
" <td>Q10188</td>\n", | |
" <td>Zuni</td>\n", | |
" <td>Zuni_language</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>zuoj1238</th>\n", | |
" <td>Q13848149</td>\n", | |
" <td>Zuojiang Zhuang</td>\n", | |
" <td>Zuojiang_Zhuang_languages</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>zuri1238</th>\n", | |
" <td>Q248682</td>\n", | |
" <td>Zurich German</td>\n", | |
" <td>Z%C3%BCrich_German</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>zyph1238</th>\n", | |
" <td>Q57004</td>\n", | |
" <td>Zyphe</td>\n", | |
" <td>Zyphe_language</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>10873 rows × 3 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" qid name title\n", | |
"glottocode \n", | |
"aant1238 Q31312216 Aantantara <NA>\n", | |
"aari1238 Q85516014 Aari-Gayil <NA>\n", | |
"aari1239 Q7495 Aari Aari_language\n", | |
"aari1240 Q4661732 Aariya Aariya_language\n", | |
"aasa1238 Q56620 Asa Asa_language\n", | |
"... ... ... ...\n", | |
"zumb1240 Q56252 Zumbun Zumbun_language\n", | |
"zuni1245 Q10188 Zuni Zuni_language\n", | |
"zuoj1238 Q13848149 Zuojiang Zhuang Zuojiang_Zhuang_languages\n", | |
"zuri1238 Q248682 Zurich German Z%C3%BCrich_German\n", | |
"zyph1238 Q57004 Zyphe Zyphe_language\n", | |
"\n", | |
"[10873 rows x 3 columns]" | |
] | |
}, | |
"execution_count": 54, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%time read_wikidata(strategy='xml_rdflib', index='glottocode')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "235dc1b7-0e6c-4703-b219-f720bf1e9979", | |
"metadata": {}, | |
"source": [ | |
"### `strategy='csv_sw'`" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 55, | |
"id": "071cc12d-1dec-490e-b192-c7ce7c8ca274", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"[INFO@root] ReadSparqlQuery('https://query.wikidata.org/sparql', strategy='csv_sw')\n", | |
"[INFO@root] strategy: CsvSwStrategy('https://query.wikidata.org/sparql', returnFormat='csv')\n", | |
"[INFO@root] endpoint: 'https://query.wikidata.org/sparql'\n", | |
"[INFO@root] returnFormat: 'csv'\n", | |
"[INFO@root] CsvSwStrategy('https://query.wikidata.org/sparql', returnFormat='csv').setOnlyConnreg(True)\n", | |
"[INFO@root] result: <SPARQLWrapper.Wrapper.QueryResult object at 0x000001E1EEC008E0>\n", | |
"[INFO@root] content_type: 'text/csv;charset=utf-8'\n", | |
"[INFO@root] pandas.read_csv(<http.client.HTTPResponse object at 0x000001E1EEC01D50>, **{'index_col': 'glottocode', 'na_values': '', 'keep_default_na': False, 'encoding': 'utf-8'})\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: total: 15.6 ms\n", | |
"Wall time: 7.7 s\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>qid</th>\n", | |
" <th>name</th>\n", | |
" <th>title</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>glottocode</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>aant1238</th>\n", | |
" <td>Q31312216</td>\n", | |
" <td>Aantantara</td>\n", | |
" <td><NA></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>aari1238</th>\n", | |
" <td>Q85516014</td>\n", | |
" <td>Aari-Gayil</td>\n", | |
" <td><NA></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>aari1239</th>\n", | |
" <td>Q7495</td>\n", | |
" <td>Aari</td>\n", | |
" <td>Aari_language</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>aari1240</th>\n", | |
" <td>Q4661732</td>\n", | |
" <td>Aariya</td>\n", | |
" <td>Aariya_language</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>aasa1238</th>\n", | |
" <td>Q56620</td>\n", | |
" <td>Asa</td>\n", | |
" <td>Asa_language</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>zumb1240</th>\n", | |
" <td>Q56252</td>\n", | |
" <td>Zumbun</td>\n", | |
" <td>Zumbun_language</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>zuni1245</th>\n", | |
" <td>Q10188</td>\n", | |
" <td>Zuni</td>\n", | |
" <td>Zuni_language</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>zuoj1238</th>\n", | |
" <td>Q13848149</td>\n", | |
" <td>Zuojiang Zhuang</td>\n", | |
" <td>Zuojiang_Zhuang_languages</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>zuri1238</th>\n", | |
" <td>Q248682</td>\n", | |
" <td>Zurich German</td>\n", | |
" <td>Z%C3%BCrich_German</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>zyph1238</th>\n", | |
" <td>Q57004</td>\n", | |
" <td>Zyphe</td>\n", | |
" <td>Zyphe_language</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>10873 rows × 3 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" qid name title\n", | |
"glottocode \n", | |
"aant1238 Q31312216 Aantantara <NA>\n", | |
"aari1238 Q85516014 Aari-Gayil <NA>\n", | |
"aari1239 Q7495 Aari Aari_language\n", | |
"aari1240 Q4661732 Aariya Aariya_language\n", | |
"aasa1238 Q56620 Asa Asa_language\n", | |
"... ... ... ...\n", | |
"zumb1240 Q56252 Zumbun Zumbun_language\n", | |
"zuni1245 Q10188 Zuni Zuni_language\n", | |
"zuoj1238 Q13848149 Zuojiang Zhuang Zuojiang_Zhuang_languages\n", | |
"zuri1238 Q248682 Zurich German Z%C3%BCrich_German\n", | |
"zyph1238 Q57004 Zyphe Zyphe_language\n", | |
"\n", | |
"[10873 rows x 3 columns]" | |
] | |
}, | |
"execution_count": 55, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%time read_wikidata(strategy='csv_sw', index_col='glottocode')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "e5eb79e1-5a40-4d66-9951-056fbf3e440d", | |
"metadata": {}, | |
"source": [ | |
"### `strategy='xml_sw'`" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 56, | |
"id": "6d36993c-766f-431c-af72-b341afbd4689", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"[INFO@root] ReadSparqlQuery('https://query.wikidata.org/sparql', strategy='xml_sw')\n", | |
"[INFO@root] strategy: XmlSwStrategy('https://query.wikidata.org/sparql', returnFormat='xml')\n", | |
"[INFO@root] endpoint: 'https://query.wikidata.org/sparql'\n", | |
"[INFO@root] returnFormat: 'xml'\n", | |
"[INFO@root] result: <SPARQLWrapper.Wrapper.QueryResult object at 0x000001E1EEB99E40>\n", | |
"[INFO@root] content_type: 'application/sparql-results+xml;charset=utf-8'\n", | |
"[INFO@root] xml: <Element '{http://www.w3.org/2005/sparql-results#}sparql' at 0x000001E1F3318540>\n", | |
"[INFO@root] pandas.DataFrame.from_records(<generator object iterrecords at 0x000001E1EF2D09E0>, **{'index': 'glottocode', 'columns': ['glottocode', 'qid', 'name', 'title'], 'coerce_float': True})\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: total: 625 ms\n", | |
"Wall time: 8.97 s\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>qid</th>\n", | |
" <th>name</th>\n", | |
" <th>title</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>glottocode</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>aant1238</th>\n", | |
" <td>Q31312216</td>\n", | |
" <td>Aantantara</td>\n", | |
" <td><NA></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>aari1238</th>\n", | |
" <td>Q85516014</td>\n", | |
" <td>Aari-Gayil</td>\n", | |
" <td><NA></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>aari1239</th>\n", | |
" <td>Q7495</td>\n", | |
" <td>Aari</td>\n", | |
" <td>Aari_language</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>aari1240</th>\n", | |
" <td>Q4661732</td>\n", | |
" <td>Aariya</td>\n", | |
" <td>Aariya_language</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>aasa1238</th>\n", | |
" <td>Q56620</td>\n", | |
" <td>Asa</td>\n", | |
" <td>Asa_language</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>zumb1240</th>\n", | |
" <td>Q56252</td>\n", | |
" <td>Zumbun</td>\n", | |
" <td>Zumbun_language</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>zuni1245</th>\n", | |
" <td>Q10188</td>\n", | |
" <td>Zuni</td>\n", | |
" <td>Zuni_language</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>zuoj1238</th>\n", | |
" <td>Q13848149</td>\n", | |
" <td>Zuojiang Zhuang</td>\n", | |
" <td>Zuojiang_Zhuang_languages</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>zuri1238</th>\n", | |
" <td>Q248682</td>\n", | |
" <td>Zurich German</td>\n", | |
" <td>Z%C3%BCrich_German</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>zyph1238</th>\n", | |
" <td>Q57004</td>\n", | |
" <td>Zyphe</td>\n", | |
" <td>Zyphe_language</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>10873 rows × 3 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" qid name title\n", | |
"glottocode \n", | |
"aant1238 Q31312216 Aantantara <NA>\n", | |
"aari1238 Q85516014 Aari-Gayil <NA>\n", | |
"aari1239 Q7495 Aari Aari_language\n", | |
"aari1240 Q4661732 Aariya Aariya_language\n", | |
"aasa1238 Q56620 Asa Asa_language\n", | |
"... ... ... ...\n", | |
"zumb1240 Q56252 Zumbun Zumbun_language\n", | |
"zuni1245 Q10188 Zuni Zuni_language\n", | |
"zuoj1238 Q13848149 Zuojiang Zhuang Zuojiang_Zhuang_languages\n", | |
"zuri1238 Q248682 Zurich German Z%C3%BCrich_German\n", | |
"zyph1238 Q57004 Zyphe Zyphe_language\n", | |
"\n", | |
"[10873 rows x 3 columns]" | |
] | |
}, | |
"execution_count": 56, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%time read_wikidata(strategy='xml_sw', index='glottocode')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "56b5fcd4-15d4-49f1-830d-3aabf15c80ca", | |
"metadata": {}, | |
"source": [ | |
"### `strategy='json'`" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 57, | |
"id": "4bcf9fc1-2bd3-44ac-bc7d-771484626866", | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"[INFO@root] ReadSparqlQuery('https://query.wikidata.org/sparql', strategy='json')\n", | |
"[INFO@root] strategy: JsonResultStrategy('https://query.wikidata.org/sparql', returnFormat='json')\n", | |
"[INFO@root] endpoint: 'https://query.wikidata.org/sparql'\n", | |
"[INFO@root] result: <SPARQLWrapper.SmartWrapper.Bindings object at 0x000001E1EEB9B1F0>\n", | |
"[INFO@root] result.variables: ['glottocode', 'qid', 'name', 'title']\n", | |
"[INFO@root] pandas.DataFrame.from_records(<generator object read_sparql_query_json_result.<locals>.<genexpr> at 0x000001E1EF26C190>, **{'index': 'glottocode', 'columns': ['glottocode', 'qid', 'name', 'title'], 'coerce_float': True})\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: total: 125 ms\n", | |
"Wall time: 5.3 s\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>qid</th>\n", | |
" <th>name</th>\n", | |
" <th>title</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>glottocode</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>aant1238</th>\n", | |
" <td>Q31312216</td>\n", | |
" <td>Aantantara</td>\n", | |
" <td><NA></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>aari1238</th>\n", | |
" <td>Q85516014</td>\n", | |
" <td>Aari-Gayil</td>\n", | |
" <td><NA></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>aari1239</th>\n", | |
" <td>Q7495</td>\n", | |
" <td>Aari</td>\n", | |
" <td>Aari_language</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>aari1240</th>\n", | |
" <td>Q4661732</td>\n", | |
" <td>Aariya</td>\n", | |
" <td>Aariya_language</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>aasa1238</th>\n", | |
" <td>Q56620</td>\n", | |
" <td>Asa</td>\n", | |
" <td>Asa_language</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>zumb1240</th>\n", | |
" <td>Q56252</td>\n", | |
" <td>Zumbun</td>\n", | |
" <td>Zumbun_language</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>zuni1245</th>\n", | |
" <td>Q10188</td>\n", | |
" <td>Zuni</td>\n", | |
" <td>Zuni_language</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>zuoj1238</th>\n", | |
" <td>Q13848149</td>\n", | |
" <td>Zuojiang Zhuang</td>\n", | |
" <td>Zuojiang_Zhuang_languages</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>zuri1238</th>\n", | |
" <td>Q248682</td>\n", | |
" <td>Zurich German</td>\n", | |
" <td>Z%C3%BCrich_German</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>zyph1238</th>\n", | |
" <td>Q57004</td>\n", | |
" <td>Zyphe</td>\n", | |
" <td>Zyphe_language</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>10873 rows × 3 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" qid name title\n", | |
"glottocode \n", | |
"aant1238 Q31312216 Aantantara <NA>\n", | |
"aari1238 Q85516014 Aari-Gayil <NA>\n", | |
"aari1239 Q7495 Aari Aari_language\n", | |
"aari1240 Q4661732 Aariya Aariya_language\n", | |
"aasa1238 Q56620 Asa Asa_language\n", | |
"... ... ... ...\n", | |
"zumb1240 Q56252 Zumbun Zumbun_language\n", | |
"zuni1245 Q10188 Zuni Zuni_language\n", | |
"zuoj1238 Q13848149 Zuojiang Zhuang Zuojiang_Zhuang_languages\n", | |
"zuri1238 Q248682 Zurich German Z%C3%BCrich_German\n", | |
"zyph1238 Q57004 Zyphe Zyphe_language\n", | |
"\n", | |
"[10873 rows x 3 columns]" | |
] | |
}, | |
"execution_count": 57, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%time read_wikidata(strategy='json', index='glottocode')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "3aa86137-540b-49c0-bdbd-c5d98d269b24", | |
"metadata": {}, | |
"source": [ | |
"### `strategy='json_normalize'`" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 58, | |
"id": "5642ca34-b0a3-49a9-a715-b3d2262c05cc", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"[INFO@root] ReadSparqlQuery('https://query.wikidata.org/sparql', strategy='json_normalize')\n", | |
"[INFO@root] strategy: JsonNormalizeStrategy('https://query.wikidata.org/sparql', returnFormat='json')\n", | |
"[INFO@root] endpoint: 'https://query.wikidata.org/sparql'\n", | |
"[INFO@root] returnFormat: 'json'\n", | |
"[INFO@root] result: <SPARQLWrapper.Wrapper.QueryResult object at 0x000001E1EEB9B280>\n", | |
"[INFO@root] content_type: 'application/sparql-results+json;charset=utf-8'\n", | |
"[INFO@root] result['head']: {'vars': ['glottocode', 'qid', 'name', 'title']}\n", | |
"[INFO@root] pandas.json_normalize(jsondata, **{'record_path': ['results', 'bindings']})\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: total: 391 ms\n", | |
"Wall time: 4.53 s\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>qid</th>\n", | |
" <th>name</th>\n", | |
" <th>title</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>glottocode</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>aant1238</th>\n", | |
" <td>Q31312216</td>\n", | |
" <td>Aantantara</td>\n", | |
" <td><NA></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>aari1238</th>\n", | |
" <td>Q85516014</td>\n", | |
" <td>Aari-Gayil</td>\n", | |
" <td><NA></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>aari1239</th>\n", | |
" <td>Q7495</td>\n", | |
" <td>Aari</td>\n", | |
" <td>Aari_language</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>aari1240</th>\n", | |
" <td>Q4661732</td>\n", | |
" <td>Aariya</td>\n", | |
" <td>Aariya_language</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>aasa1238</th>\n", | |
" <td>Q56620</td>\n", | |
" <td>Asa</td>\n", | |
" <td>Asa_language</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>zumb1240</th>\n", | |
" <td>Q56252</td>\n", | |
" <td>Zumbun</td>\n", | |
" <td>Zumbun_language</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>zuni1245</th>\n", | |
" <td>Q10188</td>\n", | |
" <td>Zuni</td>\n", | |
" <td>Zuni_language</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>zuoj1238</th>\n", | |
" <td>Q13848149</td>\n", | |
" <td>Zuojiang Zhuang</td>\n", | |
" <td>Zuojiang_Zhuang_languages</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>zuri1238</th>\n", | |
" <td>Q248682</td>\n", | |
" <td>Zurich German</td>\n", | |
" <td>Z%C3%BCrich_German</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>zyph1238</th>\n", | |
" <td>Q57004</td>\n", | |
" <td>Zyphe</td>\n", | |
" <td>Zyphe_language</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>10873 rows × 3 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" qid name title\n", | |
"glottocode \n", | |
"aant1238 Q31312216 Aantantara <NA>\n", | |
"aari1238 Q85516014 Aari-Gayil <NA>\n", | |
"aari1239 Q7495 Aari Aari_language\n", | |
"aari1240 Q4661732 Aariya Aariya_language\n", | |
"aasa1238 Q56620 Asa Asa_language\n", | |
"... ... ... ...\n", | |
"zumb1240 Q56252 Zumbun Zumbun_language\n", | |
"zuni1245 Q10188 Zuni Zuni_language\n", | |
"zuoj1238 Q13848149 Zuojiang Zhuang Zuojiang_Zhuang_languages\n", | |
"zuri1238 Q248682 Zurich German Z%C3%BCrich_German\n", | |
"zyph1238 Q57004 Zyphe Zyphe_language\n", | |
"\n", | |
"[10873 rows x 3 columns]" | |
] | |
}, | |
"execution_count": 58, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%time read_wikidata(strategy='json_normalize').set_index('glottocode')" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.10.4" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment