Last active
May 11, 2024 15:17
-
-
Save Zsailer/6da0dc3c97ec873685b7fe58e52d36d7 to your computer and use it in GitHub Desktop.
Define and validate schema.org structured data in Python with Pydantic
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Define and validate schema.org structured data in Python with Pydantic " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pydantic\n", | |
"from pydantic import BaseModel, Schema\n", | |
"from pydantic.main import MetaModel\n", | |
"from schemaorg import main as schemaorg" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"class Thing(BaseModel):\n", | |
" \"\"\"The most generic type of item.\"\"\"\n", | |
" \n", | |
" # Need to define extra items at the top level \n", | |
" class Config:\n", | |
" title = 'Thing'\n", | |
" schema_extra = {\n", | |
" '$schema': 'https://schema.org',\n", | |
" '$id': 'https://schema.org/Thing',\n", | |
" }\n", | |
" \n", | |
" additionalType: str = Schema(\n", | |
" ...,\n", | |
" title='additionalType',\n", | |
" description=(\n", | |
" \"An additional type for the item, typically \"\n", | |
" \"used for adding more specific types from \"\n", | |
" \"external vocabularies in microdata syntax. \"\n", | |
" \"This is a relationship between something and \"\n", | |
" \"a class that the thing is in. In RDFa syntax, \"\n", | |
" \"it is better to use the native RDFa syntax - \"\n", | |
" \"the 'typeof' attribute - for multiple types. \"\n", | |
" \"Schema.org tools may have only weaker \"\n", | |
" \"understanding of extra types, in particular \"\n", | |
" \"those defined externally.\"\n", | |
" )\n", | |
" )\n", | |
" \n", | |
" alternateName: str = Schema(\n", | |
" ...,\n", | |
" title='alternateName',\n", | |
" description=\"An alias for the item.\"\n", | |
" )\n", | |
" \n", | |
" description: str = Schema(\n", | |
" ...,\n", | |
" description=\"A description of the item.\"\n", | |
" )\n", | |
" \n", | |
" disambiguatingDescription: str = Schema(\n", | |
" ...,\n", | |
" description=\"A sub property of description. A short description of the item used to disambiguate from other, similar items. Information from other properties (in particular, name) may be necessary for the description to be useful for disambiguation.\"\n", | |
" )" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"I want to enforce the attributes in the `Config` accessor, *and* I think the accessor syntax is a bit ugly. Let's see if we can make it go away." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"\n", | |
"class MetaSchema(MetaModel):\n", | |
" \"\"\"Metaclass that checks for three required class attributes:\n", | |
" 1. _id: the ID of the event\n", | |
" 2. _version: the version of the current schema.\n", | |
" 3. _title: the name of the schema.\n", | |
"\n", | |
" These attribute are mapped to pydantic.BaseModel's `Config` inner class\n", | |
" for proper schema generation+validation.\n", | |
" \"\"\"\n", | |
" def __new__(cls, name, base, dct):\n", | |
" # Check that required keys are found.\n", | |
" if not all((key in dct for key in ['_id', '_title', '_version', '_schema'])):\n", | |
" raise AttributeError('Required class attributes are missing from the {} class.'.format(name))\n", | |
"\n", | |
" # Check that keys are the proper types.\n", | |
" if not all((\n", | |
" type(dct['_id']) in (str, type(None)),\n", | |
" type(dct['_version']) in (float, type(None)),\n", | |
" type(dct['_title']) in (str, type(None)),\n", | |
" type(dct['_schema']) in (str, type(None)),\n", | |
" )):\n", | |
" raise TypeError('Check the class attributes types: \"_id\" must be a string, '\n", | |
" '\"_version\" must be an integer, and \"_title\" must be a string.')\n", | |
"\n", | |
" # Add a Config inner class to this Pydantic model.\n", | |
" class Config:\n", | |
" title = dct['_title']\n", | |
" schema_extra = {\n", | |
" '$id': dct['_id'],\n", | |
" '$schema': dct['_schema'],\n", | |
" 'version': dct['_version']\n", | |
" }\n", | |
"\n", | |
" dct['Config'] = Config\n", | |
" return super(MetaSchema, cls).__new__(cls, name, base, dct)\n", | |
"\n", | |
"\n", | |
"class JsonSchema(pydantic.BaseModel, metaclass=MetaSchema):\n", | |
" \"\"\"A pydantic base Model for JSON schemas.\"\"\"\n", | |
" _id: str = None\n", | |
" _version: float = None\n", | |
" _title: str = None\n", | |
" _schema: str = None" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"class Thing(JsonSchema):\n", | |
" \"\"\"The most generic type of item.\"\"\"\n", | |
" # Define top level attributes\n", | |
" _id = 'https://schema.org/Thing'\n", | |
" _version = 3.9\n", | |
" _title = 'Thing'\n", | |
" _schema = 'https://schema.org'\n", | |
"\n", | |
" additionalType: str = Schema(\n", | |
" ...,\n", | |
" title='additionalType',\n", | |
" description=(\n", | |
" \"An additional type for the item, typically \"\n", | |
" \"used for adding more specific types from \"\n", | |
" \"external vocabularies in microdata syntax. \"\n", | |
" \"This is a relationship between something and \"\n", | |
" \"a class that the thing is in. In RDFa syntax, \"\n", | |
" \"it is better to use the native RDFa syntax - \"\n", | |
" \"the 'typeof' attribute - for multiple types. \"\n", | |
" \"Schema.org tools may have only weaker \"\n", | |
" \"understanding of extra types, in particular \"\n", | |
" \"those defined externally.\"\n", | |
" )\n", | |
" )\n", | |
" \n", | |
" alternateName: str = Schema(\n", | |
" ...,\n", | |
" title='alternateName',\n", | |
" description=\"An alias for the item.\"\n", | |
" )\n", | |
" \n", | |
" description: str = Schema(\n", | |
" ...,\n", | |
" description=\"A description of the item.\"\n", | |
" )\n", | |
" \n", | |
" disambiguatingDescription: str = Schema(\n", | |
" ...,\n", | |
" description=\"A sub property of description. A short description of the item used to disambiguate from other, similar items. Information from other properties (in particular, name) may be necessary for the description to be useful for disambiguation.\"\n", | |
" )" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'title': 'Thing',\n", | |
" 'description': 'The most generic type of item.',\n", | |
" 'type': 'object',\n", | |
" 'properties': {'additionalType': {'title': 'additionalType',\n", | |
" 'description': \"An additional type for the item, typically used for adding more specific types from external vocabularies in microdata syntax. This is a relationship between something and a class that the thing is in. In RDFa syntax, it is better to use the native RDFa syntax - the 'typeof' attribute - for multiple types. Schema.org tools may have only weaker understanding of extra types, in particular those defined externally.\",\n", | |
" 'type': 'string'},\n", | |
" 'alternateName': {'title': 'alternateName',\n", | |
" 'description': 'An alias for the item.',\n", | |
" 'type': 'string'},\n", | |
" 'description': {'title': 'Description',\n", | |
" 'description': 'A description of the item.',\n", | |
" 'type': 'string'},\n", | |
" 'disambiguatingDescription': {'title': 'Disambiguatingdescription',\n", | |
" 'description': 'A sub property of description. A short description of the item used to disambiguate from other, similar items. Information from other properties (in particular, name) may be necessary for the description to be useful for disambiguation.',\n", | |
" 'type': 'string'}},\n", | |
" 'required': ['additionalType',\n", | |
" 'alternateName',\n", | |
" 'description',\n", | |
" 'disambiguatingDescription'],\n", | |
" '$id': 'https://schema.org/Thing',\n", | |
" '$schema': 'https://schema.org',\n", | |
" 'version': 3.9}" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"Thing.schema()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Validate a new object" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"What happens when we create an invalid object?" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Object did not validate.\n" | |
] | |
} | |
], | |
"source": [ | |
"try: \n", | |
" thing = Thing()\n", | |
"except pydantic.ValidationError:\n", | |
" print(\"Object did not validate.\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Now let's try a valid object..." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"thing = Thing(\n", | |
" alternateName='New Thing',\n", | |
" description='This is a new thing',\n", | |
" disambiguatingDescription='This thing is unique.',\n", | |
" additionalType='No additional type'\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"No error was raised." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Autogenerate pydantic objects from schema.org" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"class SchemaOrg(MetaSchema):\n", | |
" \n", | |
" def __new__(cls, name, base, dct):\n", | |
" annotations = {}\n", | |
" \n", | |
" data = schemaorg.Schema(name)\n", | |
" \n", | |
" dct = dict(\n", | |
" _title=name,\n", | |
" _id=data.id,\n", | |
" _version=float(data.version),\n", | |
" _schema=data.base,\n", | |
" __doc__=data.comment,\n", | |
" __annotations__={}\n", | |
" )\n", | |
"\n", | |
" # Currently, sets all class variables to type==str for\n", | |
" # demostration purposes.\n", | |
" # Need to develop datatypes for Schema.org objects.\n", | |
" for key, val in data._properties.items():\n", | |
" dct[key] = Schema(\n", | |
" ...,\n", | |
" description=val['comment'],\n", | |
" title=val['label']\n", | |
" )\n", | |
" dct['__annotations__'][key] = str\n", | |
"\n", | |
" base = (BaseModel,) + base\n", | |
" \n", | |
" return super(SchemaOrg, cls).__new__(cls, name, base, dct)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Specification base set to http://www.schema.org\n", | |
"Using Version 3.5\n", | |
"Found http://www.schema.org/Thing\n", | |
"Thing: found 12 properties\n", | |
"Specification base set to http://www.schema.org\n", | |
"Using Version 3.5\n", | |
"Found http://www.schema.org/Event\n", | |
"Event: found 47 properties\n" | |
] | |
} | |
], | |
"source": [ | |
"class Thing(metaclass=SchemaOrg): pass\n", | |
"class Event(metaclass=SchemaOrg): pass" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Specification base set to http://www.schema.org\n", | |
"Using Version 3.5\n", | |
"Found http://www.schema.org/Person\n", | |
"Person: found 69 properties\n" | |
] | |
} | |
], | |
"source": [ | |
"class Person(metaclass=SchemaOrg): pass" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "omnipotent (Python 3.7)", | |
"language": "python", | |
"name": "omnipotent" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.1" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment