Skip to content

Instantly share code, notes, and snippets.

@tonyfast
Created January 25, 2025 07:16
Show Gist options
  • Save tonyfast/4b0acd39945adbc85744edd1cb449f9d to your computer and use it in GitHub Desktop.
Save tonyfast/4b0acd39945adbc85744edd1cb449f9d to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "bc87bb4d-16f9-4ed6-875b-5b786bce9402",
"metadata": {},
"outputs": [],
"source": [
"import polars\n",
"from nbformat import v4\n",
"\n",
"input = list(map(anyio.Path, glob.glob(str(pathlib.Path(\"~/Documents/syllabus/docs/*.ipynb\").expanduser()))))\n",
"files = polars.Series(\"path\", input).to_frame()\n",
"files = files.with_columns(\n",
" file=files[\"path\"].map_elements(lambda x: str(x._path), polars.String)\n",
")\n",
"\n",
"def enumerate_iterable(series, name=\"id\", start=0):\n",
" target_type = series.dtype.base_type()(polars.Struct(series.dtype.inner.fields + [polars.Field(name, polars.Int64)]))\n",
" return series.map_elements(lambda x: [{**body, name: i} for (i, body) in enumerate(x, start)], target_type)\n",
"\n",
"# https://github.com/jupyter/nbconvert/blob/5f508ebad9471876f53a59c737bd5f47b2b4c163/share/templates/base/display_priority.j2\n",
"display_priority = \"\"\"text/html text/markdown image/svg+xml image/png image/jpeg text/plain application/pdf\n",
"text/latex text/vnd.mermaid application/javascript application/vnd.jupyter.widget-view+json\"\"\".strip().split()\n",
"\n",
"async def read_text(path):\n",
" if isinstance(path, (anyio.Path, pathlib.Path)):\n",
" if path.suffix == \".ipynb\":\n",
" return await path.read_text()\n",
" elif path.suffix == \".md\":\n",
" return json.dumps(\n",
" v4.new_notebook(cells=[v4.new_markdown_cell((await path.read_text()).splitlines(True))])\n",
" )\n",
" elif path.suffix == \".py\":\n",
" return json.dumps(v4.new_notebook(cells=[v4.new_code_cell((await path.read_text()).splitlines(True))]))\n",
" return json.dumps(v4.new_notebook())"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "d263521e-7f72-447e-bb12-e7cfc8ecaa87",
"metadata": {},
"outputs": [],
"source": [
"import bs4\n",
"Soup = partial(bs4.BeautifulSoup, features=\"lxml\")\n",
"TEMPLATE = Soup(Path(\"~/Documents/refnb/packages/refnb-core/refnb-core/index.html\").expanduser().read_text())\n",
"CELLS = TEMPLATE.select_one(\"template.cells\").select_one(\"tbody\")\n",
"CELL = TEMPLATE.select_one(\"template.cell\").select_one(\"tr\")\n",
"OUTPUTS = TEMPLATE.select_one(\"template.outputs\").select_one(\"details\")\n",
"OUTPUT = TEMPLATE.select_one(\"template.output\").select_one(\"tr\")\n",
"assert all((CELLS, CELL, OUTPUTS, OUTPUT)), \"bad selector\"\n",
"\n",
"def clone(el):\n",
" from bs4 import Tag, NavigableString\n",
" if isinstance(el, NavigableString): return type(el)(el)\n",
" copy = Tag(None, el.builder, el.name, el.namespace, el.nsprefix)\n",
" copy.attrs = dict(el.attrs)\n",
" for k, v in copy.attrs.items():\n",
" if isinstance(v, list):\n",
" copy[k] = [*v]\n",
" for attr in (\"can_be_empty_element\", \"hidden\"): setattr(copy, attr, getattr(el, attr))\n",
" for child in el.contents: copy.append(clone(child))\n",
" return copy"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "3e2d9459-5e40-4acf-8c6b-d04ef8699cef",
"metadata": {},
"outputs": [],
"source": [
"def display_dispatch(t, v, metadata):\n",
" if t == \"text/plain\":\n",
" yield \"\".join(v)\n",
" elif t == \"text/markdown\":\n",
" yield from Soup(get_markdown().render(\"\".join(v))).body.children\n",
" elif t == \"text/html\":\n",
" yield from Soup(\"\".join(v)).body.children\n",
" elif t.startswith(\"text\"):\n",
" # highlight form mimetype\n",
" yield highlight(\"\".join(v))"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "eac11e7e-6ec9-4097-8f8f-ed3f93c81d9f",
"metadata": {},
"outputs": [],
"source": [
"IDREFS = (\"aria-labelledby aria-describedby aria-owns aria-controls for form id\".split())\n",
"def populate(nb):\n",
" tpl = clone(TEMPLATE)\n",
" table = tpl.select_one(\"main.notebook table.cells\")\n",
" footer = table.select_one(\"tfoot\")\n",
" cells = clone(CELLS)\n",
"\n",
" for cell in nb[\"cells\"]:\n",
" id = cell[\"id\"]\n",
" if cell[\"metadata\"].get(\"name\"):\n",
" id = cell[\"metadata\"][\"name\"]\n",
" row = clone(CELL)\n",
" row[\"class\"].append(cell[\"cell_type\"])\n",
" # link back to the document\n",
" row.select_one(\"th.doc a\").append(str(nb[\"file\"]))\n",
" row.select_one(\"th.cell a\").append(str(cell[\"cell\"]))\n",
" row.select_one(\"th.id input\").attrs[\"value\"] = id\n",
" row.select_one(\"td.execution_count output\").append(str(cell[\"execution_count\"] or \"\") )\n",
" row.select_one(F\"td.cell_type option[value={cell['cell_type']}]\").attrs[\"selected\"] = True\n",
" source = \"\".join(cell[\"source\"])\n",
" if cell[\"cell_type\"] == \"markdown\":\n",
" cell[\"outputs\"] = [dict(data={\"text/markdown\": source}, output_type=\"display_data\")]\n",
" row.select_one(\"td.source textarea\").append(source)\n",
" row.select_one(\"td.source section.highlight\").append(source)\n",
" row.select_one(\"td.form form\")\n",
" row.select_one(\"td.metadata\")\n",
" if cell.get(\"outputs\"):\n",
" details = clone(OUTPUTS)\n",
" outputs = details.select_one(\"table\")\n",
" for output in cell[\"outputs\"]:\n",
" if output[\"output_type\"] in {\"display_data\", \"execute_result\"}:\n",
" body = TEMPLATE.new_tag(\"tbody\")\n",
" body.attrs.setdefault(\"class\", []).append(output['output_type'])\n",
" for t in itertools.chain(\n",
" filter(output[\"data\"].__contains__, display_priority),\n",
" filter(lambda x: x not in display_priority, output[\"data\"])\n",
" ):\n",
" v = output[\"data\"][t] or \"<body></body>\"\n",
" # we can include ALL the bundles OR the preferred one\n",
"\n",
" entry = clone(OUTPUT)\n",
" entry.select_one(\"td.execution_count output\").append(str(cell[\"execution_count\"] or \"\") )\n",
" entry.select_one(\"td.output_type label\").append(t)\n",
" if t not in entry[\"class\"]: \n",
" entry[\"class\"].append(t)\n",
" \n",
" body.append(entry)\n",
" try:\n",
" entry.select_one(\"td.data\").extend(display_dispatch(t, v, output.get(\"metadata\", {})))\n",
" except Exception as e: \n",
" raise e\n",
" entry.select_one(\"td.metadata\")\n",
" body.append(entry)\n",
" outputs.append(body)\n",
" elif output[\"output_type\"] == \"stream\":\n",
" # stdout/stderr\n",
" entry = clone(OUTPUT)\n",
" entry[\"class\"] += F\" {output['output_type']}\"\n",
" entry.select_one(\"td.name\").append(output[\"name\"])\n",
" entry.select_one(\"td.text samp\").append(\"\".join(output[\"text\"]))\n",
" outputs.append(entry)\n",
" elif output[\"output_type\"] == \"error\":\n",
" entry = clone(OUTPUT)\n",
" entry[\"class\"] += F\" {output['output_type']}\"\n",
" entry.select_one(\"td.ename\").append(output[\"ename\"])\n",
" entry.select_one(\"td.evalue samp\").append(\"\".join(output[\"evalue\"]))\n",
" entry.select_one(\"td.traceback samp\").append(\"\".join(output[\"traceback\"]))\n",
" entry.select_one(\"td.execution_count output\").append(str(cell[\"execution_count\"] or \"\") )\n",
" outputs.append(entry)\n",
" row.select_one(\"td.outputs\").append(outputs)\n",
" \n",
"\n",
" if \"slide_type\" in cell[\"metadata\"]:\n",
" row[\"class\"].append(cell[\"metadata\"][\"slide_type\"])\n",
" if \"execution\" in cell[\"metadata\"]:\n",
" row.select_one(\"td.started_at time\").append(cell[\"metadata\"][\"execution\"][\"iopub.execute_input\"])\n",
" row.select_one(\"td.completed_at time\").append(cell[\"metadata\"][\"execution\"][\"iopub.execute_reply\"])\n",
" # do the math for the time\n",
" row.select_one(\"td.elapsed output time\")\n",
" if cell[\"metadata\"].get(\"collapsed\"):\n",
" row.select_one(\"td.outputs\")[\"class\"].append(\"collapsed\")\n",
" if cell[\"metadata\"].get(\"scrolled\"):\n",
" row.select_one(\"td.outputs\")[\"class\"].append(\"scrolled\")\n",
" if cell[\"metadata\"].get(\"jupyter\"):\n",
" if cell[\"metadata\"][\"jupyter\"].get(\"source_hidden\"):\n",
" row.select_one(\"td.source\")[\"hidden\"] = \"\"\n",
" if cell[\"metadata\"][\"jupyter\"].get(\"outputs_hidden\"):\n",
" row.select_one(\"td.outputs\")[\"hidden\"] = \"\"\n",
" row[\"class\"].extend(map(slugify.slugify, cell[\"metadata\"].get(\"tags\", \"\")))\n",
" \n",
" set_ids(row, id)\n",
" cells.append(row)\n",
" footer.insert_before(cells)\n",
" inject_toc(tpl)\n",
" return tpl"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "25ce0453-de81-4a43-a7d4-4407108c9f1f",
"metadata": {},
"outputs": [],
"source": [
"import slugify"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "e8366ea6-bacd-4bc6-b698-62b5a0b04058",
"metadata": {},
"outputs": [],
"source": [
"def inject_toc(document):\n",
" toc = table = document.select_one(\"table.toc.headings\")\n",
" ROW = table.select_one(\"template tr\")\n",
" tbody = TEMPLATE.new_tag(\"tbody\")\n",
" for h in document.select(\"table.cells h1,h2,h3,h4,h5,h6\"):\n",
" row = clone(ROW) \n",
" a = row.select_one(\"td.heading>a\")\n",
" heading = h.get_text()\n",
" if \"id\" not in h: h[\"id\"] = slugify.slugify(heading)\n",
" a.append(heading)\n",
" a[\"href\"] = \"#\" + h[\"id\"]\n",
" row.select_one(\"th.level\").append(h.name[1])\n",
" row.select_one(\"td.description>p\")\n",
" tbody.append(row)\n",
" table.append(tbody)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "b95ae161-508d-402f-bb5c-dd6b3a75de2d",
"metadata": {},
"outputs": [],
"source": [
"idref_selection = \",\".join(map(\"[{}]\".format, IDREFS))"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "2a5eb1ec-7785-421b-bd7a-101b07eea936",
"metadata": {},
"outputs": [],
"source": [
"def set_ids(selection, id=\"\"):\n",
" for s in [selection] + selection.select(\",\".join(map(\"[{}]\".format, IDREFS))):\n",
" for idref in IDREFS:\n",
" if idref in s.attrs:\n",
" value = s.attrs[idref]\n",
" if value == \":\":\n",
" s[idref] = id\n",
" elif isinstance(value, str):\n",
" s[idref] = \" \".join((F\"{id}-{x[1:]}\" if x.startswith(\":\") else x) for x in value.split())"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "6ffa0606-3682-4fb2-95a5-d54a065d0c4b",
"metadata": {},
"outputs": [],
"source": [
"@functools.lru_cache(1)\n",
"def get_markdown():\n",
" from markdown_it import MarkdownIt\n",
" return MarkdownIt()\n",
"\n",
"def highlight(source, lang=\"python\", attrs=None):\n",
" import pygments\n",
" try:\n",
" return str(pygments.highlight(\n",
" source,\n",
" pygments.lexers.get_lexer_by_name(lang),\n",
" pygments.formatters.get_formatter_by_name(\"html5\")\n",
" )).pre\n",
" except:\n",
" return Soup(f\"\"\"<pre><code class=\"{lang}\">{html.escape(source)}</code></pre>\"\"\").pre"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "74ffa475-94ca-4c5c-a4b1-7835fb264786",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"contents = await asyncio.gather(*map(read_text, files[\"path\"]))\n",
"contents = polars.Series(\n",
" \"contents\", contents, strict=False\n",
").str.json_decode().struct.unnest().with_columns(\n",
" file=files[\"file\"]\n",
" # , path=files[\"path\"] # causes a panic cause its a python object\n",
")\n",
"contents = contents.with_columns(cells=enumerate_iterable(contents[\"cells\"], \"cell\", 1))\n",
"CONTENTS_COLUMNS = [*contents.columns]\n",
"contents = contents.with_columns(\n",
" contents.map_rows(lambda x: (populate(dict(zip(CONTENTS_COLUMNS, x))),)).rename({\"column_0\": \"html\"})\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "558a6247-bd4c-4205-ad39-ff07ddc8e67d",
"metadata": {},
"outputs": [],
"source": [
"cells = contents[[\"file\", \"cells\"]].explode(\"cells\").unnest(\"cells\")\n",
"cells = cells.with_columns(source=cells[\"source\"].map_elements(\"\".join, polars.String))\n",
"outputs = cells[[\"file\", \"id\", \"outputs\"]].explode(\"outputs\").unnest(\"outputs\")\n",
"displays = outputs[[\"file\", \"id\", \"data\"]].drop_nulls().unnest(\"data\")"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "bdc6206b-1e66-4cc2-a2b3-3fb2842412a1",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" <iframe\n",
" width=\"100%\"\n",
" height=\"600\"\n",
" src=\"test.html\"\n",
" frameborder=\"0\"\n",
" allowfullscreen\n",
" \n",
" ></iframe>\n",
" "
],
"text/plain": [
"<IPython.lib.display.IFrame at 0x781ba2539190>"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test = pathlib.Path(\"test.html\")\n",
"test.write_text(contents[\"html\"][0].body.prettify())\n",
"IFrame(\"test.html\", width=\"100%\", height=600)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4c979859-e516-40e4-9247-32749879efb5",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "33e17f3f-a1fa-4963-aa62-d8902ed596ca",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "599eafd7-8410-4a7d-84d7-7797efedf724",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "e6ee2397-0d24-48fd-82c1-e12a7b867a8d",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "3f3c50ca-6ab2-476c-83d3-8a554b8c62d6",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "802e5732-024f-4ec1-a467-8ed38ec02738",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "57e8e84d-6a69-4f46-b4b4-69a3b0eb1c2a",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "0950074d-be87-4b44-ab15-54ea19c0e521",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:p311] *",
"language": "python",
"name": "conda-env-p311-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment