Created
January 25, 2025 07:16
-
-
Save tonyfast/4b0acd39945adbc85744edd1cb449f9d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "bc87bb4d-16f9-4ed6-875b-5b786bce9402", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import polars\n", | |
"from nbformat import v4\n", | |
"\n", | |
"input = list(map(anyio.Path, glob.glob(str(pathlib.Path(\"~/Documents/syllabus/docs/*.ipynb\").expanduser()))))\n", | |
"files = polars.Series(\"path\", input).to_frame()\n", | |
"files = files.with_columns(\n", | |
" file=files[\"path\"].map_elements(lambda x: str(x._path), polars.String)\n", | |
")\n", | |
"\n", | |
"def enumerate_iterable(series, name=\"id\", start=0):\n", | |
" target_type = series.dtype.base_type()(polars.Struct(series.dtype.inner.fields + [polars.Field(name, polars.Int64)]))\n", | |
" return series.map_elements(lambda x: [{**body, name: i} for (i, body) in enumerate(x, start)], target_type)\n", | |
"\n", | |
"# https://github.com/jupyter/nbconvert/blob/5f508ebad9471876f53a59c737bd5f47b2b4c163/share/templates/base/display_priority.j2\n", | |
"display_priority = \"\"\"text/html text/markdown image/svg+xml image/png image/jpeg text/plain application/pdf\n", | |
"text/latex text/vnd.mermaid application/javascript application/vnd.jupyter.widget-view+json\"\"\".strip().split()\n", | |
"\n", | |
"async def read_text(path):\n", | |
" if isinstance(path, (anyio.Path, pathlib.Path)):\n", | |
" if path.suffix == \".ipynb\":\n", | |
" return await path.read_text()\n", | |
" elif path.suffix == \".md\":\n", | |
" return json.dumps(\n", | |
" v4.new_notebook(cells=[v4.new_markdown_cell((await path.read_text()).splitlines(True))])\n", | |
" )\n", | |
" elif path.suffix == \".py\":\n", | |
" return json.dumps(v4.new_notebook(cells=[v4.new_code_cell((await path.read_text()).splitlines(True))]))\n", | |
" return json.dumps(v4.new_notebook())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"id": "d263521e-7f72-447e-bb12-e7cfc8ecaa87", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import bs4\n", | |
"Soup = partial(bs4.BeautifulSoup, features=\"lxml\")\n", | |
"TEMPLATE = Soup(Path(\"~/Documents/refnb/packages/refnb-core/refnb-core/index.html\").expanduser().read_text())\n", | |
"CELLS = TEMPLATE.select_one(\"template.cells\").select_one(\"tbody\")\n", | |
"CELL = TEMPLATE.select_one(\"template.cell\").select_one(\"tr\")\n", | |
"OUTPUTS = TEMPLATE.select_one(\"template.outputs\").select_one(\"details\")\n", | |
"OUTPUT = TEMPLATE.select_one(\"template.output\").select_one(\"tr\")\n", | |
"assert all((CELLS, CELL, OUTPUTS, OUTPUT)), \"bad selector\"\n", | |
"\n", | |
"def clone(el):\n", | |
" from bs4 import Tag, NavigableString\n", | |
" if isinstance(el, NavigableString): return type(el)(el)\n", | |
" copy = Tag(None, el.builder, el.name, el.namespace, el.nsprefix)\n", | |
" copy.attrs = dict(el.attrs)\n", | |
" for k, v in copy.attrs.items():\n", | |
" if isinstance(v, list):\n", | |
" copy[k] = [*v]\n", | |
" for attr in (\"can_be_empty_element\", \"hidden\"): setattr(copy, attr, getattr(el, attr))\n", | |
" for child in el.contents: copy.append(clone(child))\n", | |
" return copy" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"id": "3e2d9459-5e40-4acf-8c6b-d04ef8699cef", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def display_dispatch(t, v, metadata):\n", | |
" if t == \"text/plain\":\n", | |
" yield \"\".join(v)\n", | |
" elif t == \"text/markdown\":\n", | |
" yield from Soup(get_markdown().render(\"\".join(v))).body.children\n", | |
" elif t == \"text/html\":\n", | |
" yield from Soup(\"\".join(v)).body.children\n", | |
" elif t.startswith(\"text\"):\n", | |
" # highlight form mimetype\n", | |
" yield highlight(\"\".join(v))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"id": "eac11e7e-6ec9-4097-8f8f-ed3f93c81d9f", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"IDREFS = (\"aria-labelledby aria-describedby aria-owns aria-controls for form id\".split())\n", | |
"def populate(nb):\n", | |
" tpl = clone(TEMPLATE)\n", | |
" table = tpl.select_one(\"main.notebook table.cells\")\n", | |
" footer = table.select_one(\"tfoot\")\n", | |
" cells = clone(CELLS)\n", | |
"\n", | |
" for cell in nb[\"cells\"]:\n", | |
" id = cell[\"id\"]\n", | |
" if cell[\"metadata\"].get(\"name\"):\n", | |
" id = cell[\"metadata\"][\"name\"]\n", | |
" row = clone(CELL)\n", | |
" row[\"class\"].append(cell[\"cell_type\"])\n", | |
" # link back to the document\n", | |
" row.select_one(\"th.doc a\").append(str(nb[\"file\"]))\n", | |
" row.select_one(\"th.cell a\").append(str(cell[\"cell\"]))\n", | |
" row.select_one(\"th.id input\").attrs[\"value\"] = id\n", | |
" row.select_one(\"td.execution_count output\").append(str(cell[\"execution_count\"] or \"\") )\n", | |
" row.select_one(F\"td.cell_type option[value={cell['cell_type']}]\").attrs[\"selected\"] = True\n", | |
" source = \"\".join(cell[\"source\"])\n", | |
" if cell[\"cell_type\"] == \"markdown\":\n", | |
" cell[\"outputs\"] = [dict(data={\"text/markdown\": source}, output_type=\"display_data\")]\n", | |
" row.select_one(\"td.source textarea\").append(source)\n", | |
" row.select_one(\"td.source section.highlight\").append(source)\n", | |
" row.select_one(\"td.form form\")\n", | |
" row.select_one(\"td.metadata\")\n", | |
" if cell.get(\"outputs\"):\n", | |
" details = clone(OUTPUTS)\n", | |
" outputs = details.select_one(\"table\")\n", | |
" for output in cell[\"outputs\"]:\n", | |
" if output[\"output_type\"] in {\"display_data\", \"execute_result\"}:\n", | |
" body = TEMPLATE.new_tag(\"tbody\")\n", | |
" body.attrs.setdefault(\"class\", []).append(output['output_type'])\n", | |
" for t in itertools.chain(\n", | |
" filter(output[\"data\"].__contains__, display_priority),\n", | |
" filter(lambda x: x not in display_priority, output[\"data\"])\n", | |
" ):\n", | |
" v = output[\"data\"][t] or \"<body></body>\"\n", | |
" # we can include ALL the bundles OR the preferred one\n", | |
"\n", | |
" entry = clone(OUTPUT)\n", | |
" entry.select_one(\"td.execution_count output\").append(str(cell[\"execution_count\"] or \"\") )\n", | |
" entry.select_one(\"td.output_type label\").append(t)\n", | |
" if t not in entry[\"class\"]: \n", | |
" entry[\"class\"].append(t)\n", | |
" \n", | |
" body.append(entry)\n", | |
" try:\n", | |
" entry.select_one(\"td.data\").extend(display_dispatch(t, v, output.get(\"metadata\", {})))\n", | |
" except Exception as e: \n", | |
" raise e\n", | |
" entry.select_one(\"td.metadata\")\n", | |
" body.append(entry)\n", | |
" outputs.append(body)\n", | |
" elif output[\"output_type\"] == \"stream\":\n", | |
" # stdout/stderr\n", | |
" entry = clone(OUTPUT)\n", | |
" entry[\"class\"] += F\" {output['output_type']}\"\n", | |
" entry.select_one(\"td.name\").append(output[\"name\"])\n", | |
" entry.select_one(\"td.text samp\").append(\"\".join(output[\"text\"]))\n", | |
" outputs.append(entry)\n", | |
" elif output[\"output_type\"] == \"error\":\n", | |
" entry = clone(OUTPUT)\n", | |
" entry[\"class\"] += F\" {output['output_type']}\"\n", | |
" entry.select_one(\"td.ename\").append(output[\"ename\"])\n", | |
" entry.select_one(\"td.evalue samp\").append(\"\".join(output[\"evalue\"]))\n", | |
" entry.select_one(\"td.traceback samp\").append(\"\".join(output[\"traceback\"]))\n", | |
" entry.select_one(\"td.execution_count output\").append(str(cell[\"execution_count\"] or \"\") )\n", | |
" outputs.append(entry)\n", | |
" row.select_one(\"td.outputs\").append(outputs)\n", | |
" \n", | |
"\n", | |
" if \"slide_type\" in cell[\"metadata\"]:\n", | |
" row[\"class\"].append(cell[\"metadata\"][\"slide_type\"])\n", | |
" if \"execution\" in cell[\"metadata\"]:\n", | |
" row.select_one(\"td.started_at time\").append(cell[\"metadata\"][\"execution\"][\"iopub.execute_input\"])\n", | |
" row.select_one(\"td.completed_at time\").append(cell[\"metadata\"][\"execution\"][\"iopub.execute_reply\"])\n", | |
" # do the math for the time\n", | |
" row.select_one(\"td.elapsed output time\")\n", | |
" if cell[\"metadata\"].get(\"collapsed\"):\n", | |
" row.select_one(\"td.outputs\")[\"class\"].append(\"collapsed\")\n", | |
" if cell[\"metadata\"].get(\"scrolled\"):\n", | |
" row.select_one(\"td.outputs\")[\"class\"].append(\"scrolled\")\n", | |
" if cell[\"metadata\"].get(\"jupyter\"):\n", | |
" if cell[\"metadata\"][\"jupyter\"].get(\"source_hidden\"):\n", | |
" row.select_one(\"td.source\")[\"hidden\"] = \"\"\n", | |
" if cell[\"metadata\"][\"jupyter\"].get(\"outputs_hidden\"):\n", | |
" row.select_one(\"td.outputs\")[\"hidden\"] = \"\"\n", | |
" row[\"class\"].extend(map(slugify.slugify, cell[\"metadata\"].get(\"tags\", \"\")))\n", | |
" \n", | |
" set_ids(row, id)\n", | |
" cells.append(row)\n", | |
" footer.insert_before(cells)\n", | |
" inject_toc(tpl)\n", | |
" return tpl" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"id": "25ce0453-de81-4a43-a7d4-4407108c9f1f", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import slugify" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"id": "e8366ea6-bacd-4bc6-b698-62b5a0b04058", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def inject_toc(document):\n", | |
" toc = table = document.select_one(\"table.toc.headings\")\n", | |
" ROW = table.select_one(\"template tr\")\n", | |
" tbody = TEMPLATE.new_tag(\"tbody\")\n", | |
" for h in document.select(\"table.cells h1,h2,h3,h4,h5,h6\"):\n", | |
" row = clone(ROW) \n", | |
" a = row.select_one(\"td.heading>a\")\n", | |
" heading = h.get_text()\n", | |
" if \"id\" not in h: h[\"id\"] = slugify.slugify(heading)\n", | |
" a.append(heading)\n", | |
" a[\"href\"] = \"#\" + h[\"id\"]\n", | |
" row.select_one(\"th.level\").append(h.name[1])\n", | |
" row.select_one(\"td.description>p\")\n", | |
" tbody.append(row)\n", | |
" table.append(tbody)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"id": "b95ae161-508d-402f-bb5c-dd6b3a75de2d", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"idref_selection = \",\".join(map(\"[{}]\".format, IDREFS))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"id": "2a5eb1ec-7785-421b-bd7a-101b07eea936", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def set_ids(selection, id=\"\"):\n", | |
" for s in [selection] + selection.select(\",\".join(map(\"[{}]\".format, IDREFS))):\n", | |
" for idref in IDREFS:\n", | |
" if idref in s.attrs:\n", | |
" value = s.attrs[idref]\n", | |
" if value == \":\":\n", | |
" s[idref] = id\n", | |
" elif isinstance(value, str):\n", | |
" s[idref] = \" \".join((F\"{id}-{x[1:]}\" if x.startswith(\":\") else x) for x in value.split())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"id": "6ffa0606-3682-4fb2-95a5-d54a065d0c4b", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"@functools.lru_cache(1)\n", | |
"def get_markdown():\n", | |
" from markdown_it import MarkdownIt\n", | |
" return MarkdownIt()\n", | |
"\n", | |
"def highlight(source, lang=\"python\", attrs=None):\n", | |
" import pygments\n", | |
" try:\n", | |
" return str(pygments.highlight(\n", | |
" source,\n", | |
" pygments.lexers.get_lexer_by_name(lang),\n", | |
" pygments.formatters.get_formatter_by_name(\"html5\")\n", | |
" )).pre\n", | |
" except:\n", | |
" return Soup(f\"\"\"<pre><code class=\"{lang}\">{html.escape(source)}</code></pre>\"\"\").pre" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"id": "74ffa475-94ca-4c5c-a4b1-7835fb264786", | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"contents = await asyncio.gather(*map(read_text, files[\"path\"]))\n", | |
"contents = polars.Series(\n", | |
" \"contents\", contents, strict=False\n", | |
").str.json_decode().struct.unnest().with_columns(\n", | |
" file=files[\"file\"]\n", | |
" # , path=files[\"path\"] # causes a panic cause its a python object\n", | |
")\n", | |
"contents = contents.with_columns(cells=enumerate_iterable(contents[\"cells\"], \"cell\", 1))\n", | |
"CONTENTS_COLUMNS = [*contents.columns]\n", | |
"contents = contents.with_columns(\n", | |
" contents.map_rows(lambda x: (populate(dict(zip(CONTENTS_COLUMNS, x))),)).rename({\"column_0\": \"html\"})\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"id": "558a6247-bd4c-4205-ad39-ff07ddc8e67d", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"cells = contents[[\"file\", \"cells\"]].explode(\"cells\").unnest(\"cells\")\n", | |
"cells = cells.with_columns(source=cells[\"source\"].map_elements(\"\".join, polars.String))\n", | |
"outputs = cells[[\"file\", \"id\", \"outputs\"]].explode(\"outputs\").unnest(\"outputs\")\n", | |
"displays = outputs[[\"file\", \"id\", \"data\"]].drop_nulls().unnest(\"data\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"id": "bdc6206b-1e66-4cc2-a2b3-3fb2842412a1", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"\n", | |
" <iframe\n", | |
" width=\"100%\"\n", | |
" height=\"600\"\n", | |
" src=\"test.html\"\n", | |
" frameborder=\"0\"\n", | |
" allowfullscreen\n", | |
" \n", | |
" ></iframe>\n", | |
" " | |
], | |
"text/plain": [ | |
"<IPython.lib.display.IFrame at 0x781ba2539190>" | |
] | |
}, | |
"execution_count": 24, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"test = pathlib.Path(\"test.html\")\n", | |
"test.write_text(contents[\"html\"][0].body.prettify())\n", | |
"IFrame(\"test.html\", width=\"100%\", height=600)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "4c979859-e516-40e4-9247-32749879efb5", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "33e17f3f-a1fa-4963-aa62-d8902ed596ca", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "599eafd7-8410-4a7d-84d7-7797efedf724", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "e6ee2397-0d24-48fd-82c1-e12a7b867a8d", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "3f3c50ca-6ab2-476c-83d3-8a554b8c62d6", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "802e5732-024f-4ec1-a467-8ed38ec02738", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "57e8e84d-6a69-4f46-b4b4-69a3b0eb1c2a", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "0950074d-be87-4b44-ab15-54ea19c0e521", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python [conda env:p311] *", | |
"language": "python", | |
"name": "conda-env-p311-py" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.11.6" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment