Created
November 8, 2023 20:34
-
-
Save vgel/d2fa6c8261fcc6098f6ecc314b0fe8ba to your computer and use it in GitHub Desktop.
ar5iv
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# pip install beautifulsoup4 requests markdownify mistletoe | |
# command line usage: python chunkpaper.py 'https://ar5iv.org/abs/1910.06709' | |
# will dump the HTML, Markdown, and finally the chunk JSON | |
# note that (as of Nov 8 '23) ar5iv only has arxiv papers converted up to the end of October, | |
# but if you need something more recent you could probably do the LaTeX→HTML conversion yourself | |
# library usage: call `chunk` with some ar5iv HTML. will probably choke on anything else | |
import dataclasses | |
import re | |
from typing import Any | |
import bs4 | |
import requests | |
from markdownify import markdownify | |
import mistletoe | |
import mistletoe.block_token | |
import mistletoe.markdown_renderer | |
MIN_BLOCK_SIZE = 128 | |
MAX_BLOCK_SIZE = 2048 | |
@dataclasses.dataclass | |
class Chunk: | |
headings: list[str] | |
content: str | |
@dataclasses.dataclass | |
class ChunkingResult: | |
html: str | |
markdown: str | |
chunks: list[Chunk] | |
def chunks_jsonable(self) -> list[dict[str, Any]]: | |
return [dataclasses.asdict(c) for c in self.chunks] | |
def chunk(html: str) -> ChunkingResult: | |
soup = bs4.BeautifulSoup(html, features="html.parser") | |
article = soup.select_one("article") | |
if article is None: | |
raise ValueError("missing article") | |
for math in article.select("math"): | |
math.replace_with("$" + math.attrs["alttext"].strip() + "$") | |
for cite in article.select("cite"): | |
cite.replace_with(cite.text.strip()) | |
for header in article.select("h1,h2,h3,h4,h5,h6"): | |
text = header.text.strip() | |
header.clear() | |
header.append(text) | |
for a in article.select("a"): | |
if a.attrs.get("href", "").startswith("data:"): | |
a.decompose() | |
elif "title" in a.attrs: | |
del a.attrs["title"] | |
markdown = markdownify(str(article)) | |
markdown = re.sub(r"\n{3,}", "\n\n", markdown) | |
mistletoe.block_token.reset_tokens() | |
renderer = mistletoe.markdown_renderer.MarkdownRenderer() | |
parsed = mistletoe.Document(markdown) | |
current_headings = [] | |
chunks = [] | |
for block in parsed.children: | |
rendered = renderer.render(block).strip() | |
if not rendered: | |
continue | |
if isinstance(block, mistletoe.block_token.SetextHeading): | |
current_headings = current_headings[: block.level - 1] | |
content = rendered.replace(block.underline, "").strip() | |
current_headings.append(content) | |
elif isinstance(block, mistletoe.block_token.Heading): | |
current_headings = current_headings[: block.level - 1] | |
content = block.children[0].content | |
current_headings.append(content) | |
else: | |
should_merge = ( | |
len(rendered) < MIN_BLOCK_SIZE | |
and len(chunks) > 0 | |
and chunks[-1].headings == current_headings | |
and len(chunks[-1].content) + 2 + len(rendered) < MAX_BLOCK_SIZE | |
) | |
if should_merge: | |
chunks[-1] = Chunk( | |
headings=list(current_headings), | |
content=chunks[-1].content + "\n\n" + rendered, | |
) | |
else: | |
chunks.append( | |
Chunk( | |
headings=list(current_headings), | |
content=rendered, | |
) | |
) | |
return ChunkingResult( | |
html=soup.prettify(), | |
markdown=markdown, | |
chunks=chunks, | |
) | |
if __name__ == "__main__": | |
import sys, pprint | |
r = requests.get(sys.argv[1]) | |
r.raise_for_status() | |
result = chunk(r.text) | |
print(result.html) | |
print("\n" * 5 + "-" * 40 + "\n" * 5) | |
print(result.markdown) | |
print("\n" * 5 + "-" * 40 + "\n" * 5) | |
pprint.pprint(result.chunks_jsonable()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment