vgel · November 8, 2023 20:34
diff --git a/chunkpaper.py b/chunkpaper.py
 # pip install beautifulsoup4 requests markdownify mistletoe

 # command line usage: python chunkpaper.py 'https://ar5iv.org/abs/1910.06709'
 # will dump the HTML, Markdown, and finally the chunk JSON
 # note that (as of Nov 8 '23) ar5iv only has arxiv papers converted up to the end of October,
 # but if you need something more recent you could probably do the LaTeX→HTML conversion yourself

 # library usage: call `chunk` with some ar5iv HTML. will probably choke on anything else

 import dataclasses
 import re
 from typing import Any

 import bs4
 import requests
 from markdownify import markdownify
 import mistletoe
 import mistletoe.block_token
 import mistletoe.markdown_renderer

 MIN_BLOCK_SIZE = 128
 MAX_BLOCK_SIZE = 2048


 @dataclasses.dataclass
 class Chunk:
    headings: list[str]
    content: str


 @dataclasses.dataclass
 class ChunkingResult:
    html: str
    markdown: str
    chunks: list[Chunk]

    def chunks_jsonable(self) -> list[dict[str, Any]]:
        return [dataclasses.asdict(c) for c in self.chunks]


 def chunk(html: str) -> ChunkingResult:
    soup = bs4.BeautifulSoup(html, features="html.parser")
    article = soup.select_one("article")
    if article is None:
        raise ValueError("missing article")

    for math in article.select("math"):
        math.replace_with("$" + math.attrs["alttext"].strip() + "$")

    for cite in article.select("cite"):
        cite.replace_with(cite.text.strip())

    for header in article.select("h1,h2,h3,h4,h5,h6"):
        text = header.text.strip()
        header.clear()
        header.append(text)

    for a in article.select("a"):
        if a.attrs.get("href", "").startswith("data:"):
            a.decompose()
        elif "title" in a.attrs:
            del a.attrs["title"]

    markdown = markdownify(str(article))
    markdown = re.sub(r"\n{3,}", "\n\n", markdown)

    mistletoe.block_token.reset_tokens()
    renderer = mistletoe.markdown_renderer.MarkdownRenderer()
    parsed = mistletoe.Document(markdown)
    current_headings = []
    chunks = []
    for block in parsed.children:
        rendered = renderer.render(block).strip()
        if not rendered:
            continue

        if isinstance(block, mistletoe.block_token.SetextHeading):
            current_headings = current_headings[: block.level - 1]
            content = rendered.replace(block.underline, "").strip()
            current_headings.append(content)
        elif isinstance(block, mistletoe.block_token.Heading):
            current_headings = current_headings[: block.level - 1]
            content = block.children[0].content
            current_headings.append(content)
        else:
            should_merge = (
                len(rendered) < MIN_BLOCK_SIZE
                and len(chunks) > 0
                and chunks[-1].headings == current_headings
                and len(chunks[-1].content) + 2 + len(rendered) < MAX_BLOCK_SIZE
            )
            if should_merge:
                chunks[-1] = Chunk(
                    headings=list(current_headings),
                    content=chunks[-1].content + "\n\n" + rendered,
                )
            else:
                chunks.append(
                    Chunk(
                        headings=list(current_headings),
                        content=rendered,
                    )
                )

    return ChunkingResult(
        html=soup.prettify(),
        markdown=markdown,
        chunks=chunks,
    )


 if __name__ == "__main__":
    import sys, pprint

    r = requests.get(sys.argv[1])
    r.raise_for_status()
    result = chunk(r.text)
    print(result.html)
    print("\n" * 5 + "-" * 40 + "\n" * 5)
    print(result.markdown)
    print("\n" * 5 + "-" * 40 + "\n" * 5)
    pprint.pprint(result.chunks_jsonable())
	# pip install beautifulsoup4 requests markdownify mistletoe

	# command line usage: python chunkpaper.py 'https://ar5iv.org/abs/1910.06709'
	# will dump the HTML, Markdown, and finally the chunk JSON
	# note that (as of Nov 8 '23) ar5iv only has arxiv papers converted up to the end of October,
	# but if you need something more recent you could probably do the LaTeX→HTML conversion yourself

	# library usage: call `chunk` with some ar5iv HTML. will probably choke on anything else

	import dataclasses
	import re
	from typing import Any

	import bs4
	import requests
	from markdownify import markdownify
	import mistletoe
	import mistletoe.block_token
	import mistletoe.markdown_renderer

	MIN_BLOCK_SIZE = 128
	MAX_BLOCK_SIZE = 2048


	@dataclasses.dataclass
	class Chunk:
	headings: list[str]
	content: str


	@dataclasses.dataclass
	class ChunkingResult:
	html: str
	markdown: str
	chunks: list[Chunk]

	def chunks_jsonable(self) -> list[dict[str, Any]]:
	return [dataclasses.asdict(c) for c in self.chunks]


	def chunk(html: str) -> ChunkingResult:
	soup = bs4.BeautifulSoup(html, features="html.parser")
	article = soup.select_one("article")
	if article is None:
	raise ValueError("missing article")

	for math in article.select("math"):
	math.replace_with("$" + math.attrs["alttext"].strip() + "$")

	for cite in article.select("cite"):
	cite.replace_with(cite.text.strip())

	for header in article.select("h1,h2,h3,h4,h5,h6"):
	text = header.text.strip()
	header.clear()
	header.append(text)

	for a in article.select("a"):
	if a.attrs.get("href", "").startswith("data:"):
	a.decompose()
	elif "title" in a.attrs:
	del a.attrs["title"]

	markdown = markdownify(str(article))
	markdown = re.sub(r"\n{3,}", "\n\n", markdown)

	mistletoe.block_token.reset_tokens()
	renderer = mistletoe.markdown_renderer.MarkdownRenderer()
	parsed = mistletoe.Document(markdown)
	current_headings = []
	chunks = []
	for block in parsed.children:
	rendered = renderer.render(block).strip()
	if not rendered:
	continue

	if isinstance(block, mistletoe.block_token.SetextHeading):
	current_headings = current_headings[: block.level - 1]
	content = rendered.replace(block.underline, "").strip()
	current_headings.append(content)
	elif isinstance(block, mistletoe.block_token.Heading):
	current_headings = current_headings[: block.level - 1]
	content = block.children[0].content
	current_headings.append(content)
	else:
	should_merge = (
	len(rendered) < MIN_BLOCK_SIZE
	and len(chunks) > 0
	and chunks[-1].headings == current_headings
	and len(chunks[-1].content) + 2 + len(rendered) < MAX_BLOCK_SIZE
	)
	if should_merge:
	chunks[-1] = Chunk(
	headings=list(current_headings),
	content=chunks[-1].content + "\n\n" + rendered,
	)
	else:
	chunks.append(
	Chunk(
	headings=list(current_headings),
	content=rendered,
	)
	)

	return ChunkingResult(
	html=soup.prettify(),
	markdown=markdown,
	chunks=chunks,
	)


	if __name__ == "__main__":
	import sys, pprint

	r = requests.get(sys.argv[1])
	r.raise_for_status()
	result = chunk(r.text)
	print(result.html)
	print("\n" * 5 + "-" * 40 + "\n" * 5)
	print(result.markdown)
	print("\n" * 5 + "-" * 40 + "\n" * 5)
	pprint.pprint(result.chunks_jsonable())