j2kun · October 30, 2024 19:21
diff --git a/add_url_to_blogroll.py b/add_url_to_blogroll.py
 """Add URLs to blogroll database.

 scripts/blogroll.txt is a flat file containing URLs and titles of blog posts
 I submitted via https://github.com/j2kun/extension-trigger-gh-workflow

 Each entry contains two lines: the first line is a URL and the second is a
 title extracted from the URL. Entries are separated by a blank line.

 Example:

    https://arxiv.org/abs/1801.05507
    Gazelle: A Low Latency Framework for Secure Neural Network Inference

 This script takes in a URL, fetches the title of the article, appends it to the
 end of the flat file, and then updates content/blogroll/_index.md with the
 latest N entries.
 """

 from bs4 import BeautifulSoup
 import fire
 import more_itertools
 import urllib.request

 from scripts import utils as utils


 def is_already_in_database(path: str, url: str):
    try:
        with open(path, "r") as f:
            db_lines = f.readlines()
            for line in db_lines:
                if url in line:
                    print("URL already exists in database, skipping.")
                    return True
    except FileNotFoundError:
        pass

    return False


 def find_line_number_containing(lines, substring, nth=1):
    match_count = 0
    for line_num, line in enumerate(lines):
        if substring in line:
            match_count += 1
        if match_count == nth:
            return line_num
    return None


 def find_next_line_number_containing(lines, substring, start_line):
    for line_num, line in enumerate(lines[start_line:]):
        if substring in line:
            return start_line + line_num
    return None


 def find_article_title(text, url):
    content = BeautifulSoup(text, "html.parser")
    
    # Look for meta tags like
    # <meta property="og:title" content="...">
    metas = content.find_all("meta")
    for meta in metas:
        if meta.get("property") == "og:title":
            title = meta["content"]
            print(f"Choosing {title} from meta tags")
            return title
    
    # IACR has noscript nonsense with an h1 to ask you to turn on Javascript.
    # https://mathstodon.xyz/@j2kun/113297567947663128
    header_tag_to_use = "h3" if "eprint.iacr.org" in url else "h1"
    h1 = content.find_all(header_tag_to_use)
    title = h1[0].text
    if len(h1) > 1:
        print("Options for title:")
        for h in h1:
            print(h.text)

        for h in h1:
            for parent in h.parents:
                if parent.name == "article":
                    print(f"Choosing {h.text}")
                    return h.text

        for h in h1:
            if h.get("class") and "title" in h["class"]:
                print(f"Choosing {h.text}")
                return h.text

    # replace all line breaks and collapse whitespace
    title = title.replace("\r", "").strip()
    title = title.replace("\n", "").strip()
    title = " ".join(title.split())
    print(f"Choosing {title}")
    return title


 def get_page_text(url):
    req = urllib.request.Request(url)
    req.add_header(
        "User-Agent",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0",
    )
    req.add_header(
        "Accept",
        "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
    )
    req.add_header("Accept-Language", "en-US,en;q=0.5")
    return urllib.request.urlopen(req).read()


 def add_url(url: str = None, render_only: bool = False):
    git_root = utils.get_git_root()
    database_path = git_root / "scripts" / "blogroll.txt"
    blogroll_page_path = git_root / "content" / "blogroll" / "_index.md"
    sidebar_path = git_root / "layouts" / "partials" / "sidebar.html"

    if not render_only:
        if url is None:
            print("Please provide a URL to add to the blogroll or use --render_only.")
            return

        # strip trailing query params
        stripped_url = urllib.parse.urljoin(url, urllib.parse.urlparse(url).path)
        print(f"Fetching title for {url}")
        try:
            text = get_page_text(stripped_url)
            url = stripped_url
        except urllib.error.HTTPError as e:
            print(
                f"Failed to fetch stripped {stripped_url}: {e}\nFalling back to {url}"
            )
            text = get_page_text(url)

        title = find_article_title(text, stripped_url)

        print(f'Adding {url} ("{title}") to {database_path}')
        if not is_already_in_database(database_path, url):
            with open(database_path, "a") as f:
                f.write(f"{url}\n{title}\n\n")

    print(f"Updating {blogroll_page_path}")
    with open(blogroll_page_path, "r") as f:
        blogroll_lines = f.readlines()

    line_num = find_line_number_containing(blogroll_lines, "---", nth=3)
    if not line_num:
        raise ValueError("Could not find line number to insert blogroll entry")

    # repopulate completely with the last ten entries of the database
    with open(database_path, "r") as f:
        entries = f.readlines()
    entries = entries[:-1] if entries[-1].strip() == "" else entries
    entries = list(more_itertools.split_at(entries, lambda x: x == "\n"))
    entries = entries[-10:]
    entries = entries[::-1]
    entries = [(e[0].strip(), e[1].strip()) for e in entries]

    # insert shortcode for blogroll
    new_lines = blogroll_lines[: line_num + 1]
    new_lines.append("\n")
    for entry in entries:
        url, title = entry
        domain = url.split("/")[2]
        new_lines.append(
            '- {{< blogroll url="%s" title="%s" domain="%s" >}}\n'
            % (url, title, domain)
        )

    with open(blogroll_page_path, "w") as f:
        f.write("".join(new_lines))

    print(f"Updating {sidebar_path}")
    with open(sidebar_path, "r") as f:
        sidebar_lines = f.readlines()

    start_line_num = find_line_number_containing(sidebar_lines, '<a href="/blogroll"')
    if start_line_num is None:
        raise ValueError("Could not find line number to insert blogroll for sidebar")
    end_line_num = find_next_line_number_containing(
        sidebar_lines, "</div>", start_line_num
    )
    if end_line_num is None:
        raise ValueError(
            "Could not find ending line number to insert blogroll for sidebar"
        )

    new_sidebar = sidebar_lines[: start_line_num + 1]
    new_sidebar.append("<ul>\n")
    for entry in entries[:3]:
        url, title = entry
        domain = url.split("/")[2]
        new_sidebar.append(
            '       <li><a href="{url}">{title}</a> ({domain})</li>\n'.format(
                url=url, title=title, domain=domain
            ),
        )
    new_sidebar.append("</ul>\n")
    new_sidebar.extend(sidebar_lines[end_line_num:])

    with open(sidebar_path, "w") as f:
        f.write("".join(new_sidebar))


 if __name__ == "__main__":
    fire.Fire(add_url)
	"""Add URLs to blogroll database.

	scripts/blogroll.txt is a flat file containing URLs and titles of blog posts
	I submitted via https://github.com/j2kun/extension-trigger-gh-workflow

	Each entry contains two lines: the first line is a URL and the second is a
	title extracted from the URL. Entries are separated by a blank line.

	Example:

	https://arxiv.org/abs/1801.05507
	Gazelle: A Low Latency Framework for Secure Neural Network Inference

	This script takes in a URL, fetches the title of the article, appends it to the
	end of the flat file, and then updates content/blogroll/_index.md with the
	latest N entries.
	"""

	from bs4 import BeautifulSoup
	import fire
	import more_itertools
	import urllib.request

	from scripts import utils as utils


	def is_already_in_database(path: str, url: str):
	try:
	with open(path, "r") as f:
	db_lines = f.readlines()
	for line in db_lines:
	if url in line:
	print("URL already exists in database, skipping.")
	return True
	except FileNotFoundError:
	pass

	return False


	def find_line_number_containing(lines, substring, nth=1):
	match_count = 0
	for line_num, line in enumerate(lines):
	if substring in line:
	match_count += 1
	if match_count == nth:
	return line_num
	return None


	def find_next_line_number_containing(lines, substring, start_line):
	for line_num, line in enumerate(lines[start_line:]):
	if substring in line:
	return start_line + line_num
	return None


	def find_article_title(text, url):
	content = BeautifulSoup(text, "html.parser")

	# Look for meta tags like
	# <meta property="og:title" content="...">
	metas = content.find_all("meta")
	for meta in metas:
	if meta.get("property") == "og:title":
	title = meta["content"]
	print(f"Choosing {title} from meta tags")
	return title

	# IACR has noscript nonsense with an h1 to ask you to turn on Javascript.
	# https://mathstodon.xyz/@j2kun/113297567947663128
	header_tag_to_use = "h3" if "eprint.iacr.org" in url else "h1"
	h1 = content.find_all(header_tag_to_use)
	title = h1[0].text
	if len(h1) > 1:
	print("Options for title:")
	for h in h1:
	print(h.text)

	for h in h1:
	for parent in h.parents:
	if parent.name == "article":
	print(f"Choosing {h.text}")
	return h.text

	for h in h1:
	if h.get("class") and "title" in h["class"]:
	print(f"Choosing {h.text}")
	return h.text

	# replace all line breaks and collapse whitespace
	title = title.replace("\r", "").strip()
	title = title.replace("\n", "").strip()
	title = " ".join(title.split())
	print(f"Choosing {title}")
	return title


	def get_page_text(url):
	req = urllib.request.Request(url)
	req.add_header(
	"User-Agent",
	"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0",
	)
	req.add_header(
	"Accept",
	"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,/;q=0.8",
	)
	req.add_header("Accept-Language", "en-US,en;q=0.5")
	return urllib.request.urlopen(req).read()


	def add_url(url: str = None, render_only: bool = False):
	git_root = utils.get_git_root()
	database_path = git_root / "scripts" / "blogroll.txt"
	blogroll_page_path = git_root / "content" / "blogroll" / "_index.md"
	sidebar_path = git_root / "layouts" / "partials" / "sidebar.html"

	if not render_only:
	if url is None:
	print("Please provide a URL to add to the blogroll or use --render_only.")
	return

	# strip trailing query params
	stripped_url = urllib.parse.urljoin(url, urllib.parse.urlparse(url).path)
	print(f"Fetching title for {url}")
	try:
	text = get_page_text(stripped_url)
	url = stripped_url
	except urllib.error.HTTPError as e:
	print(
	f"Failed to fetch stripped {stripped_url}: {e}\nFalling back to {url}"
	)
	text = get_page_text(url)

	title = find_article_title(text, stripped_url)

	print(f'Adding {url} ("{title}") to {database_path}')
	if not is_already_in_database(database_path, url):
	with open(database_path, "a") as f:
	f.write(f"{url}\n{title}\n\n")

	print(f"Updating {blogroll_page_path}")
	with open(blogroll_page_path, "r") as f:
	blogroll_lines = f.readlines()

	line_num = find_line_number_containing(blogroll_lines, "---", nth=3)
	if not line_num:
	raise ValueError("Could not find line number to insert blogroll entry")

	# repopulate completely with the last ten entries of the database
	with open(database_path, "r") as f:
	entries = f.readlines()
	entries = entries[:-1] if entries[-1].strip() == "" else entries
	entries = list(more_itertools.split_at(entries, lambda x: x == "\n"))
	entries = entries[-10:]
	entries = entries[::-1]
	entries = [(e[0].strip(), e[1].strip()) for e in entries]

	# insert shortcode for blogroll
	new_lines = blogroll_lines[: line_num + 1]
	new_lines.append("\n")
	for entry in entries:
	url, title = entry
	domain = url.split("/")[2]
	new_lines.append(
	'- {{< blogroll url="%s" title="%s" domain="%s" >}}\n'
	% (url, title, domain)
	)

	with open(blogroll_page_path, "w") as f:
	f.write("".join(new_lines))

	print(f"Updating {sidebar_path}")
	with open(sidebar_path, "r") as f:
	sidebar_lines = f.readlines()

	start_line_num = find_line_number_containing(sidebar_lines, '<a href="/blogroll"')
	if start_line_num is None:
	raise ValueError("Could not find line number to insert blogroll for sidebar")
	end_line_num = find_next_line_number_containing(
	sidebar_lines, "</div>", start_line_num
	)
	if end_line_num is None:
	raise ValueError(
	"Could not find ending line number to insert blogroll for sidebar"
	)

	new_sidebar = sidebar_lines[: start_line_num + 1]
	new_sidebar.append("<ul>\n")
	for entry in entries[:3]:
	url, title = entry
	domain = url.split("/")[2]
	new_sidebar.append(
	' <li><a href="{url}">{title}</a> ({domain})</li>\n'.format(
	url=url, title=title, domain=domain
	),
	)
	new_sidebar.append("</ul>\n")
	new_sidebar.extend(sidebar_lines[end_line_num:])

	with open(sidebar_path, "w") as f:
	f.write("".join(new_sidebar))


	if __name__ == "__main__":
	fire.Fire(add_url)