Last active
October 30, 2024 19:21
-
-
Save j2kun/348411a5d4ac3277598c2ee8dc30746f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Add URLs to blogroll database. | |
scripts/blogroll.txt is a flat file containing URLs and titles of blog posts | |
I submitted via https://github.com/j2kun/extension-trigger-gh-workflow | |
Each entry contains two lines: the first line is a URL and the second is a | |
title extracted from the URL. Entries are separated by a blank line. | |
Example: | |
https://arxiv.org/abs/1801.05507 | |
Gazelle: A Low Latency Framework for Secure Neural Network Inference | |
This script takes in a URL, fetches the title of the article, appends it to the | |
end of the flat file, and then updates content/blogroll/_index.md with the | |
latest N entries. | |
""" | |
from bs4 import BeautifulSoup | |
import fire | |
import more_itertools | |
import urllib.request | |
from scripts import utils as utils | |
def is_already_in_database(path: str, url: str): | |
try: | |
with open(path, "r") as f: | |
db_lines = f.readlines() | |
for line in db_lines: | |
if url in line: | |
print("URL already exists in database, skipping.") | |
return True | |
except FileNotFoundError: | |
pass | |
return False | |
def find_line_number_containing(lines, substring, nth=1): | |
match_count = 0 | |
for line_num, line in enumerate(lines): | |
if substring in line: | |
match_count += 1 | |
if match_count == nth: | |
return line_num | |
return None | |
def find_next_line_number_containing(lines, substring, start_line): | |
for line_num, line in enumerate(lines[start_line:]): | |
if substring in line: | |
return start_line + line_num | |
return None | |
def find_article_title(text, url): | |
content = BeautifulSoup(text, "html.parser") | |
# Look for meta tags like | |
# <meta property="og:title" content="..."> | |
metas = content.find_all("meta") | |
for meta in metas: | |
if meta.get("property") == "og:title": | |
title = meta["content"] | |
print(f"Choosing {title} from meta tags") | |
return title | |
# IACR has noscript nonsense with an h1 to ask you to turn on Javascript. | |
# https://mathstodon.xyz/@j2kun/113297567947663128 | |
header_tag_to_use = "h3" if "eprint.iacr.org" in url else "h1" | |
h1 = content.find_all(header_tag_to_use) | |
title = h1[0].text | |
if len(h1) > 1: | |
print("Options for title:") | |
for h in h1: | |
print(h.text) | |
for h in h1: | |
for parent in h.parents: | |
if parent.name == "article": | |
print(f"Choosing {h.text}") | |
return h.text | |
for h in h1: | |
if h.get("class") and "title" in h["class"]: | |
print(f"Choosing {h.text}") | |
return h.text | |
# replace all line breaks and collapse whitespace | |
title = title.replace("\r", "").strip() | |
title = title.replace("\n", "").strip() | |
title = " ".join(title.split()) | |
print(f"Choosing {title}") | |
return title | |
def get_page_text(url): | |
req = urllib.request.Request(url) | |
req.add_header( | |
"User-Agent", | |
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0", | |
) | |
req.add_header( | |
"Accept", | |
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", | |
) | |
req.add_header("Accept-Language", "en-US,en;q=0.5") | |
return urllib.request.urlopen(req).read() | |
def add_url(url: str = None, render_only: bool = False): | |
git_root = utils.get_git_root() | |
database_path = git_root / "scripts" / "blogroll.txt" | |
blogroll_page_path = git_root / "content" / "blogroll" / "_index.md" | |
sidebar_path = git_root / "layouts" / "partials" / "sidebar.html" | |
if not render_only: | |
if url is None: | |
print("Please provide a URL to add to the blogroll or use --render_only.") | |
return | |
# strip trailing query params | |
stripped_url = urllib.parse.urljoin(url, urllib.parse.urlparse(url).path) | |
print(f"Fetching title for {url}") | |
try: | |
text = get_page_text(stripped_url) | |
url = stripped_url | |
except urllib.error.HTTPError as e: | |
print( | |
f"Failed to fetch stripped {stripped_url}: {e}\nFalling back to {url}" | |
) | |
text = get_page_text(url) | |
title = find_article_title(text, stripped_url) | |
print(f'Adding {url} ("{title}") to {database_path}') | |
if not is_already_in_database(database_path, url): | |
with open(database_path, "a") as f: | |
f.write(f"{url}\n{title}\n\n") | |
print(f"Updating {blogroll_page_path}") | |
with open(blogroll_page_path, "r") as f: | |
blogroll_lines = f.readlines() | |
line_num = find_line_number_containing(blogroll_lines, "---", nth=3) | |
if not line_num: | |
raise ValueError("Could not find line number to insert blogroll entry") | |
# repopulate completely with the last ten entries of the database | |
with open(database_path, "r") as f: | |
entries = f.readlines() | |
entries = entries[:-1] if entries[-1].strip() == "" else entries | |
entries = list(more_itertools.split_at(entries, lambda x: x == "\n")) | |
entries = entries[-10:] | |
entries = entries[::-1] | |
entries = [(e[0].strip(), e[1].strip()) for e in entries] | |
# insert shortcode for blogroll | |
new_lines = blogroll_lines[: line_num + 1] | |
new_lines.append("\n") | |
for entry in entries: | |
url, title = entry | |
domain = url.split("/")[2] | |
new_lines.append( | |
'- {{< blogroll url="%s" title="%s" domain="%s" >}}\n' | |
% (url, title, domain) | |
) | |
with open(blogroll_page_path, "w") as f: | |
f.write("".join(new_lines)) | |
print(f"Updating {sidebar_path}") | |
with open(sidebar_path, "r") as f: | |
sidebar_lines = f.readlines() | |
start_line_num = find_line_number_containing(sidebar_lines, '<a href="/blogroll"') | |
if start_line_num is None: | |
raise ValueError("Could not find line number to insert blogroll for sidebar") | |
end_line_num = find_next_line_number_containing( | |
sidebar_lines, "</div>", start_line_num | |
) | |
if end_line_num is None: | |
raise ValueError( | |
"Could not find ending line number to insert blogroll for sidebar" | |
) | |
new_sidebar = sidebar_lines[: start_line_num + 1] | |
new_sidebar.append("<ul>\n") | |
for entry in entries[:3]: | |
url, title = entry | |
domain = url.split("/")[2] | |
new_sidebar.append( | |
' <li><a href="{url}">{title}</a> ({domain})</li>\n'.format( | |
url=url, title=title, domain=domain | |
), | |
) | |
new_sidebar.append("</ul>\n") | |
new_sidebar.extend(sidebar_lines[end_line_num:]) | |
with open(sidebar_path, "w") as f: | |
f.write("".join(new_sidebar)) | |
if __name__ == "__main__": | |
fire.Fire(add_url) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment