Skip to content

Instantly share code, notes, and snippets.

@j2kun
Last active October 30, 2024 19:21
Show Gist options
  • Save j2kun/348411a5d4ac3277598c2ee8dc30746f to your computer and use it in GitHub Desktop.
Save j2kun/348411a5d4ac3277598c2ee8dc30746f to your computer and use it in GitHub Desktop.
"""Add URLs to blogroll database.
scripts/blogroll.txt is a flat file containing URLs and titles of blog posts
I submitted via https://github.com/j2kun/extension-trigger-gh-workflow
Each entry contains two lines: the first line is a URL and the second is a
title extracted from the URL. Entries are separated by a blank line.
Example:
https://arxiv.org/abs/1801.05507
Gazelle: A Low Latency Framework for Secure Neural Network Inference
This script takes in a URL, fetches the title of the article, appends it to the
end of the flat file, and then updates content/blogroll/_index.md with the
latest N entries.
"""
from bs4 import BeautifulSoup
import fire
import more_itertools
import urllib.request
from scripts import utils as utils
def is_already_in_database(path: str, url: str):
try:
with open(path, "r") as f:
db_lines = f.readlines()
for line in db_lines:
if url in line:
print("URL already exists in database, skipping.")
return True
except FileNotFoundError:
pass
return False
def find_line_number_containing(lines, substring, nth=1):
match_count = 0
for line_num, line in enumerate(lines):
if substring in line:
match_count += 1
if match_count == nth:
return line_num
return None
def find_next_line_number_containing(lines, substring, start_line):
for line_num, line in enumerate(lines[start_line:]):
if substring in line:
return start_line + line_num
return None
def find_article_title(text, url):
content = BeautifulSoup(text, "html.parser")
# Look for meta tags like
# <meta property="og:title" content="...">
metas = content.find_all("meta")
for meta in metas:
if meta.get("property") == "og:title":
title = meta["content"]
print(f"Choosing {title} from meta tags")
return title
# IACR has noscript nonsense with an h1 to ask you to turn on Javascript.
# https://mathstodon.xyz/@j2kun/113297567947663128
header_tag_to_use = "h3" if "eprint.iacr.org" in url else "h1"
h1 = content.find_all(header_tag_to_use)
title = h1[0].text
if len(h1) > 1:
print("Options for title:")
for h in h1:
print(h.text)
for h in h1:
for parent in h.parents:
if parent.name == "article":
print(f"Choosing {h.text}")
return h.text
for h in h1:
if h.get("class") and "title" in h["class"]:
print(f"Choosing {h.text}")
return h.text
# replace all line breaks and collapse whitespace
title = title.replace("\r", "").strip()
title = title.replace("\n", "").strip()
title = " ".join(title.split())
print(f"Choosing {title}")
return title
def get_page_text(url):
req = urllib.request.Request(url)
req.add_header(
"User-Agent",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0",
)
req.add_header(
"Accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
)
req.add_header("Accept-Language", "en-US,en;q=0.5")
return urllib.request.urlopen(req).read()
def add_url(url: str = None, render_only: bool = False):
git_root = utils.get_git_root()
database_path = git_root / "scripts" / "blogroll.txt"
blogroll_page_path = git_root / "content" / "blogroll" / "_index.md"
sidebar_path = git_root / "layouts" / "partials" / "sidebar.html"
if not render_only:
if url is None:
print("Please provide a URL to add to the blogroll or use --render_only.")
return
# strip trailing query params
stripped_url = urllib.parse.urljoin(url, urllib.parse.urlparse(url).path)
print(f"Fetching title for {url}")
try:
text = get_page_text(stripped_url)
url = stripped_url
except urllib.error.HTTPError as e:
print(
f"Failed to fetch stripped {stripped_url}: {e}\nFalling back to {url}"
)
text = get_page_text(url)
title = find_article_title(text, stripped_url)
print(f'Adding {url} ("{title}") to {database_path}')
if not is_already_in_database(database_path, url):
with open(database_path, "a") as f:
f.write(f"{url}\n{title}\n\n")
print(f"Updating {blogroll_page_path}")
with open(blogroll_page_path, "r") as f:
blogroll_lines = f.readlines()
line_num = find_line_number_containing(blogroll_lines, "---", nth=3)
if not line_num:
raise ValueError("Could not find line number to insert blogroll entry")
# repopulate completely with the last ten entries of the database
with open(database_path, "r") as f:
entries = f.readlines()
entries = entries[:-1] if entries[-1].strip() == "" else entries
entries = list(more_itertools.split_at(entries, lambda x: x == "\n"))
entries = entries[-10:]
entries = entries[::-1]
entries = [(e[0].strip(), e[1].strip()) for e in entries]
# insert shortcode for blogroll
new_lines = blogroll_lines[: line_num + 1]
new_lines.append("\n")
for entry in entries:
url, title = entry
domain = url.split("/")[2]
new_lines.append(
'- {{< blogroll url="%s" title="%s" domain="%s" >}}\n'
% (url, title, domain)
)
with open(blogroll_page_path, "w") as f:
f.write("".join(new_lines))
print(f"Updating {sidebar_path}")
with open(sidebar_path, "r") as f:
sidebar_lines = f.readlines()
start_line_num = find_line_number_containing(sidebar_lines, '<a href="/blogroll"')
if start_line_num is None:
raise ValueError("Could not find line number to insert blogroll for sidebar")
end_line_num = find_next_line_number_containing(
sidebar_lines, "</div>", start_line_num
)
if end_line_num is None:
raise ValueError(
"Could not find ending line number to insert blogroll for sidebar"
)
new_sidebar = sidebar_lines[: start_line_num + 1]
new_sidebar.append("<ul>\n")
for entry in entries[:3]:
url, title = entry
domain = url.split("/")[2]
new_sidebar.append(
' <li><a href="{url}">{title}</a> ({domain})</li>\n'.format(
url=url, title=title, domain=domain
),
)
new_sidebar.append("</ul>\n")
new_sidebar.extend(sidebar_lines[end_line_num:])
with open(sidebar_path, "w") as f:
f.write("".join(new_sidebar))
if __name__ == "__main__":
fire.Fire(add_url)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment