Skip to content

Instantly share code, notes, and snippets.

@j2kun
Created October 30, 2024 22:06
Show Gist options
  • Save j2kun/d5988e676997fb56be2a4eaed43acbe0 to your computer and use it in GitHub Desktop.
Save j2kun/d5988e676997fb56be2a4eaed43acbe0 to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import argparse
import datetime
import pprint
import requests
import sys
from scripts.webmention.utils import send_webmention
def get_post_text(post):
if post.get("story_text"):
return post["story_text"]
elif post.get("comment_text"):
return post["comment_text"]
else:
return ""
def main(domain, since_days=7):
# https://hn.algolia.com/?dateRange=pastWeek&page=0&prefix=false&query=jeremykun.com&sort=byDate&type=story
search_url = (
"https://hn.algolia.com/api/v1/search"
f"?query={domain}&tags=story&hitsPerPage=20"
f"&numericFilters=created_at_i%3E{int(datetime.datetime.now().timestamp()) - since_days * 24 * 60 * 60}"
)
try:
r = requests.get(search_url)
except requests.exceptions.RequestException as e:
print(e)
sys.exit(1)
response = r.json()
pprint.pp(response)
num_hits = response["nbHits"]
num_pages = response["nbPages"]
print(f"Found {num_hits} posts across {num_pages} paginated search pages.")
for page in range(0, num_pages):
print(f"Querying page {page}")
try:
r = requests.get(f"{search_url}&page={page}")
except requests.exceptions.RequestException as e:
print(e)
sys.exit(1)
response = r.json()
hn_posts = response["hits"]
for post in hn_posts:
created_at = (
datetime.datetime.strptime(post["created_at"], "%Y-%m-%dT%H:%M:%SZ")
if "created_at" in post
else datetime.datetime.now()
)
now = datetime.datetime.now()
if (now - created_at).days > since_days:
# we already manually handled this webmention with the initial
# script run
print(
f"Skipping post because its publication date ({created_at}) "
f"is older than the threshold of {since_days} days since "
f"today ({now})."
)
continue
post_url = "https://news.ycombinator.com/item?id=" + str(post["objectID"])
post_http_url = post.get("url")
print(f"Post URL: {post_http_url}")
# use 'domain in' because it may be www.jeremykun.com or jeremykun.com
if post_http_url is not None and domain in urlparse(post_http_url).netloc:
send_webmention(post_url, post_http_url)
continue
else:
parsed = urlparse(post_http_url).netloc
print(f"doesn't match {domain} netloc was: {parsed}")
story_text = get_post_text(post)
content = BeautifulSoup(story_text, "html.parser")
links = content.find_all("a")
for link in links:
if link.get("href") is None:
continue
post_domain = urlparse(link.get("href")).netloc
if post_domain == domain:
send_webmention(post_url, link.get("href"))
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-d", "--domain")
parser.add_argument("-s", "--since_days", type=int, default=7)
args = parser.parse_args()
main(args.domain, args.since_days)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment