Created
October 30, 2024 22:06
-
-
Save j2kun/d5988e676997fb56be2a4eaed43acbe0 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
from urllib.parse import urlparse | |
import argparse | |
import datetime | |
import pprint | |
import requests | |
import sys | |
from scripts.webmention.utils import send_webmention | |
def get_post_text(post): | |
if post.get("story_text"): | |
return post["story_text"] | |
elif post.get("comment_text"): | |
return post["comment_text"] | |
else: | |
return "" | |
def main(domain, since_days=7): | |
# https://hn.algolia.com/?dateRange=pastWeek&page=0&prefix=false&query=jeremykun.com&sort=byDate&type=story | |
search_url = ( | |
"https://hn.algolia.com/api/v1/search" | |
f"?query={domain}&tags=story&hitsPerPage=20" | |
f"&numericFilters=created_at_i%3E{int(datetime.datetime.now().timestamp()) - since_days * 24 * 60 * 60}" | |
) | |
try: | |
r = requests.get(search_url) | |
except requests.exceptions.RequestException as e: | |
print(e) | |
sys.exit(1) | |
response = r.json() | |
pprint.pp(response) | |
num_hits = response["nbHits"] | |
num_pages = response["nbPages"] | |
print(f"Found {num_hits} posts across {num_pages} paginated search pages.") | |
for page in range(0, num_pages): | |
print(f"Querying page {page}") | |
try: | |
r = requests.get(f"{search_url}&page={page}") | |
except requests.exceptions.RequestException as e: | |
print(e) | |
sys.exit(1) | |
response = r.json() | |
hn_posts = response["hits"] | |
for post in hn_posts: | |
created_at = ( | |
datetime.datetime.strptime(post["created_at"], "%Y-%m-%dT%H:%M:%SZ") | |
if "created_at" in post | |
else datetime.datetime.now() | |
) | |
now = datetime.datetime.now() | |
if (now - created_at).days > since_days: | |
# we already manually handled this webmention with the initial | |
# script run | |
print( | |
f"Skipping post because its publication date ({created_at}) " | |
f"is older than the threshold of {since_days} days since " | |
f"today ({now})." | |
) | |
continue | |
post_url = "https://news.ycombinator.com/item?id=" + str(post["objectID"]) | |
post_http_url = post.get("url") | |
print(f"Post URL: {post_http_url}") | |
# use 'domain in' because it may be www.jeremykun.com or jeremykun.com | |
if post_http_url is not None and domain in urlparse(post_http_url).netloc: | |
send_webmention(post_url, post_http_url) | |
continue | |
else: | |
parsed = urlparse(post_http_url).netloc | |
print(f"doesn't match {domain} netloc was: {parsed}") | |
story_text = get_post_text(post) | |
content = BeautifulSoup(story_text, "html.parser") | |
links = content.find_all("a") | |
for link in links: | |
if link.get("href") is None: | |
continue | |
post_domain = urlparse(link.get("href")).netloc | |
if post_domain == domain: | |
send_webmention(post_url, link.get("href")) | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser() | |
parser.add_argument("-d", "--domain") | |
parser.add_argument("-s", "--since_days", type=int, default=7) | |
args = parser.parse_args() | |
main(args.domain, args.since_days) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment