Last active
November 29, 2022 08:29
-
-
Save drewdiver/d168527aec3775d80f3751a515139eff to your computer and use it in GitHub Desktop.
Generates an RSS feed from Paul Ford’s author page on Wired
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# | |
# Scrapes Paul's author page on wired and generates a feed. | |
# Copyright 2022, Drew Diver | |
# | |
from bs4 import BeautifulSoup | |
from dateutil import parser | |
import datetime | |
import requests | |
import rfeed | |
response = requests.get('https://www.wired.com/author/paul-ford/') | |
soup = BeautifulSoup(response.text, "html.parser") | |
base_url = "https://www.wired.com" | |
# title of page | |
site_title = soup.h1.text | |
# get all the posts | |
posts = soup.select(".SummaryItemWrapper-gcQMOo") | |
# feed items | |
items_ = [] | |
# cycle through each element in the list of posts and append to feed | |
for post in posts: | |
title = post.h2.text | |
url_attributes = post.select("a.SummaryItemHedLink-cgaOJy") | |
for attribute in url_attributes: | |
url = attribute.get("href") | |
link = base_url + url | |
# timestamp is returned as string, convert to datetime and parse | |
dt_string = post.time.text | |
dt = parser.parse(dt_string) | |
year = dt.year | |
month = dt.month | |
day = dt.day | |
hour = dt.hour | |
minutes = dt.minute | |
# grab the post image | |
image_attributes = post.select(".ResponsiveImageContainer-dmlCKO") | |
for attribute in image_attributes: | |
img_src = attribute.get("src") | |
# add item to array | |
item = rfeed.Item( | |
title=title, | |
link=link, | |
author=site_title, | |
guid=rfeed.Guid(link), | |
pubDate = datetime.datetime(year, month, day, hour, minutes), | |
enclosure=rfeed.Enclosure(url=img_src,type='image/jpeg',length=0) | |
) | |
items_.append(item) | |
# generate the feed | |
feed = rfeed.Feed(title=site_title, | |
description="Latest posts by Paul Ford", | |
language="en_US", | |
items=items_, | |
link="https://www.wired.com/author/paul-ford/") | |
print(feed.rss()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment