Skip to content

Instantly share code, notes, and snippets.

@drewdiver
Last active November 29, 2022 08:29
Show Gist options
  • Save drewdiver/d168527aec3775d80f3751a515139eff to your computer and use it in GitHub Desktop.
Save drewdiver/d168527aec3775d80f3751a515139eff to your computer and use it in GitHub Desktop.
Generates an RSS feed from Paul Ford’s author page on Wired
#!/usr/bin/env python3
#
# Scrapes Paul's author page on wired and generates a feed.
# Copyright 2022, Drew Diver
#
from bs4 import BeautifulSoup
from dateutil import parser
import datetime
import requests
import rfeed
response = requests.get('https://www.wired.com/author/paul-ford/')
soup = BeautifulSoup(response.text, "html.parser")
base_url = "https://www.wired.com"
# title of page
site_title = soup.h1.text
# get all the posts
posts = soup.select(".SummaryItemWrapper-gcQMOo")
# feed items
items_ = []
# cycle through each element in the list of posts and append to feed
for post in posts:
title = post.h2.text
url_attributes = post.select("a.SummaryItemHedLink-cgaOJy")
for attribute in url_attributes:
url = attribute.get("href")
link = base_url + url
# timestamp is returned as string, convert to datetime and parse
dt_string = post.time.text
dt = parser.parse(dt_string)
year = dt.year
month = dt.month
day = dt.day
hour = dt.hour
minutes = dt.minute
# grab the post image
image_attributes = post.select(".ResponsiveImageContainer-dmlCKO")
for attribute in image_attributes:
img_src = attribute.get("src")
# add item to array
item = rfeed.Item(
title=title,
link=link,
author=site_title,
guid=rfeed.Guid(link),
pubDate = datetime.datetime(year, month, day, hour, minutes),
enclosure=rfeed.Enclosure(url=img_src,type='image/jpeg',length=0)
)
items_.append(item)
# generate the feed
feed = rfeed.Feed(title=site_title,
description="Latest posts by Paul Ford",
language="en_US",
items=items_,
link="https://www.wired.com/author/paul-ford/")
print(feed.rss())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment