Created
August 31, 2015 22:19
-
-
Save cwharland/1bf6190b8ec539238e5b to your computer and use it in GitHub Desktop.
Studs Terkel Interviews
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup as bs | |
import urllib2 | |
import re | |
import wget | |
# Where do you want to download the files? | |
path = '~/studs_terkel/%s' | |
# Page with all the mp3 links | |
url = 'http://conversations.studsterkel.org/htimes.php' | |
page = urllib2.urlopen(url) | |
soup = bs(page) | |
# URLs for each interview | |
mp3s = [x['href'] for x in soup.find_all('a', href = re.compile(r'mp3'))] | |
# Matching file names | |
file_names = [x.text.strip() for x in soup.find_all('font', color = '#FFFFFF')] | |
# Check that we have names for all the links | |
# If not something messed up with the parsing above #NotMyProblem | |
assert len(file_names) == len(mp3s) | |
# Get to downloading | |
for f, m in zip(file_names, mp3s): | |
formatted_name = path % (f + '.mp3') | |
wget.download(m,formatted_name) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment