Skip to content

Instantly share code, notes, and snippets.

@simongarisch
Created December 8, 2019 18:05
Show Gist options
  • Save simongarisch/5f03763fff2980d8d5a6e8e5b7220679 to your computer and use it in GitHub Desktop.
Save simongarisch/5f03763fff2980d8d5a6e8e5b7220679 to your computer and use it in GitHub Desktop.
Scrape PDF files from free-programming-books
import os
import warnings
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests
FREE_EBOOKS_URL = "https://github.com/EbookFoundation/free-programming-books/blob/master/free-programming-books.md" # noqa: E501
SCRIPT_PATH = os.path.dirname(os.path.abspath(__file__))
def get_pdf_urls():
page = requests.get(FREE_EBOOKS_URL)
data = page.text
soup = BeautifulSoup(data)
pdf_urls = []
for link in soup.find_all("a"):
link_address = link.get("href")
if link_address.endswith(".pdf"):
pdf_urls.append(link_address)
return pdf_urls
def validate_pdf_url(pdf_url):
if not isinstance(pdf_url, str):
raise TypeError("Expected a url string.")
if not pdf_url.endswith(".pdf"):
raise ValueError("Expected link to a pdf file.")
def get_book_name(pdf_url):
validate_pdf_url(pdf_url)
return pdf_url.split("/")[-1]
def download_file(pdf_url):
validate_pdf_url(pdf_url)
book_name = get_book_name(pdf_url)
book_path = os.path.join(SCRIPT_PATH, book_name)
book_already_downloaded = os.path.isfile(book_path)
if book_already_downloaded:
return
try:
response = urlopen(pdf_url)
with open(book_name, "wb") as target_pdf:
target_pdf.write(response.read())
except Exception as e:
warnings.warn(
"Unable to download '{}': Error {}".format(
pdf_url, str(e)
)
)
def download_free_ebooks():
pdf_urls = get_pdf_urls()
for pdf_url in pdf_urls:
print("downloading '{}'".format(pdf_url))
download_file(pdf_url)
def main():
download_free_ebooks()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment