Created
December 8, 2019 18:05
-
-
Save simongarisch/5f03763fff2980d8d5a6e8e5b7220679 to your computer and use it in GitHub Desktop.
Scrape PDF files from free-programming-books
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import warnings | |
from urllib.request import urlopen | |
from bs4 import BeautifulSoup | |
import requests | |
FREE_EBOOKS_URL = "https://github.com/EbookFoundation/free-programming-books/blob/master/free-programming-books.md" # noqa: E501 | |
SCRIPT_PATH = os.path.dirname(os.path.abspath(__file__)) | |
def get_pdf_urls(): | |
page = requests.get(FREE_EBOOKS_URL) | |
data = page.text | |
soup = BeautifulSoup(data) | |
pdf_urls = [] | |
for link in soup.find_all("a"): | |
link_address = link.get("href") | |
if link_address.endswith(".pdf"): | |
pdf_urls.append(link_address) | |
return pdf_urls | |
def validate_pdf_url(pdf_url): | |
if not isinstance(pdf_url, str): | |
raise TypeError("Expected a url string.") | |
if not pdf_url.endswith(".pdf"): | |
raise ValueError("Expected link to a pdf file.") | |
def get_book_name(pdf_url): | |
validate_pdf_url(pdf_url) | |
return pdf_url.split("/")[-1] | |
def download_file(pdf_url): | |
validate_pdf_url(pdf_url) | |
book_name = get_book_name(pdf_url) | |
book_path = os.path.join(SCRIPT_PATH, book_name) | |
book_already_downloaded = os.path.isfile(book_path) | |
if book_already_downloaded: | |
return | |
try: | |
response = urlopen(pdf_url) | |
with open(book_name, "wb") as target_pdf: | |
target_pdf.write(response.read()) | |
except Exception as e: | |
warnings.warn( | |
"Unable to download '{}': Error {}".format( | |
pdf_url, str(e) | |
) | |
) | |
def download_free_ebooks(): | |
pdf_urls = get_pdf_urls() | |
for pdf_url in pdf_urls: | |
print("downloading '{}'".format(pdf_url)) | |
download_file(pdf_url) | |
def main(): | |
download_free_ebooks() | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment