Last active
April 29, 2023 14:33
-
-
Save Tokariew/b9ad5501b6a11ac3619ec045e4202d6f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/python3 | |
from concurrent.futures import ThreadPoolExecutor | |
from pathlib import Path | |
import requests | |
from bs4 import BeautifulSoup | |
from slugify import slugify as restring | |
from termcolor import colored | |
# script to download books from wolnelektury.pl site | |
base_url = 'https://wolnelektury.pl' | |
katalog_page = requests.get(f'{base_url}/katalog').content | |
soup = BeautifulSoup(katalog_page, 'lxml') | |
katalog = soup.find("div", {"class": "plain-list"}) | |
books_links = katalog.find_all('a', href=True) | |
def get_book(book): | |
link = f'{base_url}{book["href"]}' | |
book_page = requests.get(link) | |
if book_page.status_code != 200: | |
print(colored(f'Error when getting: {link}', 'red')) | |
return | |
book_soup = BeautifulSoup(book_page.content, 'lxml') | |
try: | |
author = restring( | |
book_soup.find("div", { | |
"class": "l-header__content" | |
}).find("p").text.rstrip() | |
) | |
except TypeError: | |
author = 'NoAuthor' | |
title = restring( | |
book_soup.find("div", { | |
'class': 'l-header__content' | |
}).find("h1").string | |
) | |
down = book_soup.find("div", {'class': 'c-media__popup__box__items'}) | |
try: | |
links = down.find_all('a', href=True) | |
except AttributeError: | |
print(colored(f'Error when getting: {link}', 'red')) | |
return | |
Path(f"{author}").mkdir(exist_ok=True) | |
path = Path(f"{author}/{title}") | |
if not path.exists(): | |
print(colored(f'New book in {path}', 'green')) | |
path.mkdir(exist_ok=True) | |
for link in links: | |
link = link['href'] | |
if '/media/' in link and not any( | |
elem in link for elem in ('daisy.zip', 'audio.epub') | |
): | |
pass | |
name = restring(link[link.rfind('/') + 1 :]) | |
name = '.'.join(name.rsplit('-', 1)) | |
f = path / Path(name) | |
with requests.get(f"{base_url}{link}", stream=True) as r: | |
total_length = int(r.headers.get('content-length')) | |
if f.exists(): | |
if total_length == f.stat().st_size: | |
return | |
print(f) | |
with open(f, 'wb') as file: | |
for chunk in r.iter_content(chunk_size=1048576): | |
file.write(chunk) | |
with ThreadPoolExecutor(max_workers=72) as executor: | |
for book in executor.map(get_book, books_links): | |
pass |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment