Tokariew · April 29, 2023 14:33
diff --git a/wolnelektury.py b/wolnelektury.py
 #!/bin/python3
 from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path

 import requests
 from bs4 import BeautifulSoup
 from slugify import slugify as restring
 from termcolor import colored

 # script to download books from wolnelektury.pl site

 base_url = 'https://wolnelektury.pl'
 katalog_page = requests.get(f'{base_url}/katalog').content
 soup = BeautifulSoup(katalog_page, 'lxml')
 katalog = soup.find("div", {"class": "plain-list"})
 books_links = katalog.find_all('a', href=True)


 def get_book(book):
    link = f'{base_url}{book["href"]}'
    book_page = requests.get(link)
    if book_page.status_code != 200:
        print(colored(f'Error when getting: {link}', 'red'))
        return
    book_soup = BeautifulSoup(book_page.content, 'lxml')
    try:
        author = restring(
            book_soup.find("div", {
                "class": "l-header__content"
            }).find("p").text.rstrip()
        )
    except TypeError:
        author = 'NoAuthor'
    title = restring(
        book_soup.find("div", {
            'class': 'l-header__content'
        }).find("h1").string
    )

    down = book_soup.find("div", {'class': 'c-media__popup__box__items'})
    try:
        links = down.find_all('a', href=True)
    except AttributeError:
        print(colored(f'Error when getting: {link}', 'red'))
        return
    Path(f"{author}").mkdir(exist_ok=True)
    path = Path(f"{author}/{title}")
    if not path.exists():
        print(colored(f'New book in {path}', 'green'))
    path.mkdir(exist_ok=True)
    for link in links:
        link = link['href']
        if '/media/' in link and not any(
            elem in link for elem in ('daisy.zip', 'audio.epub')
        ):
            pass
            name = restring(link[link.rfind('/') + 1 :])
            name = '.'.join(name.rsplit('-', 1))
            f = path / Path(name)
            with requests.get(f"{base_url}{link}", stream=True) as r:
                total_length = int(r.headers.get('content-length'))
                if f.exists():
                    if total_length == f.stat().st_size:
                        return
                print(f)
                with open(f, 'wb') as file:
                    for chunk in r.iter_content(chunk_size=1048576):
                        file.write(chunk)


 with ThreadPoolExecutor(max_workers=72) as executor:
    for book in executor.map(get_book, books_links):
        pass
	#!/bin/python3
	from concurrent.futures import ThreadPoolExecutor
	from pathlib import Path

	import requests
	from bs4 import BeautifulSoup
	from slugify import slugify as restring
	from termcolor import colored

	# script to download books from wolnelektury.pl site

	base_url = 'https://wolnelektury.pl'
	katalog_page = requests.get(f'{base_url}/katalog').content
	soup = BeautifulSoup(katalog_page, 'lxml')
	katalog = soup.find("div", {"class": "plain-list"})
	books_links = katalog.find_all('a', href=True)


	def get_book(book):
	link = f'{base_url}{book["href"]}'
	book_page = requests.get(link)
	if book_page.status_code != 200:
	print(colored(f'Error when getting: {link}', 'red'))
	return
	book_soup = BeautifulSoup(book_page.content, 'lxml')
	try:
	author = restring(
	book_soup.find("div", {
	"class": "l-header__content"
	}).find("p").text.rstrip()
	)
	except TypeError:
	author = 'NoAuthor'
	title = restring(
	book_soup.find("div", {
	'class': 'l-header__content'
	}).find("h1").string
	)

	down = book_soup.find("div", {'class': 'c-media__popup__box__items'})
	try:
	links = down.find_all('a', href=True)
	except AttributeError:
	print(colored(f'Error when getting: {link}', 'red'))
	return
	Path(f"{author}").mkdir(exist_ok=True)
	path = Path(f"{author}/{title}")
	if not path.exists():
	print(colored(f'New book in {path}', 'green'))
	path.mkdir(exist_ok=True)
	for link in links:
	link = link['href']
	if '/media/' in link and not any(
	elem in link for elem in ('daisy.zip', 'audio.epub')
	):
	pass
	name = restring(link[link.rfind('/') + 1 :])
	name = '.'.join(name.rsplit('-', 1))
	f = path / Path(name)
	with requests.get(f"{base_url}{link}", stream=True) as r:
	total_length = int(r.headers.get('content-length'))
	if f.exists():
	if total_length == f.stat().st_size:
	return
	print(f)
	with open(f, 'wb') as file:
	for chunk in r.iter_content(chunk_size=1048576):
	file.write(chunk)


	with ThreadPoolExecutor(max_workers=72) as executor:
	for book in executor.map(get_book, books_links):
	pass