Created
August 1, 2021 09:58
-
-
Save dusekdan/0f828a5736fbb0d85a8f5cdc4b02ec9f to your computer and use it in GitHub Desktop.
Small script to scrape quotes from azquotes.com
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import logging as LOG | |
import requests | |
from bs4 import BeautifulSoup | |
LOG.basicConfig(level=LOG.INFO) | |
QUOTES_BASE_URL = 'https://www.azquotes.com/top_quotes.html?p=' | |
OUTPUT_FILE = 'quotes-better.json' | |
MAX_PAGE_NUMBER_EXCLUSIVE = 11 | |
def main(): | |
LOG.info('Started') | |
quotes = [] | |
for page in range(1, MAX_PAGE_NUMBER_EXCLUSIVE): | |
quotes = quotes + get_quotes_from_page(f"{QUOTES_BASE_URL}{page}") | |
LOG.info(f"Quotes scraped: {len(quotes)}") | |
with open(OUTPUT_FILE, "w") as f: | |
LOG.info(f"Writing {len(quotes)} quotes to file...") | |
json.dump(quotes, f) | |
LOG.info("Job's finished") | |
def get_quotes_from_page(url): | |
LOG.info(f"Retrieving quotes from {url}") | |
quotes = [] | |
request = requests.get(url) | |
soup = BeautifulSoup(request.content, "html.parser") | |
quotes_container = soup.find("ul", class_="list-quotes") | |
for li in quotes_container.find_all("li"): | |
quote_info = li.find('a', {'class' : 'title'}) | |
quotes.append({ | |
f"quote-{quote_info['href'].split('/quote/')[1]}" : { | |
"content": quote_info.text, | |
"author": soup.select('.author > a')[0].text | |
} | |
}) | |
LOG.info(f"{len(quotes)} quotes retrieved") | |
return quotes | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment