Last active
August 5, 2024 22:40
-
-
Save LCPallares/2bd85d5da472655be267cc2956936f54 to your computer and use it in GitHub Desktop.
analyst_amazon_bestsellers
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
import csv | |
from datetime import datetime | |
def scrape_amazon_bestsellers(): | |
url = "https://www.amazon.com/best-sellers-books-Amazon/zgbs/books/" | |
headers = { | |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" | |
} | |
response = requests.get(url, headers=headers) | |
''' | |
with open('amazon-bestsellers-scraper.html', 'r') as file: | |
html = file.read() | |
soup = BeautifulSoup(html, 'html.parser') | |
''' | |
soup = BeautifulSoup(requests.content, 'html.parser') | |
books = [] | |
for item in soup.find_all("div", id="gridItemRoot"): | |
rank = item.find("span", class_="zg-bdg-text").text.strip().replace("#", "") | |
title = item.find_all("a", class_="a-link-normal")[1].text.strip() | |
author = item.find("div", class_="a-row a-size-small").text.strip() | |
score = item.find("span", class_="a-icon-alt") | |
score = score.text.split(" ")[0] if score else "0.0" | |
price = item.find("span", class_="_cDEzb_p13n-sc-price_3mJ9Z") | |
price = price.text[1:] if price else "0.0" | |
type_cover = item.find("span", class_="a-size-small a-color-secondary a-text-normal").text | |
numbers_reviews = item.find("span", class_="a-size-small").text.replace(",", "") | |
numbers_reviews = int(numbers_reviews) if numbers_reviews.isdigit() else 0 | |
books.append({ | |
"rank": rank, | |
"title": title, | |
"author": author, | |
"price": float(price), | |
"score": float(score), | |
"type_cover": type_cover, | |
"numbers_reviews": numbers_reviews, | |
"date_scraped": datetime.now().strftime("%Y-%m-%d") | |
}) | |
print(books) | |
return books | |
def save_to_csv(books, filename): | |
keys = books[0].keys() | |
with open(filename, 'w', newline='', encoding='utf-8') as output_file: | |
dict_writer = csv.DictWriter(output_file, keys) | |
dict_writer.writeheader() | |
dict_writer.writerows(books) | |
if __name__ == "__main__": | |
bestsellers = scrape_amazon_bestsellers() | |
save_to_csv(bestsellers, "amazon_bestsellers.csv") | |
print(f"Scraped {len(bestsellers)} books and saved to amazon_bestsellers.csv") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment