Created
October 12, 2016 13:20
-
-
Save adammichaelwood/a7568d31bdf36cdce0a9d8e5a688c5fc to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import google | |
import time | |
import random | |
from bs4 import BeautifulSoup | |
import urllib.request | |
import http.cookiejar | |
from selenium import webdriver | |
import subprocess | |
import blessings | |
from operator import itemgetter | |
browser = webdriver.Firefox() | |
# import Resource | |
topic = input("Topic: ").replace(" ", "+") | |
t = blessings.Terminal() | |
amz_base = "https://www.amazon.com/s/?url=search-alias%3Dstripbooks&field-keywords=" | |
browser.get(amz_base + topic) | |
change_default_format = input("Default format is paperback. Change? ") | |
if not change_default_format: | |
browser.find_element_by_partial_link_text("Paperback").click() | |
pages_to_search = int(input("Pages to search? (About 10-12 books per page) ")) | |
amz_links = set() | |
for i in range(pages_to_search): | |
titles = browser.find_elements_by_class_name("s-access-detail-page") | |
for title in titles: | |
title_url = title.get_attribute('href').split('/ref', 1)[0] | |
print(title_url) | |
amz_links.add(title_url) | |
time.sleep(5) | |
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);") | |
try: | |
browser.find_element_by_partial_link_text('Next Page').click() | |
except: | |
break | |
print("Total books links: " + str(len(amz_links))) | |
books = list() | |
time.sleep(5) | |
count = 0 | |
for book_url in amz_links: | |
count += 1 | |
print(str(count) + " of " + str(len(amz_links))) | |
book = dict() | |
print(book_url) | |
browser.get(book_url) | |
#book['title'] = browser.find_element_by_id('productTitle').text | |
#print("Title: " + book['title']) | |
use = input("Use book? ") | |
if not use: | |
continue | |
book['url'] = book_url | |
title = browser.find_element_by_id('productTitle').text | |
print("TITLE: " + title) | |
truncate_at = input("TRUNCATE: ") | |
if not truncate_at: | |
book['title'] = title | |
else: | |
book['title'] = title.split(truncate_at, 1)[0] | |
print("TITLE TO USE: " + book['title']) | |
authors = browser.find_elements_by_class_name('author') | |
author_names = list() | |
for author in authors: | |
name = author.text | |
if "(Author)" not in name: | |
continue | |
name = name.split(' (Au', 1)[0] | |
author_names.append(name) | |
num_of_authors = len(author_names) | |
author_string = "" | |
if ( num_of_authors == 1 ): | |
author_string = author_names[0] | |
else: | |
for i in range(len(author_names)): | |
last_name = author_names[i].split(' ', -1)[-1] | |
print("Original: " + author_names[i] + "\n" | |
"Last Name: " + last_name | |
) | |
fix_last_name = input("Fix last name: ") | |
if not fix_last_name: | |
author_names[i] = last_name | |
else: | |
author_names[i] = fix_last_name | |
if ( num_of_authors == 2): | |
author_string = author_names[0] + " and " + author_names[1] | |
if ( num_of_authors == 3): | |
author_string = author_names[0] + ", " + author_names[1] + ", and " + author_names[2] | |
if ( num_of_authors > 3): | |
author_string = author_names[0] + " et al." | |
print(author_string) | |
book['author'] = author_string | |
details_element = browser.find_element_by_xpath("//*[contains(text(), 'Product Details')]") | |
browser.execute_script("return arguments[0].scrollIntoView();", details_element) | |
book['year'] = input("Publication year: ") | |
book['description'] = input("Description: ") | |
book_string = " - [_{title}_]({url}) ({year}), by {author}, {description}\n".format(**book) | |
print(t.yellow(book_string)) | |
book['string'] = book_string | |
books.append(book) | |
for book in books: | |
print(book['string']) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment