Created
July 11, 2023 12:02
-
-
Save smileart/9de3a68287e9d08da7b38f6c1df2d0a5 to your computer and use it in GitHub Desktop.
A script to scrape Digital Content Lists from Amazon
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// NOTE: It's supposed to be used in DevTools console only, depends on `copy`, `$`, `$$`, etc. | |
// NOTE: these backups needed cause after a new page loaded $, $$, etc. are undefined for the script running 🤷♂️ | |
const selector = $ | |
const selectorAll = $$ | |
const clipboardCopy = copy | |
let allBooksList = [] | |
function scrapBooks() { | |
const titles = selectorAll('.digital_entity_title').map((b)=>{ return b.innerText }) | |
const authors = selectorAll('div[id^="content-author"]').map((a)=>{ return a.innerText }) | |
// NOTE: for Audible use this instead... | |
// const authors = selectorAll('.digital_entity_details > .information_row').map((a)=>{ return a.innerText }).filter((e) => { return e != '' }) | |
let books = [] | |
for (const [i, title] of titles.entries()) { | |
console.log(`'${title}' by ${authors[i]}`) | |
books.push(`- '${title}' by ${authors[i]}`) | |
} | |
return books | |
} | |
function sleep(ms) { | |
return new Promise(resolve => setTimeout(resolve, ms)) | |
} | |
let maxPage = Number($$('a[id^="page-"]').pop().textContent) | |
let currentPage = 0 | |
while (currentPage < maxPage) { | |
currentPage += 1 | |
selector(`#page-${currentPage}`).click() | |
await sleep(2000) | |
console.log(`Handling page #${currentPage}`) | |
allBooksList.push(...scrapBooks()) | |
console.log(`Scrapped ${allBooksList.length} books so far...`) | |
} | |
clipboardCopy(allBooksList.join("\n")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment