This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
import queue | |
from threading import Thread | |
starting_url = 'https://scrapeme.live/shop/page/1/' | |
visited = set() | |
max_visits = 100 # careful, it will crawl all the pages | |
num_workers = 5 | |
data = [] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
data = [] | |
def extract_content(soup): | |
for product in soup.select('.product'): | |
data.append({ | |
'id': product.find('a', attrs={'data-product_id': True})['data-product_id'], | |
'name': product.find('h2').text, | |
'price': product.find(class_='amount').text | |
}) | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
proxies = { | |
'http': 'http://190.64.18.177:80', | |
'https': 'http://49.12.2.178:3128', | |
} | |
headers = { | |
'authority': 'httpbin.org', | |
'cache-control': 'max-age=0', | |
'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"', | |
'sec-ch-ua-mobile': '?0', |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def queue_worker(i, q): | |
while True: | |
url = q.get() | |
if (len(visited) < max_visits and url not in visited): | |
crawl(url) | |
q.task_done() | |
q = queue.Queue() | |
num_workers = 4 | |
for i in range(num_workers): |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from threading import Thread | |
def queue_worker(i, q): | |
while True: | |
url = q.get() # Get an item from the queue, blocks until one is available | |
print('to process:', url) | |
q.task_done() # Notifies the queue that the item has been processed | |
q = queue.Queue() | |
Thread(target=queue_worker, args=(0, q), daemon=True).start() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import queue | |
q = queue.Queue() | |
q.put('https://scrapeme.live/shop/page/1/') | |
def crawl(url): | |
# ... | |
links = extract_links(soup) | |
for link in links: | |
if link not in visited: |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def crawl(url): | |
if not url or url in visited: | |
return | |
print('Crawl: ', url) | |
visited.add(url) | |
html = get_html(url) | |
soup = BeautifulSoup(html, 'html.parser') | |
extract_content(soup) | |
links = extract_links(soup) | |
to_visit.update(links) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
visited = set() | |
to_visit = set() | |
max_visits = 3 | |
def crawl(url): | |
print('Crawl: ', url) | |
response = requests.get(url) | |
soup = BeautifulSoup(response.content, 'html.parser') | |
visited.add(url) | |
for a in soup.select('a.page-numbers'): |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
to_visit = set() | |
response = requests.get('https://scrapeme.live/shop/page/1/') | |
soup = BeautifulSoup(response.content, 'html.parser') | |
for a in soup.select('a.page-numbers'): | |
to_visit.add(a.get('href')) | |
print(to_visit) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[ | |
{ | |
"date": "08/2020", | |
"fuel": "Gasoline", | |
"gear": "Manual", | |
"link": "/offers/mazda-cx-3-skyactiv-g-121-fwd-6gs-al-edition100-gasoline-0fcc31ce-0548-4ee2-b3c2-a4aba38fe9db", | |
"makemodel": "Mazda CX-3", | |
"mileage": 546, | |
"offerType": "Used", | |
"power": "89 kW (121 hp)", |