This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import requests | |
| from bs4 import BeautifulSoup | |
| import queue | |
| from threading import Thread | |
| starting_url = 'https://scrapeme.live/shop/page/1/' | |
| visited = set() | |
| max_visits = 100 # careful, it will crawl all the pages | |
| num_workers = 5 | |
| data = [] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| data = [] | |
| def extract_content(soup): | |
| for product in soup.select('.product'): | |
| data.append({ | |
| 'id': product.find('a', attrs={'data-product_id': True})['data-product_id'], | |
| 'name': product.find('h2').text, | |
| 'price': product.find(class_='amount').text | |
| }) | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| proxies = { | |
| 'http': 'http://190.64.18.177:80', | |
| 'https': 'http://49.12.2.178:3128', | |
| } | |
| headers = { | |
| 'authority': 'httpbin.org', | |
| 'cache-control': 'max-age=0', | |
| 'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"', | |
| 'sec-ch-ua-mobile': '?0', |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def queue_worker(i, q): | |
| while True: | |
| url = q.get() | |
| if (len(visited) < max_visits and url not in visited): | |
| crawl(url) | |
| q.task_done() | |
| q = queue.Queue() | |
| num_workers = 4 | |
| for i in range(num_workers): |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from threading import Thread | |
| def queue_worker(i, q): | |
| while True: | |
| url = q.get() # Get an item from the queue, blocks until one is available | |
| print('to process:', url) | |
| q.task_done() # Notifies the queue that the item has been processed | |
| q = queue.Queue() | |
| Thread(target=queue_worker, args=(0, q), daemon=True).start() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import queue | |
| q = queue.Queue() | |
| q.put('https://scrapeme.live/shop/page/1/') | |
| def crawl(url): | |
| # ... | |
| links = extract_links(soup) | |
| for link in links: | |
| if link not in visited: |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def crawl(url): | |
| if not url or url in visited: | |
| return | |
| print('Crawl: ', url) | |
| visited.add(url) | |
| html = get_html(url) | |
| soup = BeautifulSoup(html, 'html.parser') | |
| extract_content(soup) | |
| links = extract_links(soup) | |
| to_visit.update(links) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| visited = set() | |
| to_visit = set() | |
| max_visits = 3 | |
| def crawl(url): | |
| print('Crawl: ', url) | |
| response = requests.get(url) | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| visited.add(url) | |
| for a in soup.select('a.page-numbers'): |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import requests | |
| from bs4 import BeautifulSoup | |
| to_visit = set() | |
| response = requests.get('https://scrapeme.live/shop/page/1/') | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| for a in soup.select('a.page-numbers'): | |
| to_visit.add(a.get('href')) | |
| print(to_visit) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| [ | |
| { | |
| "date": "08/2020", | |
| "fuel": "Gasoline", | |
| "gear": "Manual", | |
| "link": "/offers/mazda-cx-3-skyactiv-g-121-fwd-6gs-al-edition100-gasoline-0fcc31ce-0548-4ee2-b3c2-a4aba38fe9db", | |
| "makemodel": "Mazda CX-3", | |
| "mileage": 546, | |
| "offerType": "Used", | |
| "power": "89 kW (121 hp)", |