Skip to content

Instantly share code, notes, and snippets.

@AnderRV
Last active August 11, 2021 15:09
Show Gist options
  • Save AnderRV/3d29b7c362ba80d2de67e56b87d8292a to your computer and use it in GitHub Desktop.
Save AnderRV/3d29b7c362ba80d2de67e56b87d8292a to your computer and use it in GitHub Desktop.
visited = set()
to_visit = set()
max_visits = 3
def crawl(url):
print('Crawl: ', url)
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
visited.add(url)
for a in soup.select('a.page-numbers'):
link = a.get('href')
to_visit.add(link)
if link not in visited and len(visited) < max_visits:
crawl(link)
crawl('https://scrapeme.live/shop/page/1/')
print(visited) # {'.../3/', '.../1/', '.../2/'}
print(to_visit) # { ... new ones added, such as pages 5 and 6 ... }
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment