Skip to content

Instantly share code, notes, and snippets.

@robert-mcdermott
Last active January 12, 2025 13:21
Show Gist options
  • Save robert-mcdermott/475475528c377947c47f7ca6305bb81f to your computer and use it in GitHub Desktop.
Save robert-mcdermott/475475528c377947c47f7ca6305bb81f to your computer and use it in GitHub Desktop.
Recursive website content scrape
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
def is_valid(url, base_url):
parsed = urlparse(url)
return bool(parsed.netloc) and parsed.netloc == urlparse(base_url).netloc
def is_binary(url):
# List of binary file extensions
binary_extensions = ['.pdf', '.doc', '.docx', '.ppt', '.pptx', 'msi', 'png', 'dmg', 'zip', 'jpg', 'tif', 'mp3', 'avi', 'mp4']
#return any(url.endswith(ext) for ext in binary_extensions)
# hack below to only fetch proper html files
if url.endswith(".html"):
return False
else:
return True
def get_all_links(url, base_url):
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
for a_tag in soup.find_all('a', href=True):
href = a_tag.attrs['href']
href = urljoin(url, href)
if is_valid(href, base_url) and not is_binary(href):
yield href
def get_page_text(url):
# Skip binary file types
if is_binary(url):
print(f"Skipping binary file: {url}")
return ""
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
return soup.get_text()
def crawl(url, base_url, output_file):
visited = set()
with open(output_file, 'w', encoding='utf-8') as out_file: # specify utf-8 encoding here
def _crawl(url):
if url in visited:
return
visited.add(url)
print(f"Visiting: {url}")
try:
page_text = get_page_text(url)
if page_text: # Only write if page_text is not empty
out_file.write(f"\n\n---- {url} ----\n\n")
out_file.write(page_text)
except Exception as e:
print(f"Error visiting {url}: {e}")
for link in get_all_links(url, base_url):
try:
_crawl(link)
except:
pass
try:
_crawl(url)
except:
pass
# Usage
base_url = 'https://www.fredhutch.org/'
output_file = 'output.txt'
crawl(base_url, base_url, output_file)
## How to clean up afertwards:
# grep -v '^[[:space:]]*$' output.txt > output-b.txt
# awk '!seen[$0]++' output-b.txt > output.text
# rm output-b.txt
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment