robert-mcdermott · January 12, 2025 13:21
diff --git a/website-content-scrape.py b/website-content-scrape.py
 import os
 import requests
 from bs4 import BeautifulSoup
 from urllib.parse import urljoin, urlparse

 def is_valid(url, base_url):
    parsed = urlparse(url)
    return bool(parsed.netloc) and parsed.netloc == urlparse(base_url).netloc

 def is_binary(url):
    # List of binary file extensions
    binary_extensions = ['.pdf', '.doc', '.docx', '.ppt', '.pptx', 'msi', 'png', 'dmg', 'zip', 'jpg', 'tif', 'mp3', 'avi', 'mp4']
    #return any(url.endswith(ext) for ext in binary_extensions)
    
    # hack below to only fetch proper html files
    if url.endswith(".html"):
        return False
    else:
        return True 

 def get_all_links(url, base_url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    for a_tag in soup.find_all('a', href=True):
        href = a_tag.attrs['href']
        href = urljoin(url, href)
        if is_valid(href, base_url) and not is_binary(href):
            yield href

 def get_page_text(url):
    # Skip binary file types
    if is_binary(url):
        print(f"Skipping binary file: {url}")
        return ""
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup.get_text()

 def crawl(url, base_url, output_file):
    visited = set()
    with open(output_file, 'w', encoding='utf-8') as out_file:  # specify utf-8 encoding here
        def _crawl(url):
            if url in visited:
                return
            visited.add(url)
            print(f"Visiting: {url}")
            try:
                page_text = get_page_text(url)
                if page_text:  # Only write if page_text is not empty
                    out_file.write(f"\n\n---- {url} ----\n\n")
                    out_file.write(page_text)
            except Exception as e:
                print(f"Error visiting {url}: {e}")
            for link in get_all_links(url, base_url):
                try:
                    _crawl(link)
                except:
                    pass
        try:
            _crawl(url)
        except:
            pass

 # Usage
 base_url = 'https://www.fredhutch.org/'
 output_file = 'output.txt'
 crawl(base_url, base_url, output_file)


 ## How to clean up afertwards:
 # grep -v '^[[:space:]]*$'  output.txt > output-b.txt
 # awk '!seen[$0]++' output-b.txt > output.text
 # rm output-b.txt
	import os
	import requests
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin, urlparse

	def is_valid(url, base_url):
	parsed = urlparse(url)
	return bool(parsed.netloc) and parsed.netloc == urlparse(base_url).netloc

	def is_binary(url):
	# List of binary file extensions
	binary_extensions = ['.pdf', '.doc', '.docx', '.ppt', '.pptx', 'msi', 'png', 'dmg', 'zip', 'jpg', 'tif', 'mp3', 'avi', 'mp4']
	#return any(url.endswith(ext) for ext in binary_extensions)

	# hack below to only fetch proper html files
	if url.endswith(".html"):
	return False
	else:
	return True

	def get_all_links(url, base_url):
	response = requests.get(url)
	soup = BeautifulSoup(response.text, 'html.parser')
	for a_tag in soup.find_all('a', href=True):
	href = a_tag.attrs['href']
	href = urljoin(url, href)
	if is_valid(href, base_url) and not is_binary(href):
	yield href

	def get_page_text(url):
	# Skip binary file types
	if is_binary(url):
	print(f"Skipping binary file: {url}")
	return ""
	response = requests.get(url)
	soup = BeautifulSoup(response.text, 'html.parser')
	return soup.get_text()

	def crawl(url, base_url, output_file):
	visited = set()
	with open(output_file, 'w', encoding='utf-8') as out_file: # specify utf-8 encoding here
	def _crawl(url):
	if url in visited:
	return
	visited.add(url)
	print(f"Visiting: {url}")
	try:
	page_text = get_page_text(url)
	if page_text: # Only write if page_text is not empty
	out_file.write(f"\n\n---- {url} ----\n\n")
	out_file.write(page_text)
	except Exception as e:
	print(f"Error visiting {url}: {e}")
	for link in get_all_links(url, base_url):
	try:
	_crawl(link)
	except:
	pass
	try:
	_crawl(url)
	except:
	pass

	# Usage
	base_url = 'https://www.fredhutch.org/'
	output_file = 'output.txt'
	crawl(base_url, base_url, output_file)


	## How to clean up afertwards:
	# grep -v '^[[:space:]]*$' output.txt > output-b.txt
	# awk '!seen[$0]++' output-b.txt > output.text
	# rm output-b.txt