Created
September 30, 2024 09:39
-
-
Save lucadonnoh/13b1f50e321d4272e7323bab9884514d to your computer and use it in GitHub Desktop.
Find dead links (404) in a repo.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import os | |
import re | |
import requests | |
import threading | |
from queue import Queue | |
import argparse | |
import subprocess | |
from tqdm import tqdm | |
from colorama import init, Fore, Style | |
# Initialize colorama | |
init(autoreset=True) | |
# Regular expression to find HTTPS links, excluding trailing punctuation | |
URL_REGEX = re.compile(r'https://[^\s\'\"<>()\[\]`]+') | |
def find_links_in_file(filepath): | |
"""Extracts HTTPS links from a file.""" | |
with open(filepath, 'r', errors='ignore') as f: | |
try: | |
content = f.read() | |
urls = URL_REGEX.findall(content) | |
# Remove trailing non-URL characters from URLs | |
cleaned_urls = [] | |
for url in urls: | |
# Remove any trailing characters that are not part of the URL | |
url = re.sub(r'[\s\'\"<>()\[\]`,.;:]*$', '', url) | |
cleaned_urls.append(url) | |
return cleaned_urls | |
except Exception: | |
return [] | |
def extract_error_message(exception): | |
"""Simplify the exception message for better readability.""" | |
if isinstance(exception, requests.exceptions.ConnectionError): | |
return "Connection Error" | |
elif isinstance(exception, requests.exceptions.Timeout): | |
return "Timeout" | |
elif isinstance(exception, requests.exceptions.TooManyRedirects): | |
return "Too Many Redirects" | |
else: | |
return "Request Exception" | |
def worker(url_queue, checked_urls, dead_links_404, error_counts, lock, timeout, progress_bar, headers): | |
"""Thread worker function to check URLs.""" | |
while True: | |
item = url_queue.get() | |
if item is None: | |
break | |
url = item | |
with lock: | |
if url in checked_urls: | |
url_queue.task_done() | |
progress_bar.update(1) | |
continue | |
checked_urls.add(url) | |
try: | |
# Use GET request with stream=True to avoid downloading content | |
response = requests.get( | |
url, timeout=timeout, allow_redirects=True, stream=True, headers=headers) | |
if response.status_code == 404: | |
with lock: | |
dead_links_404.append(url) | |
elif response.status_code >= 400: | |
with lock: | |
# Increment count for this status code | |
status_code = f"HTTP {response.status_code}" | |
error_counts[status_code] = error_counts.get( | |
status_code, 0) + 1 | |
# Else, do nothing (the URL is valid) | |
except requests.RequestException as e: | |
error_msg = extract_error_message(e) | |
with lock: | |
error_counts[error_msg] = error_counts.get(error_msg, 0) + 1 | |
finally: | |
url_queue.task_done() | |
progress_bar.update(1) | |
def get_git_tracked_files(root_dir): | |
"""Gets a list of files tracked by Git in the given directory, excluding .sol files.""" | |
try: | |
output = subprocess.check_output(['git', 'ls-files'], cwd=root_dir) | |
files = output.decode().splitlines() | |
# Exclude .sol files | |
files = [f for f in files if not f.endswith('.sol')] | |
files = [os.path.join(root_dir, f) for f in files] | |
return files | |
except subprocess.CalledProcessError: | |
print("Error: The specified directory is not a Git repository or Git is not installed.") | |
return [] | |
def main(): | |
parser = argparse.ArgumentParser( | |
description='Discover dead HTTPS links in a repository.') | |
parser.add_argument('directory', nargs='?', | |
default='.', help='Directory to scan') | |
parser.add_argument('-t', '--threads', type=int, | |
default=10, help='Number of worker threads') | |
parser.add_argument( | |
'-o', '--output', help='Output file to write dead links') | |
parser.add_argument('--timeout', type=int, default=10, | |
help='Timeout for HTTP requests in seconds') | |
args = parser.parse_args() | |
root_dir = os.path.abspath(args.directory) | |
num_workers = args.threads | |
timeout = args.timeout | |
# Custom headers (including User-Agent) | |
headers = { | |
'User-Agent': 'Mozilla/5.0 (compatible; LinkChecker/1.0; +https://example.com/linkchecker)' | |
} | |
# Collect all Git-tracked files in the directory, excluding .sol files | |
files_to_scan = get_git_tracked_files(root_dir) | |
if not files_to_scan: | |
print("No files to scan. Exiting.") | |
return | |
total_files = len(files_to_scan) | |
print(f"Total files to scan: {total_files}") | |
# Use tqdm progress bar for file scanning | |
urls_found = [] | |
for filepath in tqdm(files_to_scan, desc="Scanning files", unit="file"): | |
urls = find_links_in_file(filepath) | |
urls_found.extend(urls) | |
total_urls = len(urls_found) | |
print(f"Finished scanning files. Total URLs found: {total_urls}") | |
if total_urls == 0: | |
print("No URLs found to check. Exiting.") | |
return | |
# Split URLs into batches of 100 | |
batch_size = 100 | |
url_batches = [urls_found[i:i + batch_size] | |
for i in range(0, total_urls, batch_size)] | |
all_dead_links = [] # Collect all dead links (HTTP 404) across batches | |
for batch_num, url_batch in enumerate(url_batches, start=1): | |
print(f"\nProcessing batch { | |
batch_num}/{len(url_batches)} with {len(url_batch)} URLs.") | |
# Queue for URLs to check in this batch | |
url_queue = Queue() | |
for url in url_batch: | |
url_queue.put(url) | |
# Shared data structures for this batch | |
dead_links_404 = [] # List to store URLs with HTTP 404 errors | |
error_counts = {} # Dictionary to count other errors | |
checked_urls = set() | |
lock = threading.Lock() | |
# Create a tqdm progress bar for URL checking | |
progress_bar = tqdm(total=len(url_batch), desc=f"Checking URLs in batch { | |
batch_num}", unit="url") | |
# Start worker threads | |
threads = [] | |
for _ in range(num_workers): | |
t = threading.Thread(target=worker, args=( | |
url_queue, checked_urls, dead_links_404, error_counts, lock, timeout, progress_bar, headers)) | |
t.start() | |
threads.append(t) | |
# Wait until all URLs in the batch are processed | |
url_queue.join() | |
# Stop workers | |
for _ in range(num_workers): | |
url_queue.put(None) | |
for t in threads: | |
t.join() | |
progress_bar.close() | |
# Output dead links (404 errors) for this batch | |
if dead_links_404: | |
print(f"{Fore.RED}Dead links (HTTP 404) found in batch { | |
batch_num}:{Style.RESET_ALL}") | |
for url in dead_links_404: | |
print(f"{Fore.RED}- {url}{Style.RESET_ALL}") | |
all_dead_links.extend(dead_links_404) | |
else: | |
print(f"{Fore.GREEN}No dead links (HTTP 404) found in batch { | |
batch_num}.{Style.RESET_ALL}") | |
# Display summary of other errors | |
if error_counts: | |
print(f"{Fore.YELLOW}\nOther errors encountered in batch { | |
batch_num}:{Style.RESET_ALL}") | |
for error, count in error_counts.items(): | |
print( | |
f"{Fore.YELLOW}- {error}: {count} occurrences{Style.RESET_ALL}") | |
# Ask the user whether to continue | |
while True: | |
user_input = input( | |
"Press Enter to continue to the next batch, or type 'q' to quit: ").strip().lower() | |
if user_input == '': | |
break | |
elif user_input == 'q': | |
print("Exiting.") | |
if all_dead_links and args.output: | |
# Write all dead links to the output file | |
with open(args.output, 'w') as f: | |
for url in all_dead_links: | |
f.write(f"{url}\n") | |
return | |
else: | |
print("Invalid input. Please press Enter to continue or 'q' to quit.") | |
# After all batches are processed | |
if all_dead_links: | |
print(f"\n{Fore.RED}Summary of all dead links (HTTP 404) found:{ | |
Style.RESET_ALL}") | |
for url in all_dead_links: | |
print(f"{Fore.RED}- {url}{Style.RESET_ALL}") | |
if args.output: | |
with open(args.output, 'w') as f: | |
for url in all_dead_links: | |
f.write(f"{url}\n") | |
else: | |
print(f"\n{Fore.GREEN}No dead links (HTTP 404) found in any batch.{ | |
Style.RESET_ALL}") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment