lucadonnoh · September 30, 2024 09:39
diff --git a/dead-links.py b/dead-links.py
 #!/usr/bin/env python3

 import os
 import re
 import requests
 import threading
 from queue import Queue
 import argparse
 import subprocess
 from tqdm import tqdm
 from colorama import init, Fore, Style

 # Initialize colorama
 init(autoreset=True)

 # Regular expression to find HTTPS links, excluding trailing punctuation
 URL_REGEX = re.compile(r'https://[^\s\'\"<>()\[\]`]+')


 def find_links_in_file(filepath):
    """Extracts HTTPS links from a file."""
    with open(filepath, 'r', errors='ignore') as f:
        try:
            content = f.read()
            urls = URL_REGEX.findall(content)
            # Remove trailing non-URL characters from URLs
            cleaned_urls = []
            for url in urls:
                # Remove any trailing characters that are not part of the URL
                url = re.sub(r'[\s\'\"<>()\[\]`,.;:]*$', '', url)
                cleaned_urls.append(url)
            return cleaned_urls
        except Exception:
            return []


 def extract_error_message(exception):
    """Simplify the exception message for better readability."""
    if isinstance(exception, requests.exceptions.ConnectionError):
        return "Connection Error"
    elif isinstance(exception, requests.exceptions.Timeout):
        return "Timeout"
    elif isinstance(exception, requests.exceptions.TooManyRedirects):
        return "Too Many Redirects"
    else:
        return "Request Exception"


 def worker(url_queue, checked_urls, dead_links_404, error_counts, lock, timeout, progress_bar, headers):
    """Thread worker function to check URLs."""
    while True:
        item = url_queue.get()
        if item is None:
            break
        url = item
        with lock:
            if url in checked_urls:
                url_queue.task_done()
                progress_bar.update(1)
                continue
            checked_urls.add(url)
        try:
            # Use GET request with stream=True to avoid downloading content
            response = requests.get(
                url, timeout=timeout, allow_redirects=True, stream=True, headers=headers)
            if response.status_code == 404:
                with lock:
                    dead_links_404.append(url)
            elif response.status_code >= 400:
                with lock:
                    # Increment count for this status code
                    status_code = f"HTTP {response.status_code}"
                    error_counts[status_code] = error_counts.get(
                        status_code, 0) + 1
            # Else, do nothing (the URL is valid)
        except requests.RequestException as e:
            error_msg = extract_error_message(e)
            with lock:
                error_counts[error_msg] = error_counts.get(error_msg, 0) + 1
        finally:
            url_queue.task_done()
            progress_bar.update(1)


 def get_git_tracked_files(root_dir):
    """Gets a list of files tracked by Git in the given directory, excluding .sol files."""
    try:
        output = subprocess.check_output(['git', 'ls-files'], cwd=root_dir)
        files = output.decode().splitlines()
        # Exclude .sol files
        files = [f for f in files if not f.endswith('.sol')]
        files = [os.path.join(root_dir, f) for f in files]
        return files
    except subprocess.CalledProcessError:
        print("Error: The specified directory is not a Git repository or Git is not installed.")
        return []


 def main():
    parser = argparse.ArgumentParser(
        description='Discover dead HTTPS links in a repository.')
    parser.add_argument('directory', nargs='?',
                        default='.', help='Directory to scan')
    parser.add_argument('-t', '--threads', type=int,
                        default=10, help='Number of worker threads')
    parser.add_argument(
        '-o', '--output', help='Output file to write dead links')
    parser.add_argument('--timeout', type=int, default=10,
                        help='Timeout for HTTP requests in seconds')
    args = parser.parse_args()

    root_dir = os.path.abspath(args.directory)
    num_workers = args.threads
    timeout = args.timeout

    # Custom headers (including User-Agent)
    headers = {
        'User-Agent': 'Mozilla/5.0 (compatible; LinkChecker/1.0; +https://example.com/linkchecker)'
    }

    # Collect all Git-tracked files in the directory, excluding .sol files
    files_to_scan = get_git_tracked_files(root_dir)
    if not files_to_scan:
        print("No files to scan. Exiting.")
        return

    total_files = len(files_to_scan)
    print(f"Total files to scan: {total_files}")

    # Use tqdm progress bar for file scanning
    urls_found = []
    for filepath in tqdm(files_to_scan, desc="Scanning files", unit="file"):
        urls = find_links_in_file(filepath)
        urls_found.extend(urls)

    total_urls = len(urls_found)
    print(f"Finished scanning files. Total URLs found: {total_urls}")

    if total_urls == 0:
        print("No URLs found to check. Exiting.")
        return

    # Split URLs into batches of 100
    batch_size = 100
    url_batches = [urls_found[i:i + batch_size]
                   for i in range(0, total_urls, batch_size)]

    all_dead_links = []  # Collect all dead links (HTTP 404) across batches

    for batch_num, url_batch in enumerate(url_batches, start=1):
        print(f"\nProcessing batch {
              batch_num}/{len(url_batches)} with {len(url_batch)} URLs.")

        # Queue for URLs to check in this batch
        url_queue = Queue()
        for url in url_batch:
            url_queue.put(url)

        # Shared data structures for this batch
        dead_links_404 = []  # List to store URLs with HTTP 404 errors
        error_counts = {}    # Dictionary to count other errors
        checked_urls = set()
        lock = threading.Lock()

        # Create a tqdm progress bar for URL checking
        progress_bar = tqdm(total=len(url_batch), desc=f"Checking URLs in batch {
                            batch_num}", unit="url")

        # Start worker threads
        threads = []
        for _ in range(num_workers):
            t = threading.Thread(target=worker, args=(
                url_queue, checked_urls, dead_links_404, error_counts, lock, timeout, progress_bar, headers))
            t.start()
            threads.append(t)

        # Wait until all URLs in the batch are processed
        url_queue.join()

        # Stop workers
        for _ in range(num_workers):
            url_queue.put(None)
        for t in threads:
            t.join()

        progress_bar.close()

        # Output dead links (404 errors) for this batch
        if dead_links_404:
            print(f"{Fore.RED}Dead links (HTTP 404) found in batch {
                  batch_num}:{Style.RESET_ALL}")
            for url in dead_links_404:
                print(f"{Fore.RED}- {url}{Style.RESET_ALL}")
            all_dead_links.extend(dead_links_404)
        else:
            print(f"{Fore.GREEN}No dead links (HTTP 404) found in batch {
                  batch_num}.{Style.RESET_ALL}")

        # Display summary of other errors
        if error_counts:
            print(f"{Fore.YELLOW}\nOther errors encountered in batch {
                  batch_num}:{Style.RESET_ALL}")
            for error, count in error_counts.items():
                print(
                    f"{Fore.YELLOW}- {error}: {count} occurrences{Style.RESET_ALL}")

        # Ask the user whether to continue
        while True:
            user_input = input(
                "Press Enter to continue to the next batch, or type 'q' to quit: ").strip().lower()
            if user_input == '':
                break
            elif user_input == 'q':
                print("Exiting.")
                if all_dead_links and args.output:
                    # Write all dead links to the output file
                    with open(args.output, 'w') as f:
                        for url in all_dead_links:
                            f.write(f"{url}\n")
                return
            else:
                print("Invalid input. Please press Enter to continue or 'q' to quit.")

    # After all batches are processed
    if all_dead_links:
        print(f"\n{Fore.RED}Summary of all dead links (HTTP 404) found:{
              Style.RESET_ALL}")
        for url in all_dead_links:
            print(f"{Fore.RED}- {url}{Style.RESET_ALL}")
        if args.output:
            with open(args.output, 'w') as f:
                for url in all_dead_links:
                    f.write(f"{url}\n")
    else:
        print(f"\n{Fore.GREEN}No dead links (HTTP 404) found in any batch.{
              Style.RESET_ALL}")


 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3

	import os
	import re
	import requests
	import threading
	from queue import Queue
	import argparse
	import subprocess
	from tqdm import tqdm
	from colorama import init, Fore, Style

	# Initialize colorama
	init(autoreset=True)

	# Regular expression to find HTTPS links, excluding trailing punctuation
	URL_REGEX = re.compile(r'https://[^\s\'\"<>()\[\]`]+')


	def find_links_in_file(filepath):
	"""Extracts HTTPS links from a file."""
	with open(filepath, 'r', errors='ignore') as f:
	try:
	content = f.read()
	urls = URL_REGEX.findall(content)
	# Remove trailing non-URL characters from URLs
	cleaned_urls = []
	for url in urls:
	# Remove any trailing characters that are not part of the URL
	url = re.sub(r'[\s\'\"<>()\[\]`,.;:]*$', '', url)
	cleaned_urls.append(url)
	return cleaned_urls
	except Exception:
	return []


	def extract_error_message(exception):
	"""Simplify the exception message for better readability."""
	if isinstance(exception, requests.exceptions.ConnectionError):
	return "Connection Error"
	elif isinstance(exception, requests.exceptions.Timeout):
	return "Timeout"
	elif isinstance(exception, requests.exceptions.TooManyRedirects):
	return "Too Many Redirects"
	else:
	return "Request Exception"


	def worker(url_queue, checked_urls, dead_links_404, error_counts, lock, timeout, progress_bar, headers):
	"""Thread worker function to check URLs."""
	while True:
	item = url_queue.get()
	if item is None:
	break
	url = item
	with lock:
	if url in checked_urls:
	url_queue.task_done()
	progress_bar.update(1)
	continue
	checked_urls.add(url)
	try:
	# Use GET request with stream=True to avoid downloading content
	response = requests.get(
	url, timeout=timeout, allow_redirects=True, stream=True, headers=headers)
	if response.status_code == 404:
	with lock:
	dead_links_404.append(url)
	elif response.status_code >= 400:
	with lock:
	# Increment count for this status code
	status_code = f"HTTP {response.status_code}"
	error_counts[status_code] = error_counts.get(
	status_code, 0) + 1
	# Else, do nothing (the URL is valid)
	except requests.RequestException as e:
	error_msg = extract_error_message(e)
	with lock:
	error_counts[error_msg] = error_counts.get(error_msg, 0) + 1
	finally:
	url_queue.task_done()
	progress_bar.update(1)


	def get_git_tracked_files(root_dir):
	"""Gets a list of files tracked by Git in the given directory, excluding .sol files."""
	try:
	output = subprocess.check_output(['git', 'ls-files'], cwd=root_dir)
	files = output.decode().splitlines()
	# Exclude .sol files
	files = [f for f in files if not f.endswith('.sol')]
	files = [os.path.join(root_dir, f) for f in files]
	return files
	except subprocess.CalledProcessError:
	print("Error: The specified directory is not a Git repository or Git is not installed.")
	return []


	def main():
	parser = argparse.ArgumentParser(
	description='Discover dead HTTPS links in a repository.')
	parser.add_argument('directory', nargs='?',
	default='.', help='Directory to scan')
	parser.add_argument('-t', '--threads', type=int,
	default=10, help='Number of worker threads')
	parser.add_argument(
	'-o', '--output', help='Output file to write dead links')
	parser.add_argument('--timeout', type=int, default=10,
	help='Timeout for HTTP requests in seconds')
	args = parser.parse_args()

	root_dir = os.path.abspath(args.directory)
	num_workers = args.threads
	timeout = args.timeout

	# Custom headers (including User-Agent)
	headers = {
	'User-Agent': 'Mozilla/5.0 (compatible; LinkChecker/1.0; +https://example.com/linkchecker)'
	}

	# Collect all Git-tracked files in the directory, excluding .sol files
	files_to_scan = get_git_tracked_files(root_dir)
	if not files_to_scan:
	print("No files to scan. Exiting.")
	return

	total_files = len(files_to_scan)
	print(f"Total files to scan: {total_files}")

	# Use tqdm progress bar for file scanning
	urls_found = []
	for filepath in tqdm(files_to_scan, desc="Scanning files", unit="file"):
	urls = find_links_in_file(filepath)
	urls_found.extend(urls)

	total_urls = len(urls_found)
	print(f"Finished scanning files. Total URLs found: {total_urls}")

	if total_urls == 0:
	print("No URLs found to check. Exiting.")
	return

	# Split URLs into batches of 100
	batch_size = 100
	url_batches = [urls_found[i:i + batch_size]
	for i in range(0, total_urls, batch_size)]

	all_dead_links = [] # Collect all dead links (HTTP 404) across batches

	for batch_num, url_batch in enumerate(url_batches, start=1):
	print(f"\nProcessing batch {
	batch_num}/{len(url_batches)} with {len(url_batch)} URLs.")

	# Queue for URLs to check in this batch
	url_queue = Queue()
	for url in url_batch:
	url_queue.put(url)

	# Shared data structures for this batch
	dead_links_404 = [] # List to store URLs with HTTP 404 errors
	error_counts = {} # Dictionary to count other errors
	checked_urls = set()
	lock = threading.Lock()

	# Create a tqdm progress bar for URL checking
	progress_bar = tqdm(total=len(url_batch), desc=f"Checking URLs in batch {
	batch_num}", unit="url")

	# Start worker threads
	threads = []
	for _ in range(num_workers):
	t = threading.Thread(target=worker, args=(
	url_queue, checked_urls, dead_links_404, error_counts, lock, timeout, progress_bar, headers))
	t.start()
	threads.append(t)

	# Wait until all URLs in the batch are processed
	url_queue.join()

	# Stop workers
	for _ in range(num_workers):
	url_queue.put(None)
	for t in threads:
	t.join()

	progress_bar.close()

	# Output dead links (404 errors) for this batch
	if dead_links_404:
	print(f"{Fore.RED}Dead links (HTTP 404) found in batch {
	batch_num}:{Style.RESET_ALL}")
	for url in dead_links_404:
	print(f"{Fore.RED}- {url}{Style.RESET_ALL}")
	all_dead_links.extend(dead_links_404)
	else:
	print(f"{Fore.GREEN}No dead links (HTTP 404) found in batch {
	batch_num}.{Style.RESET_ALL}")

	# Display summary of other errors
	if error_counts:
	print(f"{Fore.YELLOW}\nOther errors encountered in batch {
	batch_num}:{Style.RESET_ALL}")
	for error, count in error_counts.items():
	print(
	f"{Fore.YELLOW}- {error}: {count} occurrences{Style.RESET_ALL}")

	# Ask the user whether to continue
	while True:
	user_input = input(
	"Press Enter to continue to the next batch, or type 'q' to quit: ").strip().lower()
	if user_input == '':
	break
	elif user_input == 'q':
	print("Exiting.")
	if all_dead_links and args.output:
	# Write all dead links to the output file
	with open(args.output, 'w') as f:
	for url in all_dead_links:
	f.write(f"{url}\n")
	return
	else:
	print("Invalid input. Please press Enter to continue or 'q' to quit.")

	# After all batches are processed
	if all_dead_links:
	print(f"\n{Fore.RED}Summary of all dead links (HTTP 404) found:{
	Style.RESET_ALL}")
	for url in all_dead_links:
	print(f"{Fore.RED}- {url}{Style.RESET_ALL}")
	if args.output:
	with open(args.output, 'w') as f:
	for url in all_dead_links:
	f.write(f"{url}\n")
	else:
	print(f"\n{Fore.GREEN}No dead links (HTTP 404) found in any batch.{
	Style.RESET_ALL}")


	if __name__ == "__main__":
	main()