kmcquade · November 16, 2024 20:48
diff --git a/merge_files.py b/merge_files.py
 """
 This script merges all files in a specified repository directory into a single text file.
 It includes features for filtering, excluding specific directories, and splitting the output
 into smaller files. By default, it excludes test-related directories unless specified otherwise.

 ### Man page

 usage: merge_files.py [-h] [-o OUTPUT] [--split SPLIT] [--filter-extensions FILTER_EXTENSIONS] [--no-exclude-tests] repo_path

 Merge all files in a repository into a single text file.

 positional arguments:
  repo_path             Path to the root of the repository.

 options:
  -h, --help            show this help message and exit
  -o OUTPUT, --output OUTPUT
                        Output file name (default: merged_repository.txt).
  --split SPLIT         Split the merged output into the specified number of files.
  --filter-extensions FILTER_EXTENSIONS
                        Comma-separated list of file extensions to include (e.g., '.py,.txt').
  --no-exclude-tests    Include files in directories containing 'test' or 'tests' in their names.

 ### Features:
 1. **Merge All Files**: Collects all files in the specified repository (excluding `.git` and test directories by default) into a single text file.
 2. **File Filtering**: Use --filter-extensions to include only files with specific extensions (e.g., .py, .txt).
 3. **Exclude Test Directories**: Automatically excludes directories containing "test" or "tests" in their names. This behavior can be disabled using the --no-exclude-tests flag.
 4. **Splitting Output**: Optionally split the merged output into smaller files using --split with the desired number of parts.
 5. **File Size Reporting**: Displays the size of the final merged file or each split file in MB.

 ### Usage Examples:
 1. **Merge all files in a repository**: Merge all files (excluding `.git` and test directories) into a single file named `merged_repository.txt`:
   python merge_files.py /path/to/repository

 2. **Merge files with specific extensions**: Merge only `.py` and `.txt` files:
   python merge_files.py /path/to/repository --filter-extensions .py,.txt

 3. **Merge and include test directories**: Include files in `test` or `tests` directories by using --no-exclude-tests:
   python merge_files.py /path/to/repository --no-exclude-tests

 4. **Split the merged output**: Merge all files and split the output into 5 parts:
   python merge_files.py /path/to/repository --split 5

 5. **Custom output file name**: Specify a custom output file name and include only `.md` files:
   python merge_files.py /path/to/repository -o output.txt --filter-extensions .md

 6. **Combine multiple options**: Merge `.py` files, include test directories, and split the output into 3 parts:
   python merge_files.py /path/to/repository --filter-extensions .py --no-exclude-tests --split 3

 ### Notes:
 - The script skips files in the `.git` directory by default.
 - The merged output file and all split files are created in the current working directory.
 - The --split option calculates the number of lines per file and divides the merged output evenly.
 """

 import os
 import argparse
 import math

 # Function to get a site tree, with optional filtering
 def get_site_tree(root_dir, extensions=None, exclude_tests=True):
    tree = []
    for root, _, files in os.walk(root_dir):
        if ".git" in root:  # Skip anything under .git directory
            continue
        if exclude_tests and ("test" in root.lower() or "tests" in root.lower()):  # Exclude test directories
            continue
        for file in files:
            file_path = os.path.join(root, file)
            relative_path = os.path.relpath(file_path, root_dir)

            # If filtering by extensions, check if file matches
            if extensions:
                if not any(relative_path.endswith(ext) for ext in extensions):
                    continue

            tree.append(relative_path)
    return tree

 # Function to merge files
 def merge_files(repo_path, output_file, extensions=None, exclude_tests=True):
    tree = get_site_tree(repo_path, extensions, exclude_tests)
    total_files = len(tree)

    with open(output_file, "w") as outfile:
        # Write site tree at the top
        print("Writing repository tree...")
        outfile.write("Repository Tree:\n")
        for path in tree:
            outfile.write(f"- {path}\n")
        outfile.write("\n")

        # Write each file's contents
        print(f"Found {total_files} files. Merging contents...")
        for idx, path in enumerate(tree, start=1):
            file_path = os.path.join(repo_path, path)
            if os.path.isfile(file_path):  # Ensure it's a file
                print(f"[{idx}/{total_files}] Processing: {path}")
                outfile.write("--------------------------------------------------------------------------------\n")
                outfile.write(f"# file name: {path}\n\n")
                with open(file_path, "r", encoding="utf-8", errors="ignore") as infile:
                    outfile.write(infile.read())
                outfile.write("\n")

    print(f"Merge complete! Output saved to {output_file}")

 # Function to split a file into chunks
 def split_into_chunks(output_file, prefix, num_parts):
    print(f"Splitting file {output_file} into {num_parts} parts with prefix {prefix}...")

    split_files = []
    for i in range(num_parts):
        split_file_name = f"{prefix}{str(i).zfill(2)}.txt"
        if os.path.exists(split_file_name):
            print(f"Split file {split_file_name} already exists. Removing and replacing it.")
            os.remove(split_file_name)

        split_files.append(split_file_name)

    with open(output_file, "r") as infile:
        lines = infile.readlines()

    total_lines = len(lines)
    lines_per_file = math.ceil(total_lines / num_parts)

    for i, split_file_name in enumerate(split_files):
        start_idx = i * lines_per_file
        end_idx = min(start_idx + lines_per_file, total_lines)

        with open(split_file_name, "w") as split_file:
            split_file.writelines(lines[start_idx:end_idx])
        print(f"Created {split_file_name} with lines {start_idx + 1} to {end_idx}")

    return split_files


 # Function to print file size in MB
 def print_file_size(file_path):
    file_size = os.path.getsize(file_path) / (1024 * 1024)  # Convert bytes to MB
    print(f"File '{file_path}' size: {file_size:.2f} MB")

 # Main function to handle arguments
 def main():
    parser = argparse.ArgumentParser(description="Merge all files in a repository into a single text file.")
    parser.add_argument("repo_path", help="Path to the root of the repository.")
    parser.add_argument("-o", "--output", default="merged_repository.txt", help="Output file name (default: merged_repository.txt).")
    parser.add_argument("--split", type=int, help="Split the merged output into the specified number of files.")
    parser.add_argument("--filter-extensions", help="Comma-separated list of file extensions to include (e.g., '.py,.txt').")
    parser.add_argument("--no-exclude-tests", action="store_true", help="Include files in directories containing 'test' or 'tests' in their names.")
    args = parser.parse_args()

    print(f"Will split file into parts: {args.split}")

    # Parse extensions if provided
    extensions = None
    if args.filter_extensions:
        extensions = [ext.strip() for ext in args.filter_extensions.split(",")]

    # Merge files into a single output
    exclude_tests = not args.no_exclude_tests
    merge_files(args.repo_path, args.output, extensions, exclude_tests)

    # If --split is specified, split the merged file
    if args.split:
        prefix = os.path.splitext(args.output)[0] + "_"
        print(f"Splitting output into {args.split} parts...")
        print(f"Prefix: {prefix}")
        split_files = split_into_chunks(args.output, prefix, args.split)
        for split_file in split_files:
            print_file_size(split_file)
    else:
        print_file_size(args.output)

 if __name__ == "__main__":
    main()
	"""
	This script merges all files in a specified repository directory into a single text file.
	It includes features for filtering, excluding specific directories, and splitting the output
	into smaller files. By default, it excludes test-related directories unless specified otherwise.

	### Man page

	usage: merge_files.py [-h] [-o OUTPUT] [--split SPLIT] [--filter-extensions FILTER_EXTENSIONS] [--no-exclude-tests] repo_path

	Merge all files in a repository into a single text file.

	positional arguments:
	repo_path Path to the root of the repository.

	options:
	-h, --help show this help message and exit
	-o OUTPUT, --output OUTPUT
	Output file name (default: merged_repository.txt).
	--split SPLIT Split the merged output into the specified number of files.
	--filter-extensions FILTER_EXTENSIONS
	Comma-separated list of file extensions to include (e.g., '.py,.txt').
	--no-exclude-tests Include files in directories containing 'test' or 'tests' in their names.

	### Features:
	1. Merge All Files: Collects all files in the specified repository (excluding `.git` and test directories by default) into a single text file.
	2. File Filtering: Use --filter-extensions to include only files with specific extensions (e.g., .py, .txt).
	3. Exclude Test Directories: Automatically excludes directories containing "test" or "tests" in their names. This behavior can be disabled using the --no-exclude-tests flag.
	4. Splitting Output: Optionally split the merged output into smaller files using --split with the desired number of parts.
	5. File Size Reporting: Displays the size of the final merged file or each split file in MB.

	### Usage Examples:
	1. Merge all files in a repository: Merge all files (excluding `.git` and test directories) into a single file named `merged_repository.txt`:
	python merge_files.py /path/to/repository

	2. Merge files with specific extensions: Merge only `.py` and `.txt` files:
	python merge_files.py /path/to/repository --filter-extensions .py,.txt

	3. Merge and include test directories: Include files in `test` or `tests` directories by using --no-exclude-tests:
	python merge_files.py /path/to/repository --no-exclude-tests

	4. Split the merged output: Merge all files and split the output into 5 parts:
	python merge_files.py /path/to/repository --split 5

	5. Custom output file name: Specify a custom output file name and include only `.md` files:
	python merge_files.py /path/to/repository -o output.txt --filter-extensions .md

	6. Combine multiple options: Merge `.py` files, include test directories, and split the output into 3 parts:
	python merge_files.py /path/to/repository --filter-extensions .py --no-exclude-tests --split 3

	### Notes:
	- The script skips files in the `.git` directory by default.
	- The merged output file and all split files are created in the current working directory.
	- The --split option calculates the number of lines per file and divides the merged output evenly.
	"""

	import os
	import argparse
	import math

	# Function to get a site tree, with optional filtering
	def get_site_tree(root_dir, extensions=None, exclude_tests=True):
	tree = []
	for root, _, files in os.walk(root_dir):
	if ".git" in root: # Skip anything under .git directory
	continue
	if exclude_tests and ("test" in root.lower() or "tests" in root.lower()): # Exclude test directories
	continue
	for file in files:
	file_path = os.path.join(root, file)
	relative_path = os.path.relpath(file_path, root_dir)

	# If filtering by extensions, check if file matches
	if extensions:
	if not any(relative_path.endswith(ext) for ext in extensions):
	continue

	tree.append(relative_path)
	return tree

	# Function to merge files
	def merge_files(repo_path, output_file, extensions=None, exclude_tests=True):
	tree = get_site_tree(repo_path, extensions, exclude_tests)
	total_files = len(tree)

	with open(output_file, "w") as outfile:
	# Write site tree at the top
	print("Writing repository tree...")
	outfile.write("Repository Tree:\n")
	for path in tree:
	outfile.write(f"- {path}\n")
	outfile.write("\n")

	# Write each file's contents
	print(f"Found {total_files} files. Merging contents...")
	for idx, path in enumerate(tree, start=1):
	file_path = os.path.join(repo_path, path)
	if os.path.isfile(file_path): # Ensure it's a file
	print(f"[{idx}/{total_files}] Processing: {path}")
	outfile.write("--------------------------------------------------------------------------------\n")
	outfile.write(f"# file name: {path}\n\n")
	with open(file_path, "r", encoding="utf-8", errors="ignore") as infile:
	outfile.write(infile.read())
	outfile.write("\n")

	print(f"Merge complete! Output saved to {output_file}")

	# Function to split a file into chunks
	def split_into_chunks(output_file, prefix, num_parts):
	print(f"Splitting file {output_file} into {num_parts} parts with prefix {prefix}...")

	split_files = []
	for i in range(num_parts):
	split_file_name = f"{prefix}{str(i).zfill(2)}.txt"
	if os.path.exists(split_file_name):
	print(f"Split file {split_file_name} already exists. Removing and replacing it.")
	os.remove(split_file_name)

	split_files.append(split_file_name)

	with open(output_file, "r") as infile:
	lines = infile.readlines()

	total_lines = len(lines)
	lines_per_file = math.ceil(total_lines / num_parts)

	for i, split_file_name in enumerate(split_files):
	start_idx = i * lines_per_file
	end_idx = min(start_idx + lines_per_file, total_lines)

	with open(split_file_name, "w") as split_file:
	split_file.writelines(lines[start_idx:end_idx])
	print(f"Created {split_file_name} with lines {start_idx + 1} to {end_idx}")

	return split_files


	# Function to print file size in MB
	def print_file_size(file_path):
	file_size = os.path.getsize(file_path) / (1024 * 1024) # Convert bytes to MB
	print(f"File '{file_path}' size: {file_size:.2f} MB")

	# Main function to handle arguments
	def main():
	parser = argparse.ArgumentParser(description="Merge all files in a repository into a single text file.")
	parser.add_argument("repo_path", help="Path to the root of the repository.")
	parser.add_argument("-o", "--output", default="merged_repository.txt", help="Output file name (default: merged_repository.txt).")
	parser.add_argument("--split", type=int, help="Split the merged output into the specified number of files.")
	parser.add_argument("--filter-extensions", help="Comma-separated list of file extensions to include (e.g., '.py,.txt').")
	parser.add_argument("--no-exclude-tests", action="store_true", help="Include files in directories containing 'test' or 'tests' in their names.")
	args = parser.parse_args()

	print(f"Will split file into parts: {args.split}")

	# Parse extensions if provided
	extensions = None
	if args.filter_extensions:
	extensions = [ext.strip() for ext in args.filter_extensions.split(",")]

	# Merge files into a single output
	exclude_tests = not args.no_exclude_tests
	merge_files(args.repo_path, args.output, extensions, exclude_tests)

	# If --split is specified, split the merged file
	if args.split:
	prefix = os.path.splitext(args.output)[0] + "_"
	print(f"Splitting output into {args.split} parts...")
	print(f"Prefix: {prefix}")
	split_files = split_into_chunks(args.output, prefix, args.split)
	for split_file in split_files:
	print_file_size(split_file)
	else:
	print_file_size(args.output)

	if __name__ == "__main__":
	main()