Skip to content

Instantly share code, notes, and snippets.

@kmcquade
Last active November 16, 2024 20:48
Show Gist options
  • Save kmcquade/57bdf16483842f7b6ce430b4e3aea105 to your computer and use it in GitHub Desktop.
Save kmcquade/57bdf16483842f7b6ce430b4e3aea105 to your computer and use it in GitHub Desktop.
Merge GitHub Repository files for usage in an LLM. Optionally split into smaller chunks
"""
This script merges all files in a specified repository directory into a single text file.
It includes features for filtering, excluding specific directories, and splitting the output
into smaller files. By default, it excludes test-related directories unless specified otherwise.
### Man page
usage: merge_files.py [-h] [-o OUTPUT] [--split SPLIT] [--filter-extensions FILTER_EXTENSIONS] [--no-exclude-tests] repo_path
Merge all files in a repository into a single text file.
positional arguments:
repo_path Path to the root of the repository.
options:
-h, --help show this help message and exit
-o OUTPUT, --output OUTPUT
Output file name (default: merged_repository.txt).
--split SPLIT Split the merged output into the specified number of files.
--filter-extensions FILTER_EXTENSIONS
Comma-separated list of file extensions to include (e.g., '.py,.txt').
--no-exclude-tests Include files in directories containing 'test' or 'tests' in their names.
### Features:
1. **Merge All Files**: Collects all files in the specified repository (excluding `.git` and test directories by default) into a single text file.
2. **File Filtering**: Use --filter-extensions to include only files with specific extensions (e.g., .py, .txt).
3. **Exclude Test Directories**: Automatically excludes directories containing "test" or "tests" in their names. This behavior can be disabled using the --no-exclude-tests flag.
4. **Splitting Output**: Optionally split the merged output into smaller files using --split with the desired number of parts.
5. **File Size Reporting**: Displays the size of the final merged file or each split file in MB.
### Usage Examples:
1. **Merge all files in a repository**: Merge all files (excluding `.git` and test directories) into a single file named `merged_repository.txt`:
python merge_files.py /path/to/repository
2. **Merge files with specific extensions**: Merge only `.py` and `.txt` files:
python merge_files.py /path/to/repository --filter-extensions .py,.txt
3. **Merge and include test directories**: Include files in `test` or `tests` directories by using --no-exclude-tests:
python merge_files.py /path/to/repository --no-exclude-tests
4. **Split the merged output**: Merge all files and split the output into 5 parts:
python merge_files.py /path/to/repository --split 5
5. **Custom output file name**: Specify a custom output file name and include only `.md` files:
python merge_files.py /path/to/repository -o output.txt --filter-extensions .md
6. **Combine multiple options**: Merge `.py` files, include test directories, and split the output into 3 parts:
python merge_files.py /path/to/repository --filter-extensions .py --no-exclude-tests --split 3
### Notes:
- The script skips files in the `.git` directory by default.
- The merged output file and all split files are created in the current working directory.
- The --split option calculates the number of lines per file and divides the merged output evenly.
"""
import os
import argparse
import math
# Function to get a site tree, with optional filtering
def get_site_tree(root_dir, extensions=None, exclude_tests=True):
tree = []
for root, _, files in os.walk(root_dir):
if ".git" in root: # Skip anything under .git directory
continue
if exclude_tests and ("test" in root.lower() or "tests" in root.lower()): # Exclude test directories
continue
for file in files:
file_path = os.path.join(root, file)
relative_path = os.path.relpath(file_path, root_dir)
# If filtering by extensions, check if file matches
if extensions:
if not any(relative_path.endswith(ext) for ext in extensions):
continue
tree.append(relative_path)
return tree
# Function to merge files
def merge_files(repo_path, output_file, extensions=None, exclude_tests=True):
tree = get_site_tree(repo_path, extensions, exclude_tests)
total_files = len(tree)
with open(output_file, "w") as outfile:
# Write site tree at the top
print("Writing repository tree...")
outfile.write("Repository Tree:\n")
for path in tree:
outfile.write(f"- {path}\n")
outfile.write("\n")
# Write each file's contents
print(f"Found {total_files} files. Merging contents...")
for idx, path in enumerate(tree, start=1):
file_path = os.path.join(repo_path, path)
if os.path.isfile(file_path): # Ensure it's a file
print(f"[{idx}/{total_files}] Processing: {path}")
outfile.write("--------------------------------------------------------------------------------\n")
outfile.write(f"# file name: {path}\n\n")
with open(file_path, "r", encoding="utf-8", errors="ignore") as infile:
outfile.write(infile.read())
outfile.write("\n")
print(f"Merge complete! Output saved to {output_file}")
# Function to split a file into chunks
def split_into_chunks(output_file, prefix, num_parts):
print(f"Splitting file {output_file} into {num_parts} parts with prefix {prefix}...")
split_files = []
for i in range(num_parts):
split_file_name = f"{prefix}{str(i).zfill(2)}.txt"
if os.path.exists(split_file_name):
print(f"Split file {split_file_name} already exists. Removing and replacing it.")
os.remove(split_file_name)
split_files.append(split_file_name)
with open(output_file, "r") as infile:
lines = infile.readlines()
total_lines = len(lines)
lines_per_file = math.ceil(total_lines / num_parts)
for i, split_file_name in enumerate(split_files):
start_idx = i * lines_per_file
end_idx = min(start_idx + lines_per_file, total_lines)
with open(split_file_name, "w") as split_file:
split_file.writelines(lines[start_idx:end_idx])
print(f"Created {split_file_name} with lines {start_idx + 1} to {end_idx}")
return split_files
# Function to print file size in MB
def print_file_size(file_path):
file_size = os.path.getsize(file_path) / (1024 * 1024) # Convert bytes to MB
print(f"File '{file_path}' size: {file_size:.2f} MB")
# Main function to handle arguments
def main():
parser = argparse.ArgumentParser(description="Merge all files in a repository into a single text file.")
parser.add_argument("repo_path", help="Path to the root of the repository.")
parser.add_argument("-o", "--output", default="merged_repository.txt", help="Output file name (default: merged_repository.txt).")
parser.add_argument("--split", type=int, help="Split the merged output into the specified number of files.")
parser.add_argument("--filter-extensions", help="Comma-separated list of file extensions to include (e.g., '.py,.txt').")
parser.add_argument("--no-exclude-tests", action="store_true", help="Include files in directories containing 'test' or 'tests' in their names.")
args = parser.parse_args()
print(f"Will split file into parts: {args.split}")
# Parse extensions if provided
extensions = None
if args.filter_extensions:
extensions = [ext.strip() for ext in args.filter_extensions.split(",")]
# Merge files into a single output
exclude_tests = not args.no_exclude_tests
merge_files(args.repo_path, args.output, extensions, exclude_tests)
# If --split is specified, split the merged file
if args.split:
prefix = os.path.splitext(args.output)[0] + "_"
print(f"Splitting output into {args.split} parts...")
print(f"Prefix: {prefix}")
split_files = split_into_chunks(args.output, prefix, args.split)
for split_file in split_files:
print_file_size(split_file)
else:
print_file_size(args.output)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment