Last active
November 16, 2024 20:48
-
-
Save kmcquade/57bdf16483842f7b6ce430b4e3aea105 to your computer and use it in GitHub Desktop.
Merge GitHub Repository files for usage in an LLM. Optionally split into smaller chunks
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
This script merges all files in a specified repository directory into a single text file. | |
It includes features for filtering, excluding specific directories, and splitting the output | |
into smaller files. By default, it excludes test-related directories unless specified otherwise. | |
### Man page | |
usage: merge_files.py [-h] [-o OUTPUT] [--split SPLIT] [--filter-extensions FILTER_EXTENSIONS] [--no-exclude-tests] repo_path | |
Merge all files in a repository into a single text file. | |
positional arguments: | |
repo_path Path to the root of the repository. | |
options: | |
-h, --help show this help message and exit | |
-o OUTPUT, --output OUTPUT | |
Output file name (default: merged_repository.txt). | |
--split SPLIT Split the merged output into the specified number of files. | |
--filter-extensions FILTER_EXTENSIONS | |
Comma-separated list of file extensions to include (e.g., '.py,.txt'). | |
--no-exclude-tests Include files in directories containing 'test' or 'tests' in their names. | |
### Features: | |
1. **Merge All Files**: Collects all files in the specified repository (excluding `.git` and test directories by default) into a single text file. | |
2. **File Filtering**: Use --filter-extensions to include only files with specific extensions (e.g., .py, .txt). | |
3. **Exclude Test Directories**: Automatically excludes directories containing "test" or "tests" in their names. This behavior can be disabled using the --no-exclude-tests flag. | |
4. **Splitting Output**: Optionally split the merged output into smaller files using --split with the desired number of parts. | |
5. **File Size Reporting**: Displays the size of the final merged file or each split file in MB. | |
### Usage Examples: | |
1. **Merge all files in a repository**: Merge all files (excluding `.git` and test directories) into a single file named `merged_repository.txt`: | |
python merge_files.py /path/to/repository | |
2. **Merge files with specific extensions**: Merge only `.py` and `.txt` files: | |
python merge_files.py /path/to/repository --filter-extensions .py,.txt | |
3. **Merge and include test directories**: Include files in `test` or `tests` directories by using --no-exclude-tests: | |
python merge_files.py /path/to/repository --no-exclude-tests | |
4. **Split the merged output**: Merge all files and split the output into 5 parts: | |
python merge_files.py /path/to/repository --split 5 | |
5. **Custom output file name**: Specify a custom output file name and include only `.md` files: | |
python merge_files.py /path/to/repository -o output.txt --filter-extensions .md | |
6. **Combine multiple options**: Merge `.py` files, include test directories, and split the output into 3 parts: | |
python merge_files.py /path/to/repository --filter-extensions .py --no-exclude-tests --split 3 | |
### Notes: | |
- The script skips files in the `.git` directory by default. | |
- The merged output file and all split files are created in the current working directory. | |
- The --split option calculates the number of lines per file and divides the merged output evenly. | |
""" | |
import os | |
import argparse | |
import math | |
# Function to get a site tree, with optional filtering | |
def get_site_tree(root_dir, extensions=None, exclude_tests=True): | |
tree = [] | |
for root, _, files in os.walk(root_dir): | |
if ".git" in root: # Skip anything under .git directory | |
continue | |
if exclude_tests and ("test" in root.lower() or "tests" in root.lower()): # Exclude test directories | |
continue | |
for file in files: | |
file_path = os.path.join(root, file) | |
relative_path = os.path.relpath(file_path, root_dir) | |
# If filtering by extensions, check if file matches | |
if extensions: | |
if not any(relative_path.endswith(ext) for ext in extensions): | |
continue | |
tree.append(relative_path) | |
return tree | |
# Function to merge files | |
def merge_files(repo_path, output_file, extensions=None, exclude_tests=True): | |
tree = get_site_tree(repo_path, extensions, exclude_tests) | |
total_files = len(tree) | |
with open(output_file, "w") as outfile: | |
# Write site tree at the top | |
print("Writing repository tree...") | |
outfile.write("Repository Tree:\n") | |
for path in tree: | |
outfile.write(f"- {path}\n") | |
outfile.write("\n") | |
# Write each file's contents | |
print(f"Found {total_files} files. Merging contents...") | |
for idx, path in enumerate(tree, start=1): | |
file_path = os.path.join(repo_path, path) | |
if os.path.isfile(file_path): # Ensure it's a file | |
print(f"[{idx}/{total_files}] Processing: {path}") | |
outfile.write("--------------------------------------------------------------------------------\n") | |
outfile.write(f"# file name: {path}\n\n") | |
with open(file_path, "r", encoding="utf-8", errors="ignore") as infile: | |
outfile.write(infile.read()) | |
outfile.write("\n") | |
print(f"Merge complete! Output saved to {output_file}") | |
# Function to split a file into chunks | |
def split_into_chunks(output_file, prefix, num_parts): | |
print(f"Splitting file {output_file} into {num_parts} parts with prefix {prefix}...") | |
split_files = [] | |
for i in range(num_parts): | |
split_file_name = f"{prefix}{str(i).zfill(2)}.txt" | |
if os.path.exists(split_file_name): | |
print(f"Split file {split_file_name} already exists. Removing and replacing it.") | |
os.remove(split_file_name) | |
split_files.append(split_file_name) | |
with open(output_file, "r") as infile: | |
lines = infile.readlines() | |
total_lines = len(lines) | |
lines_per_file = math.ceil(total_lines / num_parts) | |
for i, split_file_name in enumerate(split_files): | |
start_idx = i * lines_per_file | |
end_idx = min(start_idx + lines_per_file, total_lines) | |
with open(split_file_name, "w") as split_file: | |
split_file.writelines(lines[start_idx:end_idx]) | |
print(f"Created {split_file_name} with lines {start_idx + 1} to {end_idx}") | |
return split_files | |
# Function to print file size in MB | |
def print_file_size(file_path): | |
file_size = os.path.getsize(file_path) / (1024 * 1024) # Convert bytes to MB | |
print(f"File '{file_path}' size: {file_size:.2f} MB") | |
# Main function to handle arguments | |
def main(): | |
parser = argparse.ArgumentParser(description="Merge all files in a repository into a single text file.") | |
parser.add_argument("repo_path", help="Path to the root of the repository.") | |
parser.add_argument("-o", "--output", default="merged_repository.txt", help="Output file name (default: merged_repository.txt).") | |
parser.add_argument("--split", type=int, help="Split the merged output into the specified number of files.") | |
parser.add_argument("--filter-extensions", help="Comma-separated list of file extensions to include (e.g., '.py,.txt').") | |
parser.add_argument("--no-exclude-tests", action="store_true", help="Include files in directories containing 'test' or 'tests' in their names.") | |
args = parser.parse_args() | |
print(f"Will split file into parts: {args.split}") | |
# Parse extensions if provided | |
extensions = None | |
if args.filter_extensions: | |
extensions = [ext.strip() for ext in args.filter_extensions.split(",")] | |
# Merge files into a single output | |
exclude_tests = not args.no_exclude_tests | |
merge_files(args.repo_path, args.output, extensions, exclude_tests) | |
# If --split is specified, split the merged file | |
if args.split: | |
prefix = os.path.splitext(args.output)[0] + "_" | |
print(f"Splitting output into {args.split} parts...") | |
print(f"Prefix: {prefix}") | |
split_files = split_into_chunks(args.output, prefix, args.split) | |
for split_file in split_files: | |
print_file_size(split_file) | |
else: | |
print_file_size(args.output) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment