Last active
September 10, 2024 16:31
-
-
Save stuaxo/f1beb981dc4845921c31fcb4e16f4821 to your computer and use it in GitHub Desktop.
Output files in subdirectories for ingestion to an LLM such as Claude, ChatGPT etc.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# Usage: python dirtollm.py [files or glob patterns...] [options] | |
# Example: python dirtollm.py "*.py" "*.txt" /path/to/specific/file.py --exclude "*.pyc" --copy --verbose -x --binaries | |
import argparse | |
import pathlib | |
import fnmatch | |
import sys | |
import os | |
from typing import List, Tuple, Optional | |
try: | |
import pyperclip | |
PYPERCLIP_AVAILABLE = True | |
except ImportError: | |
PYPERCLIP_AVAILABLE = False | |
pyperclip = None # Keep linter happy | |
class FileProcessingError(Exception): | |
pass | |
def get_file_content( | |
path: pathlib.Path, | |
errors: str, | |
verbose: bool, | |
include_binaries: bool, | |
include_empty: bool, | |
) -> Tuple[Optional[str], Optional[Exception]]: | |
try: | |
content = path.read_text(errors=errors) | |
if not (include_empty or content.strip()): | |
return None, None | |
if not include_binaries: | |
if "\0" in content: | |
return None, None | |
return content, None | |
except UnicodeDecodeError as ude: | |
if not include_binaries: | |
return None, None | |
error_msg = f"#:{path}: Binary file\n" | |
if verbose: | |
error_msg += f"UnicodeDecodeError details: {ude}\n" | |
return f"{error_msg}\n", ude | |
except Exception as ex: | |
error_msg = f"#:{path}: Read error\n" | |
if verbose: | |
error_msg += f"Error details: {ex}\n" | |
return f"{error_msg}\n", ex | |
def fn_matches_multiple(file: str, patterns: List[str]) -> bool: | |
return any(fnmatch.fnmatch(file, pattern) for pattern in patterns) | |
def process_path( | |
path: pathlib.Path, | |
globs: List[str], | |
excludes: List[str], | |
listing: bool, | |
errors: str, | |
verbose: bool, | |
exit_on_error: bool, | |
include_binaries: bool, | |
include_empty: bool, | |
) -> Tuple[str, int]: | |
output = "" | |
file_count = 0 | |
if path.is_file(): | |
if not globs or fn_matches_multiple(path.name, globs): | |
if not fn_matches_multiple(path.name, excludes): | |
if listing: | |
output += f"{path}\n" | |
file_count += 1 | |
else: | |
file_output, error = get_file_content( | |
path, errors, verbose, include_binaries, include_empty | |
) | |
if file_output is not None: | |
output += f"#:{path}:\n" | |
output += file_output.rstrip("\n") + "\n\n" | |
file_count += 1 | |
if error and exit_on_error: | |
raise FileProcessingError(f"Exiting due to error in file: {path}") | |
elif path.is_dir(): | |
for child in path.iterdir(): | |
child_output, child_count = process_path( | |
child, globs, excludes, listing, errors, verbose, | |
exit_on_error, include_binaries, include_empty | |
) | |
output += child_output | |
file_count += child_count | |
return output, file_count | |
def dirtollm( | |
paths: List[pathlib.Path], | |
globs: List[str], | |
excludes: List[str], | |
listing: bool = False, | |
errors: str = "replace", | |
verbose: bool = False, | |
exit_on_error: bool = False, | |
include_binaries: bool = False, | |
include_empty: bool = False, | |
) -> Tuple[str, int]: | |
output = "" | |
total_file_count = 0 | |
for path in paths: | |
path_output, file_count = process_path( | |
path, globs, excludes, listing, errors, verbose, | |
exit_on_error, include_binaries, include_empty | |
) | |
output += path_output | |
total_file_count += file_count | |
return output, total_file_count | |
def main(): | |
parser = argparse.ArgumentParser( | |
description="Process files based on specified paths or glob patterns.", | |
epilog='Example: python dirtollm.py "*.py" "*.txt" /path/to/specific/file.py --exclude "*.pyc" --copy --verbose -x --binaries', | |
) | |
parser.add_argument("paths", nargs="*", help="Files, directories, or glob patterns to process") | |
parser.add_argument( | |
"--exclude", nargs="+", help="Glob patterns to exclude", default=[] | |
) | |
parser.add_argument( | |
"--prompt", | |
nargs="?", | |
const="File contents:", | |
help="Specify prompt text to output before the files", | |
) | |
parser.add_argument( | |
"--count", | |
action="store_true", | |
help="Display the count of files, bytes, and tokens processed", | |
) | |
parser.add_argument( | |
"--copy", | |
action="store_true", | |
help="Copy output to the clipboard instead of printing to stdout", | |
) | |
parser.add_argument( | |
"--list", | |
action="store_true", | |
help="List all files that match the patterns without showing their contents", | |
) | |
parser.add_argument( | |
"--errors", | |
choices=["strict", "ignore", "replace", "backslashreplace"], | |
default="replace", | |
help="Specify how encoding errors are handled (default: replace)", | |
) | |
parser.add_argument( | |
"--verbose", "-v", action="store_true", help="Enable verbose output for errors" | |
) | |
parser.add_argument( | |
"-x", | |
"--exit-on-error", | |
action="store_true", | |
help="Exit on first error encountered", | |
) | |
parser.add_argument( | |
"--binaries", action="store_true", help="Include non unicode files" | |
) | |
parser.add_argument("--empty", action="store_true", help="Include empty files") | |
args = parser.parse_args() | |
paths = [] | |
globs = [] | |
if not args.paths: | |
paths = [pathlib.Path(".")] | |
globs = ["*"] | |
else: | |
for path_or_glob in args.paths: | |
path = pathlib.Path(path_or_glob) | |
if path.exists(): | |
paths.append(path.resolve()) | |
else: | |
paths.append(pathlib.Path.cwd()) | |
globs.append(path_or_glob) | |
try: | |
output, file_count = dirtollm( | |
paths, | |
globs, | |
args.exclude, | |
listing=args.list, | |
errors=args.errors, | |
verbose=args.verbose, | |
exit_on_error=args.exit_on_error, | |
include_binaries=args.binaries, | |
include_empty=args.empty, | |
) | |
except FileProcessingError as fpe: | |
print(f"Error: {fpe}", file=sys.stderr) | |
sys.exit(1) | |
if args.prompt: | |
output = f"{args.prompt}\n\n{output}" | |
output = output.rstrip("\n") | |
byte_count = len(output.encode("utf-8")) | |
token_count = len(output.split()) | |
if args.count: | |
print( | |
f"Processed {file_count} files, {byte_count} bytes, ~{token_count} tokens." | |
) | |
elif args.copy: | |
if PYPERCLIP_AVAILABLE: | |
pyperclip.copy(output) | |
print( | |
f"Copied to clipboard: {file_count} files, {byte_count} bytes, ~{token_count} tokens." | |
) | |
else: | |
print( | |
"Error: --copy requires pyperclip module. Falling back to stdout.", | |
file=sys.stderr, | |
) | |
print(output) | |
else: | |
print(output) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
very cool dear.