Last active
September 7, 2025 13:39
-
-
Save stuaxo/f1beb981dc4845921c31fcb4e16f4821 to your computer and use it in GitHub Desktop.
Output files in subdirectories for ingestion to an LLM such as Claude, ChatGPT etc.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# Usage: python dirtollm.py [paths...] [--dir /path/to/directory] [--glob *.py] [--exclude *.pyc] [--exclude-regex ".*\.log"] [--include-dotfiles] [--no-gitignore] [--copy | --copy-append] | |
import argparse | |
import pathlib | |
import fnmatch | |
import re | |
try: | |
import pyperclip | |
except ImportError: | |
pyperclip = None | |
def get_gitignore_regex(path): | |
""" | |
Reads a .gitignore file and converts its patterns into regular expressions. | |
This is a simplified converter. A full implementation is more complex. | |
""" | |
regexes = [] | |
p = pathlib.Path(path) | |
if not p.is_file(): | |
return [] | |
base_dir = p.parent.resolve() | |
with p.open(encoding='utf-8') as f: | |
for line in f: | |
line = line.strip() | |
if not line or line.startswith('#'): | |
continue | |
# Escape special regex characters, except for '*' | |
regex = re.escape(line).replace('\\*', '.*') | |
# Handle directory-only patterns (e.g., 'build/') | |
if regex.endswith('/'): | |
regex = regex[:-1] + '(/.*)?' # Match the dir or content within it | |
# Anchor the pattern to the gitignore's directory | |
full_pattern = str(base_dir) + '/.*' + regex | |
try: | |
regexes.append(re.compile(full_pattern)) | |
except re.error as e: | |
print(f"Warning: Could not compile regex for gitignore pattern '{line}': {e}") | |
return regexes | |
def append_file_content(output, path): | |
try: | |
content = path.read_text(encoding='utf-8') | |
except UnicodeDecodeError: | |
content = "" | |
except Exception as ex: | |
content = f"Skipped (error reading file: {ex})\n\n" | |
if content: | |
output += f"#:{path}:\n{content}\n\n" | |
return output | |
def dir_to_llm(output, directory, glob, exclude_globs, exclude_regexes, parse_gitignore, include_dotfiles, list_only): | |
file_count = 0 | |
p = pathlib.Path(directory) | |
current_exclude_globs = list(exclude_globs) | |
current_exclude_regexes = list(exclude_regexes) | |
if parse_gitignore: | |
gitignore_path = p / ".gitignore" | |
current_exclude_regexes.extend(get_gitignore_regex(gitignore_path)) | |
# Recurse into subdirectories | |
for child in p.iterdir(): | |
# Added basic dotfile check here | |
if not include_dotfiles and child.name.startswith('.'): | |
continue | |
child_path_str = str(child.resolve()) | |
is_excluded_glob = any(fnmatch.fnmatch(child_path_str, pattern) for pattern in current_exclude_globs) | |
is_excluded_regex = any(r.fullmatch(child_path_str) for r in current_exclude_regexes) | |
if child.is_dir() and not is_excluded_glob and not is_excluded_regex: | |
output, sub_file_count = dir_to_llm( | |
output=output, | |
directory=child, | |
glob=glob, | |
exclude_globs=current_exclude_globs, | |
exclude_regexes=current_exclude_regexes, | |
parse_gitignore=parse_gitignore, | |
include_dotfiles=include_dotfiles, | |
list_only=list_only | |
) | |
file_count += sub_file_count | |
# Process files | |
for child in p.glob(glob): | |
# Added basic dotfile check here as well | |
if not include_dotfiles and child.name.startswith('.'): | |
continue | |
child_path_str = str(child.resolve()) | |
is_excluded_glob = any(fnmatch.fnmatch(child_path_str, pattern) for pattern in current_exclude_globs) | |
is_excluded_regex = any(r.fullmatch(child_path_str) for r in current_exclude_regexes) | |
if child.is_file() and not is_excluded_glob and not is_excluded_regex: | |
# --- FIX IS HERE --- | |
if list_only: | |
output += f"{child}\n" # Just append the file path string | |
else: | |
output = append_file_content(output, child) # Append the content | |
# --- END FIX --- | |
file_count += 1 | |
return output, file_count | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser( | |
description='Concatenate directory contents for Large Language Models.', | |
formatter_class=argparse.RawTextHelpFormatter | |
) | |
parser.add_argument('paths', nargs='*', help='Directories to process or glob patterns to match') | |
parser.add_argument('--dir', type=str, help='Directory to process', default=".") | |
parser.add_argument('--exclude', type=str, nargs='+', help='Glob patterns to exclude (e.g., *.pyc ****pycache****/*)', default=[]) | |
parser.add_argument('--exclude-regex', type=str, nargs='+', help='Regular expression patterns to exclude', default=[]) | |
parser.add_argument('--glob', type=str, help='Glob pattern to match files', default="*") | |
parser.add_argument('--prompt', nargs='?', const="Filenames followed by file content-:", default=None, help='Display a prompt before the files') | |
parser.add_argument('--list', action='store_true', help='List file paths only, without content') | |
parser.add_argument('--include-dotfiles', action='store_true', help='Include dotfiles and dot directories (e.g., .git, .venv)') | |
parser.add_argument('--no-gitignore', action='store_true', help='Do not parse .gitignore files for exclusion patterns') | |
copy_group = parser.add_mutually_exclusive_group() | |
copy_group.add_argument('--copy', action='store_true', help='Copy output to the clipboard, replacing its content.') | |
copy_group.add_argument('--copy-append', action='store_true', help='Append output to the clipboard.') | |
args = parser.parse_args() | |
# Determine directories and glob patterns from positional arguments | |
directories = [] | |
glob_patterns = [] | |
if args.paths: | |
for path_arg in args.paths: | |
path_obj = pathlib.Path(path_arg) | |
if path_obj.exists() and path_obj.is_dir(): | |
directories.append(path_arg) | |
elif '*' in path_arg or '?' in path_arg or '[' in path_arg: | |
glob_patterns.append(path_arg) | |
elif path_obj.exists() and path_obj.is_file(): | |
directories.append(str(path_obj.parent)) | |
glob_patterns.append(path_obj.name) | |
else: | |
glob_patterns.append(path_arg) | |
# Use defaults if nothing was specified | |
if not directories: | |
directories = [args.dir] | |
if not glob_patterns: | |
glob_patterns = [args.glob] | |
# Compile user-provided regexes for efficiency | |
compiled_regexes = [] | |
for pattern in args.exclude_regex: | |
try: | |
compiled_regexes.append(re.compile(pattern)) | |
except re.error as e: | |
print(f"Warning: Invalid regex pattern '{pattern}' skipped: {e}") | |
output = "" | |
if args.prompt is not None: | |
output += args.prompt + "\n\n" | |
total_file_count = 0 | |
for directory in directories: | |
for glob_pattern in glob_patterns: | |
dir_output, file_count = dir_to_llm( | |
output="", | |
directory=directory, | |
glob=glob_pattern, | |
exclude_globs=args.exclude, | |
exclude_regexes=compiled_regexes, | |
parse_gitignore=not args.no_gitignore, | |
include_dotfiles=args.include_dotfiles, | |
list_only=args.list | |
) | |
output += dir_output | |
total_file_count += file_count | |
token_count = len(output.split()) | |
if args.copy or args.copy_append: | |
if not pyperclip: | |
print("The --copy or --copy-append options require 'pyperclip'. Please install it.") | |
elif args.copy_append: | |
original_content = pyperclip.paste() | |
if original_content and isinstance(original_content, str): | |
new_content = original_content + "\n\n" + output | |
else: | |
new_content = output | |
pyperclip.copy(new_content) | |
print(f"Appended {total_file_count} files, {len(output)} bytes, and approximately {token_count} tokens to the clipboard.") | |
else: | |
pyperclip.copy(output) | |
print(f"Copied {total_file_count} files, {len(output)} bytes, and approximately {token_count} tokens to the clipboard.") | |
else: | |
print(output) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
🤣