Skip to content

Instantly share code, notes, and snippets.

@stuaxo
Last active September 7, 2025 13:39
Show Gist options
  • Save stuaxo/f1beb981dc4845921c31fcb4e16f4821 to your computer and use it in GitHub Desktop.
Save stuaxo/f1beb981dc4845921c31fcb4e16f4821 to your computer and use it in GitHub Desktop.
Output files in subdirectories for ingestion to an LLM such as Claude, ChatGPT etc.
#!/usr/bin/env python3
# Usage: python dirtollm.py [paths...] [--dir /path/to/directory] [--glob *.py] [--exclude *.pyc] [--exclude-regex ".*\.log"] [--include-dotfiles] [--no-gitignore] [--copy | --copy-append]
import argparse
import pathlib
import fnmatch
import re
try:
import pyperclip
except ImportError:
pyperclip = None
def get_gitignore_regex(path):
"""
Reads a .gitignore file and converts its patterns into regular expressions.
This is a simplified converter. A full implementation is more complex.
"""
regexes = []
p = pathlib.Path(path)
if not p.is_file():
return []
base_dir = p.parent.resolve()
with p.open(encoding='utf-8') as f:
for line in f:
line = line.strip()
if not line or line.startswith('#'):
continue
# Escape special regex characters, except for '*'
regex = re.escape(line).replace('\\*', '.*')
# Handle directory-only patterns (e.g., 'build/')
if regex.endswith('/'):
regex = regex[:-1] + '(/.*)?' # Match the dir or content within it
# Anchor the pattern to the gitignore's directory
full_pattern = str(base_dir) + '/.*' + regex
try:
regexes.append(re.compile(full_pattern))
except re.error as e:
print(f"Warning: Could not compile regex for gitignore pattern '{line}': {e}")
return regexes
def append_file_content(output, path):
try:
content = path.read_text(encoding='utf-8')
except UnicodeDecodeError:
content = ""
except Exception as ex:
content = f"Skipped (error reading file: {ex})\n\n"
if content:
output += f"#:{path}:\n{content}\n\n"
return output
def dir_to_llm(output, directory, glob, exclude_globs, exclude_regexes, parse_gitignore, include_dotfiles, list_only):
file_count = 0
p = pathlib.Path(directory)
current_exclude_globs = list(exclude_globs)
current_exclude_regexes = list(exclude_regexes)
if parse_gitignore:
gitignore_path = p / ".gitignore"
current_exclude_regexes.extend(get_gitignore_regex(gitignore_path))
# Recurse into subdirectories
for child in p.iterdir():
# Added basic dotfile check here
if not include_dotfiles and child.name.startswith('.'):
continue
child_path_str = str(child.resolve())
is_excluded_glob = any(fnmatch.fnmatch(child_path_str, pattern) for pattern in current_exclude_globs)
is_excluded_regex = any(r.fullmatch(child_path_str) for r in current_exclude_regexes)
if child.is_dir() and not is_excluded_glob and not is_excluded_regex:
output, sub_file_count = dir_to_llm(
output=output,
directory=child,
glob=glob,
exclude_globs=current_exclude_globs,
exclude_regexes=current_exclude_regexes,
parse_gitignore=parse_gitignore,
include_dotfiles=include_dotfiles,
list_only=list_only
)
file_count += sub_file_count
# Process files
for child in p.glob(glob):
# Added basic dotfile check here as well
if not include_dotfiles and child.name.startswith('.'):
continue
child_path_str = str(child.resolve())
is_excluded_glob = any(fnmatch.fnmatch(child_path_str, pattern) for pattern in current_exclude_globs)
is_excluded_regex = any(r.fullmatch(child_path_str) for r in current_exclude_regexes)
if child.is_file() and not is_excluded_glob and not is_excluded_regex:
# --- FIX IS HERE ---
if list_only:
output += f"{child}\n" # Just append the file path string
else:
output = append_file_content(output, child) # Append the content
# --- END FIX ---
file_count += 1
return output, file_count
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description='Concatenate directory contents for Large Language Models.',
formatter_class=argparse.RawTextHelpFormatter
)
parser.add_argument('paths', nargs='*', help='Directories to process or glob patterns to match')
parser.add_argument('--dir', type=str, help='Directory to process', default=".")
parser.add_argument('--exclude', type=str, nargs='+', help='Glob patterns to exclude (e.g., *.pyc ****pycache****/*)', default=[])
parser.add_argument('--exclude-regex', type=str, nargs='+', help='Regular expression patterns to exclude', default=[])
parser.add_argument('--glob', type=str, help='Glob pattern to match files', default="*")
parser.add_argument('--prompt', nargs='?', const="Filenames followed by file content-:", default=None, help='Display a prompt before the files')
parser.add_argument('--list', action='store_true', help='List file paths only, without content')
parser.add_argument('--include-dotfiles', action='store_true', help='Include dotfiles and dot directories (e.g., .git, .venv)')
parser.add_argument('--no-gitignore', action='store_true', help='Do not parse .gitignore files for exclusion patterns')
copy_group = parser.add_mutually_exclusive_group()
copy_group.add_argument('--copy', action='store_true', help='Copy output to the clipboard, replacing its content.')
copy_group.add_argument('--copy-append', action='store_true', help='Append output to the clipboard.')
args = parser.parse_args()
# Determine directories and glob patterns from positional arguments
directories = []
glob_patterns = []
if args.paths:
for path_arg in args.paths:
path_obj = pathlib.Path(path_arg)
if path_obj.exists() and path_obj.is_dir():
directories.append(path_arg)
elif '*' in path_arg or '?' in path_arg or '[' in path_arg:
glob_patterns.append(path_arg)
elif path_obj.exists() and path_obj.is_file():
directories.append(str(path_obj.parent))
glob_patterns.append(path_obj.name)
else:
glob_patterns.append(path_arg)
# Use defaults if nothing was specified
if not directories:
directories = [args.dir]
if not glob_patterns:
glob_patterns = [args.glob]
# Compile user-provided regexes for efficiency
compiled_regexes = []
for pattern in args.exclude_regex:
try:
compiled_regexes.append(re.compile(pattern))
except re.error as e:
print(f"Warning: Invalid regex pattern '{pattern}' skipped: {e}")
output = ""
if args.prompt is not None:
output += args.prompt + "\n\n"
total_file_count = 0
for directory in directories:
for glob_pattern in glob_patterns:
dir_output, file_count = dir_to_llm(
output="",
directory=directory,
glob=glob_pattern,
exclude_globs=args.exclude,
exclude_regexes=compiled_regexes,
parse_gitignore=not args.no_gitignore,
include_dotfiles=args.include_dotfiles,
list_only=args.list
)
output += dir_output
total_file_count += file_count
token_count = len(output.split())
if args.copy or args.copy_append:
if not pyperclip:
print("The --copy or --copy-append options require 'pyperclip'. Please install it.")
elif args.copy_append:
original_content = pyperclip.paste()
if original_content and isinstance(original_content, str):
new_content = original_content + "\n\n" + output
else:
new_content = output
pyperclip.copy(new_content)
print(f"Appended {total_file_count} files, {len(output)} bytes, and approximately {token_count} tokens to the clipboard.")
else:
pyperclip.copy(output)
print(f"Copied {total_file_count} files, {len(output)} bytes, and approximately {token_count} tokens to the clipboard.")
else:
print(output)
@amigax
Copy link

amigax commented May 21, 2024

very cool dear.

@stuaxo
Copy link
Author

stuaxo commented May 21, 2024

very cool dear.

🤣

@stuaxo
Copy link
Author

stuaxo commented Sep 10, 2024

@amigax now show me your batch files, hehe 😋

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment