Skip to content

Instantly share code, notes, and snippets.

@Parassharmaa
Last active June 12, 2025 01:19
Show Gist options
  • Save Parassharmaa/0b831aa55c275b4eb48c97813660f577 to your computer and use it in GitHub Desktop.
Save Parassharmaa/0b831aa55c275b4eb48c97813660f577 to your computer and use it in GitHub Desktop.
Analyze Tokens per files (using tiktoken)
# how to use
# python analyze_tokens.py --dir /path/to/your/project --encoding cl100k_base --sort-by tokens --output tree
import os
import tiktoken
from typing import List, Dict, Tuple
import argparse
import pathspec
def load_gitignore_patterns(gitignore_path: str):
patterns = []
if os.path.exists(gitignore_path):
with open(gitignore_path, 'r', encoding='utf-8', errors='ignore') as f:
patterns = f.read().splitlines()
return patterns
def load_gitignore_spec(gitignore_path: str, extra_patterns=None):
patterns = []
if os.path.exists(gitignore_path):
with open(gitignore_path, 'r', encoding='utf-8', errors='ignore') as f:
patterns = f.read().splitlines()
if extra_patterns:
patterns.extend(extra_patterns)
return pathspec.PathSpec.from_lines('gitwildmatch', patterns)
def count_tokens(text: str, encoding_name: str = "cl100k_base") -> int:
try:
encoding = tiktoken.get_encoding(encoding_name)
return len(encoding.encode(text))
except Exception:
return len(text.split())
def analyze_file(file_path: str, encoding_name: str = "cl100k_base") -> Tuple[int, int]:
try:
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
content = f.read()
token_count = count_tokens(content, encoding_name)
char_count = len(content)
return token_count, char_count
except Exception as e:
print(f"Error reading {file_path}: {e}")
return 0, 0
def build_tree_structure(results: List[Dict]) -> Dict:
tree = {'files': [], 'dirs': {}, 'tokens': 0, 'chars': 0}
for result in results:
parts = result['file'].split(os.sep)
current = tree
for part in parts[:-1]:
current = current['dirs'].setdefault(part, {'files': [], 'dirs': {}, 'tokens': 0, 'chars': 0})
if len(parts) == 1:
tree['files'].append(result)
else:
current['files'].append(result)
current = tree
current['tokens'] += result['tokens']
current['chars'] += result['chars']
for part in parts[:-1]:
current = current['dirs'][part]
current['tokens'] += result['tokens']
current['chars'] += result['chars']
return tree
def print_tree(tree: Dict, prefix: str = "", depth: int = 0, max_depth: int = None):
if max_depth is not None and depth > max_depth:
return
dirs = sorted(tree['dirs'].items())
files = sorted(tree['files'], key=lambda x: x['file'])
for i, (dir_name, dir_tree) in enumerate(dirs):
is_last_dir = i == len(dirs) - 1 and len(files) == 0
connector = "└── " if is_last_dir else "├── "
tokens = dir_tree.get('tokens', 0)
chars = dir_tree.get('chars', 0)
print(f"{prefix}{connector}{dir_name}/ [{tokens:,} tokens, {chars:,} chars]")
extension = " " if is_last_dir else "│ "
print_tree(dir_tree, prefix + extension, depth + 1, max_depth)
for i, file_info in enumerate(files):
is_last = i == len(files) - 1
connector = "└── " if is_last else "├── "
filename = os.path.basename(file_info['file'])
tokens = file_info['tokens']
chars = file_info['chars']
print(f"{prefix}{connector}{filename} [{tokens:,} tokens, {chars:,} chars]")
def walk_with_gitignore(root_dir, extra_patterns=None):
"""
Walk the directory tree, loading and merging .gitignore patterns from each directory.
Yields file paths that are not ignored by any .gitignore in their path.
"""
PathSpec = pathspec.PathSpec
def _walk(current_dir, parent_spec):
gitignore_path = os.path.join(current_dir, '.gitignore')
patterns = load_gitignore_patterns(gitignore_path)
if parent_spec is not None:
all_patterns = parent_spec.patterns.copy()
else:
all_patterns = []
if patterns:
all_patterns.extend(patterns)
if extra_patterns and current_dir == root_dir:
all_patterns.extend(extra_patterns)
spec = PathSpec.from_lines('gitwildmatch', all_patterns)
try:
entries = os.listdir(current_dir)
except Exception:
return
dirs = []
files = []
for entry in entries:
full_path = os.path.join(current_dir, entry)
rel_path = os.path.relpath(full_path, root_dir)
rel_path_git = rel_path.replace(os.sep, '/') # Normalize for .gitignore
if os.path.isdir(full_path):
if spec.match_file(rel_path_git + '/'):
continue
dirs.append(entry)
else:
if spec.match_file(rel_path_git):
continue
files.append(entry)
for file in files:
yield os.path.join(current_dir, file)
for d in dirs:
yield from _walk(os.path.join(current_dir, d), spec)
yield from _walk(root_dir, None)
def main():
parser = argparse.ArgumentParser(description='Analyze token counts for files in directory')
parser.add_argument('--dir', type=str, default=None, help='Directory to analyze (default: current working directory)')
parser.add_argument('--encoding', default='cl100k_base',
help='Tiktoken encoding to use (default: cl100k_base)')
parser.add_argument('--sort-by', choices=['tokens', 'chars', 'name'], default='tokens',
help='Sort results by tokens, characters, or filename')
parser.add_argument('--min-tokens', type=int, default=0,
help='Only show files with at least this many tokens')
parser.add_argument('--output', choices=['table', 'csv', 'tree'], default='table',
help='Output format')
parser.add_argument('--tree-depth', type=int, default=None,
help='Maximum depth for tree output (default: unlimited)')
args = parser.parse_args()
root_dir = args.dir if args.dir else os.getcwd()
extra_patterns = ['.git', '__pycache__', '*.pyc', '*.pyo', '*.pyd', '.pytest_cache']
results: List[Dict] = []
total_tokens = 0
total_chars = 0
total_files = 0
print(f"Analyzing files with encoding: {args.encoding}")
print("Scanning files...")
files_to_analyze = list(walk_with_gitignore(root_dir, extra_patterns=extra_patterns))
print(f"Found {len(files_to_analyze)} files to analyze.")
for file_path in files_to_analyze:
token_count, char_count = analyze_file(file_path, args.encoding)
if token_count >= args.min_tokens:
rel_path = os.path.relpath(file_path, root_dir)
results.append({
'file': rel_path,
'tokens': token_count,
'chars': char_count
})
total_tokens += token_count
total_chars += char_count
total_files += 1
if args.sort_by == 'tokens':
results.sort(key=lambda x: x['tokens'], reverse=True)
elif args.sort_by == 'chars':
results.sort(key=lambda x: x['chars'], reverse=True)
else:
results.sort(key=lambda x: x['file'])
if args.output == 'csv':
print("File,Tokens,Characters")
for result in results:
print(f"{result['file']},{result['tokens']},{result['chars']}")
print(f"TOTAL,{total_tokens},{total_chars}")
elif args.output == 'tree':
print(f"\nProject Structure with Token Counts (encoding: {args.encoding})")
print("=" * 60)
tree = build_tree_structure(results)
print(f"📁 . [{total_tokens:,} tokens, {total_chars:,} chars]")
print_tree(tree, max_depth=args.tree_depth)
print("\n" + "=" * 60)
print(f"Summary: {total_files} files analyzed")
print(f"Total tokens: {total_tokens:,}")
print(f"Total characters: {total_chars:,}")
print(f"Average tokens per file: {total_tokens / max(total_files, 1):.1f}")
else:
print(f"\n{'File':<50} {'Tokens':>10} {'Characters':>12}")
print("-" * 74)
for result in results:
file_display = result['file']
if len(file_display) > 47:
file_display = "..." + file_display[-44:]
print(f"{file_display:<50} {result['tokens']:>10,} {result['chars']:>12,}")
print("-" * 74)
print(f"{'TOTAL':<50} {total_tokens:>10,} {total_chars:>12,}")
print(f"\nAnalyzed {total_files} files")
print(f"Average tokens per file: {total_tokens / max(total_files, 1):.1f}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment