Last active
June 12, 2025 01:19
-
-
Save Parassharmaa/0b831aa55c275b4eb48c97813660f577 to your computer and use it in GitHub Desktop.
Analyze Tokens per files (using tiktoken)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# how to use | |
# python analyze_tokens.py --dir /path/to/your/project --encoding cl100k_base --sort-by tokens --output tree | |
import os | |
import tiktoken | |
from typing import List, Dict, Tuple | |
import argparse | |
import pathspec | |
def load_gitignore_patterns(gitignore_path: str): | |
patterns = [] | |
if os.path.exists(gitignore_path): | |
with open(gitignore_path, 'r', encoding='utf-8', errors='ignore') as f: | |
patterns = f.read().splitlines() | |
return patterns | |
def load_gitignore_spec(gitignore_path: str, extra_patterns=None): | |
patterns = [] | |
if os.path.exists(gitignore_path): | |
with open(gitignore_path, 'r', encoding='utf-8', errors='ignore') as f: | |
patterns = f.read().splitlines() | |
if extra_patterns: | |
patterns.extend(extra_patterns) | |
return pathspec.PathSpec.from_lines('gitwildmatch', patterns) | |
def count_tokens(text: str, encoding_name: str = "cl100k_base") -> int: | |
try: | |
encoding = tiktoken.get_encoding(encoding_name) | |
return len(encoding.encode(text)) | |
except Exception: | |
return len(text.split()) | |
def analyze_file(file_path: str, encoding_name: str = "cl100k_base") -> Tuple[int, int]: | |
try: | |
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: | |
content = f.read() | |
token_count = count_tokens(content, encoding_name) | |
char_count = len(content) | |
return token_count, char_count | |
except Exception as e: | |
print(f"Error reading {file_path}: {e}") | |
return 0, 0 | |
def build_tree_structure(results: List[Dict]) -> Dict: | |
tree = {'files': [], 'dirs': {}, 'tokens': 0, 'chars': 0} | |
for result in results: | |
parts = result['file'].split(os.sep) | |
current = tree | |
for part in parts[:-1]: | |
current = current['dirs'].setdefault(part, {'files': [], 'dirs': {}, 'tokens': 0, 'chars': 0}) | |
if len(parts) == 1: | |
tree['files'].append(result) | |
else: | |
current['files'].append(result) | |
current = tree | |
current['tokens'] += result['tokens'] | |
current['chars'] += result['chars'] | |
for part in parts[:-1]: | |
current = current['dirs'][part] | |
current['tokens'] += result['tokens'] | |
current['chars'] += result['chars'] | |
return tree | |
def print_tree(tree: Dict, prefix: str = "", depth: int = 0, max_depth: int = None): | |
if max_depth is not None and depth > max_depth: | |
return | |
dirs = sorted(tree['dirs'].items()) | |
files = sorted(tree['files'], key=lambda x: x['file']) | |
for i, (dir_name, dir_tree) in enumerate(dirs): | |
is_last_dir = i == len(dirs) - 1 and len(files) == 0 | |
connector = "└── " if is_last_dir else "├── " | |
tokens = dir_tree.get('tokens', 0) | |
chars = dir_tree.get('chars', 0) | |
print(f"{prefix}{connector}{dir_name}/ [{tokens:,} tokens, {chars:,} chars]") | |
extension = " " if is_last_dir else "│ " | |
print_tree(dir_tree, prefix + extension, depth + 1, max_depth) | |
for i, file_info in enumerate(files): | |
is_last = i == len(files) - 1 | |
connector = "└── " if is_last else "├── " | |
filename = os.path.basename(file_info['file']) | |
tokens = file_info['tokens'] | |
chars = file_info['chars'] | |
print(f"{prefix}{connector}{filename} [{tokens:,} tokens, {chars:,} chars]") | |
def walk_with_gitignore(root_dir, extra_patterns=None): | |
""" | |
Walk the directory tree, loading and merging .gitignore patterns from each directory. | |
Yields file paths that are not ignored by any .gitignore in their path. | |
""" | |
PathSpec = pathspec.PathSpec | |
def _walk(current_dir, parent_spec): | |
gitignore_path = os.path.join(current_dir, '.gitignore') | |
patterns = load_gitignore_patterns(gitignore_path) | |
if parent_spec is not None: | |
all_patterns = parent_spec.patterns.copy() | |
else: | |
all_patterns = [] | |
if patterns: | |
all_patterns.extend(patterns) | |
if extra_patterns and current_dir == root_dir: | |
all_patterns.extend(extra_patterns) | |
spec = PathSpec.from_lines('gitwildmatch', all_patterns) | |
try: | |
entries = os.listdir(current_dir) | |
except Exception: | |
return | |
dirs = [] | |
files = [] | |
for entry in entries: | |
full_path = os.path.join(current_dir, entry) | |
rel_path = os.path.relpath(full_path, root_dir) | |
rel_path_git = rel_path.replace(os.sep, '/') # Normalize for .gitignore | |
if os.path.isdir(full_path): | |
if spec.match_file(rel_path_git + '/'): | |
continue | |
dirs.append(entry) | |
else: | |
if spec.match_file(rel_path_git): | |
continue | |
files.append(entry) | |
for file in files: | |
yield os.path.join(current_dir, file) | |
for d in dirs: | |
yield from _walk(os.path.join(current_dir, d), spec) | |
yield from _walk(root_dir, None) | |
def main(): | |
parser = argparse.ArgumentParser(description='Analyze token counts for files in directory') | |
parser.add_argument('--dir', type=str, default=None, help='Directory to analyze (default: current working directory)') | |
parser.add_argument('--encoding', default='cl100k_base', | |
help='Tiktoken encoding to use (default: cl100k_base)') | |
parser.add_argument('--sort-by', choices=['tokens', 'chars', 'name'], default='tokens', | |
help='Sort results by tokens, characters, or filename') | |
parser.add_argument('--min-tokens', type=int, default=0, | |
help='Only show files with at least this many tokens') | |
parser.add_argument('--output', choices=['table', 'csv', 'tree'], default='table', | |
help='Output format') | |
parser.add_argument('--tree-depth', type=int, default=None, | |
help='Maximum depth for tree output (default: unlimited)') | |
args = parser.parse_args() | |
root_dir = args.dir if args.dir else os.getcwd() | |
extra_patterns = ['.git', '__pycache__', '*.pyc', '*.pyo', '*.pyd', '.pytest_cache'] | |
results: List[Dict] = [] | |
total_tokens = 0 | |
total_chars = 0 | |
total_files = 0 | |
print(f"Analyzing files with encoding: {args.encoding}") | |
print("Scanning files...") | |
files_to_analyze = list(walk_with_gitignore(root_dir, extra_patterns=extra_patterns)) | |
print(f"Found {len(files_to_analyze)} files to analyze.") | |
for file_path in files_to_analyze: | |
token_count, char_count = analyze_file(file_path, args.encoding) | |
if token_count >= args.min_tokens: | |
rel_path = os.path.relpath(file_path, root_dir) | |
results.append({ | |
'file': rel_path, | |
'tokens': token_count, | |
'chars': char_count | |
}) | |
total_tokens += token_count | |
total_chars += char_count | |
total_files += 1 | |
if args.sort_by == 'tokens': | |
results.sort(key=lambda x: x['tokens'], reverse=True) | |
elif args.sort_by == 'chars': | |
results.sort(key=lambda x: x['chars'], reverse=True) | |
else: | |
results.sort(key=lambda x: x['file']) | |
if args.output == 'csv': | |
print("File,Tokens,Characters") | |
for result in results: | |
print(f"{result['file']},{result['tokens']},{result['chars']}") | |
print(f"TOTAL,{total_tokens},{total_chars}") | |
elif args.output == 'tree': | |
print(f"\nProject Structure with Token Counts (encoding: {args.encoding})") | |
print("=" * 60) | |
tree = build_tree_structure(results) | |
print(f"📁 . [{total_tokens:,} tokens, {total_chars:,} chars]") | |
print_tree(tree, max_depth=args.tree_depth) | |
print("\n" + "=" * 60) | |
print(f"Summary: {total_files} files analyzed") | |
print(f"Total tokens: {total_tokens:,}") | |
print(f"Total characters: {total_chars:,}") | |
print(f"Average tokens per file: {total_tokens / max(total_files, 1):.1f}") | |
else: | |
print(f"\n{'File':<50} {'Tokens':>10} {'Characters':>12}") | |
print("-" * 74) | |
for result in results: | |
file_display = result['file'] | |
if len(file_display) > 47: | |
file_display = "..." + file_display[-44:] | |
print(f"{file_display:<50} {result['tokens']:>10,} {result['chars']:>12,}") | |
print("-" * 74) | |
print(f"{'TOTAL':<50} {total_tokens:>10,} {total_chars:>12,}") | |
print(f"\nAnalyzed {total_files} files") | |
print(f"Average tokens per file: {total_tokens / max(total_files, 1):.1f}") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment