Parassharmaa · June 12, 2025 01:19
diff --git a/analyze_tokens.py b/analyze_tokens.py
 # how to use
 # python analyze_tokens.py --dir /path/to/your/project --encoding cl100k_base --sort-by tokens --output tree

 import os

 import tiktoken
 from typing import List, Dict, Tuple
 import argparse
 import pathspec


 def load_gitignore_patterns(gitignore_path: str):
    patterns = []
    if os.path.exists(gitignore_path):
        with open(gitignore_path, 'r', encoding='utf-8', errors='ignore') as f:
            patterns = f.read().splitlines()
    return patterns


 def load_gitignore_spec(gitignore_path: str, extra_patterns=None):
    patterns = []
    if os.path.exists(gitignore_path):
        with open(gitignore_path, 'r', encoding='utf-8', errors='ignore') as f:
            patterns = f.read().splitlines()
    if extra_patterns:
        patterns.extend(extra_patterns)
    return pathspec.PathSpec.from_lines('gitwildmatch', patterns)


 def count_tokens(text: str, encoding_name: str = "cl100k_base") -> int:
    try:
        encoding = tiktoken.get_encoding(encoding_name)
        return len(encoding.encode(text))
    except Exception:
        return len(text.split())


 def analyze_file(file_path: str, encoding_name: str = "cl100k_base") -> Tuple[int, int]:
    try:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            content = f.read()
            token_count = count_tokens(content, encoding_name)
            char_count = len(content)
            return token_count, char_count
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return 0, 0


 def build_tree_structure(results: List[Dict]) -> Dict:
    tree = {'files': [], 'dirs': {}, 'tokens': 0, 'chars': 0}
    for result in results:
        parts = result['file'].split(os.sep)
        current = tree
        for part in parts[:-1]:
            current = current['dirs'].setdefault(part, {'files': [], 'dirs': {}, 'tokens': 0, 'chars': 0})
        if len(parts) == 1:
            tree['files'].append(result)
        else:
            current['files'].append(result)
        current = tree
        current['tokens'] += result['tokens']
        current['chars'] += result['chars']
        for part in parts[:-1]:
            current = current['dirs'][part]
            current['tokens'] += result['tokens']
            current['chars'] += result['chars']
    return tree


 def print_tree(tree: Dict, prefix: str = "", depth: int = 0, max_depth: int = None):
    if max_depth is not None and depth > max_depth:
        return
    dirs = sorted(tree['dirs'].items())
    files = sorted(tree['files'], key=lambda x: x['file'])
    for i, (dir_name, dir_tree) in enumerate(dirs):
        is_last_dir = i == len(dirs) - 1 and len(files) == 0
        connector = "└── " if is_last_dir else "├── "
        tokens = dir_tree.get('tokens', 0)
        chars = dir_tree.get('chars', 0)
        print(f"{prefix}{connector}{dir_name}/ [{tokens:,} tokens, {chars:,} chars]")
        extension = "    " if is_last_dir else "│   "
        print_tree(dir_tree, prefix + extension, depth + 1, max_depth)
    for i, file_info in enumerate(files):
        is_last = i == len(files) - 1
        connector = "└── " if is_last else "├── "
        filename = os.path.basename(file_info['file'])
        tokens = file_info['tokens']
        chars = file_info['chars']
        print(f"{prefix}{connector}{filename} [{tokens:,} tokens, {chars:,} chars]")


 def walk_with_gitignore(root_dir, extra_patterns=None):
    """
    Walk the directory tree, loading and merging .gitignore patterns from each directory.
    Yields file paths that are not ignored by any .gitignore in their path.
    """
    PathSpec = pathspec.PathSpec
    def _walk(current_dir, parent_spec):
        gitignore_path = os.path.join(current_dir, '.gitignore')
        patterns = load_gitignore_patterns(gitignore_path)
        if parent_spec is not None:
            all_patterns = parent_spec.patterns.copy()
        else:
            all_patterns = []
        if patterns:
            all_patterns.extend(patterns)
        if extra_patterns and current_dir == root_dir:
            all_patterns.extend(extra_patterns)
        spec = PathSpec.from_lines('gitwildmatch', all_patterns)
        try:
            entries = os.listdir(current_dir)
        except Exception:
            return
        dirs = []
        files = []
        for entry in entries:
            full_path = os.path.join(current_dir, entry)
            rel_path = os.path.relpath(full_path, root_dir)
            rel_path_git = rel_path.replace(os.sep, '/')  # Normalize for .gitignore
            if os.path.isdir(full_path):
                if spec.match_file(rel_path_git + '/'):
                    continue
                dirs.append(entry)
            else:
                if spec.match_file(rel_path_git):
                    continue
                files.append(entry)
        for file in files:
            yield os.path.join(current_dir, file)
        for d in dirs:
            yield from _walk(os.path.join(current_dir, d), spec)
    yield from _walk(root_dir, None)


 def main():
    parser = argparse.ArgumentParser(description='Analyze token counts for files in directory')
    parser.add_argument('--dir', type=str, default=None, help='Directory to analyze (default: current working directory)')
    parser.add_argument('--encoding', default='cl100k_base', 
                       help='Tiktoken encoding to use (default: cl100k_base)')
    parser.add_argument('--sort-by', choices=['tokens', 'chars', 'name'], default='tokens',
                       help='Sort results by tokens, characters, or filename')
    parser.add_argument('--min-tokens', type=int, default=0,
                       help='Only show files with at least this many tokens')
    parser.add_argument('--output', choices=['table', 'csv', 'tree'], default='table',
                       help='Output format')
    parser.add_argument('--tree-depth', type=int, default=None,
                       help='Maximum depth for tree output (default: unlimited)')
    args = parser.parse_args()
    root_dir = args.dir if args.dir else os.getcwd()
    extra_patterns = ['.git', '__pycache__', '*.pyc', '*.pyo', '*.pyd', '.pytest_cache']
    results: List[Dict] = []
    total_tokens = 0
    total_chars = 0
    total_files = 0
    print(f"Analyzing files with encoding: {args.encoding}")
    print("Scanning files...")
    files_to_analyze = list(walk_with_gitignore(root_dir, extra_patterns=extra_patterns))
    print(f"Found {len(files_to_analyze)} files to analyze.")
    for file_path in files_to_analyze:
        token_count, char_count = analyze_file(file_path, args.encoding)
        if token_count >= args.min_tokens:
            rel_path = os.path.relpath(file_path, root_dir)
            results.append({
                'file': rel_path,
                'tokens': token_count,
                'chars': char_count
            })
            total_tokens += token_count
            total_chars += char_count
            total_files += 1
    if args.sort_by == 'tokens':
        results.sort(key=lambda x: x['tokens'], reverse=True)
    elif args.sort_by == 'chars':
        results.sort(key=lambda x: x['chars'], reverse=True)
    else:
        results.sort(key=lambda x: x['file'])
    if args.output == 'csv':
        print("File,Tokens,Characters")
        for result in results:
            print(f"{result['file']},{result['tokens']},{result['chars']}")
        print(f"TOTAL,{total_tokens},{total_chars}")
    elif args.output == 'tree':
        print(f"\nProject Structure with Token Counts (encoding: {args.encoding})")
        print("=" * 60)
        tree = build_tree_structure(results)
        print(f"📁 . [{total_tokens:,} tokens, {total_chars:,} chars]")
        print_tree(tree, max_depth=args.tree_depth)
        print("\n" + "=" * 60)
        print(f"Summary: {total_files} files analyzed")
        print(f"Total tokens: {total_tokens:,}")
        print(f"Total characters: {total_chars:,}")
        print(f"Average tokens per file: {total_tokens / max(total_files, 1):.1f}")
    else:
        print(f"\n{'File':<50} {'Tokens':>10} {'Characters':>12}")
        print("-" * 74)
        for result in results:
            file_display = result['file']
            if len(file_display) > 47:
                file_display = "..." + file_display[-44:]
            print(f"{file_display:<50} {result['tokens']:>10,} {result['chars']:>12,}")
        print("-" * 74)
        print(f"{'TOTAL':<50} {total_tokens:>10,} {total_chars:>12,}")
        print(f"\nAnalyzed {total_files} files")
        print(f"Average tokens per file: {total_tokens / max(total_files, 1):.1f}")


 if __name__ == "__main__":
    main()
	# how to use
	# python analyze_tokens.py --dir /path/to/your/project --encoding cl100k_base --sort-by tokens --output tree

	import os

	import tiktoken
	from typing import List, Dict, Tuple
	import argparse
	import pathspec


	def load_gitignore_patterns(gitignore_path: str):
	patterns = []
	if os.path.exists(gitignore_path):
	with open(gitignore_path, 'r', encoding='utf-8', errors='ignore') as f:
	patterns = f.read().splitlines()
	return patterns


	def load_gitignore_spec(gitignore_path: str, extra_patterns=None):
	patterns = []
	if os.path.exists(gitignore_path):
	with open(gitignore_path, 'r', encoding='utf-8', errors='ignore') as f:
	patterns = f.read().splitlines()
	if extra_patterns:
	patterns.extend(extra_patterns)
	return pathspec.PathSpec.from_lines('gitwildmatch', patterns)


	def count_tokens(text: str, encoding_name: str = "cl100k_base") -> int:
	try:
	encoding = tiktoken.get_encoding(encoding_name)
	return len(encoding.encode(text))
	except Exception:
	return len(text.split())


	def analyze_file(file_path: str, encoding_name: str = "cl100k_base") -> Tuple[int, int]:
	try:
	with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
	content = f.read()
	token_count = count_tokens(content, encoding_name)
	char_count = len(content)
	return token_count, char_count
	except Exception as e:
	print(f"Error reading {file_path}: {e}")
	return 0, 0


	def build_tree_structure(results: List[Dict]) -> Dict:
	tree = {'files': [], 'dirs': {}, 'tokens': 0, 'chars': 0}
	for result in results:
	parts = result['file'].split(os.sep)
	current = tree
	for part in parts[:-1]:
	current = current['dirs'].setdefault(part, {'files': [], 'dirs': {}, 'tokens': 0, 'chars': 0})
	if len(parts) == 1:
	tree['files'].append(result)
	else:
	current['files'].append(result)
	current = tree
	current['tokens'] += result['tokens']
	current['chars'] += result['chars']
	for part in parts[:-1]:
	current = current['dirs'][part]
	current['tokens'] += result['tokens']
	current['chars'] += result['chars']
	return tree


	def print_tree(tree: Dict, prefix: str = "", depth: int = 0, max_depth: int = None):
	if max_depth is not None and depth > max_depth:
	return
	dirs = sorted(tree['dirs'].items())
	files = sorted(tree['files'], key=lambda x: x['file'])
	for i, (dir_name, dir_tree) in enumerate(dirs):
	is_last_dir = i == len(dirs) - 1 and len(files) == 0
	connector = "└── " if is_last_dir else "├── "
	tokens = dir_tree.get('tokens', 0)
	chars = dir_tree.get('chars', 0)
	print(f"{prefix}{connector}{dir_name}/ [{tokens:,} tokens, {chars:,} chars]")
	extension = " " if is_last_dir else "│ "
	print_tree(dir_tree, prefix + extension, depth + 1, max_depth)
	for i, file_info in enumerate(files):
	is_last = i == len(files) - 1
	connector = "└── " if is_last else "├── "
	filename = os.path.basename(file_info['file'])
	tokens = file_info['tokens']
	chars = file_info['chars']
	print(f"{prefix}{connector}{filename} [{tokens:,} tokens, {chars:,} chars]")


	def walk_with_gitignore(root_dir, extra_patterns=None):
	"""
	Walk the directory tree, loading and merging .gitignore patterns from each directory.
	Yields file paths that are not ignored by any .gitignore in their path.
	"""
	PathSpec = pathspec.PathSpec
	def _walk(current_dir, parent_spec):
	gitignore_path = os.path.join(current_dir, '.gitignore')
	patterns = load_gitignore_patterns(gitignore_path)
	if parent_spec is not None:
	all_patterns = parent_spec.patterns.copy()
	else:
	all_patterns = []
	if patterns:
	all_patterns.extend(patterns)
	if extra_patterns and current_dir == root_dir:
	all_patterns.extend(extra_patterns)
	spec = PathSpec.from_lines('gitwildmatch', all_patterns)
	try:
	entries = os.listdir(current_dir)
	except Exception:
	return
	dirs = []
	files = []
	for entry in entries:
	full_path = os.path.join(current_dir, entry)
	rel_path = os.path.relpath(full_path, root_dir)
	rel_path_git = rel_path.replace(os.sep, '/') # Normalize for .gitignore
	if os.path.isdir(full_path):
	if spec.match_file(rel_path_git + '/'):
	continue
	dirs.append(entry)
	else:
	if spec.match_file(rel_path_git):
	continue
	files.append(entry)
	for file in files:
	yield os.path.join(current_dir, file)
	for d in dirs:
	yield from _walk(os.path.join(current_dir, d), spec)
	yield from _walk(root_dir, None)


	def main():
	parser = argparse.ArgumentParser(description='Analyze token counts for files in directory')
	parser.add_argument('--dir', type=str, default=None, help='Directory to analyze (default: current working directory)')
	parser.add_argument('--encoding', default='cl100k_base',
	help='Tiktoken encoding to use (default: cl100k_base)')
	parser.add_argument('--sort-by', choices=['tokens', 'chars', 'name'], default='tokens',
	help='Sort results by tokens, characters, or filename')
	parser.add_argument('--min-tokens', type=int, default=0,
	help='Only show files with at least this many tokens')
	parser.add_argument('--output', choices=['table', 'csv', 'tree'], default='table',
	help='Output format')
	parser.add_argument('--tree-depth', type=int, default=None,
	help='Maximum depth for tree output (default: unlimited)')
	args = parser.parse_args()
	root_dir = args.dir if args.dir else os.getcwd()
	extra_patterns = ['.git', '__pycache__', '.pyc', '.pyo', '*.pyd', '.pytest_cache']
	results: List[Dict] = []
	total_tokens = 0
	total_chars = 0
	total_files = 0
	print(f"Analyzing files with encoding: {args.encoding}")
	print("Scanning files...")
	files_to_analyze = list(walk_with_gitignore(root_dir, extra_patterns=extra_patterns))
	print(f"Found {len(files_to_analyze)} files to analyze.")
	for file_path in files_to_analyze:
	token_count, char_count = analyze_file(file_path, args.encoding)
	if token_count >= args.min_tokens:
	rel_path = os.path.relpath(file_path, root_dir)
	results.append({
	'file': rel_path,
	'tokens': token_count,
	'chars': char_count
	})
	total_tokens += token_count
	total_chars += char_count
	total_files += 1
	if args.sort_by == 'tokens':
	results.sort(key=lambda x: x['tokens'], reverse=True)
	elif args.sort_by == 'chars':
	results.sort(key=lambda x: x['chars'], reverse=True)
	else:
	results.sort(key=lambda x: x['file'])
	if args.output == 'csv':
	print("File,Tokens,Characters")
	for result in results:
	print(f"{result['file']},{result['tokens']},{result['chars']}")
	print(f"TOTAL,{total_tokens},{total_chars}")
	elif args.output == 'tree':
	print(f"\nProject Structure with Token Counts (encoding: {args.encoding})")
	print("=" * 60)
	tree = build_tree_structure(results)
	print(f"📁 . [{total_tokens:,} tokens, {total_chars:,} chars]")
	print_tree(tree, max_depth=args.tree_depth)
	print("\n" + "=" * 60)
	print(f"Summary: {total_files} files analyzed")
	print(f"Total tokens: {total_tokens:,}")
	print(f"Total characters: {total_chars:,}")
	print(f"Average tokens per file: {total_tokens / max(total_files, 1):.1f}")
	else:
	print(f"\n{'File':<50} {'Tokens':>10} {'Characters':>12}")
	print("-" * 74)
	for result in results:
	file_display = result['file']
	if len(file_display) > 47:
	file_display = "..." + file_display[-44:]
	print(f"{file_display:<50} {result['tokens']:>10,} {result['chars']:>12,}")
	print("-" * 74)
	print(f"{'TOTAL':<50} {total_tokens:>10,} {total_chars:>12,}")
	print(f"\nAnalyzed {total_files} files")
	print(f"Average tokens per file: {total_tokens / max(total_files, 1):.1f}")


	if __name__ == "__main__":
	main()