Last active
March 9, 2024 09:33
-
-
Save tanbro/c4dd926f9a709755e68df4352fb48b15 to your computer and use it in GitHub Desktop.
Export all source code files into a single HTML file with Pygments syntax highlight and remove all comments and empty lines.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Export all source code files into a single HTML file with Pygments syntax highlight and remove all comments and empty lines. | |
""" | |
import argparse | |
import fnmatch | |
import os | |
import sys | |
from functools import lru_cache | |
from itertools import chain | |
from pathlib import Path | |
from textwrap import dedent, shorten | |
from jinja2 import Template | |
from pathspec.gitignore import GitIgnoreSpec | |
from pygments import highlight | |
from pygments.filter import simplefilter | |
from pygments.formatters import HtmlFormatter, NullFormatter | |
from pygments.lexers import get_lexer_for_filename | |
from pygments.styles import get_all_styles | |
from pygments.token import Comment, String | |
from pygments.util import ClassNotFound | |
DEFAULT_HTML_TEMPLATE = Template( | |
dedent( | |
""" | |
<!DOCTYPE html> | |
<html> | |
<head> | |
<title>{{ title if title }}</title> | |
<meta charset="utf-8"/> | |
<meta name="viewport" content="width=device-width, initial-scale=1"> | |
<style type="text/css"> | |
{{ style_defs }} | |
</style> | |
</head> | |
<body> | |
{% for file_name, lexer_name, highlight in highlights %} | |
<article> | |
<h2> | |
{{ file_name }} | |
<small>({{ lexer_name }})</small> | |
</h2> | |
<article class="hll"> | |
{{ highlight }} | |
</article> | |
</article> | |
{% endfor %} | |
<body> | |
""" | |
).strip() | |
) | |
_lines_dict = dict() | |
def set_args(): | |
########## | |
# define # | |
########## | |
parser = argparse.ArgumentParser(description=__doc__) | |
parser.add_argument( | |
"--dir", | |
"-d", | |
type=str, | |
help="Top directory to search code source files. (default: <CWD>)", | |
) | |
parser.add_argument( | |
"--sub-dir", | |
"-s", | |
type=str, | |
action="append", | |
help="Scan for source files only in these sub-directories. Can be specified multiple times. (default: <CWD>)", | |
) | |
parser.add_argument( | |
"output", | |
metavar="OUTPUT", | |
nargs="?", | |
type=argparse.FileType("w", encoding="utf-8"), | |
default="-", | |
help="Write output HTML file here. (default: <STDOUT>)", | |
) | |
parser.add_argument( | |
"--exclude", | |
"-x", | |
type=str, | |
action="append", | |
help="Exclude files match theses patterns in `glob` expression. Can be specified multiple times", | |
) | |
if sys.version_info >= (3, 9): | |
parser.add_argument( | |
"--gitignore", | |
action=argparse.BooleanOptionalAction, | |
default=True, | |
help="Follow `.gitignore` files for excluding", | |
) | |
else: | |
parser.add_argument( | |
"--no-gitignore", | |
action="store_true", | |
help="Do NOT follow `.gitignore` files for excluding", | |
) | |
if sys.version_info >= (3, 9): | |
parser.add_argument( | |
"--null-formatter", | |
action=argparse.BooleanOptionalAction, | |
default=False, | |
help="Null formatter output the text unchanged without any formatting.", | |
) | |
else: | |
parser.add_argument( | |
"--null-formatter", | |
action="store_true", | |
help="Null formatter output the text unchanged without any formatting.", | |
) | |
parser.add_argument( | |
"--style", | |
type=str, | |
choices=list(get_all_styles()), | |
help="Syntax highlight style", | |
) | |
parser.add_argument( | |
"--linenos", | |
type=str, | |
choices=["table", "inline"], | |
help="If set to 'table', output line numbers as a table with two cells, one containing the line numbers, the other the whole code. " # noqa | |
"This is copy-and-paste-friendly, but may cause alignment problems with some browsers or fonts. " | |
" If set to 'inline', the line numbers will be integrated in the <pre> tag that contains the code", | |
) | |
parser.add_argument( | |
"--template", | |
type=argparse.FileType("r", encoding="utf-8"), | |
help="A custom Jinja2 template file to render the output HTML file", | |
) | |
parser.add_argument("--title", "-t", type=str, help="Title of the HTML document") | |
######### | |
# parse # | |
######### | |
args = parser.parse_args() | |
############ | |
# validate # | |
############ | |
if not args.dir: | |
args.dir = "." | |
if not args.sub_dir: | |
args.sub_dir = [] | |
return args | |
@simplefilter | |
def _filter_no_comment(self, lexer, stream, options): | |
yield from ( | |
(ttype, value) | |
for ttype, value in stream | |
if not ( | |
any( | |
ttype is t_ | |
for t_ in ( | |
Comment, | |
# Comment.Hashbang, | |
Comment.Multiline, | |
# Comment.Preproc, | |
# Comment.PreprocFile, | |
Comment.Single, | |
# Comment.Special, | |
) | |
) | |
) | |
) | |
@simplefilter | |
def _filter_no_docstring(self, lexer, stream, options): | |
yield from ((ttype, value) for ttype, value in stream if ttype is not String.Doc) | |
_FILTERS = (_filter_no_comment(), _filter_no_docstring()) # type: ignore | |
@lru_cache | |
def make_git_ignore_spec(gitignore_file): | |
with open(gitignore_file, encoding="utf-8") as fp: | |
return GitIgnoreSpec.from_lines(fp) | |
def main(args): | |
formatter_options = dict(wrapcode=True) | |
if args.style: | |
formatter_options.update(style=args.style) | |
if args.linenos: | |
formatter_options.update(linenos=args.linenos) | |
fmt_html = HtmlFormatter(**formatter_options) # type: ignore | |
fmt_null = NullFormatter() | |
def _gen(): | |
lines_total = 0 | |
counter = 0 | |
top_path = Path(args.dir).resolve() | |
if args.sub_dir: | |
sub_paths = [top_path.joinpath(pth).resolve() for pth in args.sub_dir] | |
if any(pth <= top_path for pth in sub_paths): | |
raise ValueError( | |
"Sub-directories can not be smaller than or equal to top dir." | |
) | |
walker = chain.from_iterable( | |
os.walk(top_path.joinpath(pth).resolve()) for pth in sub_paths | |
) | |
else: | |
walker = os.walk(top_path) | |
for dirpath, _, filenames in walker: | |
for filename in filenames: | |
pth = Path(dirpath, filename).resolve() | |
filename = Path( | |
os.path.normpath(os.path.join(dirpath, filename)) | |
).relative_to(top_path) | |
################ | |
# excluding ... | |
# ignore none-files | |
if not pth.is_file(): | |
continue | |
# ignore symlinks not in the dir | |
try: | |
subdir_parts = pth.parent.relative_to(top_path).parts | |
except ValueError: | |
continue | |
# ignore hidden files | |
if pth.name.startswith("."): | |
continue | |
# ignore hidden dirs (except specified ones) | |
if any(str(part).startswith(".") for part in subdir_parts): | |
continue | |
# exclude files from cmd args | |
if args.exclude: | |
if any(fnmatch.fnmatch(str(filename), pat) for pat in args.exclude): | |
continue | |
################ | |
# git-ignore | |
if ( | |
args.gitignore | |
if sys.version_info >= (3, 9) | |
else not args.no_gitignore | |
): | |
is_ignore = False | |
for parent_dir in pth.parents: | |
if parent_dir < top_path: | |
break | |
pth_gitignore = parent_dir.joinpath(".gitignore") | |
if not pth_gitignore.is_file(): | |
continue | |
ignore_spec = make_git_ignore_spec(pth_gitignore) | |
is_ignore = ignore_spec.match_file(filename.as_posix()) | |
if is_ignore: | |
break | |
if is_ignore: | |
continue | |
################### | |
# read source file | |
with pth.open("rb") as fp: | |
code = fp.read() | |
# ignore empty files | |
if not code: | |
continue | |
try: | |
lexer = get_lexer_for_filename(filename, code) | |
except ClassNotFound: | |
# ignore no-supported source files | |
continue | |
else: | |
for filter_ in _FILTERS: | |
lexer.add_filter(filter_) | |
lines = [ | |
line | |
for line in "".join( | |
s for _, s in lexer.get_tokens(code) # type: ignore | |
).splitlines() | |
if line.strip() | |
] | |
# ignore empty source files | |
if not lines: | |
continue | |
code = "\n".join(lines) | |
counter += 1 | |
lines_cnt = len(lines) | |
lines_total += lines_cnt | |
_lines_dict[lexer.name] = _lines_dict.get(lexer.name, 0) + lines_cnt | |
print( | |
f"[{counter:05d}] " | |
f"{shorten(str(filename), 88):88} " | |
f"{shorten(lexer.name, 24):24} " # type: ignore | |
f"lines: {lines_cnt:3,d}/{lines_total:3,d}", | |
file=sys.stderr, | |
) | |
formatted = highlight( | |
code, lexer, fmt_null if args.null_formatter else fmt_html | |
) | |
yield filename, lexer.name, formatted # type: ignore | |
if args.null_formatter: | |
for filename, lexer_name, formatted in _gen(): | |
print(f"{filename} ({lexer_name})", file=args.output) | |
print(formatted, file=args.output) | |
else: | |
context = dict( | |
style_defs=fmt_html.get_style_defs(), | |
highlights=_gen(), | |
title=args.title, | |
) | |
tpl = Template(args.template.read()) if args.template else DEFAULT_HTML_TEMPLATE | |
tpl.stream(**context).dump(args.output) | |
print(file=sys.stderr) | |
print("=" * 79, file=sys.stderr) | |
for k, v in _lines_dict.items(): | |
print( | |
f"{shorten(k, 24):24} " f"lines: {v:3,d}", | |
file=sys.stderr, | |
) | |
print("=" * 79, file=sys.stderr) | |
if __name__ == "__main__": | |
exit(main(set_args())) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Jinja2 | |
Pygments | |
pathspec |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment