Last active
May 27, 2025 20:01
-
-
Save jkulhanek/b48f12e98b6ecac0cb698c0a249815d3 to your computer and use it in GitHub Desktop.
Simplify LaTeX project by removing unused files and comment and downscale and crop images.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
This tool simplifies a LaTeX project by removing unused files and removing comments. | |
It also merges tex files into a single file and optionally crops and downscales images | |
to reduce the size of the output. It is useful when you want to prepare an ArXiv submission. | |
Usage: | |
export-latex.py <path> <output> [options] | |
Arguments: | |
<path> Path to the input ".tex" file. Can be inside a zip file | |
<output> Path to output directory/zip-file. If it exists, it will be deleted. | |
Options: | |
--use-bbl Use the .bbl file instead of the .bib file. (for arxiv) | |
--overwrite Override output file if it exists. | |
--pdf Also output pdf file. | |
--ppi=<ppi> Automatically resize images to the specified pixels per inch (PPI). | |
--no-crop Do not crop images (keep full size, not scaling is applied on cropped images). | |
--no-pdf Skip generating PDF file | |
--no-compress-pdf By default ghostscript is used to compress included PDF files. | |
--verbose Print detailed log. | |
Notes: | |
The script will try to find all required files by compiling the LaTeX file. | |
It relies on the latexmk tool, which must be installed. Both the input and | |
the output can be a directory or a zip file. Therefore the script can be used | |
on the downloaded Overleaf project zip file. And the output archive can be | |
directly uploaded to ArXiv. | |
Examples: | |
export-latex.py my_paper.tex my_paper_arxiv | |
export-latex.py my_paper.tex my_paper_arxiv.zip --use-bbl --pdf | |
export-latex.py overleaf_archive.zip/my_paper.tex my_paper_arxiv.zip --ppi=300 | |
MIT License | |
Copyright (c) 2023 Jonas Kulhanek | |
Permission is hereby granted, free of charge, to any person obtaining a copy | |
of this software and associated documentation files (the "Software"), to deal | |
in the Software without restriction, including without limitation the rights | |
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
copies of the Software, and to permit persons to whom the Software is | |
furnished to do so, subject to the following conditions: | |
The above copyright notice and this permission notice shall be included in all | |
copies or substantial portions of the Software. | |
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
SOFTWARE. | |
""" | |
import contextlib | |
import os | |
import sys | |
import tempfile | |
from pathlib import Path | |
from typing import Optional, Tuple | |
import re | |
import argparse | |
import subprocess | |
import shutil | |
import zipfile | |
from PIL import Image | |
report_images = ("graphicx calc pgf currfile".split(), r""" | |
\newwrite\imgstream | |
\immediate\openout\imgstream=imagedata.aux | |
\makeatletter | |
\let\old@gintrim\Gin@trim | |
\def\new@gintrim{\old@gintrim% | |
\pgfmathparse{int(round(\Gin@llx))} | |
\let\outllx\pgfmathresult | |
\pgfmathparse{int(round(\Gin@lly))} | |
\let\outlly\pgfmathresult | |
\pgfmathparse{int(round(\Gin@urx))} | |
\let\outurx\pgfmathresult | |
\pgfmathparse{int(round(\Gin@ury))} | |
\let\outury\pgfmathresult | |
\pgfmathparse{int(round(\Gin@ollx))} | |
\immediate\write\imgstream{crop=\outllx,\outlly,\outurx,\outury}} | |
\let\old@ginsetfile\Gin@setfile | |
\def\new@ginsetfile#1{% | |
\immediate\write\imgstream{path=\Gin@base\Gin@ext}% | |
\old@ginsetfile{#1}} | |
\let\oldincludegraphics\includegraphics | |
\providecommand{\includegraphics}{} | |
\renewcommand{\includegraphics}[2][]{% | |
\immediate\write\imgstream{source=\currfilepath}% | |
\immediate\write\imgstream{line=\the\inputlineno}% | |
\begingroup | |
\let\Gin@trim\new@gintrim | |
\let\Gin@setfile\new@ginsetfile | |
\setbox0=\hbox{\oldincludegraphics[#1]{#2}}% | |
\immediate\write\imgstream{rendered_size=\the\wd0,\the\ht0,\the\dp0}% | |
\let\Gin@setfile\old@ginsetfile | |
\let\Gin@trim\old@gintrim | |
\immediate\write\imgstream{}% | |
\endgroup% | |
\oldincludegraphics[#1]{#2}} | |
\makeatother | |
""", r""" | |
\immediate\closeout\imgstream | |
""") | |
replace_images = ([], r""" | |
\newread\imgstream | |
\immediate\openin\imgstream=imagedata.in | |
\makeatletter | |
\def\new@kvginclip#1{} | |
\def\new@kvgintrim#1{} | |
\let\old@kvginclip\KV@Gin@clip | |
\let\old@kvgintrim\KV@Gin@trim | |
\let\oldincludegraphics\includegraphics | |
\providecommand{\includegraphics}{} | |
\renewcommand{\includegraphics}[2][]{% | |
\immediate\read\imgstream to \src | |
\immediate\read\imgstream to \removecrop | |
\ifnum\removecrop=1 | |
\let\KV@Gin@clip\new@kvginclip | |
\let\KV@Gin@trim\new@kvgintrim | |
\fi | |
\oldincludegraphics[#1]{\src}% | |
\let\KV@Gin@clip\old@kvginclip | |
\let\KV@Gin@trim\old@kvgintrim} | |
\makeatother | |
""", r""" | |
\immediate\closein\imgstream | |
""") | |
def _apply_tex_patch(tex: str, patch): | |
deps, pre, post = patch | |
deps = [x for x in deps if f"\\usepackage{{{x}}}" not in tex] | |
pre = "".join(f"\\usepackage{{{x}}}\n" for x in deps) + pre | |
tex = tex.replace("\\begin{document}", pre + "\n\\begin{document}") | |
tex = tex.replace("\\end{document}", post + "\n\\end{document}") | |
return tex | |
def _parse_image_data(text: str): | |
output = [{}] | |
for line in text.splitlines(): | |
if not line: | |
output.append({}) | |
continue | |
key, val = line.split("=", 1) | |
output[-1][key] = val | |
output.pop() | |
return output | |
def _find_required_resources(path: Path, verbose=False): | |
with open(path, "r", encoding="utf8") as f: | |
text = f.read() | |
updated_text = _apply_tex_patch(text, report_images) | |
tmp_path_stem = f"__tmp_doc_{path.stem}" | |
tmp_path = path.with_stem(tmp_path_stem) | |
for fname in tmp_path.parent.glob(tmp_path.stem + ".*"): | |
os.remove(fname) | |
with open(tmp_path, "w") as f: | |
f.write(updated_text) | |
subprocess.check_call(['latexmk', '-halt-on-error', '-g', '-pdf', tmp_path.name], | |
cwd=tmp_path.parent, | |
**({"stdout": subprocess.DEVNULL, "stderr": subprocess.DEVNULL} if not verbose else {})) # type: ignore | |
with open(tmp_path.parent.joinpath("imagedata.aux"), "r") as f: | |
images = _parse_image_data(f.read()) | |
log = open(tmp_path.with_suffix(".log"), 'r').read() | |
output = set() | |
for m in re.finditer(r'\(\.\/([^\s\)\n]+)(?:\)|\n|\s|\r\n)', log): | |
fname = m.group(1) | |
if fname.endswith('.cls') or \ | |
fname.endswith('.sty') or \ | |
fname.endswith('.bbl'): | |
if fname.startswith(tmp_path.stem): | |
fname = str(Path(fname).with_stem(path.stem)) | |
output.add(fname) | |
if tmp_path.with_suffix(".blg").exists(): | |
log = open(tmp_path.with_suffix(".blg"), 'r').read() | |
for m in re.finditer(r'\s([^\s\n]*(?:.bib|.bst))\n', log): | |
fname = m.group(1) | |
if fname.startswith(tmp_path.stem): | |
fname = str(Path(fname).with_stem(path.stem)) | |
output.add(fname) | |
if tmp_path.with_suffix(".bbl").exists(): | |
shutil.copy(tmp_path.with_suffix(".bbl"), path.with_suffix(".bbl")) | |
inputsize = tmp_path.with_suffix(".pdf").stat().st_size | |
for fname in tmp_path.parent.glob(tmp_path.stem + ".*"): | |
os.remove(fname) | |
return output, images, inputsize | |
def _remove_tex_comments(content): | |
def _remove_comment_line(text): | |
if 'auto-ignore' in text: | |
return text + '\n' | |
if text.lstrip(' ').lstrip('\t').startswith('%'): | |
return '' | |
match = re.search(r'(?<!\\)%', text) | |
if match: | |
return text[:match.end()] + '\n' | |
else: | |
return text + '\n' | |
return ''.join(map(_remove_comment_line, content.splitlines())) | |
def _read_tex_file(path: Path, root: Optional[Path] = None): | |
if root is None: | |
root = path.parent | |
if not path.suffix == '.tex' and not path.suffix == '.bbl' and not path.suffix == '.cls': | |
path = Path(str(path) + '.tex') | |
text = open(path, 'r').read() | |
text = _remove_tex_comments(text) | |
text = re.sub(r'\\(?:input|include){([^}]+)}', lambda m: _read_tex_file(root/m.group(1), root), text) | |
return text.rstrip(" \t\n\r") | |
def _get_pdf_size(pdf_path): | |
try: | |
# Run Ghostscript with bbox device to extract bounding box info | |
result = subprocess.run( | |
['gs', '-q', '-dBATCH', '-dNOPAUSE', '-sDEVICE=bbox', pdf_path], | |
stderr=subprocess.PIPE, | |
stdout=subprocess.PIPE, | |
text=True | |
) | |
# Extract the HiResBoundingBox line | |
match = re.search(r'%%HiResBoundingBox:\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)', result.stderr) | |
if not match: | |
raise ValueError("Could not find HiResBoundingBox in Ghostscript output.") | |
x0, y0, x1, y1 = map(float, match.groups()) | |
width_pts = x1 - x0 | |
height_pts = y1 - y0 | |
return width_pts, height_pts | |
except Exception as e: | |
raise RuntimeError(f"Failed to get PDF size for file {pdf_path}: {e}") | |
def _format_size(size_bytes: int) -> str: | |
if size_bytes < 1024: | |
return f"{size_bytes}B" | |
elif size_bytes < 1024 * 1024: | |
return f"{size_bytes / 1024:.2f}KB" | |
else: | |
return f"{size_bytes / (1024 * 1024):.2f}MB" | |
def _process_image(input_path: str, output_path: str, data, *, ppi=None, | |
compress_pdf_images=False, | |
no_crop=False, cache) -> Tuple[str, bool]: | |
crop = data.get("crop") | |
if no_crop: | |
crop = None | |
width_pt = height_pt = None | |
if ppi is not None: | |
width_pt, height_pt, *_ = data["rendered_size"].split(",") | |
assert width_pt.endswith("pt") | |
assert height_pt.endswith("pt") | |
width_pt = float(width_pt[:-2]) | |
height_pt = float(height_pt[:-2]) | |
_entry = data["path"], crop, width_pt, height_pt | |
if _entry in cache: | |
return cache[_entry] | |
outpath = f"{len(cache)+1:05d}{os.path.splitext(data['path'])[1]}" | |
remove_crop = False | |
if data["path"].endswith(".pdf"): | |
size_bytes = _format_size(os.path.getsize(os.path.join(input_path, data["path"]))) | |
if not compress_pdf_images or crop is not None or ppi is None: | |
shutil.copyfile(os.path.join(input_path, data["path"]), os.path.join(output_path, outpath)) | |
print(f'Copying file \033[34m{data["path"]}\033[0m (\33[1m{size_bytes}\033[0m)') | |
cache[_entry] = outpath, remove_crop | |
return outpath, remove_crop | |
else: | |
# Compress PDF images using ghostscript | |
# First get pdf image size in points | |
input_size_w_pt, input_size_h_pt = _get_pdf_size(os.path.join(input_path, data["path"])) | |
target_w_px = int(round(width_pt * ppi / 72)) | |
target_h_px = int(round(height_pt * ppi / 72)) | |
assert crop is None, "Crop is not supported for PDF files with compression" | |
dpi = max(target_w_px * 72 / input_size_w_pt, target_h_px * 72 / input_size_h_pt) | |
gs_command = ("gs -q -dNOPAUSE -dBATCH -sDEVICE=pdfwrite " \ | |
f"-dPDFSETTINGS=/screen -dCompatibilityLevel=1.4 " \ | |
f"-dDownsampleColorImages=true -dColorImageResolution={int(dpi)} " \ | |
f"-dDownsampleGrayImages=true -dGrayImageResolution={int(dpi)} " \ | |
f"-dDownsampleMonoImages=true -dMonoImageResolution={int(dpi)}").split() + [ | |
f'-sOutputFile={os.path.join(output_path, outpath)}', | |
os.path.join(input_path, data["path"]), | |
] | |
subprocess.check_call(gs_command) | |
new_size_bytes = _format_size(os.path.getsize(os.path.join(output_path, outpath))) | |
print(f'Compressing PDF file \033[34m{data["path"]}\033[0m using DPI \33[1m{dpi:.3f}\033[0m, compression: \33[1m{size_bytes}\033[0m -> \33[1m{new_size_bytes}\033[0m') | |
cache[_entry] = outpath, remove_crop | |
return outpath, remove_crop | |
image = Image.open(os.path.join(input_path, data["path"])) | |
input_size = image.size | |
if crop is not None: | |
w, h = image.size | |
a, b, c, d = tuple(map(int, crop.split(","))) | |
image = image.crop((a, h-d, c, h-b)) | |
remove_crop = True | |
del w | |
was_resized = False | |
if ppi is not None and (not no_crop or data.get("crop") is None): | |
assert width_pt is not None | |
image_width = int(round(width_pt * ppi / 72)) | |
if image_width < image.size[0]: | |
new_size = (image_width, int(round(image_width / image.size[0] * image.size[1]))) | |
image = image.resize(new_size) | |
if crop is not None: | |
print(f'Cropping and resizing image \033[34m{data["path"]}\033[0m \33[1m{input_size}\033[0m, crop \33[1m{crop}\033[0m -> \33[1m{new_size}\033[0m') | |
else: | |
print(f'Resizing image \033[34m{data["path"]}\033[0m \33[1m{input_size}\033[0m -> \33[1m{new_size}\033[0m') | |
image.save(os.path.join(output_path, outpath)) | |
was_resized = True | |
if not was_resized: | |
if crop is not None: | |
print(f'Cropping image \033[34m{data["path"]}\033[0m \33[1m{input_size}\033[0m, crop \33[1m{crop}\033[0m -> \33[1m{image.size}\033[0m') | |
image.save(os.path.join(output_path, outpath)) | |
else: | |
print(f'Copying image \033[34m{data["path"]}\033[0m \33[1m{input_size}\033[0m') | |
shutil.copyfile(os.path.join(input_path, data["path"]), os.path.join(output_path, outpath)) | |
cache[_entry] = outpath, remove_crop | |
return outpath, remove_crop | |
if __name__ == "__main__": | |
pname = sys.argv[0] | |
parser = argparse.ArgumentParser( | |
description='''This tool simplifies a LaTeX project by removing unused files and removing comments. | |
It also merges tex files into a single file and optionally crops and downscales images | |
to reduce the size of the output. It is useful when you want to prepare an ArXiv submission.''', | |
formatter_class=argparse.RawDescriptionHelpFormatter, | |
epilog=(f'''notes: | |
The script will try to find all required files by compiling the LaTeX file. | |
It relies on the latexmk tool, which must be installed. Both the input and | |
the output can be a directory or a zip file. Therefore the script can be used | |
on the downloaded Overleaf project zip file. And the output archive can be | |
directly uploaded to ArXiv. | |
examples: | |
{pname} my_paper.tex my_paper_arxiv | |
{pname} my_paper.tex my_paper_arxiv.zip --use-bbl --pdf | |
{pname} overleaf_archive.zip/my_paper.tex my_paper_arxiv.zip --ppi=300 | |
Copyright (c) 2023 Jonas Kulhanek, MIT License''')) | |
parser.add_argument('path', type=Path, help='Path to the input ".tex" file. Can be inside a zip file') | |
parser.add_argument('output', type=Path, help='Path to output directory/zip-file. If it exists, it will be deleted.') | |
parser.add_argument('--use-bbl', action='store_true', help='Use the .bbl file instead of the .bib file. (for arxiv)') | |
parser.add_argument('--overwrite', action='store_true', help='Override output file if it exists.') | |
parser.add_argument('--ppi', type=float, default=None, help='Specify pixels per inch (PPI) to automatically resize images.') | |
parser.add_argument('--no-crop', action='store_true', help='Do not crop images (not recommended). No scaling is applied on uncropped cropped images.') | |
parser.add_argument('--no-pdf', action='store_true', help='Skip generating PDF file') | |
parser.add_argument('--no-compress-pdf', action='store_true', help='Disable image compression for included PDF files. By default ghostscript is used to compress included PDF files.') | |
parser.add_argument('--verbose', action='store_true', help='Print detailed log.') | |
args = parser.parse_args() | |
args.path = args.path.absolute() | |
args.output = args.output.absolute() | |
if os.path.exists(args.output): | |
if args.overwrite: | |
if args.output.is_dir(): | |
shutil.rmtree(args.output) | |
else: | |
os.remove(args.output) | |
else: | |
print(f"\033[91;1mOutput path {args.output} already exists. Use --overwrite to delete it.\033[0m", file=sys.stderr) | |
sys.exit(1) | |
if args.output.suffix == ".zip": | |
output_ctx_wrapper = tempfile.TemporaryDirectory() | |
else: | |
output_ctx_wrapper = contextlib.nullcontext(str(args.output)) | |
if args.path.parent.suffix == ".zip": | |
input_ctx_wrapper = tempfile.TemporaryDirectory() | |
else: | |
input_ctx_wrapper = contextlib.nullcontext(str(args.path.parent)) | |
with output_ctx_wrapper as output_dir, input_ctx_wrapper as input_path: | |
if args.path.parent.suffix == ".zip": | |
with zipfile.ZipFile(args.path.parent, 'r') as zip_ref: | |
zip_ref.extractall(input_path) | |
path = Path(input_path)/args.path.name | |
output = Path(output_dir) | |
output.mkdir(exist_ok=True, parents=True) | |
print("\033[93;1mFinding required resources...\033[0m") | |
required_files, image_data, inputsize = _find_required_resources(path, verbose=args.verbose) | |
# Process images | |
_cache = {} | |
with open(output / "imagedata.in", "w") as f: | |
for data in image_data: | |
img_name, remove_crop = _process_image( | |
input_path, | |
output_dir, | |
data, | |
cache=_cache, | |
ppi=args.ppi, | |
compress_pdf_images=not args.no_compress_pdf, | |
no_crop=args.no_crop) | |
f.write(f"{img_name}\n") | |
f.write("1\n" if remove_crop else "0\n") | |
# Prepare latex file | |
tex_content = _read_tex_file(path) | |
tex_content = _apply_tex_patch(tex_content, replace_images) | |
# Copy required files | |
root = path.parent | |
if args.use_bbl: | |
required_files = set(x for x in required_files if not x.lower().endswith('.bib') and not x.lower().endswith('.bst')) | |
else: | |
required_files = set(x for x in required_files if not x.lower().endswith('.bbl')) | |
for f in required_files: | |
(output / f).parent.mkdir(parents=True, exist_ok=True) | |
_, extension = os.path.splitext(f) | |
if extension in {'.cls', '.tex'}: | |
with open(output / f, 'w+') as fout: | |
fout.write(_read_tex_file(root/f, root)) | |
else: | |
if (root / f).exists(): | |
shutil.copyfile(root / f, output / f) | |
else: | |
# File does not exist, but latex compiled so it is probably in the texlive | |
# print(f"\033[91;1mRequired file {f} not found.\033[0m", file=sys.stderr) | |
pass | |
# Write tex file | |
print("\033[93;1mWriting output file...\033[0m") | |
with open(output / args.path.name, 'w+') as f: | |
f.write(tex_content) | |
if args.output.suffix == ".zip": | |
with zipfile.ZipFile(args.output, 'w') as zip_file: | |
for f in output.glob("**/*"): | |
zip_file.write(f, f.relative_to(output)) | |
print(f"\033[93;1mOutput written to \033[94;1m{args.output}\033[0m") | |
if not args.no_pdf: | |
print("\033[93;1mCompiling PDF...\033[0m") | |
current_files = list(output.glob("*")) | |
subprocess.check_call(['latexmk', '-halt-on-error', '-g', '-pdf', args.path.name], | |
cwd=output, | |
**({"stdout": subprocess.DEVNULL, "stderr": subprocess.DEVNULL} if not args.verbose else {})) # type: ignore | |
shutil.move(output / args.path.with_suffix('.pdf').name, args.output.with_suffix('.pdf')) | |
for f in output.glob("*"): | |
if f not in current_files: | |
os.remove(f) | |
print(f"\033[93;1mPDF written to \033[94;1m{args.output.with_suffix('.pdf')}\033[0m") | |
outputsize = args.output.with_suffix('.pdf').stat().st_size | |
print(f"\033[93;1mSize: \033[92;1m{inputsize/1024/1024:.2f}\033[0mMB -> \033[92;1m{outputsize/1024/1024:.2f}\033[0mMB") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment