Skip to content

Instantly share code, notes, and snippets.

@jkulhanek
Last active May 27, 2025 20:01
Show Gist options
  • Save jkulhanek/b48f12e98b6ecac0cb698c0a249815d3 to your computer and use it in GitHub Desktop.
Save jkulhanek/b48f12e98b6ecac0cb698c0a249815d3 to your computer and use it in GitHub Desktop.
Simplify LaTeX project by removing unused files and comment and downscale and crop images.
#!/usr/bin/env python
"""
This tool simplifies a LaTeX project by removing unused files and removing comments.
It also merges tex files into a single file and optionally crops and downscales images
to reduce the size of the output. It is useful when you want to prepare an ArXiv submission.
Usage:
export-latex.py <path> <output> [options]
Arguments:
<path> Path to the input ".tex" file. Can be inside a zip file
<output> Path to output directory/zip-file. If it exists, it will be deleted.
Options:
--use-bbl Use the .bbl file instead of the .bib file. (for arxiv)
--overwrite Override output file if it exists.
--pdf Also output pdf file.
--ppi=<ppi> Automatically resize images to the specified pixels per inch (PPI).
--no-crop Do not crop images (keep full size, not scaling is applied on cropped images).
--no-pdf Skip generating PDF file
--no-compress-pdf By default ghostscript is used to compress included PDF files.
--verbose Print detailed log.
Notes:
The script will try to find all required files by compiling the LaTeX file.
It relies on the latexmk tool, which must be installed. Both the input and
the output can be a directory or a zip file. Therefore the script can be used
on the downloaded Overleaf project zip file. And the output archive can be
directly uploaded to ArXiv.
Examples:
export-latex.py my_paper.tex my_paper_arxiv
export-latex.py my_paper.tex my_paper_arxiv.zip --use-bbl --pdf
export-latex.py overleaf_archive.zip/my_paper.tex my_paper_arxiv.zip --ppi=300
MIT License
Copyright (c) 2023 Jonas Kulhanek
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
"""
import contextlib
import os
import sys
import tempfile
from pathlib import Path
from typing import Optional, Tuple
import re
import argparse
import subprocess
import shutil
import zipfile
from PIL import Image
report_images = ("graphicx calc pgf currfile".split(), r"""
\newwrite\imgstream
\immediate\openout\imgstream=imagedata.aux
\makeatletter
\let\old@gintrim\Gin@trim
\def\new@gintrim{\old@gintrim%
\pgfmathparse{int(round(\Gin@llx))}
\let\outllx\pgfmathresult
\pgfmathparse{int(round(\Gin@lly))}
\let\outlly\pgfmathresult
\pgfmathparse{int(round(\Gin@urx))}
\let\outurx\pgfmathresult
\pgfmathparse{int(round(\Gin@ury))}
\let\outury\pgfmathresult
\pgfmathparse{int(round(\Gin@ollx))}
\immediate\write\imgstream{crop=\outllx,\outlly,\outurx,\outury}}
\let\old@ginsetfile\Gin@setfile
\def\new@ginsetfile#1{%
\immediate\write\imgstream{path=\Gin@base\Gin@ext}%
\old@ginsetfile{#1}}
\let\oldincludegraphics\includegraphics
\providecommand{\includegraphics}{}
\renewcommand{\includegraphics}[2][]{%
\immediate\write\imgstream{source=\currfilepath}%
\immediate\write\imgstream{line=\the\inputlineno}%
\begingroup
\let\Gin@trim\new@gintrim
\let\Gin@setfile\new@ginsetfile
\setbox0=\hbox{\oldincludegraphics[#1]{#2}}%
\immediate\write\imgstream{rendered_size=\the\wd0,\the\ht0,\the\dp0}%
\let\Gin@setfile\old@ginsetfile
\let\Gin@trim\old@gintrim
\immediate\write\imgstream{}%
\endgroup%
\oldincludegraphics[#1]{#2}}
\makeatother
""", r"""
\immediate\closeout\imgstream
""")
replace_images = ([], r"""
\newread\imgstream
\immediate\openin\imgstream=imagedata.in
\makeatletter
\def\new@kvginclip#1{}
\def\new@kvgintrim#1{}
\let\old@kvginclip\KV@Gin@clip
\let\old@kvgintrim\KV@Gin@trim
\let\oldincludegraphics\includegraphics
\providecommand{\includegraphics}{}
\renewcommand{\includegraphics}[2][]{%
\immediate\read\imgstream to \src
\immediate\read\imgstream to \removecrop
\ifnum\removecrop=1
\let\KV@Gin@clip\new@kvginclip
\let\KV@Gin@trim\new@kvgintrim
\fi
\oldincludegraphics[#1]{\src}%
\let\KV@Gin@clip\old@kvginclip
\let\KV@Gin@trim\old@kvgintrim}
\makeatother
""", r"""
\immediate\closein\imgstream
""")
def _apply_tex_patch(tex: str, patch):
deps, pre, post = patch
deps = [x for x in deps if f"\\usepackage{{{x}}}" not in tex]
pre = "".join(f"\\usepackage{{{x}}}\n" for x in deps) + pre
tex = tex.replace("\\begin{document}", pre + "\n\\begin{document}")
tex = tex.replace("\\end{document}", post + "\n\\end{document}")
return tex
def _parse_image_data(text: str):
output = [{}]
for line in text.splitlines():
if not line:
output.append({})
continue
key, val = line.split("=", 1)
output[-1][key] = val
output.pop()
return output
def _find_required_resources(path: Path, verbose=False):
with open(path, "r", encoding="utf8") as f:
text = f.read()
updated_text = _apply_tex_patch(text, report_images)
tmp_path_stem = f"__tmp_doc_{path.stem}"
tmp_path = path.with_stem(tmp_path_stem)
for fname in tmp_path.parent.glob(tmp_path.stem + ".*"):
os.remove(fname)
with open(tmp_path, "w") as f:
f.write(updated_text)
subprocess.check_call(['latexmk', '-halt-on-error', '-g', '-pdf', tmp_path.name],
cwd=tmp_path.parent,
**({"stdout": subprocess.DEVNULL, "stderr": subprocess.DEVNULL} if not verbose else {})) # type: ignore
with open(tmp_path.parent.joinpath("imagedata.aux"), "r") as f:
images = _parse_image_data(f.read())
log = open(tmp_path.with_suffix(".log"), 'r').read()
output = set()
for m in re.finditer(r'\(\.\/([^\s\)\n]+)(?:\)|\n|\s|\r\n)', log):
fname = m.group(1)
if fname.endswith('.cls') or \
fname.endswith('.sty') or \
fname.endswith('.bbl'):
if fname.startswith(tmp_path.stem):
fname = str(Path(fname).with_stem(path.stem))
output.add(fname)
if tmp_path.with_suffix(".blg").exists():
log = open(tmp_path.with_suffix(".blg"), 'r').read()
for m in re.finditer(r'\s([^\s\n]*(?:.bib|.bst))\n', log):
fname = m.group(1)
if fname.startswith(tmp_path.stem):
fname = str(Path(fname).with_stem(path.stem))
output.add(fname)
if tmp_path.with_suffix(".bbl").exists():
shutil.copy(tmp_path.with_suffix(".bbl"), path.with_suffix(".bbl"))
inputsize = tmp_path.with_suffix(".pdf").stat().st_size
for fname in tmp_path.parent.glob(tmp_path.stem + ".*"):
os.remove(fname)
return output, images, inputsize
def _remove_tex_comments(content):
def _remove_comment_line(text):
if 'auto-ignore' in text:
return text + '\n'
if text.lstrip(' ').lstrip('\t').startswith('%'):
return ''
match = re.search(r'(?<!\\)%', text)
if match:
return text[:match.end()] + '\n'
else:
return text + '\n'
return ''.join(map(_remove_comment_line, content.splitlines()))
def _read_tex_file(path: Path, root: Optional[Path] = None):
if root is None:
root = path.parent
if not path.suffix == '.tex' and not path.suffix == '.bbl' and not path.suffix == '.cls':
path = Path(str(path) + '.tex')
text = open(path, 'r').read()
text = _remove_tex_comments(text)
text = re.sub(r'\\(?:input|include){([^}]+)}', lambda m: _read_tex_file(root/m.group(1), root), text)
return text.rstrip(" \t\n\r")
def _get_pdf_size(pdf_path):
try:
# Run Ghostscript with bbox device to extract bounding box info
result = subprocess.run(
['gs', '-q', '-dBATCH', '-dNOPAUSE', '-sDEVICE=bbox', pdf_path],
stderr=subprocess.PIPE,
stdout=subprocess.PIPE,
text=True
)
# Extract the HiResBoundingBox line
match = re.search(r'%%HiResBoundingBox:\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)', result.stderr)
if not match:
raise ValueError("Could not find HiResBoundingBox in Ghostscript output.")
x0, y0, x1, y1 = map(float, match.groups())
width_pts = x1 - x0
height_pts = y1 - y0
return width_pts, height_pts
except Exception as e:
raise RuntimeError(f"Failed to get PDF size for file {pdf_path}: {e}")
def _format_size(size_bytes: int) -> str:
if size_bytes < 1024:
return f"{size_bytes}B"
elif size_bytes < 1024 * 1024:
return f"{size_bytes / 1024:.2f}KB"
else:
return f"{size_bytes / (1024 * 1024):.2f}MB"
def _process_image(input_path: str, output_path: str, data, *, ppi=None,
compress_pdf_images=False,
no_crop=False, cache) -> Tuple[str, bool]:
crop = data.get("crop")
if no_crop:
crop = None
width_pt = height_pt = None
if ppi is not None:
width_pt, height_pt, *_ = data["rendered_size"].split(",")
assert width_pt.endswith("pt")
assert height_pt.endswith("pt")
width_pt = float(width_pt[:-2])
height_pt = float(height_pt[:-2])
_entry = data["path"], crop, width_pt, height_pt
if _entry in cache:
return cache[_entry]
outpath = f"{len(cache)+1:05d}{os.path.splitext(data['path'])[1]}"
remove_crop = False
if data["path"].endswith(".pdf"):
size_bytes = _format_size(os.path.getsize(os.path.join(input_path, data["path"])))
if not compress_pdf_images or crop is not None or ppi is None:
shutil.copyfile(os.path.join(input_path, data["path"]), os.path.join(output_path, outpath))
print(f'Copying file \033[34m{data["path"]}\033[0m (\33[1m{size_bytes}\033[0m)')
cache[_entry] = outpath, remove_crop
return outpath, remove_crop
else:
# Compress PDF images using ghostscript
# First get pdf image size in points
input_size_w_pt, input_size_h_pt = _get_pdf_size(os.path.join(input_path, data["path"]))
target_w_px = int(round(width_pt * ppi / 72))
target_h_px = int(round(height_pt * ppi / 72))
assert crop is None, "Crop is not supported for PDF files with compression"
dpi = max(target_w_px * 72 / input_size_w_pt, target_h_px * 72 / input_size_h_pt)
gs_command = ("gs -q -dNOPAUSE -dBATCH -sDEVICE=pdfwrite " \
f"-dPDFSETTINGS=/screen -dCompatibilityLevel=1.4 " \
f"-dDownsampleColorImages=true -dColorImageResolution={int(dpi)} " \
f"-dDownsampleGrayImages=true -dGrayImageResolution={int(dpi)} " \
f"-dDownsampleMonoImages=true -dMonoImageResolution={int(dpi)}").split() + [
f'-sOutputFile={os.path.join(output_path, outpath)}',
os.path.join(input_path, data["path"]),
]
subprocess.check_call(gs_command)
new_size_bytes = _format_size(os.path.getsize(os.path.join(output_path, outpath)))
print(f'Compressing PDF file \033[34m{data["path"]}\033[0m using DPI \33[1m{dpi:.3f}\033[0m, compression: \33[1m{size_bytes}\033[0m -> \33[1m{new_size_bytes}\033[0m')
cache[_entry] = outpath, remove_crop
return outpath, remove_crop
image = Image.open(os.path.join(input_path, data["path"]))
input_size = image.size
if crop is not None:
w, h = image.size
a, b, c, d = tuple(map(int, crop.split(",")))
image = image.crop((a, h-d, c, h-b))
remove_crop = True
del w
was_resized = False
if ppi is not None and (not no_crop or data.get("crop") is None):
assert width_pt is not None
image_width = int(round(width_pt * ppi / 72))
if image_width < image.size[0]:
new_size = (image_width, int(round(image_width / image.size[0] * image.size[1])))
image = image.resize(new_size)
if crop is not None:
print(f'Cropping and resizing image \033[34m{data["path"]}\033[0m \33[1m{input_size}\033[0m, crop \33[1m{crop}\033[0m -> \33[1m{new_size}\033[0m')
else:
print(f'Resizing image \033[34m{data["path"]}\033[0m \33[1m{input_size}\033[0m -> \33[1m{new_size}\033[0m')
image.save(os.path.join(output_path, outpath))
was_resized = True
if not was_resized:
if crop is not None:
print(f'Cropping image \033[34m{data["path"]}\033[0m \33[1m{input_size}\033[0m, crop \33[1m{crop}\033[0m -> \33[1m{image.size}\033[0m')
image.save(os.path.join(output_path, outpath))
else:
print(f'Copying image \033[34m{data["path"]}\033[0m \33[1m{input_size}\033[0m')
shutil.copyfile(os.path.join(input_path, data["path"]), os.path.join(output_path, outpath))
cache[_entry] = outpath, remove_crop
return outpath, remove_crop
if __name__ == "__main__":
pname = sys.argv[0]
parser = argparse.ArgumentParser(
description='''This tool simplifies a LaTeX project by removing unused files and removing comments.
It also merges tex files into a single file and optionally crops and downscales images
to reduce the size of the output. It is useful when you want to prepare an ArXiv submission.''',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=(f'''notes:
The script will try to find all required files by compiling the LaTeX file.
It relies on the latexmk tool, which must be installed. Both the input and
the output can be a directory or a zip file. Therefore the script can be used
on the downloaded Overleaf project zip file. And the output archive can be
directly uploaded to ArXiv.
examples:
{pname} my_paper.tex my_paper_arxiv
{pname} my_paper.tex my_paper_arxiv.zip --use-bbl --pdf
{pname} overleaf_archive.zip/my_paper.tex my_paper_arxiv.zip --ppi=300
Copyright (c) 2023 Jonas Kulhanek, MIT License'''))
parser.add_argument('path', type=Path, help='Path to the input ".tex" file. Can be inside a zip file')
parser.add_argument('output', type=Path, help='Path to output directory/zip-file. If it exists, it will be deleted.')
parser.add_argument('--use-bbl', action='store_true', help='Use the .bbl file instead of the .bib file. (for arxiv)')
parser.add_argument('--overwrite', action='store_true', help='Override output file if it exists.')
parser.add_argument('--ppi', type=float, default=None, help='Specify pixels per inch (PPI) to automatically resize images.')
parser.add_argument('--no-crop', action='store_true', help='Do not crop images (not recommended). No scaling is applied on uncropped cropped images.')
parser.add_argument('--no-pdf', action='store_true', help='Skip generating PDF file')
parser.add_argument('--no-compress-pdf', action='store_true', help='Disable image compression for included PDF files. By default ghostscript is used to compress included PDF files.')
parser.add_argument('--verbose', action='store_true', help='Print detailed log.')
args = parser.parse_args()
args.path = args.path.absolute()
args.output = args.output.absolute()
if os.path.exists(args.output):
if args.overwrite:
if args.output.is_dir():
shutil.rmtree(args.output)
else:
os.remove(args.output)
else:
print(f"\033[91;1mOutput path {args.output} already exists. Use --overwrite to delete it.\033[0m", file=sys.stderr)
sys.exit(1)
if args.output.suffix == ".zip":
output_ctx_wrapper = tempfile.TemporaryDirectory()
else:
output_ctx_wrapper = contextlib.nullcontext(str(args.output))
if args.path.parent.suffix == ".zip":
input_ctx_wrapper = tempfile.TemporaryDirectory()
else:
input_ctx_wrapper = contextlib.nullcontext(str(args.path.parent))
with output_ctx_wrapper as output_dir, input_ctx_wrapper as input_path:
if args.path.parent.suffix == ".zip":
with zipfile.ZipFile(args.path.parent, 'r') as zip_ref:
zip_ref.extractall(input_path)
path = Path(input_path)/args.path.name
output = Path(output_dir)
output.mkdir(exist_ok=True, parents=True)
print("\033[93;1mFinding required resources...\033[0m")
required_files, image_data, inputsize = _find_required_resources(path, verbose=args.verbose)
# Process images
_cache = {}
with open(output / "imagedata.in", "w") as f:
for data in image_data:
img_name, remove_crop = _process_image(
input_path,
output_dir,
data,
cache=_cache,
ppi=args.ppi,
compress_pdf_images=not args.no_compress_pdf,
no_crop=args.no_crop)
f.write(f"{img_name}\n")
f.write("1\n" if remove_crop else "0\n")
# Prepare latex file
tex_content = _read_tex_file(path)
tex_content = _apply_tex_patch(tex_content, replace_images)
# Copy required files
root = path.parent
if args.use_bbl:
required_files = set(x for x in required_files if not x.lower().endswith('.bib') and not x.lower().endswith('.bst'))
else:
required_files = set(x for x in required_files if not x.lower().endswith('.bbl'))
for f in required_files:
(output / f).parent.mkdir(parents=True, exist_ok=True)
_, extension = os.path.splitext(f)
if extension in {'.cls', '.tex'}:
with open(output / f, 'w+') as fout:
fout.write(_read_tex_file(root/f, root))
else:
if (root / f).exists():
shutil.copyfile(root / f, output / f)
else:
# File does not exist, but latex compiled so it is probably in the texlive
# print(f"\033[91;1mRequired file {f} not found.\033[0m", file=sys.stderr)
pass
# Write tex file
print("\033[93;1mWriting output file...\033[0m")
with open(output / args.path.name, 'w+') as f:
f.write(tex_content)
if args.output.suffix == ".zip":
with zipfile.ZipFile(args.output, 'w') as zip_file:
for f in output.glob("**/*"):
zip_file.write(f, f.relative_to(output))
print(f"\033[93;1mOutput written to \033[94;1m{args.output}\033[0m")
if not args.no_pdf:
print("\033[93;1mCompiling PDF...\033[0m")
current_files = list(output.glob("*"))
subprocess.check_call(['latexmk', '-halt-on-error', '-g', '-pdf', args.path.name],
cwd=output,
**({"stdout": subprocess.DEVNULL, "stderr": subprocess.DEVNULL} if not args.verbose else {})) # type: ignore
shutil.move(output / args.path.with_suffix('.pdf').name, args.output.with_suffix('.pdf'))
for f in output.glob("*"):
if f not in current_files:
os.remove(f)
print(f"\033[93;1mPDF written to \033[94;1m{args.output.with_suffix('.pdf')}\033[0m")
outputsize = args.output.with_suffix('.pdf').stat().st_size
print(f"\033[93;1mSize: \033[92;1m{inputsize/1024/1024:.2f}\033[0mMB -> \033[92;1m{outputsize/1024/1024:.2f}\033[0mMB")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment