jkulhanek · May 27, 2025 20:01
diff --git a/export-latex.py b/export-latex.py
 #!/usr/bin/env python
 """
 This tool simplifies a LaTeX project by removing unused files and removing comments.
 It also merges tex files into a single file and optionally crops and downscales images
 to reduce the size of the output. It is useful when you want to prepare an ArXiv submission.

 Usage:
    export-latex.py <path> <output> [options]

 Arguments:
    <path>      Path to the input ".tex" file. Can be inside a zip file
    <output>    Path to output directory/zip-file. If it exists, it will be deleted.

 Options:
    --use-bbl          Use the .bbl file instead of the .bib file. (for arxiv)
    --overwrite         Override output file if it exists.
    --pdf              Also output pdf file.
    --ppi=<ppi>        Automatically resize images to the specified pixels per inch (PPI).
    --no-crop          Do not crop images (keep full size, not scaling is applied on cropped images).
    --no-pdf           Skip generating PDF file
    --no-compress-pdf  By default ghostscript is used to compress included PDF files.
    --verbose          Print detailed log.

 Notes:
    The script will try to find all required files by compiling the LaTeX file.
    It relies on the latexmk tool, which must be installed. Both the input and
    the output can be a directory or a zip file. Therefore the script can be used
    on the downloaded Overleaf project zip file. And the output archive can be
    directly uploaded to ArXiv.

 Examples:
    export-latex.py my_paper.tex my_paper_arxiv
    export-latex.py my_paper.tex my_paper_arxiv.zip --use-bbl --pdf
    export-latex.py overleaf_archive.zip/my_paper.tex my_paper_arxiv.zip --ppi=300


 MIT License

 Copyright (c) 2023 Jonas Kulhanek

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:

 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.

 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 """
 import contextlib
 import os
 import sys
 import tempfile
 from pathlib import Path
 from typing import Optional, Tuple
 import re
 import argparse
 import subprocess
 import shutil
 import zipfile
 from PIL import Image


 report_images = ("graphicx calc pgf currfile".split(), r"""
 \newwrite\imgstream
 \immediate\openout\imgstream=imagedata.aux
 \makeatletter
 \let\old@gintrim\Gin@trim
 \def\new@gintrim{\old@gintrim%
    \pgfmathparse{int(round(\Gin@llx))}
    \let\outllx\pgfmathresult
    \pgfmathparse{int(round(\Gin@lly))}
    \let\outlly\pgfmathresult
    \pgfmathparse{int(round(\Gin@urx))}
    \let\outurx\pgfmathresult
    \pgfmathparse{int(round(\Gin@ury))}
    \let\outury\pgfmathresult
    \pgfmathparse{int(round(\Gin@ollx))}
    \immediate\write\imgstream{crop=\outllx,\outlly,\outurx,\outury}}
 \let\old@ginsetfile\Gin@setfile
 \def\new@ginsetfile#1{%
  \immediate\write\imgstream{path=\Gin@base\Gin@ext}%
  \old@ginsetfile{#1}}
 \let\oldincludegraphics\includegraphics
 \providecommand{\includegraphics}{}
 \renewcommand{\includegraphics}[2][]{%
  \immediate\write\imgstream{source=\currfilepath}%
  \immediate\write\imgstream{line=\the\inputlineno}%
  \begingroup
  \let\Gin@trim\new@gintrim
  \let\Gin@setfile\new@ginsetfile
  \setbox0=\hbox{\oldincludegraphics[#1]{#2}}%
  \immediate\write\imgstream{rendered_size=\the\wd0,\the\ht0,\the\dp0}%
  \let\Gin@setfile\old@ginsetfile
  \let\Gin@trim\old@gintrim
  \immediate\write\imgstream{}%
  \endgroup%
  \oldincludegraphics[#1]{#2}}
 \makeatother
 """, r"""
 \immediate\closeout\imgstream
 """)

 replace_images = ([], r"""
 \newread\imgstream
 \immediate\openin\imgstream=imagedata.in
 \makeatletter
 \def\new@kvginclip#1{}
 \def\new@kvgintrim#1{}
 \let\old@kvginclip\KV@Gin@clip
 \let\old@kvgintrim\KV@Gin@trim
 \let\oldincludegraphics\includegraphics
 \providecommand{\includegraphics}{}
 \renewcommand{\includegraphics}[2][]{%
  \immediate\read\imgstream to \src
  \immediate\read\imgstream to \removecrop
  \ifnum\removecrop=1
      \let\KV@Gin@clip\new@kvginclip
      \let\KV@Gin@trim\new@kvgintrim
  \fi
  \oldincludegraphics[#1]{\src}%
  \let\KV@Gin@clip\old@kvginclip
  \let\KV@Gin@trim\old@kvgintrim}
 \makeatother
 """, r"""
 \immediate\closein\imgstream
 """)


 def _apply_tex_patch(tex: str, patch):
    deps, pre, post = patch
    deps = [x for x in deps if f"\\usepackage{{{x}}}" not in tex]
    pre = "".join(f"\\usepackage{{{x}}}\n" for x in deps) + pre
    tex = tex.replace("\\begin{document}", pre + "\n\\begin{document}")
    tex = tex.replace("\\end{document}", post + "\n\\end{document}")
    return tex


 def _parse_image_data(text: str):
    output = [{}]
    for line in text.splitlines():
        if not line:
            output.append({})
            continue
        key, val = line.split("=", 1)
        output[-1][key] = val
    output.pop()
    return output


 def _find_required_resources(path: Path, verbose=False):
    with open(path, "r", encoding="utf8") as f:
        text = f.read()
    updated_text = _apply_tex_patch(text, report_images)
    tmp_path_stem = f"__tmp_doc_{path.stem}"
    tmp_path = path.with_stem(tmp_path_stem)
    for fname in tmp_path.parent.glob(tmp_path.stem + ".*"):
        os.remove(fname)
    with open(tmp_path, "w") as f:
        f.write(updated_text)
    subprocess.check_call(['latexmk', '-halt-on-error', '-g', '-pdf', tmp_path.name],
                          cwd=tmp_path.parent,
                          **({"stdout": subprocess.DEVNULL, "stderr": subprocess.DEVNULL} if not verbose else {}))  # type: ignore
    with open(tmp_path.parent.joinpath("imagedata.aux"), "r") as f:
        images = _parse_image_data(f.read())
    log = open(tmp_path.with_suffix(".log"), 'r').read()

    output = set()
    for m in re.finditer(r'\(\.\/([^\s\)\n]+)(?:\)|\n|\s|\r\n)', log):
        fname = m.group(1)
        if fname.endswith('.cls') or \
            fname.endswith('.sty') or \
                fname.endswith('.bbl'):
            if fname.startswith(tmp_path.stem):
                fname = str(Path(fname).with_stem(path.stem))
            output.add(fname)
    if tmp_path.with_suffix(".blg").exists():
        log = open(tmp_path.with_suffix(".blg"), 'r').read()
    for m in re.finditer(r'\s([^\s\n]*(?:.bib|.bst))\n', log):
        fname = m.group(1)
        if fname.startswith(tmp_path.stem):
            fname = str(Path(fname).with_stem(path.stem))
        output.add(fname)
    if tmp_path.with_suffix(".bbl").exists():
        shutil.copy(tmp_path.with_suffix(".bbl"), path.with_suffix(".bbl"))
    inputsize = tmp_path.with_suffix(".pdf").stat().st_size
    for fname in tmp_path.parent.glob(tmp_path.stem + ".*"):
        os.remove(fname)
    return output, images, inputsize

 def _remove_tex_comments(content):
    def _remove_comment_line(text):
        if 'auto-ignore' in text:
            return text + '\n'
        if text.lstrip(' ').lstrip('\t').startswith('%'):
            return ''
        match = re.search(r'(?<!\\)%', text)
        if match:
            return text[:match.end()] + '\n'
        else:
            return text + '\n'
    return ''.join(map(_remove_comment_line, content.splitlines()))

 def _read_tex_file(path: Path, root: Optional[Path] = None):
    if root is None:
        root = path.parent
    if not path.suffix == '.tex' and not path.suffix == '.bbl'  and not path.suffix == '.cls':
        path = Path(str(path) + '.tex')
    text = open(path, 'r').read()
    text = _remove_tex_comments(text)
    text = re.sub(r'\\(?:input|include){([^}]+)}', lambda m: _read_tex_file(root/m.group(1), root), text)
    return text.rstrip(" \t\n\r")

 def _get_pdf_size(pdf_path):
    try:
        # Run Ghostscript with bbox device to extract bounding box info
        result = subprocess.run(
            ['gs', '-q', '-dBATCH', '-dNOPAUSE', '-sDEVICE=bbox', pdf_path],
            stderr=subprocess.PIPE,
            stdout=subprocess.PIPE,
            text=True
        )

        # Extract the HiResBoundingBox line
        match = re.search(r'%%HiResBoundingBox:\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)', result.stderr)
        if not match:
            raise ValueError("Could not find HiResBoundingBox in Ghostscript output.")

        x0, y0, x1, y1 = map(float, match.groups())
        width_pts = x1 - x0
        height_pts = y1 - y0
        return width_pts, height_pts

    except Exception as e:
        raise RuntimeError(f"Failed to get PDF size for file {pdf_path}: {e}")

 def _format_size(size_bytes: int) -> str:
    if size_bytes < 1024:
        return f"{size_bytes}B"
    elif size_bytes < 1024 * 1024:
        return f"{size_bytes / 1024:.2f}KB"
    else:
        return f"{size_bytes / (1024 * 1024):.2f}MB"


 def _process_image(input_path: str, output_path: str, data, *, ppi=None, 
                   compress_pdf_images=False, 
                   no_crop=False, cache) -> Tuple[str, bool]:
    crop = data.get("crop")
    if no_crop:
        crop = None
    width_pt = height_pt = None
    if ppi is not None:
        width_pt, height_pt, *_ = data["rendered_size"].split(",")
        assert width_pt.endswith("pt")
        assert height_pt.endswith("pt")
        width_pt = float(width_pt[:-2])
        height_pt = float(height_pt[:-2])

    _entry = data["path"], crop, width_pt, height_pt
    if _entry in cache:
        return cache[_entry]
    outpath = f"{len(cache)+1:05d}{os.path.splitext(data['path'])[1]}"

    remove_crop = False
    if data["path"].endswith(".pdf"):
        size_bytes = _format_size(os.path.getsize(os.path.join(input_path, data["path"])))
        if not compress_pdf_images or crop is not None or ppi is None:
            shutil.copyfile(os.path.join(input_path, data["path"]), os.path.join(output_path, outpath))
            print(f'Copying file \033[34m{data["path"]}\033[0m (\33[1m{size_bytes}\033[0m)')
            cache[_entry] = outpath, remove_crop
            return outpath, remove_crop
        else:
            # Compress PDF images using ghostscript
            # First get pdf image size in points
            input_size_w_pt, input_size_h_pt = _get_pdf_size(os.path.join(input_path, data["path"]))
            target_w_px = int(round(width_pt * ppi / 72))
            target_h_px = int(round(height_pt * ppi / 72))
            assert crop is None, "Crop is not supported for PDF files with compression"
            dpi = max(target_w_px * 72 / input_size_w_pt, target_h_px * 72 / input_size_h_pt)
            gs_command = ("gs -q -dNOPAUSE -dBATCH -sDEVICE=pdfwrite " \
                f"-dPDFSETTINGS=/screen -dCompatibilityLevel=1.4 " \
                f"-dDownsampleColorImages=true -dColorImageResolution={int(dpi)} " \
                f"-dDownsampleGrayImages=true -dGrayImageResolution={int(dpi)} " \
                f"-dDownsampleMonoImages=true -dMonoImageResolution={int(dpi)}").split() + [
                f'-sOutputFile={os.path.join(output_path, outpath)}',
                os.path.join(input_path, data["path"]),
            ]
            subprocess.check_call(gs_command)
            new_size_bytes = _format_size(os.path.getsize(os.path.join(output_path, outpath)))
            print(f'Compressing PDF file \033[34m{data["path"]}\033[0m using DPI \33[1m{dpi:.3f}\033[0m, compression: \33[1m{size_bytes}\033[0m -> \33[1m{new_size_bytes}\033[0m')
            cache[_entry] = outpath, remove_crop
            return outpath, remove_crop

    image = Image.open(os.path.join(input_path, data["path"]))
    input_size = image.size
    if crop is not None:
        w, h = image.size
        a, b, c, d = tuple(map(int, crop.split(",")))
        image = image.crop((a, h-d, c, h-b))
        remove_crop = True
        del w

    was_resized = False
    if ppi is not None and (not no_crop or data.get("crop") is None):
        assert width_pt is not None
        image_width = int(round(width_pt * ppi / 72))
        if image_width < image.size[0]:
            new_size = (image_width, int(round(image_width / image.size[0] * image.size[1])))
            image = image.resize(new_size)
            if crop is not None:
                print(f'Cropping and resizing image \033[34m{data["path"]}\033[0m \33[1m{input_size}\033[0m, crop \33[1m{crop}\033[0m -> \33[1m{new_size}\033[0m')
            else:
                print(f'Resizing image \033[34m{data["path"]}\033[0m \33[1m{input_size}\033[0m -> \33[1m{new_size}\033[0m')
            image.save(os.path.join(output_path, outpath))
            was_resized = True

    if not was_resized:
        if crop is not None:
            print(f'Cropping image \033[34m{data["path"]}\033[0m \33[1m{input_size}\033[0m, crop \33[1m{crop}\033[0m -> \33[1m{image.size}\033[0m')
            image.save(os.path.join(output_path, outpath))
        else:
            print(f'Copying image \033[34m{data["path"]}\033[0m \33[1m{input_size}\033[0m')
            shutil.copyfile(os.path.join(input_path, data["path"]), os.path.join(output_path, outpath))
    cache[_entry] = outpath, remove_crop
    return outpath, remove_crop


 if __name__ == "__main__":
    pname = sys.argv[0]
    parser = argparse.ArgumentParser(
        description='''This tool simplifies a LaTeX project by removing unused files and removing comments.
 It also merges tex files into a single file and optionally crops and downscales images
 to reduce the size of the output. It is useful when you want to prepare an ArXiv submission.''',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=(f'''notes:
    The script will try to find all required files by compiling the LaTeX file.
    It relies on the latexmk tool, which must be installed. Both the input and
    the output can be a directory or a zip file. Therefore the script can be used
    on the downloaded Overleaf project zip file. And the output archive can be
    directly uploaded to ArXiv.

 examples:
    {pname} my_paper.tex my_paper_arxiv
    {pname} my_paper.tex my_paper_arxiv.zip --use-bbl --pdf
    {pname} overleaf_archive.zip/my_paper.tex my_paper_arxiv.zip --ppi=300

 Copyright (c) 2023 Jonas Kulhanek, MIT License'''))
    parser.add_argument('path', type=Path, help='Path to the input ".tex" file. Can be inside a zip file')
    parser.add_argument('output', type=Path, help='Path to output directory/zip-file. If it exists, it will be deleted.')
    parser.add_argument('--use-bbl', action='store_true', help='Use the .bbl file instead of the .bib file. (for arxiv)')
    parser.add_argument('--overwrite', action='store_true', help='Override output file if it exists.')
    parser.add_argument('--ppi', type=float, default=None, help='Specify pixels per inch (PPI) to automatically resize images.')
    parser.add_argument('--no-crop', action='store_true', help='Do not crop images (not recommended). No scaling is applied on uncropped cropped images.')
    parser.add_argument('--no-pdf', action='store_true', help='Skip generating PDF file')
    parser.add_argument('--no-compress-pdf', action='store_true', help='Disable image compression for included PDF files. By default ghostscript is used to compress included PDF files.')
    parser.add_argument('--verbose', action='store_true', help='Print detailed log.')
    args = parser.parse_args()
    args.path = args.path.absolute()
    args.output = args.output.absolute()
    if os.path.exists(args.output):
        if args.overwrite:
            if args.output.is_dir():
                shutil.rmtree(args.output)
            else:
                os.remove(args.output)
        else:
            print(f"\033[91;1mOutput path {args.output} already exists. Use --overwrite to delete it.\033[0m", file=sys.stderr)
            sys.exit(1)
    if args.output.suffix == ".zip":
        output_ctx_wrapper = tempfile.TemporaryDirectory()
    else:
        output_ctx_wrapper = contextlib.nullcontext(str(args.output))
    if args.path.parent.suffix == ".zip":
        input_ctx_wrapper = tempfile.TemporaryDirectory()
    else:
        input_ctx_wrapper = contextlib.nullcontext(str(args.path.parent))

    with output_ctx_wrapper as output_dir, input_ctx_wrapper as input_path:
        if args.path.parent.suffix == ".zip":
            with zipfile.ZipFile(args.path.parent, 'r') as zip_ref:
                zip_ref.extractall(input_path)
        path = Path(input_path)/args.path.name
        output = Path(output_dir)
        output.mkdir(exist_ok=True, parents=True)
        print("\033[93;1mFinding required resources...\033[0m")
        required_files, image_data, inputsize = _find_required_resources(path, verbose=args.verbose)

        # Process images
        _cache = {}
        with open(output / "imagedata.in", "w") as f:
            for data in image_data:
                img_name, remove_crop = _process_image(
                    input_path,
                    output_dir,
                    data,
                    cache=_cache,
                    ppi=args.ppi,
                    compress_pdf_images=not args.no_compress_pdf,
                    no_crop=args.no_crop)
                f.write(f"{img_name}\n")
                f.write("1\n" if remove_crop else "0\n")

        # Prepare latex file
        tex_content = _read_tex_file(path)
        tex_content = _apply_tex_patch(tex_content, replace_images)

        # Copy required files
        root = path.parent
        if args.use_bbl:
            required_files = set(x for x in required_files if not x.lower().endswith('.bib') and not x.lower().endswith('.bst'))
        else:
            required_files = set(x for x in required_files if not x.lower().endswith('.bbl'))
        for f in required_files:
            (output / f).parent.mkdir(parents=True, exist_ok=True)
            _, extension = os.path.splitext(f)
            if extension in {'.cls', '.tex'}:
                with open(output / f, 'w+') as fout:
                    fout.write(_read_tex_file(root/f, root))
            else:
                if (root / f).exists():
                    shutil.copyfile(root / f, output / f)
                else:
                    # File does not exist, but latex compiled so it is probably in the texlive
                    # print(f"\033[91;1mRequired file {f} not found.\033[0m", file=sys.stderr)
                    pass

        # Write tex file
        print("\033[93;1mWriting output file...\033[0m")
        with open(output / args.path.name, 'w+') as f:
            f.write(tex_content)
        if args.output.suffix == ".zip":
            with zipfile.ZipFile(args.output, 'w') as zip_file:
                for f in output.glob("**/*"):
                    zip_file.write(f, f.relative_to(output))
        print(f"\033[93;1mOutput written to \033[94;1m{args.output}\033[0m")

        if not args.no_pdf:
            print("\033[93;1mCompiling PDF...\033[0m")
            current_files = list(output.glob("*"))
            subprocess.check_call(['latexmk', '-halt-on-error', '-g', '-pdf', args.path.name],
                                  cwd=output,
                                  **({"stdout": subprocess.DEVNULL, "stderr": subprocess.DEVNULL} if not args.verbose else {}))  # type: ignore
            shutil.move(output / args.path.with_suffix('.pdf').name, args.output.with_suffix('.pdf'))
            for f in output.glob("*"):
                if f not in current_files:
                    os.remove(f)
            print(f"\033[93;1mPDF written to \033[94;1m{args.output.with_suffix('.pdf')}\033[0m")
            outputsize = args.output.with_suffix('.pdf').stat().st_size
            print(f"\033[93;1mSize: \033[92;1m{inputsize/1024/1024:.2f}\033[0mMB -> \033[92;1m{outputsize/1024/1024:.2f}\033[0mMB")
	#!/usr/bin/env python
	"""
	This tool simplifies a LaTeX project by removing unused files and removing comments.
	It also merges tex files into a single file and optionally crops and downscales images
	to reduce the size of the output. It is useful when you want to prepare an ArXiv submission.

	Usage:
	export-latex.py <path> <output> [options]

	Arguments:
	<path> Path to the input ".tex" file. Can be inside a zip file
	<output> Path to output directory/zip-file. If it exists, it will be deleted.

	Options:
	--use-bbl Use the .bbl file instead of the .bib file. (for arxiv)
	--overwrite Override output file if it exists.
	--pdf Also output pdf file.
	--ppi=<ppi> Automatically resize images to the specified pixels per inch (PPI).
	--no-crop Do not crop images (keep full size, not scaling is applied on cropped images).
	--no-pdf Skip generating PDF file
	--no-compress-pdf By default ghostscript is used to compress included PDF files.
	--verbose Print detailed log.

	Notes:
	The script will try to find all required files by compiling the LaTeX file.
	It relies on the latexmk tool, which must be installed. Both the input and
	the output can be a directory or a zip file. Therefore the script can be used
	on the downloaded Overleaf project zip file. And the output archive can be
	directly uploaded to ArXiv.

	Examples:
	export-latex.py my_paper.tex my_paper_arxiv
	export-latex.py my_paper.tex my_paper_arxiv.zip --use-bbl --pdf
	export-latex.py overleaf_archive.zip/my_paper.tex my_paper_arxiv.zip --ppi=300


	MIT License

	Copyright (c) 2023 Jonas Kulhanek

	Permission is hereby granted, free of charge, to any person obtaining a copy
	of this software and associated documentation files (the "Software"), to deal
	in the Software without restriction, including without limitation the rights
	to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	copies of the Software, and to permit persons to whom the Software is
	furnished to do so, subject to the following conditions:

	The above copyright notice and this permission notice shall be included in all
	copies or substantial portions of the Software.

	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	SOFTWARE.
	"""
	import contextlib
	import os
	import sys
	import tempfile
	from pathlib import Path
	from typing import Optional, Tuple
	import re
	import argparse
	import subprocess
	import shutil
	import zipfile
	from PIL import Image


	report_images = ("graphicx calc pgf currfile".split(), r"""
	\newwrite\imgstream
	\immediate\openout\imgstream=imagedata.aux
	\makeatletter
	\let\old@gintrim\Gin@trim
	\def\new@gintrim{\old@gintrim%
	\pgfmathparse{int(round(\Gin@llx))}
	\let\outllx\pgfmathresult
	\pgfmathparse{int(round(\Gin@lly))}
	\let\outlly\pgfmathresult
	\pgfmathparse{int(round(\Gin@urx))}
	\let\outurx\pgfmathresult
	\pgfmathparse{int(round(\Gin@ury))}
	\let\outury\pgfmathresult
	\pgfmathparse{int(round(\Gin@ollx))}
	\immediate\write\imgstream{crop=\outllx,\outlly,\outurx,\outury}}
	\let\old@ginsetfile\Gin@setfile
	\def\new@ginsetfile#1{%
	\immediate\write\imgstream{path=\Gin@base\Gin@ext}%
	\old@ginsetfile{#1}}
	\let\oldincludegraphics\includegraphics
	\providecommand{\includegraphics}{}
	\renewcommand{\includegraphics}[2][]{%
	\immediate\write\imgstream{source=\currfilepath}%
	\immediate\write\imgstream{line=\the\inputlineno}%
	\begingroup
	\let\Gin@trim\new@gintrim
	\let\Gin@setfile\new@ginsetfile
	\setbox0=\hbox{\oldincludegraphics[#1]{#2}}%
	\immediate\write\imgstream{rendered_size=\the\wd0,\the\ht0,\the\dp0}%
	\let\Gin@setfile\old@ginsetfile
	\let\Gin@trim\old@gintrim
	\immediate\write\imgstream{}%
	\endgroup%
	\oldincludegraphics[#1]{#2}}
	\makeatother
	""", r"""
	\immediate\closeout\imgstream
	""")

	replace_images = ([], r"""
	\newread\imgstream
	\immediate\openin\imgstream=imagedata.in
	\makeatletter
	\def\new@kvginclip#1{}
	\def\new@kvgintrim#1{}
	\let\old@kvginclip\KV@Gin@clip
	\let\old@kvgintrim\KV@Gin@trim
	\let\oldincludegraphics\includegraphics
	\providecommand{\includegraphics}{}
	\renewcommand{\includegraphics}[2][]{%
	\immediate\read\imgstream to \src
	\immediate\read\imgstream to \removecrop
	\ifnum\removecrop=1
	\let\KV@Gin@clip\new@kvginclip
	\let\KV@Gin@trim\new@kvgintrim
	\fi
	\oldincludegraphics[#1]{\src}%
	\let\KV@Gin@clip\old@kvginclip
	\let\KV@Gin@trim\old@kvgintrim}
	\makeatother
	""", r"""
	\immediate\closein\imgstream
	""")


	def _apply_tex_patch(tex: str, patch):
	deps, pre, post = patch
	deps = [x for x in deps if f"\\usepackage{{{x}}}" not in tex]
	pre = "".join(f"\\usepackage{{{x}}}\n" for x in deps) + pre
	tex = tex.replace("\\begin{document}", pre + "\n\\begin{document}")
	tex = tex.replace("\\end{document}", post + "\n\\end{document}")
	return tex


	def _parse_image_data(text: str):
	output = [{}]
	for line in text.splitlines():
	if not line:
	output.append({})
	continue
	key, val = line.split("=", 1)
	output[-1][key] = val
	output.pop()
	return output


	def _find_required_resources(path: Path, verbose=False):
	with open(path, "r", encoding="utf8") as f:
	text = f.read()
	updated_text = _apply_tex_patch(text, report_images)
	tmp_path_stem = f"__tmp_doc_{path.stem}"
	tmp_path = path.with_stem(tmp_path_stem)
	for fname in tmp_path.parent.glob(tmp_path.stem + ".*"):
	os.remove(fname)
	with open(tmp_path, "w") as f:
	f.write(updated_text)
	subprocess.check_call(['latexmk', '-halt-on-error', '-g', '-pdf', tmp_path.name],
	cwd=tmp_path.parent,
	**({"stdout": subprocess.DEVNULL, "stderr": subprocess.DEVNULL} if not verbose else {})) # type: ignore
	with open(tmp_path.parent.joinpath("imagedata.aux"), "r") as f:
	images = _parse_image_data(f.read())
	log = open(tmp_path.with_suffix(".log"), 'r').read()

	output = set()
	for m in re.finditer(r'\(\.\/([^\s\)\n]+)(?:\)\|\n\|\s\|\r\n)', log):
	fname = m.group(1)
	if fname.endswith('.cls') or \
	fname.endswith('.sty') or \
	fname.endswith('.bbl'):
	if fname.startswith(tmp_path.stem):
	fname = str(Path(fname).with_stem(path.stem))
	output.add(fname)
	if tmp_path.with_suffix(".blg").exists():
	log = open(tmp_path.with_suffix(".blg"), 'r').read()
	for m in re.finditer(r'\s([^\s\n]*(?:.bib\|.bst))\n', log):
	fname = m.group(1)
	if fname.startswith(tmp_path.stem):
	fname = str(Path(fname).with_stem(path.stem))
	output.add(fname)
	if tmp_path.with_suffix(".bbl").exists():
	shutil.copy(tmp_path.with_suffix(".bbl"), path.with_suffix(".bbl"))
	inputsize = tmp_path.with_suffix(".pdf").stat().st_size
	for fname in tmp_path.parent.glob(tmp_path.stem + ".*"):
	os.remove(fname)
	return output, images, inputsize

	def _remove_tex_comments(content):
	def _remove_comment_line(text):
	if 'auto-ignore' in text:
	return text + '\n'
	if text.lstrip(' ').lstrip('\t').startswith('%'):
	return ''
	match = re.search(r'(?<!\\)%', text)
	if match:
	return text[:match.end()] + '\n'
	else:
	return text + '\n'
	return ''.join(map(_remove_comment_line, content.splitlines()))

	def _read_tex_file(path: Path, root: Optional[Path] = None):
	if root is None:
	root = path.parent
	if not path.suffix == '.tex' and not path.suffix == '.bbl' and not path.suffix == '.cls':
	path = Path(str(path) + '.tex')
	text = open(path, 'r').read()
	text = _remove_tex_comments(text)
	text = re.sub(r'\\(?:input\|include){([^}]+)}', lambda m: _read_tex_file(root/m.group(1), root), text)
	return text.rstrip(" \t\n\r")

	def _get_pdf_size(pdf_path):
	try:
	# Run Ghostscript with bbox device to extract bounding box info
	result = subprocess.run(
	['gs', '-q', '-dBATCH', '-dNOPAUSE', '-sDEVICE=bbox', pdf_path],
	stderr=subprocess.PIPE,
	stdout=subprocess.PIPE,
	text=True
	)

	# Extract the HiResBoundingBox line
	match = re.search(r'%%HiResBoundingBox:\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)', result.stderr)
	if not match:
	raise ValueError("Could not find HiResBoundingBox in Ghostscript output.")

	x0, y0, x1, y1 = map(float, match.groups())
	width_pts = x1 - x0
	height_pts = y1 - y0
	return width_pts, height_pts

	except Exception as e:
	raise RuntimeError(f"Failed to get PDF size for file {pdf_path}: {e}")

	def _format_size(size_bytes: int) -> str:
	if size_bytes < 1024:
	return f"{size_bytes}B"
	elif size_bytes < 1024 * 1024:
	return f"{size_bytes / 1024:.2f}KB"
	else:
	return f"{size_bytes / (1024 * 1024):.2f}MB"


	def _process_image(input_path: str, output_path: str, data, *, ppi=None,
	compress_pdf_images=False,
	no_crop=False, cache) -> Tuple[str, bool]:
	crop = data.get("crop")
	if no_crop:
	crop = None
	width_pt = height_pt = None
	if ppi is not None:
	width_pt, height_pt, *_ = data["rendered_size"].split(",")
	assert width_pt.endswith("pt")
	assert height_pt.endswith("pt")
	width_pt = float(width_pt[:-2])
	height_pt = float(height_pt[:-2])

	_entry = data["path"], crop, width_pt, height_pt
	if _entry in cache:
	return cache[_entry]
	outpath = f"{len(cache)+1:05d}{os.path.splitext(data['path'])[1]}"

	remove_crop = False
	if data["path"].endswith(".pdf"):
	size_bytes = _format_size(os.path.getsize(os.path.join(input_path, data["path"])))
	if not compress_pdf_images or crop is not None or ppi is None:
	shutil.copyfile(os.path.join(input_path, data["path"]), os.path.join(output_path, outpath))
	print(f'Copying file \033[34m{data["path"]}\033[0m (\33[1m{size_bytes}\033[0m)')
	cache[_entry] = outpath, remove_crop
	return outpath, remove_crop
	else:
	# Compress PDF images using ghostscript
	# First get pdf image size in points
	input_size_w_pt, input_size_h_pt = _get_pdf_size(os.path.join(input_path, data["path"]))
	target_w_px = int(round(width_pt * ppi / 72))
	target_h_px = int(round(height_pt * ppi / 72))
	assert crop is None, "Crop is not supported for PDF files with compression"
	dpi = max(target_w_px * 72 / input_size_w_pt, target_h_px * 72 / input_size_h_pt)
	gs_command = ("gs -q -dNOPAUSE -dBATCH -sDEVICE=pdfwrite " \
	f"-dPDFSETTINGS=/screen -dCompatibilityLevel=1.4 " \
	f"-dDownsampleColorImages=true -dColorImageResolution={int(dpi)} " \
	f"-dDownsampleGrayImages=true -dGrayImageResolution={int(dpi)} " \
	f"-dDownsampleMonoImages=true -dMonoImageResolution={int(dpi)}").split() + [
	f'-sOutputFile={os.path.join(output_path, outpath)}',
	os.path.join(input_path, data["path"]),
	]
	subprocess.check_call(gs_command)
	new_size_bytes = _format_size(os.path.getsize(os.path.join(output_path, outpath)))
	print(f'Compressing PDF file \033[34m{data["path"]}\033[0m using DPI \33[1m{dpi:.3f}\033[0m, compression: \33[1m{size_bytes}\033[0m -> \33[1m{new_size_bytes}\033[0m')
	cache[_entry] = outpath, remove_crop
	return outpath, remove_crop

	image = Image.open(os.path.join(input_path, data["path"]))
	input_size = image.size
	if crop is not None:
	w, h = image.size
	a, b, c, d = tuple(map(int, crop.split(",")))
	image = image.crop((a, h-d, c, h-b))
	remove_crop = True
	del w

	was_resized = False
	if ppi is not None and (not no_crop or data.get("crop") is None):
	assert width_pt is not None
	image_width = int(round(width_pt * ppi / 72))
	if image_width < image.size[0]:
	new_size = (image_width, int(round(image_width / image.size[0] * image.size[1])))
	image = image.resize(new_size)
	if crop is not None:
	print(f'Cropping and resizing image \033[34m{data["path"]}\033[0m \33[1m{input_size}\033[0m, crop \33[1m{crop}\033[0m -> \33[1m{new_size}\033[0m')
	else:
	print(f'Resizing image \033[34m{data["path"]}\033[0m \33[1m{input_size}\033[0m -> \33[1m{new_size}\033[0m')
	image.save(os.path.join(output_path, outpath))
	was_resized = True

	if not was_resized:
	if crop is not None:
	print(f'Cropping image \033[34m{data["path"]}\033[0m \33[1m{input_size}\033[0m, crop \33[1m{crop}\033[0m -> \33[1m{image.size}\033[0m')
	image.save(os.path.join(output_path, outpath))
	else:
	print(f'Copying image \033[34m{data["path"]}\033[0m \33[1m{input_size}\033[0m')
	shutil.copyfile(os.path.join(input_path, data["path"]), os.path.join(output_path, outpath))
	cache[_entry] = outpath, remove_crop
	return outpath, remove_crop


	if __name__ == "__main__":
	pname = sys.argv[0]
	parser = argparse.ArgumentParser(
	description='''This tool simplifies a LaTeX project by removing unused files and removing comments.
	It also merges tex files into a single file and optionally crops and downscales images
	to reduce the size of the output. It is useful when you want to prepare an ArXiv submission.''',
	formatter_class=argparse.RawDescriptionHelpFormatter,
	epilog=(f'''notes:
	The script will try to find all required files by compiling the LaTeX file.
	It relies on the latexmk tool, which must be installed. Both the input and
	the output can be a directory or a zip file. Therefore the script can be used
	on the downloaded Overleaf project zip file. And the output archive can be
	directly uploaded to ArXiv.

	examples:
	{pname} my_paper.tex my_paper_arxiv
	{pname} my_paper.tex my_paper_arxiv.zip --use-bbl --pdf
	{pname} overleaf_archive.zip/my_paper.tex my_paper_arxiv.zip --ppi=300

	Copyright (c) 2023 Jonas Kulhanek, MIT License'''))
	parser.add_argument('path', type=Path, help='Path to the input ".tex" file. Can be inside a zip file')
	parser.add_argument('output', type=Path, help='Path to output directory/zip-file. If it exists, it will be deleted.')
	parser.add_argument('--use-bbl', action='store_true', help='Use the .bbl file instead of the .bib file. (for arxiv)')
	parser.add_argument('--overwrite', action='store_true', help='Override output file if it exists.')
	parser.add_argument('--ppi', type=float, default=None, help='Specify pixels per inch (PPI) to automatically resize images.')
	parser.add_argument('--no-crop', action='store_true', help='Do not crop images (not recommended). No scaling is applied on uncropped cropped images.')
	parser.add_argument('--no-pdf', action='store_true', help='Skip generating PDF file')
	parser.add_argument('--no-compress-pdf', action='store_true', help='Disable image compression for included PDF files. By default ghostscript is used to compress included PDF files.')
	parser.add_argument('--verbose', action='store_true', help='Print detailed log.')
	args = parser.parse_args()
	args.path = args.path.absolute()
	args.output = args.output.absolute()
	if os.path.exists(args.output):
	if args.overwrite:
	if args.output.is_dir():
	shutil.rmtree(args.output)
	else:
	os.remove(args.output)
	else:
	print(f"\033[91;1mOutput path {args.output} already exists. Use --overwrite to delete it.\033[0m", file=sys.stderr)
	sys.exit(1)
	if args.output.suffix == ".zip":
	output_ctx_wrapper = tempfile.TemporaryDirectory()
	else:
	output_ctx_wrapper = contextlib.nullcontext(str(args.output))
	if args.path.parent.suffix == ".zip":
	input_ctx_wrapper = tempfile.TemporaryDirectory()
	else:
	input_ctx_wrapper = contextlib.nullcontext(str(args.path.parent))

	with output_ctx_wrapper as output_dir, input_ctx_wrapper as input_path:
	if args.path.parent.suffix == ".zip":
	with zipfile.ZipFile(args.path.parent, 'r') as zip_ref:
	zip_ref.extractall(input_path)
	path = Path(input_path)/args.path.name
	output = Path(output_dir)
	output.mkdir(exist_ok=True, parents=True)
	print("\033[93;1mFinding required resources...\033[0m")
	required_files, image_data, inputsize = _find_required_resources(path, verbose=args.verbose)

	# Process images
	_cache = {}
	with open(output / "imagedata.in", "w") as f:
	for data in image_data:
	img_name, remove_crop = _process_image(
	input_path,
	output_dir,
	data,
	cache=_cache,
	ppi=args.ppi,
	compress_pdf_images=not args.no_compress_pdf,
	no_crop=args.no_crop)
	f.write(f"{img_name}\n")
	f.write("1\n" if remove_crop else "0\n")

	# Prepare latex file
	tex_content = _read_tex_file(path)
	tex_content = _apply_tex_patch(tex_content, replace_images)

	# Copy required files
	root = path.parent
	if args.use_bbl:
	required_files = set(x for x in required_files if not x.lower().endswith('.bib') and not x.lower().endswith('.bst'))
	else:
	required_files = set(x for x in required_files if not x.lower().endswith('.bbl'))
	for f in required_files:
	(output / f).parent.mkdir(parents=True, exist_ok=True)
	_, extension = os.path.splitext(f)
	if extension in {'.cls', '.tex'}:
	with open(output / f, 'w+') as fout:
	fout.write(_read_tex_file(root/f, root))
	else:
	if (root / f).exists():
	shutil.copyfile(root / f, output / f)
	else:
	# File does not exist, but latex compiled so it is probably in the texlive
	# print(f"\033[91;1mRequired file {f} not found.\033[0m", file=sys.stderr)
	pass

	# Write tex file
	print("\033[93;1mWriting output file...\033[0m")
	with open(output / args.path.name, 'w+') as f:
	f.write(tex_content)
	if args.output.suffix == ".zip":
	with zipfile.ZipFile(args.output, 'w') as zip_file:
	for f in output.glob("*/"):
	zip_file.write(f, f.relative_to(output))
	print(f"\033[93;1mOutput written to \033[94;1m{args.output}\033[0m")

	if not args.no_pdf:
	print("\033[93;1mCompiling PDF...\033[0m")
	current_files = list(output.glob("*"))
	subprocess.check_call(['latexmk', '-halt-on-error', '-g', '-pdf', args.path.name],
	cwd=output,
	**({"stdout": subprocess.DEVNULL, "stderr": subprocess.DEVNULL} if not args.verbose else {})) # type: ignore
	shutil.move(output / args.path.with_suffix('.pdf').name, args.output.with_suffix('.pdf'))
	for f in output.glob("*"):
	if f not in current_files:
	os.remove(f)
	print(f"\033[93;1mPDF written to \033[94;1m{args.output.with_suffix('.pdf')}\033[0m")
	outputsize = args.output.with_suffix('.pdf').stat().st_size
	print(f"\033[93;1mSize: \033[92;1m{inputsize/1024/1024:.2f}\033[0mMB -> \033[92;1m{outputsize/1024/1024:.2f}\033[0mMB")