Last active
April 27, 2023 00:24
-
-
Save nijave/0759b2e2cd2ae63a59bf53121d314295 to your computer and use it in GitHub Desktop.
Generates a qdirstat cache file from a borgbackup archive
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
See section about "Reading and writing cache files" for how this works | |
with QDirStat https://github.com/shundhammer/qdirstat/blob/master/README.md?plain=1#L848 | |
`borg list` should run without prompting for credentials. See borg docs for configuring | |
environment variables https://borgbackup.readthedocs.io/en/stable/quickstart.html#automating-backups | |
""" | |
import dataclasses | |
import datetime | |
import gzip | |
import io | |
import logging | |
import subprocess | |
import sys | |
import time | |
# can be swapped out with native json (and references below fixed) | |
# but it will likely be a lot slower | |
import orjson | |
import typing | |
# percentcoding can be swapped out with urllib.parse.quote to make | |
# it easier to run but it's significantly slower when processing lots of data | |
# The version on pypi segfaults on amd64 | |
# git+https://github.com/nijave/python-percentcoding.git | |
import percentcoding | |
logger = logging.getLogger(__name__) | |
logging.basicConfig(level=logging.INFO) | |
OUTPUT_FILENAME = "qdirstat.cache.gz" | |
@dataclasses.dataclass | |
class QDirStatCacheItem: | |
type: str # F=File, D=Directory, L=Link | |
path: str # Url encoded path | |
size: int # Bytes | |
mtime: datetime.datetime | |
blocks: typing.Optional[int] = None | |
links: typing.Optional[int] = None | |
def __str__(self): | |
mtime = self.mtime | |
if self.mtime: | |
mtime = time.mktime(self.mtime.timetuple()) | |
if self.mtime is None: | |
mtime = 0 | |
line = ( | |
f"{self.type}\t" | |
+ "/".join(percentcoding.quote(part) for part in self.path.split("/")) | |
+ f"\t{self.size}\t{mtime}" | |
) | |
if len(line) > 1024: | |
raise ValueError("generated item is longer than 1024 bytes") | |
return line | |
def write_qdirstat_cache_header(writer: io.TextIOBase) -> None: | |
writer.write( | |
"""[qdirstat 1.0 cache file] | |
# Generated by qdirstat-generate (borg-qdirstat.py) | |
# Do not edit! | |
# | |
# Type path size mtime <optional fields> | |
D / 0 0 | |
""" | |
) | |
def borg_list_archives(repository: str) -> typing.List[str]: | |
logger.info("Getting list of borg archives in repository") | |
borg_list_output = orjson.loads( | |
subprocess.check_output(["borg", "list", "--json", repository], text=True) | |
) | |
archives = [archive["name"] for archive in borg_list_output["archives"]] | |
logger.info("Found %d archives", len(archives)) | |
return archives | |
def write_borg_output( | |
file_stream: io.TextIOWrapper, output_filename: str = OUTPUT_FILENAME | |
) -> None: | |
items_written = 0 | |
with io.open(output_filename, mode="wb", buffering=1024 * 1024 * 64) as raw_output: | |
with gzip.GzipFile( | |
fileobj=raw_output, | |
mode="w", | |
compresslevel=1, | |
) as compressed_binary: | |
compressed = io.TextIOWrapper(compressed_binary, encoding="utf-8") | |
write_qdirstat_cache_header(compressed) | |
fmt = "%Y-%m-%dT%H:%M:%S.%f" | |
while line := file_stream.readline(): | |
file = orjson.loads(line) | |
mtime = datetime.datetime.strptime(file["mtime"], fmt) | |
compressed.write( | |
str( | |
QDirStatCacheItem( | |
type="D" if file["type"] == "d" else "F", | |
path=f'/{file["path"]}', | |
size=file["size"], | |
mtime=mtime, | |
) | |
) | |
) | |
compressed.write("\n") | |
items_written += 1 | |
if items_written % 50000 == 0: | |
logger.info("Wrote %d items", items_written) | |
logger.info("Wrote %d items total", items_written) | |
if __name__ == "__main__": | |
if len(sys.argv) != 2: | |
raise ValueError(f"Usage: {sys.argv[0]} repo-name") | |
repo = sys.argv[1] | |
archives = borg_list_archives(repo) | |
archive = archives[-1] | |
logger.info("Listing files for archive %s", archive) | |
borg_list_files = subprocess.Popen( | |
["borg", "list", "--json-lines", f"{repo}::{archive}"], | |
text=True, | |
stdout=subprocess.PIPE, | |
) | |
write_borg_output(borg_list_files.stdout) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment