Created
November 4, 2021 11:30
-
-
Save abbbi/b4b07efd133cdc5f86c0da01a030e76a to your computer and use it in GitHub Desktop.
streaming_tar.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# | |
# Building a tar file chunk-by-chunk. | |
# | |
# This is a quick bit of sample code for streaming data to a tar file, | |
# building it piece-by-piece. The tarfile is built on-the-fly and streamed | |
# back out. This is useful for web applications that need to dynamically | |
# build a tar file without swamping the server. | |
import os | |
import sys | |
import tarfile | |
from io import BytesIO | |
class FileStream(object): | |
def __init__(self): | |
self.buffer = BytesIO() | |
self.offset = 0 | |
def write(self, s): | |
self.buffer.write(s) | |
self.offset += len(s) | |
def tell(self): | |
return self.offset | |
def close(self): | |
self.buffer.close() | |
def pop(self): | |
s = self.buffer.getvalue() | |
self.buffer.close() | |
self.buffer = BytesIO() | |
return s | |
def stream_build_tar(in_filename, streaming_fp): | |
tar = tarfile.TarFile.open(out_filename, 'w|gz', streaming_fp) | |
stat = os.stat(in_filename) | |
tar_info = tarfile.TarInfo(in_filename) | |
# Note that you can get this information from the storage backend, | |
# but it's valid for either to raise a NotImplementedError, so it's | |
# important to check. | |
# | |
# Things like the mode or ownership won't be available. | |
tar_info.mtime = stat.st_mtime | |
tar_info.size = stat.st_size | |
# Note that we don't pass a fileobj, so we don't write any data | |
# through addfile. We'll do this ourselves. | |
tar.addfile(tar_info) | |
yield | |
with open(in_filename, 'rb') as in_fp: | |
total_size = 0 | |
while True: | |
s = in_fp.read(BLOCK_SIZE) | |
if len(s) > 0: | |
tar.fileobj.write(s) | |
yield | |
if len(s) < BLOCK_SIZE: | |
blocks, remainder = divmod(tar_info.size, tarfile.BLOCKSIZE) | |
if remainder > 0: | |
tar.fileobj.write(tarfile.NUL * | |
(tarfile.BLOCKSIZE - remainder)) | |
yield | |
blocks += 1 | |
tar.offset += blocks * tarfile.BLOCKSIZE | |
break | |
tar.close() | |
yield | |
BLOCK_SIZE = 4096 | |
if len(sys.argv) != 3: | |
print('Usage: %s in_filename out_filename' % sys.argv[0]) | |
sys.exit(1) | |
in_filename = sys.argv[1] | |
out_filename = sys.argv[2] | |
streaming_fp = FileStream() | |
with open(out_filename, 'wb') as out_fp: | |
for i in stream_build_tar(in_filename, streaming_fp): | |
s = streaming_fp.pop() | |
if len(s) > 0: | |
print('Writing %d bytes...' % len(s)) | |
out_fp.write(s) | |
out_fp.flush() | |
print('Wrote tar file to %s' % out_filename) |
Anytime you tar a file with this script it takes around 30x the time as normal tar and also the checksums between a normal tar command and this python script are different
might well be, this gist just exists as i wanted to be able to use the script from the original gist with python3, no logical changes to the script otherwise.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Anytime you tar a file with this script it takes around 30x the time as normal tar and also the checksums between a normal tar command and this python script are different