Created
November 30, 2019 01:51
-
-
Save whalesalad/43bca4e48c4f8dfc49ab08f3110ad2ff to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import io | |
import json | |
from service.memcached import client as memcached | |
from service.storage import utils | |
ONE_MEGABYTE = 1000 * 1000 | |
CHUNK_SIZE = ONE_MEGABYTE | |
MAX_FILE_SIZE = 50 * ONE_MEGABYTE | |
class FileTooBigException(Exception): | |
pass | |
class FileAlreadyExists(Exception): | |
pass | |
class FileDoesNotExist(Exception): | |
pass | |
class DataCorruptionException(Exception): | |
pass | |
def _get_metadata(filename): | |
data = memcached.get(utils.key_for_file_metadata(filename)) | |
if not data: | |
raise FileDoesNotExist(f"{filename} could not be found.") | |
return json.loads(data) | |
def exists(filename): | |
try: | |
metadata = _get_metadata(filename) | |
return True | |
except FileDoesNotExist: | |
return False | |
def store(filename, file): | |
metadata = utils.generate_metadata(file, chunk_size=CHUNK_SIZE) | |
# If we're too big, barf. | |
if metadata['size'] > MAX_FILE_SIZE: | |
raise FileTooBigException("The file is too big. The maximum file size is 50MB.") | |
if exists(filename): | |
raise FileAlreadyExists(f"A file with the filename {filename} already exists.") | |
# 1. store the metadata for the file | |
memcached.set(utils.key_for_file_metadata(filename), json.dumps(metadata)) | |
# 2. store each chunk of the file | |
for idx, chunk in enumerate(utils.read_in_chunks(file, CHUNK_SIZE)): | |
memcached.set(utils.key_for_file_chunk(filename, idx), chunk) | |
return metadata | |
def retrieve(filename): | |
metadata = _get_metadata(filename) | |
num_chunks = metadata.get('num_chunks') | |
# TODO, check that all chunks exist to handle data corruption case. | |
out = io.BytesIO() | |
for chunk_idx in range(num_chunks): | |
# TODO introduce error handling for when a chunk does not exist | |
data = memcached.get(utils.key_for_file_chunk(filename, chunk_idx)) | |
out.write(data) | |
out.seek(0) | |
# If our manifest metadata differs from what we actually grabbed... die. | |
stored = metadata.get('md5') | |
retrieved = utils.get_md5(out) | |
if stored != retrieved: | |
raise DataCorruptionException(f"The retrieved data for {filename} does not match the manifest checksum.") | |
return out, metadata |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import hashlib | |
import math | |
def read_in_chunks(file, chunk_size): | |
while True: | |
data = file.read(chunk_size) | |
if not data: | |
break | |
yield data | |
def get_md5(file): | |
""" | |
Given a file object, return a md5 hash. | |
""" | |
md5 = hashlib.md5() | |
for chunk in read_in_chunks(file, 4096): | |
md5.update(chunk) | |
file.seek(0) | |
return md5.hexdigest() | |
def get_size(file): | |
""" | |
Given a file object, return the total bytes. | |
""" | |
file.seek(0, 2) | |
size = file.tell() | |
file.seek(0) | |
return size | |
def get_num_chunks(size, chunk_size): | |
return math.ceil(size / chunk_size) | |
def generate_metadata(file, chunk_size): | |
""" | |
Return size, md5 hash and chunk information to store alongside | |
the file in memcached. | |
""" | |
size = get_size(file) | |
return { | |
'size': size, | |
'md5': get_md5(file), | |
'num_chunks': get_num_chunks(size, chunk_size) | |
} | |
def hashed_filename(filename): | |
return hashlib.md5(filename.encode('utf-8')).hexdigest() | |
def key_for_file_metadata(filename): | |
filename = hashed_filename(filename) | |
return f"{filename}-metadata" | |
def key_for_file_chunk(filename, chunk_idx): | |
filename = hashed_filename(filename) | |
return f"{filename}-part-{chunk_idx}" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment