Last active
August 27, 2019 00:49
-
-
Save ivankra/3a1d3b57a6a1759131e58020e1845699 to your computer and use it in GitHub Desktop.
Minimal tfrecords reader/writer in pure python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Minimal tfrecords reader/writer in pure python. | |
# Use either a fast compiled crc32c implementation from `pip install crc32c`: | |
#from crc32c import crc32 as crc32c | |
# Or alternatively a slower pure python crc32c implementation: {{{ | |
CRC32C_TABLE = ( | |
0x00000000, 0xF26B8303, 0xE13B70F7, 0x1350F3F4, 0xC79A971F, 0x35F1141C, | |
0x26A1E7E8, 0xD4CA64EB, 0x8AD958CF, 0x78B2DBCC, 0x6BE22838, 0x9989AB3B, | |
0x4D43CFD0, 0xBF284CD3, 0xAC78BF27, 0x5E133C24, 0x105EC76F, 0xE235446C, | |
0xF165B798, 0x030E349B, 0xD7C45070, 0x25AFD373, 0x36FF2087, 0xC494A384, | |
0x9A879FA0, 0x68EC1CA3, 0x7BBCEF57, 0x89D76C54, 0x5D1D08BF, 0xAF768BBC, | |
0xBC267848, 0x4E4DFB4B, 0x20BD8EDE, 0xD2D60DDD, 0xC186FE29, 0x33ED7D2A, | |
0xE72719C1, 0x154C9AC2, 0x061C6936, 0xF477EA35, 0xAA64D611, 0x580F5512, | |
0x4B5FA6E6, 0xB93425E5, 0x6DFE410E, 0x9F95C20D, 0x8CC531F9, 0x7EAEB2FA, | |
0x30E349B1, 0xC288CAB2, 0xD1D83946, 0x23B3BA45, 0xF779DEAE, 0x05125DAD, | |
0x1642AE59, 0xE4292D5A, 0xBA3A117E, 0x4851927D, 0x5B016189, 0xA96AE28A, | |
0x7DA08661, 0x8FCB0562, 0x9C9BF696, 0x6EF07595, 0x417B1DBC, 0xB3109EBF, | |
0xA0406D4B, 0x522BEE48, 0x86E18AA3, 0x748A09A0, 0x67DAFA54, 0x95B17957, | |
0xCBA24573, 0x39C9C670, 0x2A993584, 0xD8F2B687, 0x0C38D26C, 0xFE53516F, | |
0xED03A29B, 0x1F682198, 0x5125DAD3, 0xA34E59D0, 0xB01EAA24, 0x42752927, | |
0x96BF4DCC, 0x64D4CECF, 0x77843D3B, 0x85EFBE38, 0xDBFC821C, 0x2997011F, | |
0x3AC7F2EB, 0xC8AC71E8, 0x1C661503, 0xEE0D9600, 0xFD5D65F4, 0x0F36E6F7, | |
0x61C69362, 0x93AD1061, 0x80FDE395, 0x72966096, 0xA65C047D, 0x5437877E, | |
0x4767748A, 0xB50CF789, 0xEB1FCBAD, 0x197448AE, 0x0A24BB5A, 0xF84F3859, | |
0x2C855CB2, 0xDEEEDFB1, 0xCDBE2C45, 0x3FD5AF46, 0x7198540D, 0x83F3D70E, | |
0x90A324FA, 0x62C8A7F9, 0xB602C312, 0x44694011, 0x5739B3E5, 0xA55230E6, | |
0xFB410CC2, 0x092A8FC1, 0x1A7A7C35, 0xE811FF36, 0x3CDB9BDD, 0xCEB018DE, | |
0xDDE0EB2A, 0x2F8B6829, 0x82F63B78, 0x709DB87B, 0x63CD4B8F, 0x91A6C88C, | |
0x456CAC67, 0xB7072F64, 0xA457DC90, 0x563C5F93, 0x082F63B7, 0xFA44E0B4, | |
0xE9141340, 0x1B7F9043, 0xCFB5F4A8, 0x3DDE77AB, 0x2E8E845F, 0xDCE5075C, | |
0x92A8FC17, 0x60C37F14, 0x73938CE0, 0x81F80FE3, 0x55326B08, 0xA759E80B, | |
0xB4091BFF, 0x466298FC, 0x1871A4D8, 0xEA1A27DB, 0xF94AD42F, 0x0B21572C, | |
0xDFEB33C7, 0x2D80B0C4, 0x3ED04330, 0xCCBBC033, 0xA24BB5A6, 0x502036A5, | |
0x4370C551, 0xB11B4652, 0x65D122B9, 0x97BAA1BA, 0x84EA524E, 0x7681D14D, | |
0x2892ED69, 0xDAF96E6A, 0xC9A99D9E, 0x3BC21E9D, 0xEF087A76, 0x1D63F975, | |
0x0E330A81, 0xFC588982, 0xB21572C9, 0x407EF1CA, 0x532E023E, 0xA145813D, | |
0x758FE5D6, 0x87E466D5, 0x94B49521, 0x66DF1622, 0x38CC2A06, 0xCAA7A905, | |
0xD9F75AF1, 0x2B9CD9F2, 0xFF56BD19, 0x0D3D3E1A, 0x1E6DCDEE, 0xEC064EED, | |
0xC38D26C4, 0x31E6A5C7, 0x22B65633, 0xD0DDD530, 0x0417B1DB, 0xF67C32D8, | |
0xE52CC12C, 0x1747422F, 0x49547E0B, 0xBB3FFD08, 0xA86F0EFC, 0x5A048DFF, | |
0x8ECEE914, 0x7CA56A17, 0x6FF599E3, 0x9D9E1AE0, 0xD3D3E1AB, 0x21B862A8, | |
0x32E8915C, 0xC083125F, 0x144976B4, 0xE622F5B7, 0xF5720643, 0x07198540, | |
0x590AB964, 0xAB613A67, 0xB831C993, 0x4A5A4A90, 0x9E902E7B, 0x6CFBAD78, | |
0x7FAB5E8C, 0x8DC0DD8F, 0xE330A81A, 0x115B2B19, 0x020BD8ED, 0xF0605BEE, | |
0x24AA3F05, 0xD6C1BC06, 0xC5914FF2, 0x37FACCF1, 0x69E9F0D5, 0x9B8273D6, | |
0x88D28022, 0x7AB90321, 0xAE7367CA, 0x5C18E4C9, 0x4F48173D, 0xBD23943E, | |
0xF36E6F75, 0x0105EC76, 0x12551F82, 0xE03E9C81, 0x34F4F86A, 0xC69F7B69, | |
0xD5CF889D, 0x27A40B9E, 0x79B737BA, 0x8BDCB4B9, 0x988C474D, 0x6AE7C44E, | |
0xBE2DA0A5, 0x4C4623A6, 0x5F16D052, 0xAD7D5351, | |
) | |
def crc32c(buf): | |
crc = 0xFFFFFFFF | |
for b in buf: | |
crc = (CRC32C_TABLE[(crc ^ b) & 0xFF] ^ (crc >> 8)) & 0xFFFFFFFF | |
return crc ^ 0xFFFFFFFF | |
# }}} | |
def masked_crc32c(buf): | |
crc = crc32c(buf) | |
crc = (((crc >> 15) | (crc << 17)) + 0xA282EAD8) & 0xFFFFFFFF | |
return crc.to_bytes(4, 'little') | |
class TFRecordWriter: | |
"""A class to write records to a TFRecords file.""" | |
def __init__(self, path): | |
self.fp = open(path, 'wb') | |
def write(self, record: bytes): | |
"""Write record to the file.""" | |
# Format of a single record: | |
# uint64 length | |
# uint32 masked crc of length | |
# byte data[length] | |
# uint32 masked crc of data | |
header = len(record).to_bytes(8, 'little') | |
header += masked_crc32c(header) | |
self.fp.write(header) | |
self.fp.write(record) | |
self.fp.write(masked_crc32c(record)) | |
def close(self): | |
self.fp.close() | |
def __enter__(self): | |
return self | |
def __exit__(self, unused_type, unused_value, unused_traceback): | |
self.close() | |
def tf_record_iterator(path): | |
"""An iterator that read the records from a TFRecords file.""" | |
with open(path, 'rb') as fp: | |
while True: | |
offs = fp.tell() | |
header = fp.read(12) | |
if len(header) == 0: | |
break | |
if len(header) != 12 or masked_crc32c(header[:8]) != header[8:]: | |
raise IOError('Corrupted record header at %d' % offs) | |
record_len = int.from_bytes(header[:8], 'little') | |
record = fp.read(record_len) | |
footer = fp.read(4) | |
if len(record) != record_len or len(footer) != 4: | |
raise IOError('Truncated record at %d' % offs) | |
if masked_crc32c(record) != footer: | |
raise IOError('Corrupted record at %d' % offs) | |
yield record | |
def test_tfrecord(): | |
import tempfile | |
path = tempfile.mktemp() | |
with TFRecordWriter(path) as writer: | |
writer.write(b'first') | |
writer.write(b'second') | |
data = open(path, 'rb').read() | |
assert data == ( | |
b'\x05\x00\x00\x00\x00\x00\x00\x00\xea\xb2\x04>firstU\xff#\xe5' | |
b'\x06\x00\x00\x00\x00\x00\x00\x00si\xd57second\xd3\xe0\xd3\xca' | |
) | |
assert list(tf_record_iterator(path)) == [b'first', b'second'] | |
print('OK') | |
if __name__ == '__main__': | |
test_tfrecord() | |
# Copyright 2015 The TensorFlow Authors. All Rights Reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment