Created
March 28, 2015 17:43
-
-
Save fbwright/7674756039f0a4ad5484 to your computer and use it in GitHub Desktop.
Novel compressor
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
from __future__ import print_function, division | |
import sys, time | |
if sys.version_info.major < 3: | |
input = raw_input | |
def compress(data): | |
freq = {} | |
for line in data.splitlines(): | |
for word in line.split(): | |
while word and word[0] in PRE_PUNCTUATION: | |
word = word[1:] | |
while word and word[-1] in POST_PUNCTUATION: | |
word = word[:-1] | |
for word in word.split('-'): | |
if not word: | |
pass | |
elif word.lower() in freq: | |
freq[word.lower()] += 1 | |
else: | |
freq[word.lower()] = 1 | |
dict = sorted(freq.items(), key=lambda i: -len(i[0])*i[1]) | |
dict = list(zip(*dict))[0] | |
chunks = [] | |
for line in data.splitlines(): | |
for word in line.split(): | |
mark = [] | |
while word and word[0] in PRE_PUNCTUATION: | |
chunk = bytes([0x60 + PRE_PUNCTUATION.index(word[0])]) | |
chunks.append(chunk) | |
word = word[1:] | |
while word and word[-1] in POST_PUNCTUATION: | |
chunk = bytes([0x70 + POST_PUNCTUATION.index(word[-1])]) | |
mark.append(chunk) | |
word = word[:-1] | |
split_dash = word.split('-') | |
for word in split_dash: | |
if not word: | |
pass | |
elif word.lower() in dict: | |
index = dict.index(word.lower()) | |
else: | |
raise IndexError("Word '%s' not found in the dictionary."%(word.lower())) | |
if word == word.capitalize(): | |
chunks.append(b"\x82") | |
elif word == word.upper(): | |
chunks.append(b"\x83") | |
if word: | |
if index < 0x20: | |
chunks.append(bytes([index])) | |
elif index < 0x2020: | |
index -= 0x20 | |
chunks.append(bytes([ | |
(1 << 5) | ((index >> 8) & 0xFF), | |
index & 0xFF | |
])) | |
else: | |
index -= 0x2020 | |
chunks.append(bytes([ | |
(1 << 5) | ((index >> 16) & 0xFF), | |
(index >> 8) & 0xFF, | |
index & 0xFF | |
])) | |
if len(split_dash) > 1: | |
chunks.append(b"\x84") | |
if mark is not None: | |
chunks.extend(mark[::-1]) | |
chunks.append(b"\x81") | |
chunks.append(b"\x80") | |
return dict, chunks | |
def binary_out(dict, chunks): | |
out = b"ZTXT-B\r\n" | |
DICT = b"\x00".join(map( | |
lambda i: bytes(i, "utf-8"), dict))+b"\x00" | |
dict_size = len(DICT) | |
dict_size = bytes([ | |
(dict_size >> 24) & 0xFF, | |
(dict_size >> 16) & 0xFF, | |
(dict_size >> 8) & 0xFF, | |
dict_size & 0xFF | |
]) | |
DICT = b"DICT" + dict_size + DICT | |
TEXT = b"".join(chunks) | |
text_size = len(TEXT) | |
text_size = bytes([ | |
(text_size >> 24) & 0xFF, | |
(text_size >> 16) & 0xFF, | |
(text_size >> 8) & 0xFF, | |
text_size & 0xFF | |
]) | |
TEXT = b"TEXT" + text_size + TEXT | |
return out + DICT + TEXT | |
def parse_chunks(data): | |
chunks = [] | |
index = 0 | |
while index < len(data): | |
byte = data[index] | |
if byte >= 0x80: | |
chunks.append(byte) | |
else: | |
size = byte >> 5 | |
chunk = byte# & 0x1f | |
if size == 1: | |
chunk = (chunk << 8) | data[index+1] | |
index += 1 | |
elif size == 2: | |
chunk = (chunk << 8) | data[index+1] | |
chunk = (chunk << 8) | data[index+2] | |
index += 2 | |
chunks.append(chunk) | |
index += 1 | |
return chunks | |
def binary_in(data): | |
#Originally I wanted to parse more than one chunk, and | |
# more than two kinds of chunks. The following is | |
# (maybe) only temporary. | |
header, data = data[:8], data[8:] | |
index = 0 | |
dict, chunks = [], [] | |
while index < len(data): | |
chunk_header = data[index:index+4] | |
chunk_offset = index | |
chunk_size = data[index+4:index+8] | |
chunk_size = chunk_size[0] << 24 | \ | |
chunk_size[1] << 16 | \ | |
chunk_size[2] << 8 | \ | |
chunk_size[3] | |
chunk_data = data[chunk_offset+8:chunk_offset+8+chunk_size] | |
index += 8 + chunk_size | |
if chunk_header == b"DICT": | |
dict = list(map(lambda i: str(i, "utf-8"), | |
chunk_data.split(b"\x00")))[:-1] | |
elif chunk_header == b"TEXT": | |
chunks = parse_chunks(chunk_data) | |
return dict, chunks | |
LOWER, CAPITALIZE, UPPER = 0, 1, 2 | |
PRE_PUNCTUATION = "([{'\"<" | |
POST_PUNCTUATION = ",.?!:;)]}'\">" | |
def decompress(dict, chunks): | |
out = "" | |
next = LOWER | |
default_delimiter = " " | |
delimiter, next_delimiter = "", " " | |
for chunk in chunks: | |
if chunk < 256 and chunk >= 0x80: | |
chunk -= 0x80 | |
if chunk == 0: #E | |
break | |
elif chunk == 1: #R | |
out += "\n" | |
next_delimiter = delimiter = "" | |
elif chunk == 2: #C | |
next = CAPITALIZE | |
next_delimiter = delimiter | |
elif chunk == 3: #U | |
next = UPPER | |
next_delimiter = delimiter | |
elif chunk == 4: | |
next_delimiter = "-" | |
else: | |
size = chunk >> (5+ | |
(8 if chunk>=256 else 0)+ | |
(8 if chunk>=65536 else 0)) & 0x3 | |
if size < 3: | |
if size == 0: | |
index = (chunk & 0x1f) | |
elif size == 1: | |
index = (chunk & 0x1fff) + 0x20 | |
elif size == 2: | |
index = (chunk & 0x1fffff) + 0x2020 | |
word = dict[index] | |
if next == CAPITALIZE: | |
word = word.capitalize() | |
elif next == UPPER: | |
word = word.upper() | |
out += delimiter | |
out += word | |
next = LOWER | |
else: | |
index = chunk & 0x1f | |
if index >= 16: | |
index -= 16 | |
out += POST_PUNCTUATION[index] | |
else: | |
out += delimiter | |
next_delimiter = PRE_PUNCTUATION[index] | |
delimiter = next_delimiter | |
next_delimiter = default_delimiter | |
return out | |
def do_comprime(file_in, file_out=None): | |
start = time.time() | |
data = open(file_in, "r").read() | |
size = len(data) | |
dict, chunks = compress(data) | |
data = binary_out(dict, chunks) | |
compressed_size = len(data) | |
if file_out is None: | |
file_out = file_in + ".ztxt" | |
open(file_out, "wb").write(data) | |
elapsed = time.time()-start | |
print("Compressed file '%s' [%.2f s; %.2f kiB] - '%s' [%.2f kiB] (compression %.4f%%)"%(file_in, elapsed, size/1024, file_out, compressed_size/1024, compressed_size/size)) | |
def do_decomprime(file_in, file_out=None): | |
start = time.time() | |
data = open(file_in, "rb").read() | |
compressed_size = len(data) | |
dict, chunks = binary_in(data) | |
data = decompress(dict, chunks) | |
size = len(data) | |
if file_out is None: | |
file_out = file_in + ".txt" | |
open(file_out, "w").write(data) | |
elapsed = time.time() - start | |
print("Decompressed file '%s' [%.2f s; %.2f kiB] - '%s' [%.2f kiB] (compression %.4f%%)"%(file_in, elapsed, compressed_size/1024, file_out, size/1024, compressed_size/size)) | |
if __name__ == "__main__": | |
cmd = "" | |
if len(sys.argv) >= 3: | |
cmd = sys.argv[1] | |
file_in = sys.argv[2] | |
file_out = sys.argv[3] if len(sys.argv) > 3 else None | |
if cmd == "c": | |
do_comprime(file_in, file_out) | |
elif cmd == "d": | |
do_decomprime(file_in, file_out) | |
else: | |
print("""Usage | |
novel_zip {d|c} FILE_IN [FILE_OUT] | |
d Decompress the file. | |
c Compress the file. | |
""") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment