fbwright · March 28, 2015 17:43
diff --git a/novel_zip.py b/novel_zip.py
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 from __future__ import print_function, division
 import sys, time
 if sys.version_info.major < 3:
 	input = raw_input

 def compress(data):
 	freq = {}
 	for line in data.splitlines():
 		for word in line.split():
 			while word and word[0] in PRE_PUNCTUATION:
 				word = word[1:]
 			while word and word[-1] in POST_PUNCTUATION:
 				word = word[:-1]	
 			for word in word.split('-'):
 				if not word:
 					pass
 				elif word.lower() in freq:
 					freq[word.lower()] += 1
 				else:
 					freq[word.lower()] = 1
 	dict = sorted(freq.items(), key=lambda i: -len(i[0])*i[1])
 	dict = list(zip(*dict))[0]
 	chunks = []

 	for line in data.splitlines():
 		for word in line.split():
 			mark = []
 			while word and word[0] in PRE_PUNCTUATION:
 				chunk = bytes([0x60 + PRE_PUNCTUATION.index(word[0])])
 				chunks.append(chunk)
 				word = word[1:]
 			while word and word[-1] in POST_PUNCTUATION:
 				chunk = bytes([0x70 + POST_PUNCTUATION.index(word[-1])])
 				mark.append(chunk)
 				word = word[:-1]
 			
 			split_dash = word.split('-')
 			for word in split_dash:
 				if not word:
 					pass
 				elif word.lower() in dict:
 					index = dict.index(word.lower())
 				else:
 					raise IndexError("Word '%s' not found in the dictionary."%(word.lower()))
 				
 				if word == word.capitalize():
 					chunks.append(b"\x82")
 				elif word == word.upper():
 					chunks.append(b"\x83")
 				
 				if word:
 					if index < 0x20:
 						chunks.append(bytes([index]))
 					elif index < 0x2020:
 						index -= 0x20
 						chunks.append(bytes([
 							(1 << 5) | ((index >> 8) & 0xFF),
 							index & 0xFF
 						]))
 					else:
 						index -= 0x2020
 						chunks.append(bytes([
 							(1 << 5) | ((index >> 16) & 0xFF),
 							(index >> 8) & 0xFF,
 							index & 0xFF
 						]))
 				
 				if len(split_dash) > 1:
 					chunks.append(b"\x84")
 			
 			if mark is not None:
 				chunks.extend(mark[::-1])
 		chunks.append(b"\x81")
 	chunks.append(b"\x80")
 	return dict, chunks

 def binary_out(dict, chunks):
 	out = b"ZTXT-B\r\n"
 	DICT = b"\x00".join(map(
 		lambda i: bytes(i, "utf-8"), dict))+b"\x00"
 	dict_size = len(DICT)
 	dict_size = bytes([
 		(dict_size >> 24) & 0xFF,
 		(dict_size >> 16) & 0xFF,
 		(dict_size >> 8) & 0xFF,
 		dict_size & 0xFF
 	])
 	DICT = b"DICT" + dict_size + DICT
 	TEXT = b"".join(chunks)
 	text_size = len(TEXT)
 	text_size = bytes([
 		(text_size >> 24) & 0xFF,
 		(text_size >> 16) & 0xFF,
 		(text_size >> 8) & 0xFF,
 		text_size & 0xFF
 	])
 	TEXT = b"TEXT" + text_size + TEXT
 	return out + DICT + TEXT

 def parse_chunks(data):
 	chunks = []
 	index = 0
 	while index < len(data):
 		byte = data[index]
 		if byte >= 0x80:
 			chunks.append(byte)
 		else:
 			size = byte >> 5
 			chunk = byte# & 0x1f
 			if size == 1:
 				chunk = (chunk << 8) | data[index+1]
 				index += 1
 			elif size == 2:
 				chunk = (chunk << 8) | data[index+1]
 				chunk = (chunk << 8) | data[index+2]
 				index += 2
 			chunks.append(chunk)
 		index += 1
 	return chunks

 def binary_in(data):
 	#Originally I wanted to parse more than one chunk, and
 	# more than two kinds of chunks. The following is 
 	# (maybe) only temporary.
 	header, data = data[:8], data[8:]
 	index = 0
 	dict, chunks = [], []
 	while index < len(data):
 		chunk_header = data[index:index+4]
 		chunk_offset = index
 		chunk_size = data[index+4:index+8]
 		chunk_size = chunk_size[0] << 24 | \
 			chunk_size[1] << 16 | \
 			chunk_size[2] << 8 | \
 			chunk_size[3]
 		chunk_data = data[chunk_offset+8:chunk_offset+8+chunk_size]
 		index += 8 + chunk_size
 		if chunk_header == b"DICT":
 			dict = list(map(lambda i: str(i, "utf-8"),
 				chunk_data.split(b"\x00")))[:-1]
 		elif chunk_header == b"TEXT":
 			chunks = parse_chunks(chunk_data)
 	return dict, chunks

 LOWER, CAPITALIZE, UPPER = 0, 1, 2
 PRE_PUNCTUATION = "([{'\"<"
 POST_PUNCTUATION = ",.?!:;)]}'\">"
 def decompress(dict, chunks):
 	out = ""
 	next = LOWER
 	default_delimiter = " "
 	delimiter, next_delimiter = "", " "
 	for chunk in chunks:
 		if chunk < 256 and chunk >= 0x80:
 			chunk -= 0x80
 			if chunk == 0: #E
 				break
 			elif chunk == 1: #R
 				out += "\n"
 				next_delimiter = delimiter = ""
 			elif chunk == 2: #C
 				next = CAPITALIZE
 				next_delimiter = delimiter
 			elif chunk == 3: #U
 				next = UPPER
 				next_delimiter = delimiter
 			elif chunk == 4:
 				next_delimiter = "-"
 		else:
 			size = chunk >> (5+
 				(8 if chunk>=256 else 0)+
 				(8 if chunk>=65536 else 0)) & 0x3
 			if size < 3:
 				if size == 0:
 					index = (chunk & 0x1f)
 				elif size == 1:
 					index = (chunk & 0x1fff) + 0x20
 				elif size == 2:
 					index = (chunk & 0x1fffff) + 0x2020
 				
 				word = dict[index]
 				if next == CAPITALIZE: 
 					word = word.capitalize()
 				elif next == UPPER: 
 					word = word.upper()
 				out += delimiter 
 				out += word
 				next = LOWER
 			else:
 				index = chunk & 0x1f
 				if index >= 16:
 					index -= 16
 					out += POST_PUNCTUATION[index]
 				else:
 					out += delimiter
 					next_delimiter = PRE_PUNCTUATION[index]
 		delimiter = next_delimiter
 		next_delimiter = default_delimiter
 	return out

 def do_comprime(file_in, file_out=None):
 	start = time.time()
 	data = open(file_in, "r").read()
 	size = len(data)
 	dict, chunks = compress(data)
 	data = binary_out(dict, chunks)
 	compressed_size = len(data)
 	if file_out is None:
 		file_out = file_in + ".ztxt"
 	open(file_out, "wb").write(data)
 	elapsed = time.time()-start
 	print("Compressed file '%s' [%.2f s; %.2f kiB] - '%s' [%.2f kiB] (compression %.4f%%)"%(file_in, elapsed, size/1024, file_out, compressed_size/1024, compressed_size/size))

 def do_decomprime(file_in, file_out=None):
 	start = time.time()
 	data = open(file_in, "rb").read()
 	compressed_size = len(data)
 	dict, chunks = binary_in(data)
 	data = decompress(dict, chunks)
 	size = len(data)
 	if file_out is None:
 		file_out = file_in + ".txt"
 	open(file_out, "w").write(data)
 	elapsed = time.time() - start
 	print("Decompressed file '%s' [%.2f s; %.2f kiB] - '%s' [%.2f kiB] (compression %.4f%%)"%(file_in, elapsed, compressed_size/1024, file_out, size/1024, compressed_size/size))

 if __name__ == "__main__":
 	cmd = ""
 	if len(sys.argv) >= 3:
 		cmd = sys.argv[1]
 		file_in = sys.argv[2]
 		file_out = sys.argv[3] if len(sys.argv) > 3 else None
 	
 	if cmd == "c":
 		do_comprime(file_in, file_out)
 	elif cmd == "d":
 		do_decomprime(file_in, file_out)
 	else:
 		print("""Usage
 novel_zip {d|c} FILE_IN [FILE_OUT]

  d    Decompress the file.
  c    Compress the file.
 """)
	#!/usr/bin/env python
	# -- coding: utf-8 --
	from __future__ import print_function, division
	import sys, time
	if sys.version_info.major < 3:
	input = raw_input

	def compress(data):
	freq = {}
	for line in data.splitlines():
	for word in line.split():
	while word and word[0] in PRE_PUNCTUATION:
	word = word[1:]
	while word and word[-1] in POST_PUNCTUATION:
	word = word[:-1]
	for word in word.split('-'):
	if not word:
	pass
	elif word.lower() in freq:
	freq[word.lower()] += 1
	else:
	freq[word.lower()] = 1
	dict = sorted(freq.items(), key=lambda i: -len(i[0])*i[1])
	dict = list(zip(*dict))[0]
	chunks = []

	for line in data.splitlines():
	for word in line.split():
	mark = []
	while word and word[0] in PRE_PUNCTUATION:
	chunk = bytes([0x60 + PRE_PUNCTUATION.index(word[0])])
	chunks.append(chunk)
	word = word[1:]
	while word and word[-1] in POST_PUNCTUATION:
	chunk = bytes([0x70 + POST_PUNCTUATION.index(word[-1])])
	mark.append(chunk)
	word = word[:-1]

	split_dash = word.split('-')
	for word in split_dash:
	if not word:
	pass
	elif word.lower() in dict:
	index = dict.index(word.lower())
	else:
	raise IndexError("Word '%s' not found in the dictionary."%(word.lower()))

	if word == word.capitalize():
	chunks.append(b"\x82")
	elif word == word.upper():
	chunks.append(b"\x83")

	if word:
	if index < 0x20:
	chunks.append(bytes([index]))
	elif index < 0x2020:
	index -= 0x20
	chunks.append(bytes([
	(1 << 5) \| ((index >> 8) & 0xFF),
	index & 0xFF
	]))
	else:
	index -= 0x2020
	chunks.append(bytes([
	(1 << 5) \| ((index >> 16) & 0xFF),
	(index >> 8) & 0xFF,
	index & 0xFF
	]))

	if len(split_dash) > 1:
	chunks.append(b"\x84")

	if mark is not None:
	chunks.extend(mark[::-1])
	chunks.append(b"\x81")
	chunks.append(b"\x80")
	return dict, chunks

	def binary_out(dict, chunks):
	out = b"ZTXT-B\r\n"
	DICT = b"\x00".join(map(
	lambda i: bytes(i, "utf-8"), dict))+b"\x00"
	dict_size = len(DICT)
	dict_size = bytes([
	(dict_size >> 24) & 0xFF,
	(dict_size >> 16) & 0xFF,
	(dict_size >> 8) & 0xFF,
	dict_size & 0xFF
	])
	DICT = b"DICT" + dict_size + DICT
	TEXT = b"".join(chunks)
	text_size = len(TEXT)
	text_size = bytes([
	(text_size >> 24) & 0xFF,
	(text_size >> 16) & 0xFF,
	(text_size >> 8) & 0xFF,
	text_size & 0xFF
	])
	TEXT = b"TEXT" + text_size + TEXT
	return out + DICT + TEXT

	def parse_chunks(data):
	chunks = []
	index = 0
	while index < len(data):
	byte = data[index]
	if byte >= 0x80:
	chunks.append(byte)
	else:
	size = byte >> 5
	chunk = byte# & 0x1f
	if size == 1:
	chunk = (chunk << 8) \| data[index+1]
	index += 1
	elif size == 2:
	chunk = (chunk << 8) \| data[index+1]
	chunk = (chunk << 8) \| data[index+2]
	index += 2
	chunks.append(chunk)
	index += 1
	return chunks

	def binary_in(data):
	#Originally I wanted to parse more than one chunk, and
	# more than two kinds of chunks. The following is
	# (maybe) only temporary.
	header, data = data[:8], data[8:]
	index = 0
	dict, chunks = [], []
	while index < len(data):
	chunk_header = data[index:index+4]
	chunk_offset = index
	chunk_size = data[index+4:index+8]
	chunk_size = chunk_size[0] << 24 \| \
	chunk_size[1] << 16 \| \
	chunk_size[2] << 8 \| \
	chunk_size[3]
	chunk_data = data[chunk_offset+8:chunk_offset+8+chunk_size]
	index += 8 + chunk_size
	if chunk_header == b"DICT":
	dict = list(map(lambda i: str(i, "utf-8"),
	chunk_data.split(b"\x00")))[:-1]
	elif chunk_header == b"TEXT":
	chunks = parse_chunks(chunk_data)
	return dict, chunks

	LOWER, CAPITALIZE, UPPER = 0, 1, 2
	PRE_PUNCTUATION = "([{'\"<"
	POST_PUNCTUATION = ",.?!:;)]}'\">"
	def decompress(dict, chunks):
	out = ""
	next = LOWER
	default_delimiter = " "
	delimiter, next_delimiter = "", " "
	for chunk in chunks:
	if chunk < 256 and chunk >= 0x80:
	chunk -= 0x80
	if chunk == 0: #E
	break
	elif chunk == 1: #R
	out += "\n"
	next_delimiter = delimiter = ""
	elif chunk == 2: #C
	next = CAPITALIZE
	next_delimiter = delimiter
	elif chunk == 3: #U
	next = UPPER
	next_delimiter = delimiter
	elif chunk == 4:
	next_delimiter = "-"
	else:
	size = chunk >> (5+
	(8 if chunk>=256 else 0)+
	(8 if chunk>=65536 else 0)) & 0x3
	if size < 3:
	if size == 0:
	index = (chunk & 0x1f)
	elif size == 1:
	index = (chunk & 0x1fff) + 0x20
	elif size == 2:
	index = (chunk & 0x1fffff) + 0x2020

	word = dict[index]
	if next == CAPITALIZE:
	word = word.capitalize()
	elif next == UPPER:
	word = word.upper()
	out += delimiter
	out += word
	next = LOWER
	else:
	index = chunk & 0x1f
	if index >= 16:
	index -= 16
	out += POST_PUNCTUATION[index]
	else:
	out += delimiter
	next_delimiter = PRE_PUNCTUATION[index]
	delimiter = next_delimiter
	next_delimiter = default_delimiter
	return out

	def do_comprime(file_in, file_out=None):
	start = time.time()
	data = open(file_in, "r").read()
	size = len(data)
	dict, chunks = compress(data)
	data = binary_out(dict, chunks)
	compressed_size = len(data)
	if file_out is None:
	file_out = file_in + ".ztxt"
	open(file_out, "wb").write(data)
	elapsed = time.time()-start
	print("Compressed file '%s' [%.2f s; %.2f kiB] - '%s' [%.2f kiB] (compression %.4f%%)"%(file_in, elapsed, size/1024, file_out, compressed_size/1024, compressed_size/size))

	def do_decomprime(file_in, file_out=None):
	start = time.time()
	data = open(file_in, "rb").read()
	compressed_size = len(data)
	dict, chunks = binary_in(data)
	data = decompress(dict, chunks)
	size = len(data)
	if file_out is None:
	file_out = file_in + ".txt"
	open(file_out, "w").write(data)
	elapsed = time.time() - start
	print("Decompressed file '%s' [%.2f s; %.2f kiB] - '%s' [%.2f kiB] (compression %.4f%%)"%(file_in, elapsed, compressed_size/1024, file_out, size/1024, compressed_size/size))

	if __name__ == "__main__":
	cmd = ""
	if len(sys.argv) >= 3:
	cmd = sys.argv[1]
	file_in = sys.argv[2]
	file_out = sys.argv[3] if len(sys.argv) > 3 else None

	if cmd == "c":
	do_comprime(file_in, file_out)
	elif cmd == "d":
	do_decomprime(file_in, file_out)
	else:
	print("""Usage
	novel_zip {d\|c} FILE_IN [FILE_OUT]

	d Decompress the file.
	c Compress the file.
	""")