Created
November 19, 2023 14:14
-
-
Save matteobertozzi/ab9fc242b75d79593bed56469e85bcf2 to your computer and use it in GitHub Desktop.
Encode an Array of numbers using a bitmap + int-pack4 (good for arrays with mostly zeros)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Encode an Array of numbers using a bitmap + int-pack4 | |
import java.io.ByteArrayOutputStream; | |
import java.io.IOException; | |
import io.github.matteobertozzi.rednaco.bytes.encoding.IntDecoder; | |
import io.github.matteobertozzi.rednaco.bytes.encoding.IntEncoder; | |
import io.github.matteobertozzi.rednaco.bytes.encoding.IntUtil; | |
public final class MostlyZeroEncoder { | |
private MostlyZeroEncoder() { | |
// no-op | |
} | |
public static byte[] encode(final int[] data) throws Exception { | |
final int bitmapLength = (data.length + 7) >> 3; | |
try (ByteArrayOutputStream stream = new ByteArrayOutputStream(bitmapLength + data.length)) { | |
IntEncoder.LITTLE_ENDIAN.writeFixed32(stream, data.length); | |
writeBitmap(stream, data); | |
writeIntPack4(stream, data); | |
return stream.toByteArray(); | |
} | |
} | |
private static void writeBitmap(final ByteArrayOutputStream stream, final int[] data) { | |
for (int i = 0; i < data.length;) { | |
int bitmapByte = 0; | |
for (int bitmapBitIndex = 7; i < data.length && bitmapBitIndex >= 0; ++i, --bitmapBitIndex) { | |
if (data[i] != 0) { | |
bitmapByte |= (byte)(1 << bitmapBitIndex); | |
} | |
} | |
stream.write(bitmapByte & 0xff); | |
} | |
} | |
private static void writeIntPack4(final ByteArrayOutputStream stream, final int[] data) throws Exception { | |
// 6 4 2 0 | |
// | 11 | 11 | 11 | 11 | | |
int head = 0; | |
int blockCount = 0; | |
final int[] block = new int[4]; | |
for (int i = 0; i < data.length; ++i) { | |
if (data[i] == 0) continue; | |
final int size = IntUtil.size(data[i]); | |
head |= (size - 1) << (6 - (blockCount << 1)); | |
block[blockCount++] = data[i]; | |
if (blockCount == 4) { | |
writeIntPack4(stream, head, blockCount, block); | |
blockCount = 0; | |
head = 0; | |
} | |
} | |
if (blockCount != 0) { | |
writeIntPack4(stream, head, blockCount, block); | |
} | |
} | |
private static void writeIntPack4(final ByteArrayOutputStream stream, final int head, final int blockCount, final int[] block) throws Exception { | |
stream.write(head); | |
for (int k = 0; k < blockCount; ++k) { | |
final int bytesWidth = 1 + ((head >> (6 - (k << 1))) & 3); | |
IntEncoder.LITTLE_ENDIAN.writeFixed(stream, block[k], bytesWidth); | |
} | |
} | |
public static int[] decode(final byte[] enc) throws IOException { | |
if (enc == null) return null; | |
final int length = IntDecoder.LITTLE_ENDIAN.readFixed32(enc, 0); | |
if (length == 0) return new int[0]; | |
final int bitmapLength = (length + 7) >> 3; | |
int nonZero = 0; | |
for (int i = 0; i < bitmapLength; ++i) { | |
nonZero += Integer.bitCount(enc[4 + i] & 0xff); | |
} | |
int valueIndex = 0; | |
final int[] values = new int[nonZero]; | |
int valuesOffset = 4 + bitmapLength; | |
while (valuesOffset < enc.length) { | |
final int head = enc[valuesOffset++] & 0xff; | |
for (int k = 0; k < 4 && valueIndex < nonZero; ++k) { | |
final int bytesWidth = 1 + ((head >> (6 - (k << 1))) & 3); | |
values[valueIndex++] = (int) IntDecoder.LITTLE_ENDIAN.readFixed(enc, valuesOffset, bytesWidth); | |
valuesOffset += bytesWidth; | |
} | |
} | |
if (valueIndex != values.length) { | |
throw new IllegalStateException(); | |
} | |
if (valuesOffset != enc.length) { | |
throw new IllegalStateException(); | |
} | |
valueIndex = 0; | |
final int[] data = new int[length]; | |
for (int i = 0, index = 0; i < bitmapLength; ++i) { | |
final int bitmapByte = enc[4 + i] & 0xff; | |
for (int k = 7; k >= 0 && index < length; --k) { | |
if ((bitmapByte & (1 << k)) != 0) { | |
data[index++] = values[valueIndex++]; | |
} else { | |
data[index++] = 0; | |
} | |
} | |
} | |
return data; | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Encode an Array of numbers using a bitmap + int-pack4 | |
import io | |
def int_bytes_width(v: int) -> int: | |
return (v.bit_length() + 7) // 8 if v != 0 else 1 | |
def _write_bitmap(stream: io.BytesIO, data: list[int]): | |
length = len(data) | |
index = 0 | |
while index < length: | |
bitmap_byte = 0 | |
bitmap_bit_index = 7 | |
while index < length and bitmap_bit_index >= 0: | |
if data[index] != 0: | |
bitmap_byte |= 1 << bitmap_bit_index | |
bitmap_bit_index -= 1 | |
index += 1 | |
stream.write(bytes([bitmap_byte])) | |
assert index == len(data) | |
def _write_int_pack4_block(stream: io.BytesIO, head: int, block: list[int]): | |
stream.write(bytes([head])) | |
for k in range(len(block)): | |
bytes_width = 1 + ((head >> (6 - (k << 1))) & 3) | |
stream.write(block[k].to_bytes(bytes_width, 'little')) | |
def _write_int_pack4(stream: io.BytesIO, data: list[int]): | |
# 6 4 2 0 | |
# | 11 | 11 | 11 | 11 | | |
head = 0 | |
block = [] | |
for v in data: | |
if v == 0: continue | |
size = int_bytes_width(v) | |
head |= (size - 1) << (6 - (len(block) << 1)) | |
block.append(v) | |
if len(block) == 4: | |
_write_int_pack4_block(stream, head, block) | |
block.clear() | |
head = 0 | |
if block: | |
_write_int_pack4_block(stream, head, block) | |
def data_encode(data: list[int]) -> bytes: | |
stream = io.BytesIO() | |
stream.write(len(data).to_bytes(4, 'little')) | |
_write_bitmap(stream, data) | |
_write_int_pack4(stream, data) | |
return stream.getvalue() | |
def data_decode(enc: bytes) -> list[int]: | |
length = int.from_bytes(enc[0:4], 'little') | |
if length == 0: return [] | |
bitmap_length = ((length + 7) >> 3) | |
non_zero = 0 | |
for i in range(bitmap_length): | |
non_zero += (enc[4 + i] & 0xff).bit_count() | |
values = [] | |
values_offset = 4 + bitmap_length | |
eof_offset = len(enc) | |
while values_offset < eof_offset: | |
head = enc[values_offset] & 0xff | |
values_offset += 1 | |
for k in range(4): | |
if len(values) >= non_zero: | |
break | |
bytes_width = 1 + ((head >> (6 - (k << 1))) & 3) | |
values.append(int.from_bytes(enc[values_offset:values_offset+bytes_width], 'little')) | |
values_offset += bytes_width | |
assert len(values) == non_zero | |
assert values_offset == eof_offset | |
data = [] | |
value_index = 0; | |
for i in range(bitmap_length): | |
bitmap_byte = enc[4 + i] & 0xff | |
for k in range(7, -1, -1): | |
if len(data) >= length: | |
break | |
if ((bitmap_byte & (1 << k)) != 0): | |
data.append(values[value_index]) | |
value_index += 1 | |
else: | |
data.append(0) | |
assert len(data) == length | |
assert value_index == non_zero | |
return data |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment