Last active
November 15, 2024 07:58
-
-
Save BarelyAliveMau5/000e7e453b6d4ebd0cb06f39bc2e7aec to your computer and use it in GitHub Desktop.
python's version of java's modified utf8
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# translated from: http://hg.openjdk.java.net/jdk8/jdk8/jdk/file/94cc251d0c45/src/share/npt/utf.c | |
def utf8s_to_utf8m(string): | |
""" | |
:param string: utf8 encoded string | |
:return: modified utf8 encoded string | |
""" | |
new_str = [] | |
i = 0 | |
while i < len(string): | |
byte1 = string[i] | |
# NULL bytes and bytes starting with 11110xxx are special | |
if (byte1 & 0x80) == 0: | |
if byte1 == 0: | |
new_str.append(0xC0) | |
new_str.append(0x80) | |
else: | |
# Single byte | |
new_str.append(byte1) | |
elif (byte1 & 0xE0) == 0xC0: # 2byte encoding | |
new_str.append(byte1) | |
i += 1 | |
new_str.append(string[i]) | |
elif (byte1 & 0xF0) == 0xE0: # 3byte encoding | |
new_str.append(byte1) | |
i += 1 | |
new_str.append(string[i]) | |
i += 1 | |
new_str.append(string[i]) | |
elif (byte1 & 0xF8) == 0xF0: # 4byte encoding | |
# Beginning of 4byte encoding, turn into 2 3byte encodings | |
# Bits in: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx | |
i += 1 | |
byte2 = string[i] | |
i += 1 | |
byte3 = string[i] | |
i += 1 | |
byte4 = string[i] | |
# Reconstruct full 21bit value | |
u21 = (byte1 & 0x07) << 18 | |
u21 += (byte2 & 0x3F) << 12 | |
u21 += (byte3 & 0x3F) << 6 | |
u21 += (byte4 & 0x3F) | |
# Bits out: 11101101 1010xxxx 10xxxxxx | |
new_str.append(0xED) | |
new_str.append((0xA0 + (((u21 >> 16) - 1) & 0x0F))) | |
new_str.append((0x80 + ((u21 >> 10) & 0x3F))) | |
# Bits out: 11101101 1011xxxx 10xxxxxx | |
new_str.append(0xED) | |
new_str.append((0xB0 + ((u21 >> 6) & 0x0F))) | |
new_str.append(byte4) | |
i += 1 | |
return bytes(new_str) | |
def utf8m_to_utf8s(string): | |
""" | |
:param string: modified utf8 encoded string | |
:return: utf8 encoded string | |
""" | |
new_string = [] | |
length = len(string) | |
i = 0 | |
while i < length: | |
byte1 = string[i] | |
if (byte1 & 0x80) == 0: # 1byte encoding | |
new_string.append(byte1) | |
elif (byte1 & 0xE0) == 0xC0: # 2byte encoding | |
i += 1 | |
byte2 = string[i] | |
if byte1 != 0xC0 or byte2 != 0x80: | |
new_string.append(byte1) | |
new_string.append(byte2) | |
else: | |
new_string.append(0) | |
elif (byte1 & 0xF0) == 0xE0: # 3byte encoding | |
i += 1 | |
byte2 = string[i] | |
i += 1 | |
byte3 = string[i] | |
if i+3 < length and byte1 == 0xED and (byte2 & 0xF0) == 0xA0: | |
# See if this is a pair of 3byte encodings | |
byte4 = string[i+1] | |
byte5 = string[i+2] | |
byte6 = string[i+3] | |
if byte4 == 0xED and (byte5 & 0xF0) == 0xB0: | |
# Bits in: 11101101 1010xxxx 10xxxxxx | |
# Bits in: 11101101 1011xxxx 10xxxxxx | |
i += 3 | |
# Reconstruct 21 bit code | |
u21 = ((byte2 & 0x0F) + 1) << 16 | |
u21 += (byte3 & 0x3F) << 10 | |
u21 += (byte5 & 0x0F) << 6 | |
u21 += (byte6 & 0x3F) | |
# Bits out: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx | |
# Convert to 4byte encoding | |
new_string.append(0xF0 + ((u21 >> 18) & 0x07)) | |
new_string.append(0x80 + ((u21 >> 12) & 0x3F)) | |
new_string.append(0x80 + ((u21 >> 6) & 0x3F)) | |
new_string.append(0x80 + (u21 & 0x3F)) | |
continue | |
new_string.append(byte1) | |
new_string.append(byte2) | |
new_string.append(byte3) | |
i += 1 | |
return bytes(new_string) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment