Created
July 18, 2012 15:53
-
-
Save astaxie/3137091 to your computer and use it in GitHub Desktop.
golang IsTextUTF8
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
func IsTextUTF8(inputStream []byte) bool { | |
encodingBytesCount := 0 | |
allTextsAreASCIIChars := true; | |
for i := 0; i < len(inputStream); i++ { | |
current := inputStream[i] | |
if (current & 0x80) == 0x80 { | |
allTextsAreASCIIChars = false | |
} | |
// First byte | |
if encodingBytesCount == 0 { | |
if (current & 0x80) == 0 { | |
// ASCII chars, from 0x00-0x7F | |
continue | |
} | |
if (current & 0xC0) == 0xC0 { | |
encodingBytesCount = 1 | |
current <<= 2 | |
// More than two bytes used to encoding a unicode char. | |
// Calculate the real length. | |
for (current & 0x80) == 0x80 { | |
current <<= 1; | |
encodingBytesCount++; | |
} | |
} else { | |
// Invalid bits structure for UTF8 encoding rule. | |
return false; | |
} | |
} else { | |
// Following bytes, must start with 10. | |
if (current & 0xC0) == 0x80 { | |
encodingBytesCount-- | |
}else{ | |
// Invalid bits structure for UTF8 encoding rule. | |
return false | |
} | |
} | |
} | |
if encodingBytesCount != 0 { | |
// Invalid bits structure for UTF8 encoding rule. | |
// Wrong following bytes count. | |
return false | |
} | |
// Although UTF8 supports encoding for ASCII chars, we regard as a input stream, whose contents are all ASCII as default encoding. | |
return !allTextsAreASCIIChars | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment