Created
May 12, 2025 13:18
-
-
Save jsonzilla/881c012a0fce70c579d426c30a3a36ff to your computer and use it in GitHub Desktop.
What This Can't Do It can't guarantee something is "ANSI" — it just tells you "not valid UTF-8", so we assume ANSI. If the string contains only ASCII characters, both UTF-8 and ANSI are valid — you can't distinguish.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <string> | |
#include <locale> | |
#include <codecvt> | |
std::string ansiToUtf8(const std::string& ansiStr) { | |
// Convert ANSI (assumed to be Latin1/Windows-1252) to UTF-16 | |
std::wstring_convert<std::codecvt<wchar_t, char, std::mbstate_t>> converter; | |
std::wstring wideStr = converter.from_bytes(ansiStr); | |
// Convert UTF-16 (wide string) to UTF-8 | |
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> utf8Converter; | |
return utf8Converter.to_bytes(wideStr); | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <string> | |
#include <locale> | |
#include <codecvt> | |
bool isValidUtf8(const std::string& str) { | |
size_t i = 0; | |
while (i < str.size()) { | |
unsigned char c = static_cast<unsigned char>(str[i]); | |
int bytes = 0; | |
if ((c & 0x80) == 0) { | |
bytes = 1; // ASCII | |
} else if ((c & 0xE0) == 0xC0) { | |
bytes = 2; | |
} else if ((c & 0xF0) == 0xE0) { | |
bytes = 3; | |
} else if ((c & 0xF8) == 0xF0) { | |
bytes = 4; | |
} else { | |
return false; // invalid leading byte | |
} | |
if (i + bytes > str.size()) return false; | |
for (int j = 1; j < bytes; ++j) { | |
if ((static_cast<unsigned char>(str[i + j]) & 0xC0) != 0x80) { | |
return false; // invalid continuation byte | |
} | |
} | |
i += bytes; | |
} | |
return true; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
int main() { | |
std::string maybeAnsi = "\xE9"; // ANSI 'é' | |
std::string converted = ansiToUtf8(maybeAnsi); | |
std::cout << "UTF-8: " << converted << std::endl; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment