Last active
August 2, 2025 11:35
-
-
Save mdchaney/da83cb66d7825b68456901902fef9ea0 to your computer and use it in GitHub Desktop.
Fix encoding to deal with mixed UTF-8 / Latin-1
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def fix_encoding(str) | |
| # The "b" method returns a copied string with encoding ASCII-8BIT | |
| str = str.b | |
| # Strip UTF-8 BOM if it's at start of file | |
| if str.byteslice(0..2) == "\xEF\xBB\xBF".b | |
| str = str.byteslice(3..-1) | |
| end | |
| if str.ascii_only? | |
| return str.force_encoding('UTF-8') | |
| end | |
| # Split string into an array of characters. Any high character will | |
| # be checked for UTF-8 validity first, or separated out if it's not | |
| # part of a valid UTF-8 sequence. | |
| characters = str.scan(/ | |
| ( | |
| (?:[\x00-\x7f]) | | |
| (?:[\xc0-\xdf][\x80-\xbf]) | | |
| (?:[\xe0-\xef][\x80-\xbf]{2}) | | |
| (?:[\xf0-\xf7][\x80-\xbf]{3}) | | |
| (?:[\x80-\xff]) | |
| )/nx | |
| ).flatten | |
| # Pass UTF-8 and standard ASCII characters through as-is. Any remaining | |
| # high characters are assumed to be ISO-8859-1 and converted to UTF-8. | |
| # Stupid quotes are converted to standard ascii quotes. | |
| characters.map do |character| | |
| if character.bytesize > 1 | |
| character.force_encoding('UTF-8') | |
| elsif character.match(/[\x82\x8b\x91\x92\x9b\xb4\x84\x93\x94]/n) | |
| character.tr("\x82\x8b\x91\x92\x9b\xb4\x84\x93\x94".b, "''''''\"\"\"").force_encoding('UTF-8') | |
| elsif character.ord > 127 | |
| character.force_encoding('ISO-8859-1').encode('UTF-8') | |
| else | |
| character.force_encoding('UTF-8') | |
| end | |
| end.join.force_encoding('UTF-8') | |
| end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment