mdchaney · August 2, 2025 11:35
diff --git a/fix_encoding.rb b/fix_encoding.rb
 def fix_encoding(str)
  # The "b" method returns a copied string with encoding ASCII-8BIT
  str = str.b

  # Strip UTF-8 BOM if it's at start of file
  if str.byteslice(0..2) == "\xEF\xBB\xBF".b
    str = str.byteslice(3..-1)
  end

  if str.ascii_only?
    return str.force_encoding('UTF-8')
  end

  # Split string into an array of characters. Any high character will
  # be checked for UTF-8 validity first, or separated out if it's not
  # part of a valid UTF-8 sequence.
  characters = str.scan(/
    (
      (?:[\x00-\x7f]) |
      (?:[\xc0-\xdf][\x80-\xbf]) |
      (?:[\xe0-\xef][\x80-\xbf]{2}) |
      (?:[\xf0-\xf7][\x80-\xbf]{3}) |
      (?:[\x80-\xff])
    )/nx
  ).flatten

  # Pass UTF-8 and standard ASCII characters through as-is.  Any remaining
  # high characters are assumed to be ISO-8859-1 and converted to UTF-8.
  # Stupid quotes are converted to standard ascii quotes.
  characters.map do |character|
    if character.bytesize > 1
      character.force_encoding('UTF-8')
    elsif character.match(/[\x82\x8b\x91\x92\x9b\xb4\x84\x93\x94]/n)
      character.tr("\x82\x8b\x91\x92\x9b\xb4\x84\x93\x94".b, "''''''\"\"\"").force_encoding('UTF-8')
    elsif character.ord > 127
      character.force_encoding('ISO-8859-1').encode('UTF-8')
    else
      character.force_encoding('UTF-8')
    end
  end.join.force_encoding('UTF-8')
 end
	def fix_encoding(str)
	# The "b" method returns a copied string with encoding ASCII-8BIT
	str = str.b

	# Strip UTF-8 BOM if it's at start of file
	if str.byteslice(0..2) == "\xEF\xBB\xBF".b
	str = str.byteslice(3..-1)
	end

	if str.ascii_only?
	return str.force_encoding('UTF-8')
	end

	# Split string into an array of characters. Any high character will
	# be checked for UTF-8 validity first, or separated out if it's not
	# part of a valid UTF-8 sequence.
	characters = str.scan(/
	(
	(?:[\x00-\x7f]) \|
	(?:[\xc0-\xdf][\x80-\xbf]) \|
	(?:[\xe0-\xef][\x80-\xbf]{2}) \|
	(?:[\xf0-\xf7][\x80-\xbf]{3}) \|
	(?:[\x80-\xff])
	)/nx
	).flatten

	# Pass UTF-8 and standard ASCII characters through as-is. Any remaining
	# high characters are assumed to be ISO-8859-1 and converted to UTF-8.
	# Stupid quotes are converted to standard ascii quotes.
	characters.map do \|character\|
	if character.bytesize > 1
	character.force_encoding('UTF-8')
	elsif character.match(/[\x82\x8b\x91\x92\x9b\xb4\x84\x93\x94]/n)
	character.tr("\x82\x8b\x91\x92\x9b\xb4\x84\x93\x94".b, "''''''\"\"\"").force_encoding('UTF-8')
	elsif character.ord > 127
	character.force_encoding('ISO-8859-1').encode('UTF-8')
	else
	character.force_encoding('UTF-8')
	end
	end.join.force_encoding('UTF-8')
	end