Skip to content

Instantly share code, notes, and snippets.

@apeiros
Created February 18, 2011 19:13

Revisions

  1. apeiros revised this gist Nov 12, 2011. 1 changed file with 48 additions and 63 deletions.
    111 changes: 48 additions & 63 deletions transliteration.rb
    Original file line number Diff line number Diff line change
    @@ -16,6 +16,8 @@
    #
    # Beware, this module does not perform proper unicode collation based operations
    module Transliterate
    BiggestSortableNumber = ((128**128)-1)
    SingleDigitToNaturalKey = %W[\u000a \u000b \u000c \u000d \u000e \u000f \u0010 \u0011 \u0012 \u0013 \u0014]
    module_function
    UpperToLower = Hash[%W[
    A a
    @@ -75,64 +77,7 @@ module Transliterate
    Z z
    ].each_slice(2).to_a]

    LowerToUpper = Hash[%W[
    a A
    \u00e5 \u00c5
    \u00e4 \u00c4
    \u00e6 \u00c6
    \u00e2 \u00c2
    \u00e1 \u00c1
    \u00e0 \u00c0
    \u00e3 \u00c3
    b B
    c C
    \u00e7 \u00c7
    d D
    e E
    \u00eb \u00cb
    \u00e9 \u00c9
    \u00ea \u00ca
    \u00e8 \u00c8
    f F
    g G
    h H
    i I
    \u00ef \u00cf
    \u00ed \u00cd
    \u00ee \u00ce
    \u00ec \u00cc
    j J
    k K
    l L
    m M
    n N
    \u00f1 \u00d1
    o O
    \u00f6 \u00d6
    \u00f3 \u00d3
    \u00f4 \u00d4
    \u00f2 \u00d2
    \u00f5 \u00d5
    \u00f8 \u00d8
    p P
    q Q
    r R
    s S
    \u00df \u1e9e
    t T
    u U
    \u00fc \u00dc
    \u00fa \u00da
    \u00fb \u00db
    \u00f9 \u00d9
    v V
    w W
    x X
    y Y
    \u00ff \u0178
    z Z
    ].each_slice(2).to_a]

    LowerToUpper = UpperToLower.invert
    SwapCase = UpperToLower.merge(LowerToUpper)
    UpperCase = UpperToLower.keys.join('')
    LowerCase = LowerToUpper.keys.join('')
    @@ -155,7 +100,7 @@ module Transliterate
    value = Alphabet[index]
    equivalents.each_char { |char| CaseInsensitiveIndex[char] = value }
    end
    NaturalSortUnknown = /[^#{Regexp.escape(CaseSensitiveOrder)}]/u
    NaturalSortUnknown = /[\x00-\x19]|[^#{Regexp.escape(CaseSensitiveOrder)}]/u
    NaturalSortKnown = /[#{Regexp.escape(CaseSensitiveOrder)}]/u

    Transliterate = Hash[%W[
    @@ -282,7 +227,7 @@ def case_insensitive_natural_sort_key(string, stable=true)
    squeeze(' '). # remove duplicate whitespace
    gsub(NaturalSortUnknown, '') # remove unknown characters
    part1 = base.
    gsub(/\d+/) { |value| base128(value.to_i) }. # translate numbers (10 comes after 2, base 128 as it must be unicode-safe)
    gsub(/[+-]?\d+/) { |value| digits_to_natural_key(value.to_i) }. # translate numbers (10 comes after 2, base 128 as it must be unicode-safe)
    gsub(NaturalSortKnown, CaseInsensitiveIndex) # map characters

    if stable then
    @@ -299,17 +244,57 @@ def case_sensitive_natural_sort_key(string)
    strip. # remove leading & trailing whitespace
    squeeze(' '). # remove duplicate whitespace
    gsub(NaturalSortUnknown, ''). # remove unknown characters
    gsub(/\d+/) { |value| base128(value.to_i) }. # translate numbers (10 comes after 2, base 128 as it must be unicode-safe)
    gsub(/[+-]?\d+/) { |value| digits_to_natural_key(value.to_i) }. # translate numbers (10 comes after 2, base 128 as it must be unicode-safe)
    gsub(NaturalSortKnown, CaseSensitiveIndex) # map characters
    end

    # performs a binary encoding whose result can be compared on a binary level correctly.
    # To do that it uses a runtime length encoding of the number plus a base128 encoding.
    # It works with numbers up to 128 digits long (in base 128), that means the biggest
    # number correctly processed is 270 digits long in decimal:
    # 5282945311356652463523397849165166065188473260361215221279607090266739025567248594…
    # 7441725588765718789467439499325712867888234755950268553725053897846293957690838668399…
    # 9005084168731517676426441053024232908211188404148028292751561738838396898767036476489…
    # 538580897737998335 - a pretty friggin huge number. Larger numbers are truncated to
    # that.
    # The scheme used will never use more bytes than the ascii representation of the number
    # used. E.g. 0-9 will still use 1 byte only. Larger numbers even use less space, e.g.
    # 999_999_999_999 (12 bytes in decimal in ascii) can be represented by 8 bytes.
    # Theoretically the implementation could be improved to support numbers as big as
    # 128^(128^8).
    def digits_to_natural_key(value)
    # * 1-8 are for runtime length encoded negative numbers
    # * 9 is for 1 digit (in base 128) negative numbers
    # * 10-29 for single digits
    # * 21 for 1 digit (in base 128) positive numbers
    # * 22-29 for runtime length encoded positive numbers
    if value.between?(0,9) then
    SingleDigitToNaturalKey[value]
    else
    sign = value < 0
    value = value.abs
    value = BiggestSortableNumber if value > BiggestSortableNumber
    converted = base128(value)
    if converted.size == 1 then
    converted << (sign ? 10 : 21)
    else
    converted << converted.size
    converted << (sign ? 9 : 22) # simplified version
    # converted << (sign ? 10-converted.size : 21+converted.size) # this version would enable very large numbers
    end

    converted.reverse.pack("U*").encode(Encoding::UTF_8) # must be big-endian
    end
    end

    def base128(value)
    base128 = []
    begin
    value, digit = value.divmod(128)
    base128 << digit
    end until value.zero?
    base128.reverse.pack("xU*").encode(Encoding::UTF_8) # must be big-endian

    base128
    end
    end

    @@ -318,7 +303,7 @@ def base128(value)
    require 'test/unit'
    class TestNatcmp < Test::Unit::TestCase
    def test_natsortkey
    assert_equal(%w(hello3.jpg Hello5.jpg hello10.jpg), %w(hello3.jpg hello10.jpg Hello5.jpg).sort_by { |a| case_insensitive_natural_sort_key(a) })
    assert_equal(%w(hello3.jpg Hello5.jpg hello10.jpg Hello329.jpg hello292349282912405965338291184.jpg), %w(hello292349282912405965338291184.jpg hello3.jpg Hello329.jpg hello10.jpg Hello5.jpg).sort_by { |a| case_insensitive_natural_sort_key(a) })
    assert_equal(%w(Hello5.jpg hello3.jpg hello10.jpg), %w(hello3.jpg hello10.jpg Hello5.jpg).sort_by { |a| case_sensitive_natural_sort_key(a) })
    assert_equal(%w(Hagar Hägar Hugar), %w(Hugar Hägar Hagar).sort_by { |a| case_sensitive_natural_sort_key(a) })
    assert_equal(%w(Hagar Hugar Hägar), %w(Hugar Hägar Hagar).sort_by { |a| a })
  2. apeiros revised this gist Nov 12, 2011. 1 changed file with 5 additions and 0 deletions.
    5 changes: 5 additions & 0 deletions transliteration.rb
    Original file line number Diff line number Diff line change
    @@ -1,5 +1,10 @@
    # Encoding: utf-8

    # The following code is under BSD 2-clause license
    # -> http://en.wikipedia.org/wiki/BSD_licenses#2-clause_license_.28.22Simplified_BSD_License.22_or_.22FreeBSD_License.22.29
    #
    # Author: Stefan Rusterholz <stefan.rusterholz@gmail.com> - https://github.com/apeiros/
    #
    # A module to help with transliteration issues.
    # Provides methods for:
    # * Changing the case of characters/strings, mapping most latin characters
  3. apeiros revised this gist Nov 12, 2011. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion transliteration.rb
    Original file line number Diff line number Diff line change
    @@ -309,7 +309,7 @@ def base128(value)
    end

    if __FILE__ == $0 then
    include Transliteration
    include Transliterate
    require 'test/unit'
    class TestNatcmp < Test::Unit::TestCase
    def test_natsortkey
  4. apeiros revised this gist Nov 12, 2011. 1 changed file with 18 additions and 15 deletions.
    33 changes: 18 additions & 15 deletions transliteration.rb
    Original file line number Diff line number Diff line change
    @@ -10,14 +10,15 @@
    # All methods output valid utf-8 strings
    #
    # Beware, this module does not perform proper unicode collation based operations
    module Transliteration
    module Transliterate
    module_function
    UpperToLower = Hash[%W[
    A a
    \u00c5 \u00e5
    \u00c4 \u00e4
    \u00c6 \u00e6
    \u00c2 \u00e2
    \u00c1 \u00e1
    \u00c0 \u00e0
    \u00c3 \u00e3
    B b
    @@ -34,6 +35,7 @@ module Transliteration
    H h
    I i
    \u00cf \u00ef
    \u00cd \u00ed
    \u00ce \u00ee
    \u00cc \u00ec
    J j
    @@ -44,6 +46,7 @@ module Transliteration
    \u00d1 \u00f1
    O o
    \u00d6 \u00f6
    \u00d3 \u00f3
    \u00d4 \u00f4
    \u00d2 \u00f2
    \u00d5 \u00f5
    @@ -56,6 +59,7 @@ module Transliteration
    T t
    U u
    \u00dc \u00fc
    \u00da \u00fa
    \u00db \u00fb
    \u00d9 \u00f9
    V v
    @@ -72,7 +76,7 @@ module Transliteration
    \u00e4 \u00c4
    \u00e6 \u00c6
    \u00e2 \u00c2
    \u00e1 A
    \u00e1 \u00c1
    \u00e0 \u00c0
    \u00e3 \u00c3
    b B
    @@ -89,7 +93,7 @@ module Transliteration
    h H
    i I
    \u00ef \u00cf
    \u00ed I
    \u00ed \u00cd
    \u00ee \u00ce
    \u00ec \u00cc
    j J
    @@ -100,7 +104,7 @@ module Transliteration
    \u00f1 \u00d1
    o O
    \u00f6 \u00d6
    \u00f3 O
    \u00f3 \u00d3
    \u00f4 \u00d4
    \u00f2 \u00d2
    \u00f5 \u00d5
    @@ -113,7 +117,7 @@ module Transliteration
    t T
    u U
    \u00fc \u00dc
    \u00fa U
    \u00fa \u00da
    \u00fb \u00db
    \u00f9 \u00d9
    v V
    @@ -127,10 +131,10 @@ module Transliteration
    SwapCase = UpperToLower.merge(LowerToUpper)
    UpperCase = UpperToLower.keys.join('')
    LowerCase = LowerToUpper.keys.join('')
    MatchUpper = Regexp.new(/[#{Regexp.escape(UpperCase)}]/u)
    MatchLower = Regexp.new(/[#{Regexp.escape(LowerCase)}]/u)
    MatchChar = Regexp.new(/[#{Regexp.escape(SwapCase.keys.join(''))}]/u)
    MatchNoChar = Regexp.new(/[^#{Regexp.escape(SwapCase.keys.join(''))}]/u)
    MatchUpper = /[#{Regexp.escape(UpperCase)}]/u
    MatchLower = /[#{Regexp.escape(LowerCase)}]/u
    MatchChar = /[#{Regexp.escape(SwapCase.keys.join(''))}]/u
    MatchNoChar = /[^#{Regexp.escape(SwapCase.keys.join(''))}]/u

    mixed_case = UpperToLower.dup
    LowerToUpper.each do |key, value|
    @@ -165,7 +169,7 @@ module Transliteration
    \u00ce I I
    \u00cc I I
    \u00d1 N N
    \u00d6 O OE
    \u00d6 O Oe
    \u00d4 O O
    \u00d2 O O
    \u00d5 O O
    @@ -191,7 +195,7 @@ module Transliteration
    \u00ee i i
    \u00ec i i
    \u00f1 n n
    \u00f6 o o
    \u00f6 o oe
    \u00f3 o o
    \u00f4 o o
    \u00f2 o o
    @@ -213,7 +217,7 @@ module Transliteration
    Transliterate[' '] = [' ', ' ']
    Transliterate1 = Hash[Transliterate.map { |key, (map1, map2)| [key, map1] }]
    Transliterate2 = Hash[Transliterate.map { |key, (map1, map2)| [key, map2] }]
    TransliterateMatch = Regexp.new("["+Regexp.escape(Transliterate.keys.join(''))+"]")
    TransliterateMatch = /[#{Regexp.escape(Transliterate.keys.join(''))}]/u

    # Maps non-7bit characters to a single 7 bit character, e.g. "ä" is mapped
    # to "a"
    @@ -224,7 +228,7 @@ module Transliteration
    # Also see transliterate2, which tries to retain the meaning
    def transliterate(string, replace_unknown="", extend_transliteration=nil)
    if extend_transliteration then
    match = Regexp.union(TransliterateMatch, Regexp.new("["+Regexp.escape(extend_transliteration.keys.join(''))+"]"))
    match = Regexp.union(TransliterateMatch, /[#{Regexp.escape(extend_transliteration.keys.join(''))}]/u)
    replace = Transliterate1.merge(extend_transliteration)
    else
    match = TransliterateMatch
    @@ -243,7 +247,7 @@ def transliterate(string, replace_unknown="", extend_transliteration=nil)
    # Also see transliterate, which maps single characters to single characters
    def transliterate2(string, replace_unknown="", extend_transliteration=nil)
    if extend_transliteration then
    match = Regexp.union(TransliterateMatch, Regexp.new("["+Regexp.escape(extend_transliteration.keys.join(''))+"]"))
    match = Regexp.union(TransliterateMatch, /[#{Regexp.escape(extend_transliteration.keys.join(''))}]/u)
    replace = Transliterate2.merge(extend_transliteration)
    else
    match = TransliterateMatch
    @@ -303,7 +307,6 @@ def base128(value)
    base128.reverse.pack("xU*").encode(Encoding::UTF_8) # must be big-endian
    end
    end
    include Transliteration

    if __FILE__ == $0 then
    include Transliteration
  5. apeiros revised this gist Feb 18, 2011. 1 changed file with 3 additions and 3 deletions.
    6 changes: 3 additions & 3 deletions transliteration.rb
    Original file line number Diff line number Diff line change
    @@ -254,15 +254,15 @@ def transliterate2(string, replace_unknown="", extend_transliteration=nil)
    end

    def downcase(string)
    string.encode(Encoding::UTF_8).downcase.gsub(MatchUpper, UpperToLower)
    string.encode(Encoding::UTF_8).gsub(MatchUpper, UpperToLower)
    end

    def upcase(string)
    string.encode(Encoding::UTF_8).upcase.gsub(MatchLower, LowerToUpper)
    string.encode(Encoding::UTF_8).gsub(MatchLower, LowerToUpper)
    end

    def swapcase(string)
    string.encode(Encoding::UTF_8).downcase.gsub(MatchChar, SwapCase)
    string.encode(Encoding::UTF_8).gsub(MatchChar, SwapCase)
    end

    # stable makes that differently cased letters don't
  6. apeiros revised this gist Feb 18, 2011. 1 changed file with 15 additions and 2 deletions.
    17 changes: 15 additions & 2 deletions transliteration.rb
    Original file line number Diff line number Diff line change
    @@ -5,6 +5,7 @@
    # * Changing the case of characters/strings, mapping most latin characters
    # * Generate a value to perform natural sorting of strings (e.g. "10" > "2",
    # "ä" < "z" etc.)
    # * Transliterate a utf-8 string to 7bit chars only (e.g. map "ä" to "a" or "ae")
    #
    # All methods output valid utf-8 strings
    #
    @@ -129,6 +130,7 @@ module Transliteration
    MatchUpper = Regexp.new(/[#{Regexp.escape(UpperCase)}]/u)
    MatchLower = Regexp.new(/[#{Regexp.escape(LowerCase)}]/u)
    MatchChar = Regexp.new(/[#{Regexp.escape(SwapCase.keys.join(''))}]/u)
    MatchNoChar = Regexp.new(/[^#{Regexp.escape(SwapCase.keys.join(''))}]/u)

    mixed_case = UpperToLower.dup
    LowerToUpper.each do |key, value|
    @@ -265,13 +267,21 @@ def swapcase(string)

    # stable makes that differently cased letters don't
    def case_insensitive_natural_sort_key(string, stable=true)
    string.encode(Encoding::UTF_8).
    base = string.encode(Encoding::UTF_8).
    tr("\t\n\r\v\f",' '). # convert whitespace to space
    strip. # remove leading & trailing whitespace
    squeeze(' '). # remove duplicate whitespace
    gsub(NaturalSortUnknown, ''). # remove unknown characters
    gsub(NaturalSortUnknown, '') # remove unknown characters
    part1 = base.
    gsub(/\d+/) { |value| base128(value.to_i) }. # translate numbers (10 comes after 2, base 128 as it must be unicode-safe)
    gsub(NaturalSortKnown, CaseInsensitiveIndex) # map characters

    if stable then
    part2 = case_sensitive_natural_sort_key(base.gsub(MatchNoChar, ''))
    part1 + part2
    else
    part1
    end
    end

    def case_sensitive_natural_sort_key(string)
    @@ -306,6 +316,9 @@ def test_natsortkey
    assert_equal(%w(Hagar Hugar Hägar), %w(Hugar Hägar Hagar).sort_by { |a| a })
    assert(case_sensitive_natural_sort_key("Hägar20").valid_encoding?)
    assert(case_insensitive_natural_sort_key("Hägar20").valid_encoding?)

    assert_equal(%w(Ä ä), %w(ä Ä).sort_by { |a| case_insensitive_natural_sort_key(a, true) })
    assert_equal(case_insensitive_natural_sort_key("Ä", false), case_insensitive_natural_sort_key("ä", false))
    end
    end
    end
  7. apeiros created this gist Feb 18, 2011.
    311 changes: 311 additions & 0 deletions transliteration.rb
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,311 @@
    # Encoding: utf-8

    # A module to help with transliteration issues.
    # Provides methods for:
    # * Changing the case of characters/strings, mapping most latin characters
    # * Generate a value to perform natural sorting of strings (e.g. "10" > "2",
    # "ä" < "z" etc.)
    #
    # All methods output valid utf-8 strings
    #
    # Beware, this module does not perform proper unicode collation based operations
    module Transliteration
    module_function
    UpperToLower = Hash[%W[
    A a
    \u00c5 \u00e5
    \u00c4 \u00e4
    \u00c6 \u00e6
    \u00c2 \u00e2
    \u00c0 \u00e0
    \u00c3 \u00e3
    B b
    C c
    \u00c7 \u00e7
    D d
    E e
    \u00cb \u00eb
    \u00c9 \u00e9
    \u00ca \u00ea
    \u00c8 \u00e8
    F f
    G g
    H h
    I i
    \u00cf \u00ef
    \u00ce \u00ee
    \u00cc \u00ec
    J j
    K k
    L l
    M m
    N n
    \u00d1 \u00f1
    O o
    \u00d6 \u00f6
    \u00d4 \u00f4
    \u00d2 \u00f2
    \u00d5 \u00f5
    \u00d8 \u00f8
    P p
    Q q
    R r
    S s
    \u1e9e \u00df
    T t
    U u
    \u00dc \u00fc
    \u00db \u00fb
    \u00d9 \u00f9
    V v
    W w
    X x
    Y y
    \u0178 \u00ff
    Z z
    ].each_slice(2).to_a]

    LowerToUpper = Hash[%W[
    a A
    \u00e5 \u00c5
    \u00e4 \u00c4
    \u00e6 \u00c6
    \u00e2 \u00c2
    \u00e1 A
    \u00e0 \u00c0
    \u00e3 \u00c3
    b B
    c C
    \u00e7 \u00c7
    d D
    e E
    \u00eb \u00cb
    \u00e9 \u00c9
    \u00ea \u00ca
    \u00e8 \u00c8
    f F
    g G
    h H
    i I
    \u00ef \u00cf
    \u00ed I
    \u00ee \u00ce
    \u00ec \u00cc
    j J
    k K
    l L
    m M
    n N
    \u00f1 \u00d1
    o O
    \u00f6 \u00d6
    \u00f3 O
    \u00f4 \u00d4
    \u00f2 \u00d2
    \u00f5 \u00d5
    \u00f8 \u00d8
    p P
    q Q
    r R
    s S
    \u00df \u1e9e
    t T
    u U
    \u00fc \u00dc
    \u00fa U
    \u00fb \u00db
    \u00f9 \u00d9
    v V
    w W
    x X
    y Y
    \u00ff \u0178
    z Z
    ].each_slice(2).to_a]

    SwapCase = UpperToLower.merge(LowerToUpper)
    UpperCase = UpperToLower.keys.join('')
    LowerCase = LowerToUpper.keys.join('')
    MatchUpper = Regexp.new(/[#{Regexp.escape(UpperCase)}]/u)
    MatchLower = Regexp.new(/[#{Regexp.escape(LowerCase)}]/u)
    MatchChar = Regexp.new(/[#{Regexp.escape(SwapCase.keys.join(''))}]/u)

    mixed_case = UpperToLower.dup
    LowerToUpper.each do |key, value|
    mixed_case[value] += key unless UpperToLower[value] == key
    end
    mixed_case = mixed_case.map { |upper,lower| upper+lower }
    CaseSensitiveOrder = "!\"\#$%&'()*+,-./0123456789:;<=>?@[\\]^_`{|}~\u00a7\u00b2\u00bf#{UpperCase}#{LowerCase}"
    CaseInsensitiveOrder = "!\"\#$%&'()*+,-./0123456789:;<=>?@[\\]^_`{|}~\u00a7\u00b2\u00bf".chars.to_a+mixed_case
    Alphabet = (32...([CaseSensitiveOrder.length+32, (1<<15)+22528].max)).to_a.pack("U*")
    CaseSensitiveIndex = Hash[CaseSensitiveOrder.chars.with_index.to_a.map { |char, index| [char, Alphabet[index]] }]
    CaseInsensitiveIndex = {}
    CaseInsensitiveOrder.each_with_index do |equivalents, index|
    value = Alphabet[index]
    equivalents.each_char { |char| CaseInsensitiveIndex[char] = value }
    end
    NaturalSortUnknown = /[^#{Regexp.escape(CaseSensitiveOrder)}]/u
    NaturalSortKnown = /[#{Regexp.escape(CaseSensitiveOrder)}]/u

    Transliterate = Hash[%W[
    \u00c5 A A
    \u00c4 A Ae
    \u00c6 A AE
    \u00c2 A A
    \u00c0 A A
    \u00c3 A A
    \u00c7 C C
    \u00cb E E
    \u00c9 E E
    \u00ca E E
    \u00c8 E E
    \u00cf I I
    \u00ce I I
    \u00cc I I
    \u00d1 N N
    \u00d6 O OE
    \u00d4 O O
    \u00d2 O O
    \u00d5 O O
    \u00d8 O O
    \u1e9e S SS
    \u00dc U U
    \u00db U U
    \u00d9 U U
    \u0178 Y Y
    \u00e5 a a
    \u00e4 a ae
    \u00e6 a ae
    \u00e1 a a
    \u00e2 a a
    \u00e0 a a
    \u00e3 a a
    \u00e7 c c
    \u00eb e e
    \u00ea e e
    \u00e8 e e
    \u00ef i i
    \u00ed i i
    \u00ee i i
    \u00ec i i
    \u00f1 n n
    \u00f6 o o
    \u00f3 o o
    \u00f4 o o
    \u00f2 o o
    \u00f5 o o
    \u00f8 o o
    \u00df s ss
    \u00fc u u
    \u00fa u u
    \u00fb u u
    \u00f9 u u
    \u00ff y y
    ].each_slice(3).map { |char, map1, map2| [char, [map1, map2]] }]

    # 7bit ascii characters
    ("A".."Z").each do |char| Transliterate[char] = [char, char] end
    ("a".."z").each do |char| Transliterate[char] = [char, char] end
    ("0".."9").each do |char| Transliterate[char] = [char, char] end
    %W[! \" # $ % & ' ( ) * + , - . / : ; < = > ? @ [ \\ ] ^ _ ` { | } ~].each do |char| Transliterate[char] = [char, char] end
    Transliterate[' '] = [' ', ' ']
    Transliterate1 = Hash[Transliterate.map { |key, (map1, map2)| [key, map1] }]
    Transliterate2 = Hash[Transliterate.map { |key, (map1, map2)| [key, map2] }]
    TransliterateMatch = Regexp.new("["+Regexp.escape(Transliterate.keys.join(''))+"]")

    # Maps non-7bit characters to a single 7 bit character, e.g. "ä" is mapped
    # to "a"
    #
    # Example:
    # Transliterate.transliterate "Größe" # => "Grose"
    #
    # Also see transliterate2, which tries to retain the meaning
    def transliterate(string, replace_unknown="", extend_transliteration=nil)
    if extend_transliteration then
    match = Regexp.union(TransliterateMatch, Regexp.new("["+Regexp.escape(extend_transliteration.keys.join(''))+"]"))
    replace = Transliterate1.merge(extend_transliteration)
    else
    match = TransliterateMatch
    replace = Transliterate1
    end

    string.encode(Encoding::UTF_8).gsub(match, replace)
    end

    # Maps non-7bit characters to a sequence of 7 bit characters which convey
    # the same meaning, e.g. "ä" is mapped to "ae".
    #
    # Example:
    # Transliterate.transliterate2 "Größe" # => "Groesse"
    #
    # Also see transliterate, which maps single characters to single characters
    def transliterate2(string, replace_unknown="", extend_transliteration=nil)
    if extend_transliteration then
    match = Regexp.union(TransliterateMatch, Regexp.new("["+Regexp.escape(extend_transliteration.keys.join(''))+"]"))
    replace = Transliterate2.merge(extend_transliteration)
    else
    match = TransliterateMatch
    replace = Transliterate2
    end

    string.encode(Encoding::UTF_8).gsub(match, replace)
    end

    def downcase(string)
    string.encode(Encoding::UTF_8).downcase.gsub(MatchUpper, UpperToLower)
    end

    def upcase(string)
    string.encode(Encoding::UTF_8).upcase.gsub(MatchLower, LowerToUpper)
    end

    def swapcase(string)
    string.encode(Encoding::UTF_8).downcase.gsub(MatchChar, SwapCase)
    end

    # stable makes that differently cased letters don't
    def case_insensitive_natural_sort_key(string, stable=true)
    string.encode(Encoding::UTF_8).
    tr("\t\n\r\v\f",' '). # convert whitespace to space
    strip. # remove leading & trailing whitespace
    squeeze(' '). # remove duplicate whitespace
    gsub(NaturalSortUnknown, ''). # remove unknown characters
    gsub(/\d+/) { |value| base128(value.to_i) }. # translate numbers (10 comes after 2, base 128 as it must be unicode-safe)
    gsub(NaturalSortKnown, CaseInsensitiveIndex) # map characters
    end

    def case_sensitive_natural_sort_key(string)
    string.encode(Encoding::UTF_8).
    tr("\t\n\r\v\f",' '). # convert whitespace to space
    strip. # remove leading & trailing whitespace
    squeeze(' '). # remove duplicate whitespace
    gsub(NaturalSortUnknown, ''). # remove unknown characters
    gsub(/\d+/) { |value| base128(value.to_i) }. # translate numbers (10 comes after 2, base 128 as it must be unicode-safe)
    gsub(NaturalSortKnown, CaseSensitiveIndex) # map characters
    end

    def base128(value)
    base128 = []
    begin
    value, digit = value.divmod(128)
    base128 << digit
    end until value.zero?
    base128.reverse.pack("xU*").encode(Encoding::UTF_8) # must be big-endian
    end
    end
    include Transliteration

    if __FILE__ == $0 then
    include Transliteration
    require 'test/unit'
    class TestNatcmp < Test::Unit::TestCase
    def test_natsortkey
    assert_equal(%w(hello3.jpg Hello5.jpg hello10.jpg), %w(hello3.jpg hello10.jpg Hello5.jpg).sort_by { |a| case_insensitive_natural_sort_key(a) })
    assert_equal(%w(Hello5.jpg hello3.jpg hello10.jpg), %w(hello3.jpg hello10.jpg Hello5.jpg).sort_by { |a| case_sensitive_natural_sort_key(a) })
    assert_equal(%w(Hagar Hägar Hugar), %w(Hugar Hägar Hagar).sort_by { |a| case_sensitive_natural_sort_key(a) })
    assert_equal(%w(Hagar Hugar Hägar), %w(Hugar Hägar Hagar).sort_by { |a| a })
    assert(case_sensitive_natural_sort_key("Hägar20").valid_encoding?)
    assert(case_insensitive_natural_sort_key("Hägar20").valid_encoding?)
    end
    end
    end