Created
February 18, 2011 19:13
Revisions
-
apeiros revised this gist
Nov 12, 2011 . 1 changed file with 48 additions and 63 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -16,6 +16,8 @@ # # Beware, this module does not perform proper unicode collation based operations module Transliterate BiggestSortableNumber = ((128**128)-1) SingleDigitToNaturalKey = %W[\u000a \u000b \u000c \u000d \u000e \u000f \u0010 \u0011 \u0012 \u0013 \u0014] module_function UpperToLower = Hash[%W[ A a @@ -75,64 +77,7 @@ module Transliterate Z z ].each_slice(2).to_a] LowerToUpper = UpperToLower.invert SwapCase = UpperToLower.merge(LowerToUpper) UpperCase = UpperToLower.keys.join('') LowerCase = LowerToUpper.keys.join('') @@ -155,7 +100,7 @@ module Transliterate value = Alphabet[index] equivalents.each_char { |char| CaseInsensitiveIndex[char] = value } end NaturalSortUnknown = /[\x00-\x19]|[^#{Regexp.escape(CaseSensitiveOrder)}]/u NaturalSortKnown = /[#{Regexp.escape(CaseSensitiveOrder)}]/u Transliterate = Hash[%W[ @@ -282,7 +227,7 @@ def case_insensitive_natural_sort_key(string, stable=true) squeeze(' '). # remove duplicate whitespace gsub(NaturalSortUnknown, '') # remove unknown characters part1 = base. gsub(/[+-]?\d+/) { |value| digits_to_natural_key(value.to_i) }. # translate numbers (10 comes after 2, base 128 as it must be unicode-safe) gsub(NaturalSortKnown, CaseInsensitiveIndex) # map characters if stable then @@ -299,17 +244,57 @@ def case_sensitive_natural_sort_key(string) strip. # remove leading & trailing whitespace squeeze(' '). # remove duplicate whitespace gsub(NaturalSortUnknown, ''). # remove unknown characters gsub(/[+-]?\d+/) { |value| digits_to_natural_key(value.to_i) }. # translate numbers (10 comes after 2, base 128 as it must be unicode-safe) gsub(NaturalSortKnown, CaseSensitiveIndex) # map characters end # performs a binary encoding whose result can be compared on a binary level correctly. # To do that it uses a runtime length encoding of the number plus a base128 encoding. # It works with numbers up to 128 digits long (in base 128), that means the biggest # number correctly processed is 270 digits long in decimal: # 5282945311356652463523397849165166065188473260361215221279607090266739025567248594… # 7441725588765718789467439499325712867888234755950268553725053897846293957690838668399… # 9005084168731517676426441053024232908211188404148028292751561738838396898767036476489… # 538580897737998335 - a pretty friggin huge number. Larger numbers are truncated to # that. # The scheme used will never use more bytes than the ascii representation of the number # used. E.g. 0-9 will still use 1 byte only. Larger numbers even use less space, e.g. # 999_999_999_999 (12 bytes in decimal in ascii) can be represented by 8 bytes. # Theoretically the implementation could be improved to support numbers as big as # 128^(128^8). def digits_to_natural_key(value) # * 1-8 are for runtime length encoded negative numbers # * 9 is for 1 digit (in base 128) negative numbers # * 10-29 for single digits # * 21 for 1 digit (in base 128) positive numbers # * 22-29 for runtime length encoded positive numbers if value.between?(0,9) then SingleDigitToNaturalKey[value] else sign = value < 0 value = value.abs value = BiggestSortableNumber if value > BiggestSortableNumber converted = base128(value) if converted.size == 1 then converted << (sign ? 10 : 21) else converted << converted.size converted << (sign ? 9 : 22) # simplified version # converted << (sign ? 10-converted.size : 21+converted.size) # this version would enable very large numbers end converted.reverse.pack("U*").encode(Encoding::UTF_8) # must be big-endian end end def base128(value) base128 = [] begin value, digit = value.divmod(128) base128 << digit end until value.zero? base128 end end @@ -318,7 +303,7 @@ def base128(value) require 'test/unit' class TestNatcmp < Test::Unit::TestCase def test_natsortkey assert_equal(%w(hello3.jpg Hello5.jpg hello10.jpg Hello329.jpg hello292349282912405965338291184.jpg), %w(hello292349282912405965338291184.jpg hello3.jpg Hello329.jpg hello10.jpg Hello5.jpg).sort_by { |a| case_insensitive_natural_sort_key(a) }) assert_equal(%w(Hello5.jpg hello3.jpg hello10.jpg), %w(hello3.jpg hello10.jpg Hello5.jpg).sort_by { |a| case_sensitive_natural_sort_key(a) }) assert_equal(%w(Hagar Hägar Hugar), %w(Hugar Hägar Hagar).sort_by { |a| case_sensitive_natural_sort_key(a) }) assert_equal(%w(Hagar Hugar Hägar), %w(Hugar Hägar Hagar).sort_by { |a| a }) -
apeiros revised this gist
Nov 12, 2011 . 1 changed file with 5 additions and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,5 +1,10 @@ # Encoding: utf-8 # The following code is under BSD 2-clause license # -> http://en.wikipedia.org/wiki/BSD_licenses#2-clause_license_.28.22Simplified_BSD_License.22_or_.22FreeBSD_License.22.29 # # Author: Stefan Rusterholz <stefan.rusterholz@gmail.com> - https://github.com/apeiros/ # # A module to help with transliteration issues. # Provides methods for: # * Changing the case of characters/strings, mapping most latin characters -
apeiros revised this gist
Nov 12, 2011 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -309,7 +309,7 @@ def base128(value) end if __FILE__ == $0 then include Transliterate require 'test/unit' class TestNatcmp < Test::Unit::TestCase def test_natsortkey -
apeiros revised this gist
Nov 12, 2011 . 1 changed file with 18 additions and 15 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -10,14 +10,15 @@ # All methods output valid utf-8 strings # # Beware, this module does not perform proper unicode collation based operations module Transliterate module_function UpperToLower = Hash[%W[ A a \u00c5 \u00e5 \u00c4 \u00e4 \u00c6 \u00e6 \u00c2 \u00e2 \u00c1 \u00e1 \u00c0 \u00e0 \u00c3 \u00e3 B b @@ -34,6 +35,7 @@ module Transliteration H h I i \u00cf \u00ef \u00cd \u00ed \u00ce \u00ee \u00cc \u00ec J j @@ -44,6 +46,7 @@ module Transliteration \u00d1 \u00f1 O o \u00d6 \u00f6 \u00d3 \u00f3 \u00d4 \u00f4 \u00d2 \u00f2 \u00d5 \u00f5 @@ -56,6 +59,7 @@ module Transliteration T t U u \u00dc \u00fc \u00da \u00fa \u00db \u00fb \u00d9 \u00f9 V v @@ -72,7 +76,7 @@ module Transliteration \u00e4 \u00c4 \u00e6 \u00c6 \u00e2 \u00c2 \u00e1 \u00c1 \u00e0 \u00c0 \u00e3 \u00c3 b B @@ -89,7 +93,7 @@ module Transliteration h H i I \u00ef \u00cf \u00ed \u00cd \u00ee \u00ce \u00ec \u00cc j J @@ -100,7 +104,7 @@ module Transliteration \u00f1 \u00d1 o O \u00f6 \u00d6 \u00f3 \u00d3 \u00f4 \u00d4 \u00f2 \u00d2 \u00f5 \u00d5 @@ -113,7 +117,7 @@ module Transliteration t T u U \u00fc \u00dc \u00fa \u00da \u00fb \u00db \u00f9 \u00d9 v V @@ -127,10 +131,10 @@ module Transliteration SwapCase = UpperToLower.merge(LowerToUpper) UpperCase = UpperToLower.keys.join('') LowerCase = LowerToUpper.keys.join('') MatchUpper = /[#{Regexp.escape(UpperCase)}]/u MatchLower = /[#{Regexp.escape(LowerCase)}]/u MatchChar = /[#{Regexp.escape(SwapCase.keys.join(''))}]/u MatchNoChar = /[^#{Regexp.escape(SwapCase.keys.join(''))}]/u mixed_case = UpperToLower.dup LowerToUpper.each do |key, value| @@ -165,7 +169,7 @@ module Transliteration \u00ce I I \u00cc I I \u00d1 N N \u00d6 O Oe \u00d4 O O \u00d2 O O \u00d5 O O @@ -191,7 +195,7 @@ module Transliteration \u00ee i i \u00ec i i \u00f1 n n \u00f6 o oe \u00f3 o o \u00f4 o o \u00f2 o o @@ -213,7 +217,7 @@ module Transliteration Transliterate[' '] = [' ', ' '] Transliterate1 = Hash[Transliterate.map { |key, (map1, map2)| [key, map1] }] Transliterate2 = Hash[Transliterate.map { |key, (map1, map2)| [key, map2] }] TransliterateMatch = /[#{Regexp.escape(Transliterate.keys.join(''))}]/u # Maps non-7bit characters to a single 7 bit character, e.g. "ä" is mapped # to "a" @@ -224,7 +228,7 @@ module Transliteration # Also see transliterate2, which tries to retain the meaning def transliterate(string, replace_unknown="", extend_transliteration=nil) if extend_transliteration then match = Regexp.union(TransliterateMatch, /[#{Regexp.escape(extend_transliteration.keys.join(''))}]/u) replace = Transliterate1.merge(extend_transliteration) else match = TransliterateMatch @@ -243,7 +247,7 @@ def transliterate(string, replace_unknown="", extend_transliteration=nil) # Also see transliterate, which maps single characters to single characters def transliterate2(string, replace_unknown="", extend_transliteration=nil) if extend_transliteration then match = Regexp.union(TransliterateMatch, /[#{Regexp.escape(extend_transliteration.keys.join(''))}]/u) replace = Transliterate2.merge(extend_transliteration) else match = TransliterateMatch @@ -303,7 +307,6 @@ def base128(value) base128.reverse.pack("xU*").encode(Encoding::UTF_8) # must be big-endian end end if __FILE__ == $0 then include Transliteration -
apeiros revised this gist
Feb 18, 2011 . 1 changed file with 3 additions and 3 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -254,15 +254,15 @@ def transliterate2(string, replace_unknown="", extend_transliteration=nil) end def downcase(string) string.encode(Encoding::UTF_8).gsub(MatchUpper, UpperToLower) end def upcase(string) string.encode(Encoding::UTF_8).gsub(MatchLower, LowerToUpper) end def swapcase(string) string.encode(Encoding::UTF_8).gsub(MatchChar, SwapCase) end # stable makes that differently cased letters don't -
apeiros revised this gist
Feb 18, 2011 . 1 changed file with 15 additions and 2 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -5,6 +5,7 @@ # * Changing the case of characters/strings, mapping most latin characters # * Generate a value to perform natural sorting of strings (e.g. "10" > "2", # "ä" < "z" etc.) # * Transliterate a utf-8 string to 7bit chars only (e.g. map "ä" to "a" or "ae") # # All methods output valid utf-8 strings # @@ -129,6 +130,7 @@ module Transliteration MatchUpper = Regexp.new(/[#{Regexp.escape(UpperCase)}]/u) MatchLower = Regexp.new(/[#{Regexp.escape(LowerCase)}]/u) MatchChar = Regexp.new(/[#{Regexp.escape(SwapCase.keys.join(''))}]/u) MatchNoChar = Regexp.new(/[^#{Regexp.escape(SwapCase.keys.join(''))}]/u) mixed_case = UpperToLower.dup LowerToUpper.each do |key, value| @@ -265,13 +267,21 @@ def swapcase(string) # stable makes that differently cased letters don't def case_insensitive_natural_sort_key(string, stable=true) base = string.encode(Encoding::UTF_8). tr("\t\n\r\v\f",' '). # convert whitespace to space strip. # remove leading & trailing whitespace squeeze(' '). # remove duplicate whitespace gsub(NaturalSortUnknown, '') # remove unknown characters part1 = base. gsub(/\d+/) { |value| base128(value.to_i) }. # translate numbers (10 comes after 2, base 128 as it must be unicode-safe) gsub(NaturalSortKnown, CaseInsensitiveIndex) # map characters if stable then part2 = case_sensitive_natural_sort_key(base.gsub(MatchNoChar, '')) part1 + part2 else part1 end end def case_sensitive_natural_sort_key(string) @@ -306,6 +316,9 @@ def test_natsortkey assert_equal(%w(Hagar Hugar Hägar), %w(Hugar Hägar Hagar).sort_by { |a| a }) assert(case_sensitive_natural_sort_key("Hägar20").valid_encoding?) assert(case_insensitive_natural_sort_key("Hägar20").valid_encoding?) assert_equal(%w(Ä ä), %w(ä Ä).sort_by { |a| case_insensitive_natural_sort_key(a, true) }) assert_equal(case_insensitive_natural_sort_key("Ä", false), case_insensitive_natural_sort_key("ä", false)) end end end -
apeiros created this gist
Feb 18, 2011 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,311 @@ # Encoding: utf-8 # A module to help with transliteration issues. # Provides methods for: # * Changing the case of characters/strings, mapping most latin characters # * Generate a value to perform natural sorting of strings (e.g. "10" > "2", # "ä" < "z" etc.) # # All methods output valid utf-8 strings # # Beware, this module does not perform proper unicode collation based operations module Transliteration module_function UpperToLower = Hash[%W[ A a \u00c5 \u00e5 \u00c4 \u00e4 \u00c6 \u00e6 \u00c2 \u00e2 \u00c0 \u00e0 \u00c3 \u00e3 B b C c \u00c7 \u00e7 D d E e \u00cb \u00eb \u00c9 \u00e9 \u00ca \u00ea \u00c8 \u00e8 F f G g H h I i \u00cf \u00ef \u00ce \u00ee \u00cc \u00ec J j K k L l M m N n \u00d1 \u00f1 O o \u00d6 \u00f6 \u00d4 \u00f4 \u00d2 \u00f2 \u00d5 \u00f5 \u00d8 \u00f8 P p Q q R r S s \u1e9e \u00df T t U u \u00dc \u00fc \u00db \u00fb \u00d9 \u00f9 V v W w X x Y y \u0178 \u00ff Z z ].each_slice(2).to_a] LowerToUpper = Hash[%W[ a A \u00e5 \u00c5 \u00e4 \u00c4 \u00e6 \u00c6 \u00e2 \u00c2 \u00e1 A \u00e0 \u00c0 \u00e3 \u00c3 b B c C \u00e7 \u00c7 d D e E \u00eb \u00cb \u00e9 \u00c9 \u00ea \u00ca \u00e8 \u00c8 f F g G h H i I \u00ef \u00cf \u00ed I \u00ee \u00ce \u00ec \u00cc j J k K l L m M n N \u00f1 \u00d1 o O \u00f6 \u00d6 \u00f3 O \u00f4 \u00d4 \u00f2 \u00d2 \u00f5 \u00d5 \u00f8 \u00d8 p P q Q r R s S \u00df \u1e9e t T u U \u00fc \u00dc \u00fa U \u00fb \u00db \u00f9 \u00d9 v V w W x X y Y \u00ff \u0178 z Z ].each_slice(2).to_a] SwapCase = UpperToLower.merge(LowerToUpper) UpperCase = UpperToLower.keys.join('') LowerCase = LowerToUpper.keys.join('') MatchUpper = Regexp.new(/[#{Regexp.escape(UpperCase)}]/u) MatchLower = Regexp.new(/[#{Regexp.escape(LowerCase)}]/u) MatchChar = Regexp.new(/[#{Regexp.escape(SwapCase.keys.join(''))}]/u) mixed_case = UpperToLower.dup LowerToUpper.each do |key, value| mixed_case[value] += key unless UpperToLower[value] == key end mixed_case = mixed_case.map { |upper,lower| upper+lower } CaseSensitiveOrder = "!\"\#$%&'()*+,-./0123456789:;<=>?@[\\]^_`{|}~\u00a7\u00b2\u00bf#{UpperCase}#{LowerCase}" CaseInsensitiveOrder = "!\"\#$%&'()*+,-./0123456789:;<=>?@[\\]^_`{|}~\u00a7\u00b2\u00bf".chars.to_a+mixed_case Alphabet = (32...([CaseSensitiveOrder.length+32, (1<<15)+22528].max)).to_a.pack("U*") CaseSensitiveIndex = Hash[CaseSensitiveOrder.chars.with_index.to_a.map { |char, index| [char, Alphabet[index]] }] CaseInsensitiveIndex = {} CaseInsensitiveOrder.each_with_index do |equivalents, index| value = Alphabet[index] equivalents.each_char { |char| CaseInsensitiveIndex[char] = value } end NaturalSortUnknown = /[^#{Regexp.escape(CaseSensitiveOrder)}]/u NaturalSortKnown = /[#{Regexp.escape(CaseSensitiveOrder)}]/u Transliterate = Hash[%W[ \u00c5 A A \u00c4 A Ae \u00c6 A AE \u00c2 A A \u00c0 A A \u00c3 A A \u00c7 C C \u00cb E E \u00c9 E E \u00ca E E \u00c8 E E \u00cf I I \u00ce I I \u00cc I I \u00d1 N N \u00d6 O OE \u00d4 O O \u00d2 O O \u00d5 O O \u00d8 O O \u1e9e S SS \u00dc U U \u00db U U \u00d9 U U \u0178 Y Y \u00e5 a a \u00e4 a ae \u00e6 a ae \u00e1 a a \u00e2 a a \u00e0 a a \u00e3 a a \u00e7 c c \u00eb e e \u00ea e e \u00e8 e e \u00ef i i \u00ed i i \u00ee i i \u00ec i i \u00f1 n n \u00f6 o o \u00f3 o o \u00f4 o o \u00f2 o o \u00f5 o o \u00f8 o o \u00df s ss \u00fc u u \u00fa u u \u00fb u u \u00f9 u u \u00ff y y ].each_slice(3).map { |char, map1, map2| [char, [map1, map2]] }] # 7bit ascii characters ("A".."Z").each do |char| Transliterate[char] = [char, char] end ("a".."z").each do |char| Transliterate[char] = [char, char] end ("0".."9").each do |char| Transliterate[char] = [char, char] end %W[! \" # $ % & ' ( ) * + , - . / : ; < = > ? @ [ \\ ] ^ _ ` { | } ~].each do |char| Transliterate[char] = [char, char] end Transliterate[' '] = [' ', ' '] Transliterate1 = Hash[Transliterate.map { |key, (map1, map2)| [key, map1] }] Transliterate2 = Hash[Transliterate.map { |key, (map1, map2)| [key, map2] }] TransliterateMatch = Regexp.new("["+Regexp.escape(Transliterate.keys.join(''))+"]") # Maps non-7bit characters to a single 7 bit character, e.g. "ä" is mapped # to "a" # # Example: # Transliterate.transliterate "Größe" # => "Grose" # # Also see transliterate2, which tries to retain the meaning def transliterate(string, replace_unknown="", extend_transliteration=nil) if extend_transliteration then match = Regexp.union(TransliterateMatch, Regexp.new("["+Regexp.escape(extend_transliteration.keys.join(''))+"]")) replace = Transliterate1.merge(extend_transliteration) else match = TransliterateMatch replace = Transliterate1 end string.encode(Encoding::UTF_8).gsub(match, replace) end # Maps non-7bit characters to a sequence of 7 bit characters which convey # the same meaning, e.g. "ä" is mapped to "ae". # # Example: # Transliterate.transliterate2 "Größe" # => "Groesse" # # Also see transliterate, which maps single characters to single characters def transliterate2(string, replace_unknown="", extend_transliteration=nil) if extend_transliteration then match = Regexp.union(TransliterateMatch, Regexp.new("["+Regexp.escape(extend_transliteration.keys.join(''))+"]")) replace = Transliterate2.merge(extend_transliteration) else match = TransliterateMatch replace = Transliterate2 end string.encode(Encoding::UTF_8).gsub(match, replace) end def downcase(string) string.encode(Encoding::UTF_8).downcase.gsub(MatchUpper, UpperToLower) end def upcase(string) string.encode(Encoding::UTF_8).upcase.gsub(MatchLower, LowerToUpper) end def swapcase(string) string.encode(Encoding::UTF_8).downcase.gsub(MatchChar, SwapCase) end # stable makes that differently cased letters don't def case_insensitive_natural_sort_key(string, stable=true) string.encode(Encoding::UTF_8). tr("\t\n\r\v\f",' '). # convert whitespace to space strip. # remove leading & trailing whitespace squeeze(' '). # remove duplicate whitespace gsub(NaturalSortUnknown, ''). # remove unknown characters gsub(/\d+/) { |value| base128(value.to_i) }. # translate numbers (10 comes after 2, base 128 as it must be unicode-safe) gsub(NaturalSortKnown, CaseInsensitiveIndex) # map characters end def case_sensitive_natural_sort_key(string) string.encode(Encoding::UTF_8). tr("\t\n\r\v\f",' '). # convert whitespace to space strip. # remove leading & trailing whitespace squeeze(' '). # remove duplicate whitespace gsub(NaturalSortUnknown, ''). # remove unknown characters gsub(/\d+/) { |value| base128(value.to_i) }. # translate numbers (10 comes after 2, base 128 as it must be unicode-safe) gsub(NaturalSortKnown, CaseSensitiveIndex) # map characters end def base128(value) base128 = [] begin value, digit = value.divmod(128) base128 << digit end until value.zero? base128.reverse.pack("xU*").encode(Encoding::UTF_8) # must be big-endian end end include Transliteration if __FILE__ == $0 then include Transliteration require 'test/unit' class TestNatcmp < Test::Unit::TestCase def test_natsortkey assert_equal(%w(hello3.jpg Hello5.jpg hello10.jpg), %w(hello3.jpg hello10.jpg Hello5.jpg).sort_by { |a| case_insensitive_natural_sort_key(a) }) assert_equal(%w(Hello5.jpg hello3.jpg hello10.jpg), %w(hello3.jpg hello10.jpg Hello5.jpg).sort_by { |a| case_sensitive_natural_sort_key(a) }) assert_equal(%w(Hagar Hägar Hugar), %w(Hugar Hägar Hagar).sort_by { |a| case_sensitive_natural_sort_key(a) }) assert_equal(%w(Hagar Hugar Hägar), %w(Hugar Hägar Hagar).sort_by { |a| a }) assert(case_sensitive_natural_sort_key("Hägar20").valid_encoding?) assert(case_insensitive_natural_sort_key("Hägar20").valid_encoding?) end end end