Revisions
-
hinrik revised this gist
Jun 18, 2012 . 2 changed files with 57 additions and 69 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,69 +0,0 @@ This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,57 @@ local lpeg = require 'lpeg' local U = require 'icu.ustring' local re = require 'icu.regex' local utf8_codepoint do -- decode a two-byte UTF-8 sequence local function f2 (s) local c1, c2 = string.byte(s, 1, 2) return c1 * 64 + c2 - 12416 end -- decode a three-byte UTF-8 sequence local function f3 (s) local c1, c2, c3 = string.byte(s, 1, 3) return (c1 * 64 + c2) * 64 + c3 - 925824 end -- decode a four-byte UTF-8 sequence local function f4 (s) local c1, c2, c3, c4 = string.byte(s, 1, 4) return ((c1 * 64 + c2) * 64 + c3) * 64 + c4 - 63447168 end local cont = lpeg.R("\128\191") -- continuation byte utf8_codepoint = lpeg.R("\0\127") / string.byte + lpeg.R("\194\223") * cont / f2 + lpeg.R("\224\239") * cont * cont / f3 + lpeg.R("\240\244") * cont * cont * cont / f4 end local alnum = re.compile('^\\p{alnum}$') local alpha = re.compile('^\\p{alpha}$') local cntrl = re.compile('^\\p{cntrl}$') local digit = re.compile('^\\p{digit}$') local graph = re.compile('^\\p{graph}$') local lower = re.compile('^\\p{lower}$') local print = re.compile('^\\p{print}$') local punct = re.compile('^\\p{punct}$') local space = re.compile('^\\p{space}$') local upper = re.compile('^\\p{upper}$') local xdigit = re.compile('^\\p{xdigit}$') return { alnum = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(alnum, U.char(c)) end ) ; alpha = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(alpha, U.char(c)) end ) ; cntrl = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(cntrl, U.char(c)) end ) ; digit = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(digit, U.char(c)) end ) ; graph = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(graph, U.char(c)) end ) ; lower = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(lower, U.char(c)) end ) ; print = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(print, U.char(c)) end ) ; punct = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(punct, U.char(c)) end ) ; space = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(space, U.char(c)) end ) ; upper = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(upper, U.char(c)) end ) ; xdigit = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(digit, U.char(c)) end ) ; } -
daurnimator revised this gist
Jun 17, 2012 . 1 changed file with 40 additions and 32 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,29 +1,32 @@ local lpeg = require "lpeg" local utf8_codepoint do -- decode a two-byte UTF-8 sequence local function f2 (s) local c1, c2 = string.byte(s, 1, 2) return c1 * 64 + c2 - 12416 end -- decode a three-byte UTF-8 sequence local function f3 (s) local c1, c2, c3 = string.byte(s, 1, 3) return (c1 * 64 + c2) * 64 + c3 - 925824 end -- decode a four-byte UTF-8 sequence local function f4 (s) local c1, c2, c3, c4 = string.byte(s, 1, 4) return ((c1 * 64 + c2) * 64 + c3) * 64 + c4 - 63447168 end local cont = lpeg.R("\128\191") -- continuation byte utf8_codepoint = lpeg.R("\0\127") / string.byte + lpeg.R("\194\223") * cont / f2 + lpeg.R("\224\239") * cont * cont / f3 + lpeg.R("\240\244") * cont * cont * cont / f4 end local ffi = require "ffi" @@ -46,16 +49,21 @@ UBool u_isprint_49(UChar32 c); local ICU = ffi.load ( "icuuc" ) local utf8_locale = { alnum = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isalnum_49(c) end ) ; alpha = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isalpha_49(c) end ) ; cntrl = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_iscntrl_49(c) end ) ; digit = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isdigit_49(c) end ) ; graph = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isgraph_49(c) end ) ; lower = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_islower_49(c) end ) ; print = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isprint_49(c) end ) ; punct = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_ispunct_49(c) end ) ; space = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isspace_49(c) end ) ; upper = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isupper_49(c) end ) ; xdigit = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isxdigit_49(c) end ) ; } return { utf8_codepoint = utf8_codepoint ; utf8_locale = utf8_locale ; } -
daurnimator revised this gist
Jun 17, 2012 . 1 changed file with 12 additions and 19 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -28,41 +28,34 @@ local utf8 = lpeg.R("\0\127") / string.byte local ffi = require "ffi" ffi.cdef [[ typedef bool UBool; typedef int32_t UChar32; UBool u_islower_49(UChar32 c); UBool u_isupper_49(UChar32 c); UBool u_isdigit_49(UChar32 c); UBool u_isalpha_49(UChar32 c); UBool u_isalnum_49(UChar32 c); UBool u_isxdigit_49(UChar32 c); UBool u_ispunct_49(UChar32 c); UBool u_isgraph_49(UChar32 c); UBool u_isspace_49(UChar32 c); UBool u_iscntrl_49(UChar32 c); UBool u_isprint_49(UChar32 c); ]] local ICU = ffi.load ( "icuuc" ) local unicode_locale = { alnum = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isalnum_49(c) end ) ; alpha = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isalpha_49(c) end ) ; cntrl = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_iscntrl_49(c) end ) ; digit = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isdigit_49(c) end ) ; graph = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isgraph_49(c) end ) ; lower = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_islower_49(c) end ) ; print = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isprint_49(c) end ) ; punct = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_ispunct_49(c) end ) ; space = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isspace_49(c) end ) ; upper = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isupper_49(c) end ) ; xdigit = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isxdigit_49(c) end ) ; } -
daurnimator created this gist
Jun 17, 2012 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,68 @@ local lpeg = require "lpeg" -- decode a two-byte UTF-8 sequence local function f2 (s) local c1, c2 = string.byte(s, 1, 2) return c1 * 64 + c2 - 12416 end -- decode a three-byte UTF-8 sequence local function f3 (s) local c1, c2, c3 = string.byte(s, 1, 3) return (c1 * 64 + c2) * 64 + c3 - 925824 end -- decode a four-byte UTF-8 sequence local function f4 (s) local c1, c2, c3, c4 = string.byte(s, 1, 4) return ((c1 * 64 + c2) * 64 + c3) * 64 + c4 - 63447168 end local cont = lpeg.R("\128\191") -- continuation byte local utf8 = lpeg.R("\0\127") / string.byte + lpeg.R("\194\223") * cont / f2 + lpeg.R("\224\239") * cont * cont / f3 + lpeg.R("\240\244") * cont * cont * cont / f4 local ffi = require "ffi" ffi.cdef [[ typedef int8_t UBool; typedef int32_t UChar32; UBool u_islower_49(UChar32 c); UBool u_isupper_49(UChar32 c); UBool u_istitle_49(UChar32 c); UBool u_isdigit_49(UChar32 c); UBool u_isalpha_49(UChar32 c); UBool u_isalnum_49(UChar32 c); UBool u_isxdigit_49(UChar32 c); UBool u_ispunct_49(UChar32 c); UBool u_isgraph_49(UChar32 c); UBool u_isblank_49(UChar32 c); UBool u_isdefined_49(UChar32 c); UBool u_isspace_49(UChar32 c); UBool u_isJavaSpaceChar_49(UChar32 c); UBool u_isWhitespace_49(UChar32 c); UBool u_iscntrl_49(UChar32 c); UBool u_isISOControl_49(UChar32 c); UBool u_isprint_49(UChar32 c); UBool u_isbase_49(UChar32 c); ]] local ICU = ffi.load ( "icuuc" ) local unicode_locale = { alnum = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isalnum_49(c)~=0 end ) ; alpha = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isalpha_49(c)~=0 end ) ; cntrl = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_iscntrl_49(c)~=0 end ) ; digit = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isdigit_49(c)~=0 end ) ; graph = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isgraph_49(c)~=0 end ) ; lower = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_islower_49(c)~=0 end ) ; print = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isprint_49(c)~=0 end ) ; punct = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_ispunct_49(c)~=0 end ) ; space = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isspace_49(c)~=0 end ) ; upper = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isupper_49(c)~=0 end ) ; xdigit = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isxdigi_49t(c)~=0 end ) ; }