Skip to content

Instantly share code, notes, and snippets.

@hinrik
Forked from daurnimator/lpeg_unicode.lua
Created June 18, 2012 18:18

Revisions

  1. hinrik revised this gist Jun 18, 2012. 2 changed files with 57 additions and 69 deletions.
    69 changes: 0 additions & 69 deletions lpeg_unicode.lua
    Original file line number Diff line number Diff line change
    @@ -1,69 +0,0 @@
    local lpeg = require "lpeg"

    local utf8_codepoint
    do
    -- decode a two-byte UTF-8 sequence
    local function f2 (s)
    local c1, c2 = string.byte(s, 1, 2)
    return c1 * 64 + c2 - 12416
    end

    -- decode a three-byte UTF-8 sequence
    local function f3 (s)
    local c1, c2, c3 = string.byte(s, 1, 3)
    return (c1 * 64 + c2) * 64 + c3 - 925824
    end

    -- decode a four-byte UTF-8 sequence
    local function f4 (s)
    local c1, c2, c3, c4 = string.byte(s, 1, 4)
    return ((c1 * 64 + c2) * 64 + c3) * 64 + c4 - 63447168
    end

    local cont = lpeg.R("\128\191") -- continuation byte

    utf8_codepoint = lpeg.R("\0\127") / string.byte
    + lpeg.R("\194\223") * cont / f2
    + lpeg.R("\224\239") * cont * cont / f3
    + lpeg.R("\240\244") * cont * cont * cont / f4
    end

    local ffi = require "ffi"

    ffi.cdef [[
    typedef bool UBool;
    typedef int32_t UChar32;

    UBool u_islower_49(UChar32 c);
    UBool u_isupper_49(UChar32 c);
    UBool u_isdigit_49(UChar32 c);
    UBool u_isalpha_49(UChar32 c);
    UBool u_isalnum_49(UChar32 c);
    UBool u_isxdigit_49(UChar32 c);
    UBool u_ispunct_49(UChar32 c);
    UBool u_isgraph_49(UChar32 c);
    UBool u_isspace_49(UChar32 c);
    UBool u_iscntrl_49(UChar32 c);
    UBool u_isprint_49(UChar32 c);
    ]]

    local ICU = ffi.load ( "icuuc" )

    local utf8_locale = {
    alnum = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isalnum_49(c) end ) ;
    alpha = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isalpha_49(c) end ) ;
    cntrl = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_iscntrl_49(c) end ) ;
    digit = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isdigit_49(c) end ) ;
    graph = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isgraph_49(c) end ) ;
    lower = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_islower_49(c) end ) ;
    print = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isprint_49(c) end ) ;
    punct = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_ispunct_49(c) end ) ;
    space = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isspace_49(c) end ) ;
    upper = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isupper_49(c) end ) ;
    xdigit = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isxdigit_49(c) end ) ;
    }

    return {
    utf8_codepoint = utf8_codepoint ;
    utf8_locale = utf8_locale ;
    }
    57 changes: 57 additions & 0 deletions lpeg_utf8_locale.lua
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,57 @@
    local lpeg = require 'lpeg'
    local U = require 'icu.ustring'
    local re = require 'icu.regex'

    local utf8_codepoint
    do
    -- decode a two-byte UTF-8 sequence
    local function f2 (s)
    local c1, c2 = string.byte(s, 1, 2)
    return c1 * 64 + c2 - 12416
    end

    -- decode a three-byte UTF-8 sequence
    local function f3 (s)
    local c1, c2, c3 = string.byte(s, 1, 3)
    return (c1 * 64 + c2) * 64 + c3 - 925824
    end

    -- decode a four-byte UTF-8 sequence
    local function f4 (s)
    local c1, c2, c3, c4 = string.byte(s, 1, 4)
    return ((c1 * 64 + c2) * 64 + c3) * 64 + c4 - 63447168
    end

    local cont = lpeg.R("\128\191") -- continuation byte

    utf8_codepoint = lpeg.R("\0\127") / string.byte
    + lpeg.R("\194\223") * cont / f2
    + lpeg.R("\224\239") * cont * cont / f3
    + lpeg.R("\240\244") * cont * cont * cont / f4
    end

    local alnum = re.compile('^\\p{alnum}$')
    local alpha = re.compile('^\\p{alpha}$')
    local cntrl = re.compile('^\\p{cntrl}$')
    local digit = re.compile('^\\p{digit}$')
    local graph = re.compile('^\\p{graph}$')
    local lower = re.compile('^\\p{lower}$')
    local print = re.compile('^\\p{print}$')
    local punct = re.compile('^\\p{punct}$')
    local space = re.compile('^\\p{space}$')
    local upper = re.compile('^\\p{upper}$')
    local xdigit = re.compile('^\\p{xdigit}$')

    return {
    alnum = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(alnum, U.char(c)) end ) ;
    alpha = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(alpha, U.char(c)) end ) ;
    cntrl = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(cntrl, U.char(c)) end ) ;
    digit = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(digit, U.char(c)) end ) ;
    graph = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(graph, U.char(c)) end ) ;
    lower = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(lower, U.char(c)) end ) ;
    print = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(print, U.char(c)) end ) ;
    punct = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(punct, U.char(c)) end ) ;
    space = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(space, U.char(c)) end ) ;
    upper = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(upper, U.char(c)) end ) ;
    xdigit = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(digit, U.char(c)) end ) ;
    }
  2. @daurnimator daurnimator revised this gist Jun 17, 2012. 1 changed file with 40 additions and 32 deletions.
    72 changes: 40 additions & 32 deletions lpeg_unicode.lua
    Original file line number Diff line number Diff line change
    @@ -1,29 +1,32 @@
    local lpeg = require "lpeg"

    -- decode a two-byte UTF-8 sequence
    local function f2 (s)
    local c1, c2 = string.byte(s, 1, 2)
    return c1 * 64 + c2 - 12416
    end
    local utf8_codepoint
    do
    -- decode a two-byte UTF-8 sequence
    local function f2 (s)
    local c1, c2 = string.byte(s, 1, 2)
    return c1 * 64 + c2 - 12416
    end

    -- decode a three-byte UTF-8 sequence
    local function f3 (s)
    local c1, c2, c3 = string.byte(s, 1, 3)
    return (c1 * 64 + c2) * 64 + c3 - 925824
    end
    -- decode a three-byte UTF-8 sequence
    local function f3 (s)
    local c1, c2, c3 = string.byte(s, 1, 3)
    return (c1 * 64 + c2) * 64 + c3 - 925824
    end

    -- decode a four-byte UTF-8 sequence
    local function f4 (s)
    local c1, c2, c3, c4 = string.byte(s, 1, 4)
    return ((c1 * 64 + c2) * 64 + c3) * 64 + c4 - 63447168
    end
    -- decode a four-byte UTF-8 sequence
    local function f4 (s)
    local c1, c2, c3, c4 = string.byte(s, 1, 4)
    return ((c1 * 64 + c2) * 64 + c3) * 64 + c4 - 63447168
    end

    local cont = lpeg.R("\128\191") -- continuation byte
    local cont = lpeg.R("\128\191") -- continuation byte

    local utf8 = lpeg.R("\0\127") / string.byte
    + lpeg.R("\194\223") * cont / f2
    + lpeg.R("\224\239") * cont * cont / f3
    + lpeg.R("\240\244") * cont * cont * cont / f4
    utf8_codepoint = lpeg.R("\0\127") / string.byte
    + lpeg.R("\194\223") * cont / f2
    + lpeg.R("\224\239") * cont * cont / f3
    + lpeg.R("\240\244") * cont * cont * cont / f4
    end

    local ffi = require "ffi"

    @@ -46,16 +49,21 @@ UBool u_isprint_49(UChar32 c);

    local ICU = ffi.load ( "icuuc" )

    local unicode_locale = {
    alnum = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isalnum_49(c) end ) ;
    alpha = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isalpha_49(c) end ) ;
    cntrl = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_iscntrl_49(c) end ) ;
    digit = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isdigit_49(c) end ) ;
    graph = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isgraph_49(c) end ) ;
    lower = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_islower_49(c) end ) ;
    print = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isprint_49(c) end ) ;
    punct = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_ispunct_49(c) end ) ;
    space = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isspace_49(c) end ) ;
    upper = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isupper_49(c) end ) ;
    xdigit = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isxdigit_49(c) end ) ;
    local utf8_locale = {
    alnum = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isalnum_49(c) end ) ;
    alpha = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isalpha_49(c) end ) ;
    cntrl = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_iscntrl_49(c) end ) ;
    digit = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isdigit_49(c) end ) ;
    graph = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isgraph_49(c) end ) ;
    lower = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_islower_49(c) end ) ;
    print = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isprint_49(c) end ) ;
    punct = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_ispunct_49(c) end ) ;
    space = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isspace_49(c) end ) ;
    upper = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isupper_49(c) end ) ;
    xdigit = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isxdigit_49(c) end ) ;
    }

    return {
    utf8_codepoint = utf8_codepoint ;
    utf8_locale = utf8_locale ;
    }
  3. @daurnimator daurnimator revised this gist Jun 17, 2012. 1 changed file with 12 additions and 19 deletions.
    31 changes: 12 additions & 19 deletions lpeg_unicode.lua
    Original file line number Diff line number Diff line change
    @@ -28,41 +28,34 @@ local utf8 = lpeg.R("\0\127") / string.byte
    local ffi = require "ffi"

    ffi.cdef [[
    typedef int8_t UBool;
    typedef bool UBool;
    typedef int32_t UChar32;

    UBool u_islower_49(UChar32 c);
    UBool u_isupper_49(UChar32 c);
    UBool u_istitle_49(UChar32 c);
    UBool u_isdigit_49(UChar32 c);
    UBool u_isalpha_49(UChar32 c);
    UBool u_isalnum_49(UChar32 c);
    UBool u_isxdigit_49(UChar32 c);
    UBool u_ispunct_49(UChar32 c);
    UBool u_isgraph_49(UChar32 c);
    UBool u_isblank_49(UChar32 c);
    UBool u_isdefined_49(UChar32 c);
    UBool u_isspace_49(UChar32 c);
    UBool u_isJavaSpaceChar_49(UChar32 c);
    UBool u_isWhitespace_49(UChar32 c);
    UBool u_iscntrl_49(UChar32 c);
    UBool u_isISOControl_49(UChar32 c);
    UBool u_isprint_49(UChar32 c);
    UBool u_isbase_49(UChar32 c);
    ]]

    local ICU = ffi.load ( "icuuc" )

    local unicode_locale = {
    alnum = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isalnum_49(c)~=0 end ) ;
    alpha = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isalpha_49(c)~=0 end ) ;
    cntrl = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_iscntrl_49(c)~=0 end ) ;
    digit = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isdigit_49(c)~=0 end ) ;
    graph = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isgraph_49(c)~=0 end ) ;
    lower = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_islower_49(c)~=0 end ) ;
    print = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isprint_49(c)~=0 end ) ;
    punct = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_ispunct_49(c)~=0 end ) ;
    space = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isspace_49(c)~=0 end ) ;
    upper = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isupper_49(c)~=0 end ) ;
    xdigit = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isxdigi_49t(c)~=0 end ) ;
    alnum = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isalnum_49(c) end ) ;
    alpha = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isalpha_49(c) end ) ;
    cntrl = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_iscntrl_49(c) end ) ;
    digit = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isdigit_49(c) end ) ;
    graph = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isgraph_49(c) end ) ;
    lower = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_islower_49(c) end ) ;
    print = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isprint_49(c) end ) ;
    punct = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_ispunct_49(c) end ) ;
    space = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isspace_49(c) end ) ;
    upper = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isupper_49(c) end ) ;
    xdigit = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isxdigit_49(c) end ) ;
    }
  4. @daurnimator daurnimator created this gist Jun 17, 2012.
    68 changes: 68 additions & 0 deletions lpeg_unicode.lua
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,68 @@
    local lpeg = require "lpeg"

    -- decode a two-byte UTF-8 sequence
    local function f2 (s)
    local c1, c2 = string.byte(s, 1, 2)
    return c1 * 64 + c2 - 12416
    end

    -- decode a three-byte UTF-8 sequence
    local function f3 (s)
    local c1, c2, c3 = string.byte(s, 1, 3)
    return (c1 * 64 + c2) * 64 + c3 - 925824
    end

    -- decode a four-byte UTF-8 sequence
    local function f4 (s)
    local c1, c2, c3, c4 = string.byte(s, 1, 4)
    return ((c1 * 64 + c2) * 64 + c3) * 64 + c4 - 63447168
    end

    local cont = lpeg.R("\128\191") -- continuation byte

    local utf8 = lpeg.R("\0\127") / string.byte
    + lpeg.R("\194\223") * cont / f2
    + lpeg.R("\224\239") * cont * cont / f3
    + lpeg.R("\240\244") * cont * cont * cont / f4

    local ffi = require "ffi"

    ffi.cdef [[
    typedef int8_t UBool;
    typedef int32_t UChar32;

    UBool u_islower_49(UChar32 c);
    UBool u_isupper_49(UChar32 c);
    UBool u_istitle_49(UChar32 c);
    UBool u_isdigit_49(UChar32 c);
    UBool u_isalpha_49(UChar32 c);
    UBool u_isalnum_49(UChar32 c);
    UBool u_isxdigit_49(UChar32 c);
    UBool u_ispunct_49(UChar32 c);
    UBool u_isgraph_49(UChar32 c);
    UBool u_isblank_49(UChar32 c);
    UBool u_isdefined_49(UChar32 c);
    UBool u_isspace_49(UChar32 c);
    UBool u_isJavaSpaceChar_49(UChar32 c);
    UBool u_isWhitespace_49(UChar32 c);
    UBool u_iscntrl_49(UChar32 c);
    UBool u_isISOControl_49(UChar32 c);
    UBool u_isprint_49(UChar32 c);
    UBool u_isbase_49(UChar32 c);
    ]]

    local ICU = ffi.load ( "icuuc" )

    local unicode_locale = {
    alnum = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isalnum_49(c)~=0 end ) ;
    alpha = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isalpha_49(c)~=0 end ) ;
    cntrl = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_iscntrl_49(c)~=0 end ) ;
    digit = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isdigit_49(c)~=0 end ) ;
    graph = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isgraph_49(c)~=0 end ) ;
    lower = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_islower_49(c)~=0 end ) ;
    print = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isprint_49(c)~=0 end ) ;
    punct = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_ispunct_49(c)~=0 end ) ;
    space = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isspace_49(c)~=0 end ) ;
    upper = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isupper_49(c)~=0 end ) ;
    xdigit = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isxdigi_49t(c)~=0 end ) ;
    }