hinrik · June 18, 2012 18:18 · Jun 18, 2012 · Jun 17, 2012 · Jun 17, 2012 · Jun 17, 2012
diff --git a/lpeg_unicode.lua b/lpeg_unicode.lua
@@ -1,69 +0,0 @@
-local lpeg = require "lpeg"
-
-local utf8_codepoint
-do
-	-- decode a two-byte UTF-8 sequence
-	local function f2 (s)
-	  local c1, c2 = string.byte(s, 1, 2)
-	  return c1 * 64 + c2 - 12416
-	end
-
-	-- decode a three-byte UTF-8 sequence
-	local function f3 (s)
-	  local c1, c2, c3 = string.byte(s, 1, 3)
-	  return (c1 * 64 + c2) * 64 + c3 - 925824
-	end
-
-	-- decode a four-byte UTF-8 sequence
-	local function f4 (s)
-	  local c1, c2, c3, c4 = string.byte(s, 1, 4)
-	  return ((c1 * 64 + c2) * 64 + c3) * 64 + c4 - 63447168
-	end
-
-	local cont = lpeg.R("\128\191")   -- continuation byte
-
-	utf8_codepoint = lpeg.R("\0\127") / string.byte
- 		+ lpeg.R("\194\223") * cont / f2
-		+ lpeg.R("\224\239") * cont * cont / f3
-		+ lpeg.R("\240\244") * cont * cont * cont / f4
-end
-
-local ffi = require "ffi"
-
-ffi.cdef [[
-typedef bool UBool;
-typedef int32_t UChar32;
-
-UBool u_islower_49(UChar32 c);
-UBool u_isupper_49(UChar32 c);
-UBool u_isdigit_49(UChar32 c);
-UBool u_isalpha_49(UChar32 c);
-UBool u_isalnum_49(UChar32 c);
-UBool u_isxdigit_49(UChar32 c);
-UBool u_ispunct_49(UChar32 c);
-UBool u_isgraph_49(UChar32 c);
-UBool u_isspace_49(UChar32 c);
-UBool u_iscntrl_49(UChar32 c);
-UBool u_isprint_49(UChar32 c);
-]]
-
-local ICU = ffi.load ( "icuuc" )
-
-local utf8_locale = {
-	alnum = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isalnum_49(c) end ) ;
-	alpha = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isalpha_49(c) end ) ;
-	cntrl = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_iscntrl_49(c) end ) ;
-	digit = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isdigit_49(c) end ) ;
-	graph = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isgraph_49(c) end ) ;
-	lower = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_islower_49(c) end ) ;
-	print = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isprint_49(c) end ) ;
-	punct = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_ispunct_49(c) end ) ;
-	space = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isspace_49(c) end ) ;
-	upper = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isupper_49(c) end ) ;
-	xdigit = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isxdigit_49(c) end ) ;
-}
-
-return {
-	utf8_codepoint = utf8_codepoint ;
-	utf8_locale    = utf8_locale ;
-}

diff --git a/lpeg_utf8_locale.lua b/lpeg_utf8_locale.lua
@@ -0,0 +1,57 @@
+local lpeg = require 'lpeg'
+local U    = require 'icu.ustring'
+local re   = require 'icu.regex'
+
+local utf8_codepoint
+do
+  -- decode a two-byte UTF-8 sequence
+  local function f2 (s)
+    local c1, c2 = string.byte(s, 1, 2)
+    return c1 * 64 + c2 - 12416
+  end
+
+  -- decode a three-byte UTF-8 sequence
+  local function f3 (s)
+    local c1, c2, c3 = string.byte(s, 1, 3)
+    return (c1 * 64 + c2) * 64 + c3 - 925824
+  end
+
+  -- decode a four-byte UTF-8 sequence
+  local function f4 (s)
+    local c1, c2, c3, c4 = string.byte(s, 1, 4)
+    return ((c1 * 64 + c2) * 64 + c3) * 64 + c4 - 63447168
+  end
+
+  local cont = lpeg.R("\128\191")   -- continuation byte
+
+  utf8_codepoint = lpeg.R("\0\127") / string.byte
+    + lpeg.R("\194\223") * cont / f2
+    + lpeg.R("\224\239") * cont * cont / f3
+    + lpeg.R("\240\244") * cont * cont * cont / f4
+end
+
+local alnum = re.compile('^\\p{alnum}$')
+local alpha = re.compile('^\\p{alpha}$')
+local cntrl = re.compile('^\\p{cntrl}$')
+local digit = re.compile('^\\p{digit}$')
+local graph = re.compile('^\\p{graph}$')
+local lower = re.compile('^\\p{lower}$')
+local print = re.compile('^\\p{print}$')
+local punct = re.compile('^\\p{punct}$')
+local space = re.compile('^\\p{space}$')
+local upper = re.compile('^\\p{upper}$')
+local xdigit = re.compile('^\\p{xdigit}$')
+
+return {
+  alnum = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(alnum, U.char(c)) end ) ;
+  alpha = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(alpha, U.char(c)) end ) ;
+  cntrl = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(cntrl, U.char(c)) end ) ;
+  digit = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(digit, U.char(c)) end ) ;
+  graph = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(graph, U.char(c)) end ) ;
+  lower = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(lower, U.char(c)) end ) ;
+  print = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(print, U.char(c)) end ) ;
+  punct = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(punct, U.char(c)) end ) ;
+  space = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(space, U.char(c)) end ) ;
+  upper = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(upper, U.char(c)) end ) ;
+  xdigit = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(digit, U.char(c)) end ) ;
+}
diff --git a/lpeg_unicode.lua b/lpeg_unicode.lua
@@ -1,29 +1,32 @@
 local lpeg = require "lpeg"
 
--- decode a two-byte UTF-8 sequence
-local function f2 (s)
-  local c1, c2 = string.byte(s, 1, 2)
-  return c1 * 64 + c2 - 12416
-end
+local utf8_codepoint
+do
+	-- decode a two-byte UTF-8 sequence
+	local function f2 (s)
+	  local c1, c2 = string.byte(s, 1, 2)
+	  return c1 * 64 + c2 - 12416
+	end
 
--- decode a three-byte UTF-8 sequence
-local function f3 (s)
-  local c1, c2, c3 = string.byte(s, 1, 3)
-  return (c1 * 64 + c2) * 64 + c3 - 925824
-end
+	-- decode a three-byte UTF-8 sequence
+	local function f3 (s)
+	  local c1, c2, c3 = string.byte(s, 1, 3)
+	  return (c1 * 64 + c2) * 64 + c3 - 925824
+	end
 
--- decode a four-byte UTF-8 sequence
-local function f4 (s)
-  local c1, c2, c3, c4 = string.byte(s, 1, 4)
-  return ((c1 * 64 + c2) * 64 + c3) * 64 + c4 - 63447168
-end
+	-- decode a four-byte UTF-8 sequence
+	local function f4 (s)
+	  local c1, c2, c3, c4 = string.byte(s, 1, 4)
+	  return ((c1 * 64 + c2) * 64 + c3) * 64 + c4 - 63447168
+	end
 
-local cont = lpeg.R("\128\191")   -- continuation byte
+	local cont = lpeg.R("\128\191")   -- continuation byte
 
-local utf8 = lpeg.R("\0\127") / string.byte
-		   + lpeg.R("\194\223") * cont / f2
-		   + lpeg.R("\224\239") * cont * cont / f3
-		   + lpeg.R("\240\244") * cont * cont * cont / f4
+	utf8_codepoint = lpeg.R("\0\127") / string.byte
+ 		+ lpeg.R("\194\223") * cont / f2
+		+ lpeg.R("\224\239") * cont * cont / f3
+		+ lpeg.R("\240\244") * cont * cont * cont / f4
+end
 
 local ffi = require "ffi"
 
@@ -46,16 +49,21 @@ UBool u_isprint_49(UChar32 c);
 
 local ICU = ffi.load ( "icuuc" )
 
-local unicode_locale = {
-	alnum = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isalnum_49(c) end ) ;
-	alpha = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isalpha_49(c) end ) ;
-	cntrl = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_iscntrl_49(c) end ) ;
-	digit = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isdigit_49(c) end ) ;
-	graph = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isgraph_49(c) end ) ;
-	lower = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_islower_49(c) end ) ;
-	print = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isprint_49(c) end ) ;
-	punct = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_ispunct_49(c) end ) ;
-	space = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isspace_49(c) end ) ;
-	upper = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isupper_49(c) end ) ;
-	xdigit = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isxdigit_49(c) end ) ;
+local utf8_locale = {
+	alnum = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isalnum_49(c) end ) ;
+	alpha = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isalpha_49(c) end ) ;
+	cntrl = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_iscntrl_49(c) end ) ;
+	digit = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isdigit_49(c) end ) ;
+	graph = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isgraph_49(c) end ) ;
+	lower = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_islower_49(c) end ) ;
+	print = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isprint_49(c) end ) ;
+	punct = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_ispunct_49(c) end ) ;
+	space = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isspace_49(c) end ) ;
+	upper = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isupper_49(c) end ) ;
+	xdigit = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isxdigit_49(c) end ) ;
+}
+
+return {
+	utf8_codepoint = utf8_codepoint ;
+	utf8_locale    = utf8_locale ;
 }
diff --git a/lpeg_unicode.lua b/lpeg_unicode.lua
@@ -28,41 +28,34 @@ local utf8 = lpeg.R("\0\127") / string.byte
 local ffi = require "ffi"
 
 ffi.cdef [[
-typedef int8_t UBool;
+typedef bool UBool;
 typedef int32_t UChar32;
 
 UBool u_islower_49(UChar32 c);
 UBool u_isupper_49(UChar32 c);
-UBool u_istitle_49(UChar32 c);
 UBool u_isdigit_49(UChar32 c);
 UBool u_isalpha_49(UChar32 c);
 UBool u_isalnum_49(UChar32 c);
 UBool u_isxdigit_49(UChar32 c);
 UBool u_ispunct_49(UChar32 c);
 UBool u_isgraph_49(UChar32 c);
-UBool u_isblank_49(UChar32 c);
-UBool u_isdefined_49(UChar32 c);
 UBool u_isspace_49(UChar32 c);
-UBool u_isJavaSpaceChar_49(UChar32 c);
-UBool u_isWhitespace_49(UChar32 c);
 UBool u_iscntrl_49(UChar32 c);
-UBool u_isISOControl_49(UChar32 c);
 UBool u_isprint_49(UChar32 c);
-UBool u_isbase_49(UChar32 c);
 ]]
 
 local ICU = ffi.load ( "icuuc" )
 
 local unicode_locale = {
-	alnum = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isalnum_49(c)~=0 end ) ;
-	alpha = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isalpha_49(c)~=0 end ) ;
-	cntrl = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_iscntrl_49(c)~=0 end ) ;
-	digit = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isdigit_49(c)~=0 end ) ;
-	graph = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isgraph_49(c)~=0 end ) ;
-	lower = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_islower_49(c)~=0 end ) ;
-	print = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isprint_49(c)~=0 end ) ;
-	punct = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_ispunct_49(c)~=0 end ) ;
-	space = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isspace_49(c)~=0 end ) ;
-	upper = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isupper_49(c)~=0 end ) ;
-	xdigit = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isxdigi_49t(c)~=0 end ) ;
+	alnum = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isalnum_49(c) end ) ;
+	alpha = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isalpha_49(c) end ) ;
+	cntrl = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_iscntrl_49(c) end ) ;
+	digit = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isdigit_49(c) end ) ;
+	graph = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isgraph_49(c) end ) ;
+	lower = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_islower_49(c) end ) ;
+	print = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isprint_49(c) end ) ;
+	punct = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_ispunct_49(c) end ) ;
+	space = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isspace_49(c) end ) ;
+	upper = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isupper_49(c) end ) ;
+	xdigit = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isxdigit_49(c) end ) ;
 }
diff --git a/lpeg_unicode.lua b/lpeg_unicode.lua
@@ -0,0 +1,68 @@
+local lpeg = require "lpeg"
+
+-- decode a two-byte UTF-8 sequence
+local function f2 (s)
+  local c1, c2 = string.byte(s, 1, 2)
+  return c1 * 64 + c2 - 12416
+end
+
+-- decode a three-byte UTF-8 sequence
+local function f3 (s)
+  local c1, c2, c3 = string.byte(s, 1, 3)
+  return (c1 * 64 + c2) * 64 + c3 - 925824
+end
+
+-- decode a four-byte UTF-8 sequence
+local function f4 (s)
+  local c1, c2, c3, c4 = string.byte(s, 1, 4)
+  return ((c1 * 64 + c2) * 64 + c3) * 64 + c4 - 63447168
+end
+
+local cont = lpeg.R("\128\191")   -- continuation byte
+
+local utf8 = lpeg.R("\0\127") / string.byte
+		   + lpeg.R("\194\223") * cont / f2
+		   + lpeg.R("\224\239") * cont * cont / f3
+		   + lpeg.R("\240\244") * cont * cont * cont / f4
+
+local ffi = require "ffi"
+
+ffi.cdef [[
+typedef int8_t UBool;
+typedef int32_t UChar32;
+
+UBool u_islower_49(UChar32 c);
+UBool u_isupper_49(UChar32 c);
+UBool u_istitle_49(UChar32 c);
+UBool u_isdigit_49(UChar32 c);
+UBool u_isalpha_49(UChar32 c);
+UBool u_isalnum_49(UChar32 c);
+UBool u_isxdigit_49(UChar32 c);
+UBool u_ispunct_49(UChar32 c);
+UBool u_isgraph_49(UChar32 c);
+UBool u_isblank_49(UChar32 c);
+UBool u_isdefined_49(UChar32 c);
+UBool u_isspace_49(UChar32 c);
+UBool u_isJavaSpaceChar_49(UChar32 c);
+UBool u_isWhitespace_49(UChar32 c);
+UBool u_iscntrl_49(UChar32 c);
+UBool u_isISOControl_49(UChar32 c);
+UBool u_isprint_49(UChar32 c);
+UBool u_isbase_49(UChar32 c);
+]]
+
+local ICU = ffi.load ( "icuuc" )
+
+local unicode_locale = {
+	alnum = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isalnum_49(c)~=0 end ) ;
+	alpha = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isalpha_49(c)~=0 end ) ;
+	cntrl = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_iscntrl_49(c)~=0 end ) ;
+	digit = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isdigit_49(c)~=0 end ) ;
+	graph = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isgraph_49(c)~=0 end ) ;
+	lower = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_islower_49(c)~=0 end ) ;
+	print = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isprint_49(c)~=0 end ) ;
+	punct = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_ispunct_49(c)~=0 end ) ;
+	space = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isspace_49(c)~=0 end ) ;
+	upper = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isupper_49(c)~=0 end ) ;
+	xdigit = lpeg.Cmt ( utf8 , function (s,i,c) return ICU.u_isxdigi_49t(c)~=0 end ) ;
+}