Created
April 27, 2011 04:01
-
-
Save lrz/943692 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/re.c b/re.c | |
index 523ba17..0df6349 100644 | |
--- a/re.c | |
+++ b/re.c | |
@@ -696,34 +696,56 @@ rb_reg_matcher_new(VALUE re, VALUE str) | |
u_errorName(status)); | |
} | |
- long chars_len = 0; | |
- UChar *chars = rb_str_xcopy_uchars(str, &chars_len); | |
- | |
- if (chars_len == 0) { | |
- // uregex_setText() will complain if we pass a NULL pattern or a | |
- // pattern length of 0, so we do pass an empty pattern with a length | |
- // of -1 which indicates it's terminated by \0. | |
- chars = (UChar *)xmalloc(sizeof(UChar)); | |
- *chars = '\0'; | |
- chars_len = -1; | |
+ // Fast path when applying a regexp on an UTF-8 encoded text string. | |
+ // (Only if ICU is 4.6 or higher). | |
+ bool need_uchars = true; | |
+#if U_ICU_VERSION_MAJOR_NUM >= 4 && U_ICU_VERSION_MINOR_NUM >= 6 | |
+ if (IS_RSTR(str) | |
+ && (IS_UTF8_ENC(RSTR(str)->encoding) | |
+ || IS_ASCII_ENC(RSTR(str)->encoding))) { | |
+ UText *text = utext_openUTF8(NULL, RSTR(str)->bytes, | |
+ RSTR(str)->length_in_bytes, &status); | |
+ if (status == U_ZERO_ERROR) { | |
+ uregex_setUText(match_pattern, text, &status); | |
+ utext_close(text); | |
+ if (status == U_ZERO_ERROR) { | |
+ need_uchars = false; | |
+ } | |
+ } | |
+ status = U_ZERO_ERROR; | |
} | |
+#endif | |
+ | |
+ // Slow path, converting the text string into a buffer of uchars. | |
+ if (need_uchars) { | |
+ long chars_len = 0; | |
+ UChar *chars = rb_str_xcopy_uchars(str, &chars_len); | |
+ | |
+ if (chars_len == 0) { | |
+ // uregex_setText() will complain if we pass a NULL pattern or a | |
+ // pattern length of 0, so we do pass an empty pattern with a | |
+ // length of -1 which indicates it's terminated by \0. | |
+ chars = (UChar *)xmalloc(sizeof(UChar)); | |
+ *chars = '\0'; | |
+ chars_len = -1; | |
+ } | |
- uregex_setText(match_pattern, chars, chars_len, &status); | |
+ uregex_setText(match_pattern, chars, chars_len, &status); | |
- if (status != U_ZERO_ERROR) { | |
- uregex_close(match_pattern); | |
- rb_raise(rb_eRegexpError, "can't set pattern text: %s", | |
- u_errorName(status)); | |
+ if (status != U_ZERO_ERROR) { | |
+ uregex_close(match_pattern); | |
+ rb_raise(rb_eRegexpError, "can't set pattern text: %s", | |
+ u_errorName(status)); | |
+ } | |
+ // Apparently uregex_setText doesn't copy the given string, so we need | |
+ // to keep it around until we finally destroy the matcher object. | |
+ GC_WB(&matcher->text_chars, chars); | |
} | |
matcher->pattern = match_pattern; | |
matcher->frozen_str = 0; // set lazily | |
GC_WB(&matcher->orig_str, str); | |
- // Apparently uregex_setText doesn't copy the given string, so we need | |
- // to keep it around until we finally destroy the matcher object. | |
- GC_WB(&matcher->text_chars, chars); | |
- | |
return (VALUE)matcher; | |
} | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ time ./miniruby -e "s=File.read('parse.c'); 1000.times { s.match(/./) }" | |
real 0m3.868s | |
user 0m3.863s | |
sys 0m0.197s | |
$ time macruby -e "s=File.read('parse.c'); 1000.times { s.match(/./) }" | |
real 0m3.269s | |
user 0m2.907s | |
sys 0m0.550s |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment