Last active
December 2, 2024 19:27
-
-
Save snippins/2d9a66e6708bc258f662babe0a1c1009 to your computer and use it in GitHub Desktop.
Emacs 30 patch for regex lookaround - rev a8169bee2064282a40214ef65ef0493233ed4669 - 2024-12-01 master
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/src/regex-emacs.c b/src/regex-emacs.c | |
index 3bf3ad9..8bbfb78 100644 | |
--- a/src/regex-emacs.c | |
+++ b/src/regex-emacs.c | |
@@ -44,6 +44,9 @@ | |
#endif | |
#define RE_DUP_MAX (0xffff) | |
+#define PREV_CHAR_BOUNDARY(p, limit) ((p)--) | |
+#define STRING_CHAR_AND_LENGTH(p, actual_len) ((actual_len) = 1, *(p)) | |
+ | |
/* Make syntax table lookup grant data in gl_state. */ | |
#define SYNTAX(c) syntax_property (c, 1) | |
@@ -323,7 +326,16 @@ #define BYTEWIDTH 8 /* In bits. */ | |
/* Matches any character whose syntax is not that specified. */ | |
notsyntaxspec, | |
+ lookahead, | |
+ lookahead_not, | |
+ lookbehind, | |
+ lookbehind_not, | |
+ lookaround_succeed, | |
+ lookaround_fail, | |
+ | |
+ before_dot, /* Succeeds if before point. */ | |
at_dot, /* Succeeds if at point. */ | |
+ after_dot, /* Succeeds if after point. */ | |
/* Matches any character whose category-set contains the specified | |
category. The operator is followed by a byte which contains a | |
@@ -540,6 +552,31 @@ print_partial_compiled_pattern (FILE *dest, re_char *start, re_char *end) | |
fprintf (dest, "/stop_memory/%d", *p++); | |
break; | |
+ case lookahead: | |
+ extract_number_and_incr (&mcnt, &p); | |
+ fprintf (dest, "/lookahead/%d", mcnt); | |
+ break; | |
+ case lookahead_not: | |
+ extract_number_and_incr (&mcnt, &p); | |
+ fprintf (dest, "/lookahead_not/%d", mcnt); | |
+ break; | |
+ case lookbehind: | |
+ extract_number_and_incr (&mcnt, &p); | |
+ extract_number_and_incr (&mcnt2, &p); | |
+ fprintf (dest, "/lookbehind/%d/%d", mcnt, mcnt2); | |
+ break; | |
+ case lookbehind_not: | |
+ extract_number_and_incr (&mcnt, &p); | |
+ extract_number_and_incr (&mcnt2, &p); | |
+ fprintf (dest, "/lookbehind_not/%d/%d", mcnt, mcnt2); | |
+ break; | |
+ case lookaround_succeed: | |
+ fprintf (dest, "/lookaround_succeed"); | |
+ break; | |
+ case lookaround_fail: | |
+ fprintf (dest, "/lookaround_fail"); | |
+ break; | |
+ | |
case duplicate: | |
fprintf (dest, "/duplicate/%d", *p++); | |
break; | |
@@ -704,10 +741,18 @@ print_partial_compiled_pattern (FILE *dest, re_char *start, re_char *end) | |
fprintf (dest, "/%d", mcnt); | |
break; | |
+ case before_dot: | |
+ fprintf (dest, "/before_dot"); | |
+ break; | |
+ | |
case at_dot: | |
fputs ("/at_dot", dest); | |
break; | |
+ case after_dot: | |
+ fprintf (dest, "/after_dot"); | |
+ break; | |
+ | |
case categoryspec: | |
fputs ("/categoryspec", dest); | |
mcnt = *p++; | |
@@ -1159,6 +1204,28 @@ #define POP_FAILURE_POINT(str, pat) \ | |
DEBUG_STATEMENT (nfailure_points_popped++); \ | |
} while (false) /* POP_FAILURE_POINT */ | |
+ #define FINISH_LOOKAROUND() \ | |
+ do { \ | |
+ re_char *str, *pat; \ | |
+ re_opcode_t op; \ | |
+ discard_saved_regs = 1; \ | |
+ while (!FAIL_STACK_EMPTY ()) \ | |
+ { \ | |
+ POP_FAILURE_POINT (str, pat); \ | |
+ op = (re_opcode_t) *pat; \ | |
+ if (op == lookahead \ | |
+ || op == lookahead_not \ | |
+ || op == lookbehind \ | |
+ || op == lookbehind_not) \ | |
+ { \ | |
+ d = str; \ | |
+ dend = ((d >= string1 && d <= end1) \ | |
+ ? end_match_1 : end_match_2); \ | |
+ break; \ | |
+ } \ | |
+ } \ | |
+ discard_saved_regs = 0; \ | |
+ } while (0); | |
/* Registers are set to a sentinel when they haven't yet matched. */ | |
@@ -1298,6 +1365,7 @@ static_assert (LONG_MIN <= -(MAX_BUF_SIZE - 1) && MAX_BUF_SIZE - 1 <= LONG_MAX); | |
pattern_offset_t fixup_alt_jump; | |
pattern_offset_t laststart_offset; | |
regnum_t regnum; | |
+ int lookaround; | |
} compile_stack_elt_t; | |
@@ -1669,6 +1737,7 @@ extend_range_table_work_area (struct range_table_work_area *work_area) | |
/* regex_compile and helpers. */ | |
static bool group_in_compile_stack (compile_stack_type, regnum_t); | |
+static int exact_chars_in_pattern_buffer (struct re_pattern_buffer *bufp, re_char *p, re_char *pend); | |
/* Insert the 'jump' from the end of last alternative to "here". | |
The space for the jump has already been allocated. */ | |
@@ -2209,6 +2278,7 @@ regex_compile (re_char *pattern, ptrdiff_t size, | |
case '(': | |
{ | |
bool shy = false; | |
+ int lookaround = 0; | |
regnum_t regnum = 0; | |
if (p+1 < pend) | |
{ | |
@@ -2234,6 +2304,27 @@ regex_compile (re_char *pattern, ptrdiff_t size, | |
|| ckd_add (®num, regnum, c - '0')) | |
FREE_STACK_RETURN (REG_ESIZE); | |
break; | |
+ case '=': | |
+ /* Positive lookahead assertion. */ | |
+ shy = lookaround = 1; | |
+ break; | |
+ case '!': | |
+ /* Negative lookahead assertion. */ | |
+ shy = lookaround = 2; | |
+ break; | |
+ case '<': | |
+ { | |
+ PATFETCH (c); | |
+ if (c == '=') | |
+ /* Positive lookbehind assertion. */ | |
+ shy = lookaround = -1; | |
+ else if (c == '!') | |
+ /* Negative lookbehind assertion. */ | |
+ shy = lookaround = -2; | |
+ else | |
+ FREE_STACK_RETURN (REG_BADPAT); | |
+ } | |
+ break; | |
default: | |
/* Only (?:...) is supported right now. */ | |
FREE_STACK_RETURN (REG_BADPAT); | |
@@ -2276,6 +2367,7 @@ regex_compile (re_char *pattern, ptrdiff_t size, | |
= fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0; | |
COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer; | |
COMPILE_STACK_TOP.regnum = regnum; | |
+ COMPILE_STACK_TOP.lookaround = lookaround; | |
/* Do not push a start_memory for groups beyond the last one | |
we can represent in the compiled pattern. */ | |
@@ -2312,6 +2404,7 @@ regex_compile (re_char *pattern, ptrdiff_t size, | |
later groups should continue to be numbered higher, | |
as in '(ab)c(de)' -- the second group is #2. */ | |
regnum_t regnum; | |
+ int lookaround; | |
compile_stack.avail--; | |
begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset; | |
@@ -2324,13 +2417,38 @@ regex_compile (re_char *pattern, ptrdiff_t size, | |
/* If we've reached MAX_REGNUM groups, then this open | |
won't actually generate any code, so we'll have to | |
clear pending_exact explicitly. */ | |
+ lookaround = COMPILE_STACK_TOP.lookaround; | |
pending_exact = 0; | |
/* We're at the end of the group, so now we know how many | |
groups were inside this one. */ | |
if (regnum <= MAX_REGNUM && regnum > 0) | |
BUF_PUSH_2 (stop_memory, regnum); | |
- } | |
+ else if (lookaround) | |
+ { | |
+ if (lookaround > 0) | |
+ { | |
+ /* Positive/negative lookahead assertion. */ | |
+ GET_BUFFER_SPACE (3); | |
+ INSERT_JUMP (lookaround == 1 ? lookahead : lookahead_not, laststart, b + 4); | |
+ b += 3; | |
+ } | |
+ else | |
+ { | |
+ /* Positive/negative lookbehind assertion. */ | |
+ int count = exact_chars_in_pattern_buffer (bufp, laststart, b); | |
+ if (count == -1) /* variable length */ | |
+ FREE_STACK_RETURN (REG_BADPAT); | |
+ GET_BUFFER_SPACE (5); | |
+ INSERT_JUMP2 (lookaround == -1 ? lookbehind : lookbehind_not, laststart, b + 6, count); | |
+ b += 5; | |
+ } | |
+ /* Negative form. */ | |
+ if (lookaround > 1 || lookaround < -1) | |
+ BUF_PUSH (lookaround_fail); | |
+ BUF_PUSH (lookaround_succeed); | |
+ } | |
+ } | |
break; | |
@@ -3208,7 +3326,15 @@ analyze_first_fastmap (const re_char *p, void *arg) | |
} | |
return true; | |
+ case lookahead: | |
+ case lookahead_not: | |
+ case lookbehind: | |
+ case lookbehind_not: | |
+ return -1; | |
+ | |
+ case before_dot: | |
case at_dot: | |
+ case after_dot: | |
case begbuf: | |
case wordbound: | |
case notwordbound: | |
@@ -3256,7 +3382,15 @@ analyze_first_null (const re_char *p, void *arg) | |
case notcategoryspec: | |
return true; | |
+ case lookahead: | |
+ case lookahead_not: | |
+ case lookbehind: | |
+ case lookbehind_not: | |
+ return -1; | |
+ | |
+ case before_dot: | |
case at_dot: | |
+ case after_dot: | |
case begbuf: | |
case wordbound: | |
case notwordbound: | |
@@ -3302,6 +3436,93 @@ analyze_first (struct re_pattern_buffer *bufp, | |
return !safe; | |
} | |
+static int | |
+exact_chars_in_pattern_buffer (bufp, p, pend) | |
+ struct re_pattern_buffer *bufp; | |
+ re_char *p, *pend; | |
+{ | |
+ int count = 0; | |
+ while (p < pend) | |
+ { | |
+ switch (*p++) | |
+ { | |
+ case exactn: | |
+ { | |
+ int mcnt = *p++; | |
+ int buf_charlen; | |
+ while (mcnt > 0) { | |
+ STRING_CHAR_AND_LENGTH (p, buf_charlen); | |
+ p += buf_charlen; | |
+ mcnt -= buf_charlen; | |
+ count++; | |
+ } | |
+ } | |
+ break; | |
+ case start_memory: | |
+ case stop_memory: | |
+ p++; | |
+ break; | |
+#ifdef emacs | |
+ case categoryspec: | |
+ case notcategoryspec: | |
+#endif /* emacs */ | |
+ case syntaxspec: | |
+ case notsyntaxspec: | |
+ p++; | |
+ case anychar: | |
+ count++; | |
+ break; | |
+ | |
+ case charset: | |
+ case charset_not: | |
+ if (CHARSET_RANGE_TABLE_EXISTS_P (p - 1)) | |
+ { | |
+ int mcnt; | |
+ p = CHARSET_RANGE_TABLE (p - 1); | |
+ EXTRACT_NUMBER_AND_INCR (mcnt, p); | |
+ p = CHARSET_RANGE_TABLE_END (p, mcnt); | |
+ } | |
+ else | |
+ p += 1 + CHARSET_BITMAP_SIZE (p - 1); | |
+ count++; | |
+ break; | |
+ | |
+#ifdef emacs | |
+ case before_dot: | |
+ case at_dot: | |
+ case after_dot: | |
+#endif /* emacs */ | |
+ case no_op: | |
+ case begline: | |
+ case endline: | |
+ case begbuf: | |
+ case endbuf: | |
+ case wordbound: | |
+ case notwordbound: | |
+ case wordbeg: | |
+ case wordend: | |
+ case symbeg: | |
+ case symend: | |
+ /* Zero width. */ | |
+ continue; | |
+ case lookahead: | |
+ case lookahead_not: | |
+ case lookbehind: | |
+ case lookbehind_not: | |
+ /* Skip to lookaround_success. */ | |
+ while (p < pend) | |
+ { | |
+ if ((re_opcode_t) *p++ == lookaround_succeed) | |
+ break; | |
+ } | |
+ break; | |
+ default: | |
+ return -1; | |
+ } | |
+ } | |
+ return count; | |
+} | |
+ | |
/* Compute a fastmap for the compiled pattern in BUFP. | |
A fastmap records which of the (1 << BYTEWIDTH) possible | |
@@ -4143,6 +4364,9 @@ re_match_2_internal (struct re_pattern_buffer *bufp, | |
bool best_regs_set = false; | |
re_char **best_regstart UNINIT, **best_regend UNINIT; | |
+ /* Discard a saved register from the stack. */ | |
+ bool discard_saved_regs = 0; | |
+ | |
/* Logically, this is 'best_regend[0]'. But we don't want to have to | |
allocate space for that if we're not allocating space for anything | |
else (see below). Also, we never need info about register 0 for | |
@@ -4633,6 +4857,77 @@ re_match_2_internal (struct re_pattern_buffer *bufp, | |
p += 1; | |
break; | |
+ case lookahead: | |
+ case lookahead_not: | |
+ DEBUG_PRINT ((re_opcode_t) *(p - 1) == lookahead ? "EXECUTING lookahead.\n" : "EXECUTING lookahead_not.\n"); | |
+ | |
+ p += 2; | |
+ PUSH_FAILURE_POINT (p - 3, d); | |
+ break; | |
+ | |
+ case lookbehind: | |
+ case lookbehind_not: | |
+ { | |
+ int mcnt, count_local; | |
+ bool not = (re_opcode_t) *(p - 1) != lookbehind; | |
+ | |
+ EXTRACT_NUMBER_AND_INCR (mcnt, p); | |
+ EXTRACT_NUMBER_AND_INCR (count_local, p); | |
+ | |
+ DEBUG_PRINT (not | |
+ ? "EXECUTING lookbehind_not %d.\n" | |
+ : "EXECUTING lookbehind %d.\n", count); | |
+ | |
+ dfail = d; | |
+ while (d != string1 && count_local > 0) | |
+ { | |
+ if (d == string2) | |
+ { | |
+ if (!string1) | |
+ break; | |
+ d = end1; | |
+ dend = end_match_1; | |
+ } | |
+ | |
+ if (target_multibyte) | |
+ { | |
+ re_char *dhead = (d >= string1 && d <= end1) ? string1 : string2; | |
+ PREV_CHAR_BOUNDARY (d, dhead); | |
+ } | |
+ else | |
+ d--; | |
+ count_local--; | |
+ } | |
+ | |
+ if (count_local > 0) | |
+ { | |
+ if (not) | |
+ { | |
+ /* There is no enough string to match. | |
+ So just make it succeeded here. */ | |
+ d = dfail; | |
+ p = p - 2 + mcnt; | |
+ break; | |
+ } | |
+ else | |
+ goto fail; | |
+ } | |
+ | |
+ PUSH_FAILURE_POINT (p - 5, dfail); | |
+ } | |
+ break; | |
+ | |
+ case lookaround_succeed: | |
+ DEBUG_PRINT ("EXECUTING lookaround_succeed.\n"); | |
+ | |
+ FINISH_LOOKAROUND(); | |
+ break; | |
+ | |
+ case lookaround_fail: | |
+ DEBUG_PRINT ("EXECUTING lookaround_fail.\n"); | |
+ | |
+ FINISH_LOOKAROUND(); | |
+ goto fail; | |
/* \<digit> has been turned into a 'duplicate' command which is | |
followed by the numeric value of <digit> as the register number. */ | |
@@ -5198,12 +5493,25 @@ re_match_2_internal (struct re_pattern_buffer *bufp, | |
} | |
break; | |
+ case before_dot: | |
+ DEBUG_PRINT ("EXECUTING before_dot.\n"); | |
+ if (PTR_BYTE_POS (d) >= PT_BYTE) | |
+ goto fail; | |
+ break; | |
+ | |
+ | |
case at_dot: | |
DEBUG_PRINT ("EXECUTING at_dot.\n"); | |
if (PTR_BYTE_POS (d) != PT_BYTE) | |
goto fail; | |
break; | |
+ case after_dot: | |
+ DEBUG_PRINT ("EXECUTING after_dot.\n"); | |
+ if (PTR_BYTE_POS (d) <= PT_BYTE) | |
+ goto fail; | |
+ break; | |
+ | |
case categoryspec: | |
case notcategoryspec: | |
{ | |
@@ -5253,11 +5561,15 @@ re_match_2_internal (struct re_pattern_buffer *bufp, | |
case on_failure_jump_loop: | |
case on_failure_jump: | |
case succeed_n: | |
+ case lookahead_not: | |
+ case lookbehind_not: | |
d = str; | |
continue_failure_jump: | |
p = extract_address (pat); | |
break; | |
+ case lookahead: | |
+ case lookbehind: | |
case no_op: | |
/* A special frame used for nastyloops. */ | |
goto fail; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Adapted for Emacs 30 from this:
https://lists.gnu.org/archive/html/emacs-devel/2009-06/msg00094.html
Spec:
[x] (?<=[0-9]+)MB
[o] (?<=[0-9][0-9][0-9][0-9])MB
(re-search-backward "(?<=a)b") for buffer "abca_|_b"
will seek to first "ab".