Skip to content

Instantly share code, notes, and snippets.

@snippins
Last active December 2, 2024 19:27
Show Gist options
  • Save snippins/2d9a66e6708bc258f662babe0a1c1009 to your computer and use it in GitHub Desktop.
Save snippins/2d9a66e6708bc258f662babe0a1c1009 to your computer and use it in GitHub Desktop.
Emacs 30 patch for regex lookaround - rev a8169bee2064282a40214ef65ef0493233ed4669 - 2024-12-01 master
diff --git a/src/regex-emacs.c b/src/regex-emacs.c
index 3bf3ad9..8bbfb78 100644
--- a/src/regex-emacs.c
+++ b/src/regex-emacs.c
@@ -44,6 +44,9 @@
#endif
#define RE_DUP_MAX (0xffff)
+#define PREV_CHAR_BOUNDARY(p, limit) ((p)--)
+#define STRING_CHAR_AND_LENGTH(p, actual_len) ((actual_len) = 1, *(p))
+
/* Make syntax table lookup grant data in gl_state. */
#define SYNTAX(c) syntax_property (c, 1)
@@ -323,7 +326,16 @@ #define BYTEWIDTH 8 /* In bits. */
/* Matches any character whose syntax is not that specified. */
notsyntaxspec,
+ lookahead,
+ lookahead_not,
+ lookbehind,
+ lookbehind_not,
+ lookaround_succeed,
+ lookaround_fail,
+
+ before_dot, /* Succeeds if before point. */
at_dot, /* Succeeds if at point. */
+ after_dot, /* Succeeds if after point. */
/* Matches any character whose category-set contains the specified
category. The operator is followed by a byte which contains a
@@ -540,6 +552,31 @@ print_partial_compiled_pattern (FILE *dest, re_char *start, re_char *end)
fprintf (dest, "/stop_memory/%d", *p++);
break;
+ case lookahead:
+ extract_number_and_incr (&mcnt, &p);
+ fprintf (dest, "/lookahead/%d", mcnt);
+ break;
+ case lookahead_not:
+ extract_number_and_incr (&mcnt, &p);
+ fprintf (dest, "/lookahead_not/%d", mcnt);
+ break;
+ case lookbehind:
+ extract_number_and_incr (&mcnt, &p);
+ extract_number_and_incr (&mcnt2, &p);
+ fprintf (dest, "/lookbehind/%d/%d", mcnt, mcnt2);
+ break;
+ case lookbehind_not:
+ extract_number_and_incr (&mcnt, &p);
+ extract_number_and_incr (&mcnt2, &p);
+ fprintf (dest, "/lookbehind_not/%d/%d", mcnt, mcnt2);
+ break;
+ case lookaround_succeed:
+ fprintf (dest, "/lookaround_succeed");
+ break;
+ case lookaround_fail:
+ fprintf (dest, "/lookaround_fail");
+ break;
+
case duplicate:
fprintf (dest, "/duplicate/%d", *p++);
break;
@@ -704,10 +741,18 @@ print_partial_compiled_pattern (FILE *dest, re_char *start, re_char *end)
fprintf (dest, "/%d", mcnt);
break;
+ case before_dot:
+ fprintf (dest, "/before_dot");
+ break;
+
case at_dot:
fputs ("/at_dot", dest);
break;
+ case after_dot:
+ fprintf (dest, "/after_dot");
+ break;
+
case categoryspec:
fputs ("/categoryspec", dest);
mcnt = *p++;
@@ -1159,6 +1204,28 @@ #define POP_FAILURE_POINT(str, pat) \
DEBUG_STATEMENT (nfailure_points_popped++); \
} while (false) /* POP_FAILURE_POINT */
+ #define FINISH_LOOKAROUND() \
+ do { \
+ re_char *str, *pat; \
+ re_opcode_t op; \
+ discard_saved_regs = 1; \
+ while (!FAIL_STACK_EMPTY ()) \
+ { \
+ POP_FAILURE_POINT (str, pat); \
+ op = (re_opcode_t) *pat; \
+ if (op == lookahead \
+ || op == lookahead_not \
+ || op == lookbehind \
+ || op == lookbehind_not) \
+ { \
+ d = str; \
+ dend = ((d >= string1 && d <= end1) \
+ ? end_match_1 : end_match_2); \
+ break; \
+ } \
+ } \
+ discard_saved_regs = 0; \
+ } while (0);
/* Registers are set to a sentinel when they haven't yet matched. */
@@ -1298,6 +1365,7 @@ static_assert (LONG_MIN <= -(MAX_BUF_SIZE - 1) && MAX_BUF_SIZE - 1 <= LONG_MAX);
pattern_offset_t fixup_alt_jump;
pattern_offset_t laststart_offset;
regnum_t regnum;
+ int lookaround;
} compile_stack_elt_t;
@@ -1669,6 +1737,7 @@ extend_range_table_work_area (struct range_table_work_area *work_area)
/* regex_compile and helpers. */
static bool group_in_compile_stack (compile_stack_type, regnum_t);
+static int exact_chars_in_pattern_buffer (struct re_pattern_buffer *bufp, re_char *p, re_char *pend);
/* Insert the 'jump' from the end of last alternative to "here".
The space for the jump has already been allocated. */
@@ -2209,6 +2278,7 @@ regex_compile (re_char *pattern, ptrdiff_t size,
case '(':
{
bool shy = false;
+ int lookaround = 0;
regnum_t regnum = 0;
if (p+1 < pend)
{
@@ -2234,6 +2304,27 @@ regex_compile (re_char *pattern, ptrdiff_t size,
|| ckd_add (&regnum, regnum, c - '0'))
FREE_STACK_RETURN (REG_ESIZE);
break;
+ case '=':
+ /* Positive lookahead assertion. */
+ shy = lookaround = 1;
+ break;
+ case '!':
+ /* Negative lookahead assertion. */
+ shy = lookaround = 2;
+ break;
+ case '<':
+ {
+ PATFETCH (c);
+ if (c == '=')
+ /* Positive lookbehind assertion. */
+ shy = lookaround = -1;
+ else if (c == '!')
+ /* Negative lookbehind assertion. */
+ shy = lookaround = -2;
+ else
+ FREE_STACK_RETURN (REG_BADPAT);
+ }
+ break;
default:
/* Only (?:...) is supported right now. */
FREE_STACK_RETURN (REG_BADPAT);
@@ -2276,6 +2367,7 @@ regex_compile (re_char *pattern, ptrdiff_t size,
= fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
COMPILE_STACK_TOP.regnum = regnum;
+ COMPILE_STACK_TOP.lookaround = lookaround;
/* Do not push a start_memory for groups beyond the last one
we can represent in the compiled pattern. */
@@ -2312,6 +2404,7 @@ regex_compile (re_char *pattern, ptrdiff_t size,
later groups should continue to be numbered higher,
as in '(ab)c(de)' -- the second group is #2. */
regnum_t regnum;
+ int lookaround;
compile_stack.avail--;
begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
@@ -2324,13 +2417,38 @@ regex_compile (re_char *pattern, ptrdiff_t size,
/* If we've reached MAX_REGNUM groups, then this open
won't actually generate any code, so we'll have to
clear pending_exact explicitly. */
+ lookaround = COMPILE_STACK_TOP.lookaround;
pending_exact = 0;
/* We're at the end of the group, so now we know how many
groups were inside this one. */
if (regnum <= MAX_REGNUM && regnum > 0)
BUF_PUSH_2 (stop_memory, regnum);
- }
+ else if (lookaround)
+ {
+ if (lookaround > 0)
+ {
+ /* Positive/negative lookahead assertion. */
+ GET_BUFFER_SPACE (3);
+ INSERT_JUMP (lookaround == 1 ? lookahead : lookahead_not, laststart, b + 4);
+ b += 3;
+ }
+ else
+ {
+ /* Positive/negative lookbehind assertion. */
+ int count = exact_chars_in_pattern_buffer (bufp, laststart, b);
+ if (count == -1) /* variable length */
+ FREE_STACK_RETURN (REG_BADPAT);
+ GET_BUFFER_SPACE (5);
+ INSERT_JUMP2 (lookaround == -1 ? lookbehind : lookbehind_not, laststart, b + 6, count);
+ b += 5;
+ }
+ /* Negative form. */
+ if (lookaround > 1 || lookaround < -1)
+ BUF_PUSH (lookaround_fail);
+ BUF_PUSH (lookaround_succeed);
+ }
+ }
break;
@@ -3208,7 +3326,15 @@ analyze_first_fastmap (const re_char *p, void *arg)
}
return true;
+ case lookahead:
+ case lookahead_not:
+ case lookbehind:
+ case lookbehind_not:
+ return -1;
+
+ case before_dot:
case at_dot:
+ case after_dot:
case begbuf:
case wordbound:
case notwordbound:
@@ -3256,7 +3382,15 @@ analyze_first_null (const re_char *p, void *arg)
case notcategoryspec:
return true;
+ case lookahead:
+ case lookahead_not:
+ case lookbehind:
+ case lookbehind_not:
+ return -1;
+
+ case before_dot:
case at_dot:
+ case after_dot:
case begbuf:
case wordbound:
case notwordbound:
@@ -3302,6 +3436,93 @@ analyze_first (struct re_pattern_buffer *bufp,
return !safe;
}
+static int
+exact_chars_in_pattern_buffer (bufp, p, pend)
+ struct re_pattern_buffer *bufp;
+ re_char *p, *pend;
+{
+ int count = 0;
+ while (p < pend)
+ {
+ switch (*p++)
+ {
+ case exactn:
+ {
+ int mcnt = *p++;
+ int buf_charlen;
+ while (mcnt > 0) {
+ STRING_CHAR_AND_LENGTH (p, buf_charlen);
+ p += buf_charlen;
+ mcnt -= buf_charlen;
+ count++;
+ }
+ }
+ break;
+ case start_memory:
+ case stop_memory:
+ p++;
+ break;
+#ifdef emacs
+ case categoryspec:
+ case notcategoryspec:
+#endif /* emacs */
+ case syntaxspec:
+ case notsyntaxspec:
+ p++;
+ case anychar:
+ count++;
+ break;
+
+ case charset:
+ case charset_not:
+ if (CHARSET_RANGE_TABLE_EXISTS_P (p - 1))
+ {
+ int mcnt;
+ p = CHARSET_RANGE_TABLE (p - 1);
+ EXTRACT_NUMBER_AND_INCR (mcnt, p);
+ p = CHARSET_RANGE_TABLE_END (p, mcnt);
+ }
+ else
+ p += 1 + CHARSET_BITMAP_SIZE (p - 1);
+ count++;
+ break;
+
+#ifdef emacs
+ case before_dot:
+ case at_dot:
+ case after_dot:
+#endif /* emacs */
+ case no_op:
+ case begline:
+ case endline:
+ case begbuf:
+ case endbuf:
+ case wordbound:
+ case notwordbound:
+ case wordbeg:
+ case wordend:
+ case symbeg:
+ case symend:
+ /* Zero width. */
+ continue;
+ case lookahead:
+ case lookahead_not:
+ case lookbehind:
+ case lookbehind_not:
+ /* Skip to lookaround_success. */
+ while (p < pend)
+ {
+ if ((re_opcode_t) *p++ == lookaround_succeed)
+ break;
+ }
+ break;
+ default:
+ return -1;
+ }
+ }
+ return count;
+}
+
/* Compute a fastmap for the compiled pattern in BUFP.
A fastmap records which of the (1 << BYTEWIDTH) possible
@@ -4143,6 +4364,9 @@ re_match_2_internal (struct re_pattern_buffer *bufp,
bool best_regs_set = false;
re_char **best_regstart UNINIT, **best_regend UNINIT;
+ /* Discard a saved register from the stack. */
+ bool discard_saved_regs = 0;
+
/* Logically, this is 'best_regend[0]'. But we don't want to have to
allocate space for that if we're not allocating space for anything
else (see below). Also, we never need info about register 0 for
@@ -4633,6 +4857,77 @@ re_match_2_internal (struct re_pattern_buffer *bufp,
p += 1;
break;
+ case lookahead:
+ case lookahead_not:
+ DEBUG_PRINT ((re_opcode_t) *(p - 1) == lookahead ? "EXECUTING lookahead.\n" : "EXECUTING lookahead_not.\n");
+
+ p += 2;
+ PUSH_FAILURE_POINT (p - 3, d);
+ break;
+
+ case lookbehind:
+ case lookbehind_not:
+ {
+ int mcnt, count_local;
+ bool not = (re_opcode_t) *(p - 1) != lookbehind;
+
+ EXTRACT_NUMBER_AND_INCR (mcnt, p);
+ EXTRACT_NUMBER_AND_INCR (count_local, p);
+
+ DEBUG_PRINT (not
+ ? "EXECUTING lookbehind_not %d.\n"
+ : "EXECUTING lookbehind %d.\n", count);
+
+ dfail = d;
+ while (d != string1 && count_local > 0)
+ {
+ if (d == string2)
+ {
+ if (!string1)
+ break;
+ d = end1;
+ dend = end_match_1;
+ }
+
+ if (target_multibyte)
+ {
+ re_char *dhead = (d >= string1 && d <= end1) ? string1 : string2;
+ PREV_CHAR_BOUNDARY (d, dhead);
+ }
+ else
+ d--;
+ count_local--;
+ }
+
+ if (count_local > 0)
+ {
+ if (not)
+ {
+ /* There is no enough string to match.
+ So just make it succeeded here. */
+ d = dfail;
+ p = p - 2 + mcnt;
+ break;
+ }
+ else
+ goto fail;
+ }
+
+ PUSH_FAILURE_POINT (p - 5, dfail);
+ }
+ break;
+
+ case lookaround_succeed:
+ DEBUG_PRINT ("EXECUTING lookaround_succeed.\n");
+
+ FINISH_LOOKAROUND();
+ break;
+
+ case lookaround_fail:
+ DEBUG_PRINT ("EXECUTING lookaround_fail.\n");
+
+ FINISH_LOOKAROUND();
+ goto fail;
/* \<digit> has been turned into a 'duplicate' command which is
followed by the numeric value of <digit> as the register number. */
@@ -5198,12 +5493,25 @@ re_match_2_internal (struct re_pattern_buffer *bufp,
}
break;
+ case before_dot:
+ DEBUG_PRINT ("EXECUTING before_dot.\n");
+ if (PTR_BYTE_POS (d) >= PT_BYTE)
+ goto fail;
+ break;
+
+
case at_dot:
DEBUG_PRINT ("EXECUTING at_dot.\n");
if (PTR_BYTE_POS (d) != PT_BYTE)
goto fail;
break;
+ case after_dot:
+ DEBUG_PRINT ("EXECUTING after_dot.\n");
+ if (PTR_BYTE_POS (d) <= PT_BYTE)
+ goto fail;
+ break;
+
case categoryspec:
case notcategoryspec:
{
@@ -5253,11 +5561,15 @@ re_match_2_internal (struct re_pattern_buffer *bufp,
case on_failure_jump_loop:
case on_failure_jump:
case succeed_n:
+ case lookahead_not:
+ case lookbehind_not:
d = str;
continue_failure_jump:
p = extract_address (pat);
break;
+ case lookahead:
+ case lookbehind:
case no_op:
/* A special frame used for nastyloops. */
goto fail;
@snippins
Copy link
Author

snippins commented Aug 8, 2021

Adapted for Emacs 30 from this:

https://lists.gnu.org/archive/html/emacs-devel/2009-06/msg00094.html

Spec:

  • Any pattern is allowed in lookahead assertion.
  • Nested lookaround assertion is allowed.
  • Capturing is allowed only in positive lookahead/lookbehind assertion.
  • Duplication is allowed after such assertion.
  • Variable length pattern is NOT yet allowed in lookbehind assertion.
    [x] (?<=[0-9]+)MB
    [o] (?<=[0-9][0-9][0-9][0-9])MB
  • Lookahead assertion over start bound is not allowed in re-search-backward.
    (re-search-backward "(?<=a)b") for buffer "abca_|_b"
    will seek to first "ab".

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment