Redesign interrupt/cancel API for regex engine.
authorThomas Munro <[email protected]>
Sat, 8 Apr 2023 09:57:46 +0000 (21:57 +1200)
committerThomas Munro <[email protected]>
Sat, 8 Apr 2023 10:10:39 +0000 (22:10 +1200)
Previously, a PostgreSQL-specific callback checked by the regex engine
had a way to trigger a special error code REG_CANCEL if it detected that
the next call to CHECK_FOR_INTERRUPTS() would certainly throw via
ereport().

A later proposed bugfix aims to move some complex logic out of signal
handlers, so that it won't run until the next CHECK_FOR_INTERRUPTS(),
which makes the above design impossible unless we split
CHECK_FOR_INTERRUPTS() into two phases, one to run logic and another to
ereport().  We may develop such a system in the future, but for the
regex code it is no longer necessary.

An earlier commit moved regex memory management over to our
MemoryContext system.  Given that the purpose of the two-phase interrupt
checking was to free memory before throwing, something we don't need to
worry about anymore, it seems simpler to inject CHECK_FOR_INTERRUPTS()
directly into cancelation points, and just let it throw.

Since the plan is to keep PostgreSQL-specific concerns separate from the
main regex engine code (with a view to bein able to stay in sync with
other projects), do this with a new macro INTERRUPT(), customizable in
regcustom.h and defaulting to nothing.

Reviewed-by: Tom Lane <[email protected]>
Discussion: https://postgr.es/m/CA%2BhUKGK3PGKwcKqzoosamn36YW-fsuTdOPPF1i_rtEO%3DnEYKSg%40mail.gmail.com

13 files changed:
src/backend/regex/regc_locale.c
src/backend/regex/regc_nfa.c
src/backend/regex/regcomp.c
src/backend/regex/rege_dfa.c
src/backend/regex/regexec.c
src/backend/utils/adt/jsonpath_gram.y
src/backend/utils/adt/regexp.c
src/backend/utils/adt/varlena.c
src/include/regex/regcustom.h
src/include/regex/regerrs.h
src/include/regex/regex.h
src/include/regex/regguts.h
src/test/modules/test_regex/test_regex.c

index b5f3a73b1bb29aa81d4fd0fc662a4f86b54fb2ab..77d1ce28168b20ff2775024925139fe2e060f4ea 100644 (file)
@@ -475,11 +475,7 @@ range(struct vars *v,                      /* context */
                        }
                        addchr(cv, cc);
                }
-               if (CANCEL_REQUESTED(v->re))
-               {
-                       ERR(REG_CANCEL);
-                       return NULL;
-               }
+               INTERRUPT(v->re);
        }
 
        return cv;
index 60fb0bec5d7baf20f7a316a46468dcfdfc38bbbd..f1819a24f6d32f2df28833fef9ddb68f06d5c8a2 100644 (file)
@@ -143,11 +143,7 @@ newstate(struct nfa *nfa)
         * compilation, since no code path will go very long without making a new
         * state or arc.
         */
-       if (CANCEL_REQUESTED(nfa->v->re))
-       {
-               NERR(REG_CANCEL);
-               return NULL;
-       }
+       INTERRUPT(nfa->v->re);
 
        /* first, recycle anything that's on the freelist */
        if (nfa->freestates != NULL)
@@ -297,11 +293,7 @@ newarc(struct nfa *nfa,
         * compilation, since no code path will go very long without making a new
         * state or arc.
         */
-       if (CANCEL_REQUESTED(nfa->v->re))
-       {
-               NERR(REG_CANCEL);
-               return;
-       }
+       INTERRUPT(nfa->v->re);
 
        /* check for duplicate arc, using whichever chain is shorter */
        if (from->nouts <= to->nins)
@@ -825,11 +817,7 @@ moveins(struct nfa *nfa,
                 * Because we bypass newarc() in this code path, we'd better include a
                 * cancel check.
                 */
-               if (CANCEL_REQUESTED(nfa->v->re))
-               {
-                       NERR(REG_CANCEL);
-                       return;
-               }
+               INTERRUPT(nfa->v->re);
 
                sortins(nfa, oldState);
                sortins(nfa, newState);
@@ -929,11 +917,7 @@ copyins(struct nfa *nfa,
                 * Because we bypass newarc() in this code path, we'd better include a
                 * cancel check.
                 */
-               if (CANCEL_REQUESTED(nfa->v->re))
-               {
-                       NERR(REG_CANCEL);
-                       return;
-               }
+               INTERRUPT(nfa->v->re);
 
                sortins(nfa, oldState);
                sortins(nfa, newState);
@@ -1000,11 +984,7 @@ mergeins(struct nfa *nfa,
         * Because we bypass newarc() in this code path, we'd better include a
         * cancel check.
         */
-       if (CANCEL_REQUESTED(nfa->v->re))
-       {
-               NERR(REG_CANCEL);
-               return;
-       }
+       INTERRUPT(nfa->v->re);
 
        /* Sort existing inarcs as well as proposed new ones */
        sortins(nfa, s);
@@ -1125,11 +1105,7 @@ moveouts(struct nfa *nfa,
                 * Because we bypass newarc() in this code path, we'd better include a
                 * cancel check.
                 */
-               if (CANCEL_REQUESTED(nfa->v->re))
-               {
-                       NERR(REG_CANCEL);
-                       return;
-               }
+               INTERRUPT(nfa->v->re);
 
                sortouts(nfa, oldState);
                sortouts(nfa, newState);
@@ -1226,11 +1202,7 @@ copyouts(struct nfa *nfa,
                 * Because we bypass newarc() in this code path, we'd better include a
                 * cancel check.
                 */
-               if (CANCEL_REQUESTED(nfa->v->re))
-               {
-                       NERR(REG_CANCEL);
-                       return;
-               }
+               INTERRUPT(nfa->v->re);
 
                sortouts(nfa, oldState);
                sortouts(nfa, newState);
@@ -3282,11 +3254,7 @@ checkmatchall_recurse(struct nfa *nfa, struct state *s, bool **haspaths)
                return false;
 
        /* In case the search takes a long time, check for cancel */
-       if (CANCEL_REQUESTED(nfa->v->re))
-       {
-               NERR(REG_CANCEL);
-               return false;
-       }
+       INTERRUPT(nfa->v->re);
 
        /* Create a haspath array for this state */
        haspath = (bool *) MALLOC((DUPINF + 2) * sizeof(bool));
index bb8c240598968946b5d95f74c1d43cebd2aa3761..8a6cfb2973d8c28129e42904aee0ef1894d05af2 100644 (file)
@@ -86,7 +86,6 @@ static int    newlacon(struct vars *v, struct state *begin, struct state *end,
                                         int latype);
 static void freelacons(struct subre *subs, int n);
 static void rfree(regex_t *re);
-static int     rcancelrequested(void);
 static int     rstacktoodeep(void);
 
 #ifdef REG_DEBUG
@@ -356,7 +355,6 @@ struct vars
 /* static function list */
 static const struct fns functions = {
        rfree,                                          /* regfree insides */
-       rcancelrequested,                       /* check for cancel request */
        rstacktoodeep                           /* check for stack getting dangerously deep */
 };
 
@@ -2468,22 +2466,6 @@ rfree(regex_t *re)
        }
 }
 
-/*
- * rcancelrequested - check for external request to cancel regex operation
- *
- * Return nonzero to fail the operation with error code REG_CANCEL,
- * zero to keep going
- *
- * The current implementation is Postgres-specific.  If we ever get around
- * to splitting the regex code out as a standalone library, there will need
- * to be some API to let applications define a callback function for this.
- */
-static int
-rcancelrequested(void)
-{
-       return InterruptPending && (QueryCancelPending || ProcDiePending);
-}
-
 /*
  * rstacktoodeep - check for stack getting dangerously deep
  *
index ba1289c64a9553d622068046c0f2037ca6ecc486..1f8f2ab1441ff1faba5871ac75f07df61d616618 100644 (file)
@@ -805,11 +805,7 @@ miss(struct vars *v,
         * Checking for operation cancel in the inner text search loop seems
         * unduly expensive.  As a compromise, check during cache misses.
         */
-       if (CANCEL_REQUESTED(v->re))
-       {
-               ERR(REG_CANCEL);
-               return NULL;
-       }
+       INTERRUPT(v->re);
 
        /*
         * What set of states would we end up in after consuming the co character?
index 3d9ff2e60790eaeee1c968e34a9b04f5eedb0948..2a1d5bebda30f9f616cb0b473b4eaba9bd25c67d 100644 (file)
@@ -764,8 +764,7 @@ cdissect(struct vars *v,
        MDEBUG(("%d: cdissect %c %ld-%ld\n", t->id, t->op, LOFF(begin), LOFF(end)));
 
        /* handy place to check for operation cancel */
-       if (CANCEL_REQUESTED(v->re))
-               return REG_CANCEL;
+       INTERRUPT(v->re);
        /* ... and stack overrun */
        if (STACK_TOO_DEEP(v->re))
                return REG_ETOOBIG;
index d34ad6b80da1e41cb3815ab345f5bceca0ae16c5..adc259d5bf88c9b41028d9b82136e69ba10b72dd 100644 (file)
@@ -553,8 +553,6 @@ makeItemLikeRegex(JsonPathParseItem *expr, JsonPathString *pattern,
                {
                        char        errMsg[100];
 
-                       /* See regexp.c for explanation */
-                       CHECK_FOR_INTERRUPTS();
                        pg_regerror(re_result, &re_tmp, errMsg, sizeof(errMsg));
                        ereturn(escontext, false,
                                        (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
index 45280902d6e56cfaf13bee3995a9b7dc37c2f13b..702cd52b6d429018e51c3167407c0128a2962fac 100644 (file)
@@ -218,15 +218,6 @@ RE_compile_and_cache(text *text_re, int cflags, Oid collation)
        if (regcomp_result != REG_OKAY)
        {
                /* re didn't compile (no need for pg_regfree, if so) */
-
-               /*
-                * Here and in other places in this file, do CHECK_FOR_INTERRUPTS
-                * before reporting a regex error.  This is so that if the regex
-                * library aborts and returns REG_CANCEL, we don't print an error
-                * message that implies the regex was invalid.
-                */
-               CHECK_FOR_INTERRUPTS();
-
                pg_regerror(regcomp_result, &re_temp.cre_re, errMsg, sizeof(errMsg));
                ereport(ERROR,
                                (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
@@ -308,7 +299,6 @@ RE_wchar_execute(regex_t *re, pg_wchar *data, int data_len,
        if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH)
        {
                /* re failed??? */
-               CHECK_FOR_INTERRUPTS();
                pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
                ereport(ERROR,
                                (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
@@ -2001,7 +1991,6 @@ regexp_fixed_prefix(text *text_re, bool case_insensitive, Oid collation,
 
                default:
                        /* re failed??? */
-                       CHECK_FOR_INTERRUPTS();
                        pg_regerror(re_result, re, errMsg, sizeof(errMsg));
                        ereport(ERROR,
                                        (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
index f9a607adaff1755bbe9427f2779d1fee8d508d86..b5718764684879ba079be8e65c41647008356a66 100644 (file)
@@ -4265,7 +4265,6 @@ replace_text_regexp(text *src_text, text *pattern_text,
                {
                        char            errMsg[100];
 
-                       CHECK_FOR_INTERRUPTS();
                        pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
                        ereport(ERROR,
                                        (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
index 8f4025128ec9555eae8d1606ef5f94f8d7c8be3e..bedee1e9cacaf1f1498a147b5164b8a6c4e4bbde 100644 (file)
@@ -44,7 +44,7 @@
 
 #include "mb/pg_wchar.h"
 
-#include "miscadmin.h"                 /* needed by rcancelrequested/rstacktoodeep */
+#include "miscadmin.h"                 /* needed by stacktoodeep */
 
 
 /* overrides for regguts.h definitions, if any */
@@ -52,6 +52,7 @@
 #define MALLOC(n)              palloc_extended((n), MCXT_ALLOC_NO_OOM)
 #define FREE(p)                        pfree(VS(p))
 #define REALLOC(p,n)   repalloc_extended(VS(p),(n), MCXT_ALLOC_NO_OOM)
+#define INTERRUPT(re)  CHECK_FOR_INTERRUPTS()
 #define assert(x)              Assert(x)
 
 /* internal character type and related */
index 41e25f7ff00e5cae1e2f07555653ce6db017dead..2c8873eb81038a4e02d18215d6509271d5e27747 100644 (file)
@@ -81,7 +81,3 @@
 {
        REG_ECOLORS, "REG_ECOLORS", "too many colors"
 },
-
-{
-       REG_CANCEL, "REG_CANCEL", "operation cancelled"
-},
index 1297abec62f6c85f142f1d6bbee298055726e780..d08113724f6c90b99ea7f3ac7563ece483cb27a1 100644 (file)
@@ -156,7 +156,6 @@ typedef struct
 #define REG_BADOPT     18                      /* invalid embedded option */
 #define REG_ETOOBIG 19                 /* regular expression is too complex */
 #define REG_ECOLORS 20                 /* too many colors */
-#define REG_CANCEL     21                      /* operation cancelled */
 /* two specials for debugging and testing */
 #define REG_ATOI       101                     /* convert error-code name to number */
 #define REG_ITOA       102                     /* convert error-code number to name */
index 91a52840c4743ec34bd41044a822f01d58af1fea..3ca3647e118ea7ace436e3d39273fbda05f69236 100644 (file)
 #define FREE(p)                free(VS(p))
 #endif
 
+/* interruption */
+#ifndef INTERRUPT
+#define INTERRUPT(re)
+#endif
+
 /* want size of a char in bits, and max value in bounded quantifiers */
 #ifndef _POSIX2_RE_DUP_MAX
 #define _POSIX2_RE_DUP_MAX     255 /* normally from <limits.h> */
@@ -510,13 +515,9 @@ struct subre
 struct fns
 {
        void            FUNCPTR(free, (regex_t *));
-       int                     FUNCPTR(cancel_requested, (void));
        int                     FUNCPTR(stack_too_deep, (void));
 };
 
-#define CANCEL_REQUESTED(re)  \
-       ((*((struct fns *) (re)->re_fns)->cancel_requested) ())
-
 #define STACK_TOO_DEEP(re)     \
        ((*((struct fns *) (re)->re_fns)->stack_too_deep) ())
 
index 1d4f79c9d317b2565f091b939190683978c733db..d1dd48a993bf3cc3ff1e62a4cfdb9499882f05a6 100644 (file)
@@ -185,15 +185,6 @@ test_re_compile(text *text_re, int cflags, Oid collation,
        if (regcomp_result != REG_OKAY)
        {
                /* re didn't compile (no need for pg_regfree, if so) */
-
-               /*
-                * Here and in other places in this file, do CHECK_FOR_INTERRUPTS
-                * before reporting a regex error.  This is so that if the regex
-                * library aborts and returns REG_CANCEL, we don't print an error
-                * message that implies the regex was invalid.
-                */
-               CHECK_FOR_INTERRUPTS();
-
                pg_regerror(regcomp_result, result_re, errMsg, sizeof(errMsg));
                ereport(ERROR,
                                (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
@@ -239,7 +230,6 @@ test_re_execute(regex_t *re, pg_wchar *data, int data_len,
        if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH)
        {
                /* re failed??? */
-               CHECK_FOR_INTERRUPTS();
                pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
                ereport(ERROR,
                                (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),