Add websearch_to_tsquery
authorTeodor Sigaev <[email protected]>
Thu, 5 Apr 2018 16:55:11 +0000 (19:55 +0300)
committerTeodor Sigaev <[email protected]>
Thu, 5 Apr 2018 16:55:11 +0000 (19:55 +0300)
Error-tolerant conversion function with web-like syntax for search query,
it simplifies  constraining search engine with close to habitual interface for
users.

Bump catalog version

Authors: Victor Drobny, Dmitry Ivanov with editorization by me
Reviewed by: Aleksander Alekseev, Tomas Vondra, Thomas Munro, Aleksandr Parfenov
Discussion: https://www.postgresql.org/message-id/flat/fe931111ff7e9ad79196486ada79e268@postgrespro.ru

doc/src/sgml/func.sgml
doc/src/sgml/textsearch.sgml
src/backend/tsearch/to_tsany.c
src/backend/utils/adt/tsquery.c
src/backend/utils/adt/tsvector.c
src/backend/utils/adt/tsvector_parser.c
src/include/catalog/catversion.h
src/include/catalog/pg_proc.h
src/include/tsearch/ts_utils.h
src/test/regress/expected/tsearch.out
src/test/regress/sql/tsearch.sql

index 9a1efc14cf76c2325a1be9150501cb1c5e347780..122f034f17763c2f90a3a0b44ecf2ee13cc365e5 100644 (file)
@@ -9630,6 +9630,18 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple
         <entry><literal>phraseto_tsquery('english', 'The Fat Rats')</literal></entry>
         <entry><literal>'fat' &lt;-&gt; 'rat'</literal></entry>
        </row>
+       <row>
+        <entry>
+         <indexterm>
+          <primary>websearch_to_tsquery</primary>
+         </indexterm>
+          <literal><function>websearch_to_tsquery(<optional> <replaceable class="parameter">config</replaceable> <type>regconfig</type> , </optional> <replaceable class="parameter">query</replaceable> <type>text</type>)</function></literal>
+         </entry>
+        <entry><type>tsquery</type></entry>
+        <entry>produce <type>tsquery</type> from a web search style query</entry>
+        <entry><literal>websearch_to_tsquery('english', '"fat rat" or rat')</literal></entry>
+        <entry><literal>'fat' &lt;-&gt; 'rat' | 'rat'</literal></entry>
+       </row>
        <row>
         <entry>
          <indexterm>
index 610b7bf03374f70a887ad552614f59c4d25d268e..19f58511c8221976db48f30bf19262f3f540e266 100644 (file)
@@ -797,13 +797,16 @@ UPDATE tt SET ti =
    <para>
     <productname>PostgreSQL</productname> provides the
     functions <function>to_tsquery</function>,
-    <function>plainto_tsquery</function>, and
-    <function>phraseto_tsquery</function>
+    <function>plainto_tsquery</function>,
+    <function>phraseto_tsquery</function> and
+    <function>websearch_to_tsquery</function>
     for converting a query to the <type>tsquery</type> data type.
     <function>to_tsquery</function> offers access to more features
     than either <function>plainto_tsquery</function> or
-    <function>phraseto_tsquery</function>, but it is less forgiving
-    about its input.
+    <function>phraseto_tsquery</function>, but it is less forgiving about its
+    input. <function>websearch_to_tsquery</function> is a simplified version
+    of <function>to_tsquery</function> with an alternative syntax, similar
+    to the one used by web search engines.
    </para>
 
    <indexterm>
@@ -962,6 +965,87 @@ SELECT phraseto_tsquery('english', 'The Fat &amp; Rats:C');
 </screen>
    </para>
 
+<synopsis>
+websearch_to_tsquery(<optional> <replaceable class="parameter">config</replaceable> <type>regconfig</type>, </optional> <replaceable class="parameter">querytext</replaceable> <type>text</type>) returns <type>tsquery</type>
+</synopsis>
+
+   <para>
+    <function>websearch_to_tsquery</function> creates a <type>tsquery</type>
+    value from <replaceable>querytext</replaceable> using an alternative
+    syntax in which simple unformatted text is a valid query.
+    Unlike <function>plainto_tsquery</function>
+    and <function>phraseto_tsquery</function>, it also recognizes certain
+    operators. Moreover, this function should never raise syntax errors,
+    which makes it possible to use raw user-supplied input for search.
+    The following syntax is supported:
+    <itemizedlist  spacing="compact" mark="bullet">
+     <listitem>
+       <para>
+        <literal>unquoted text</literal>: text not inside quote marks will be
+        converted to terms separated by <literal>&amp;</literal> operators, as
+        if processed by
+        <function>plainto_tsquery</function>.
+      </para>
+     </listitem>
+     <listitem>
+       <para>
+        <literal>"quoted text"</literal>: text inside quote marks will be
+        converted to terms separated by <literal>&lt;-&gt;</literal>
+        operators, as if processed by <function>phraseto_tsquery</function>.
+      </para>
+     </listitem>
+     <listitem>
+      <para>
+       <literal>OR</literal>: logical or will be converted to
+       the <literal>|</literal> operator.
+      </para>
+     </listitem>
+     <listitem>
+      <para>
+       <literal>-</literal>: the logical not operator, converted to the
+       the <literal>!</literal> operator.
+      </para>
+     </listitem>
+    </itemizedlist>
+   </para>
+   <para>
+    Examples:
+    <screen>
+      select websearch_to_tsquery('english', 'The fat rats');
+       websearch_to_tsquery
+      -----------------
+       'fat' &amp; 'rat'
+      (1 row)
+    </screen>
+    <screen>
+      select websearch_to_tsquery('english', '"supernovae stars" -crab');
+             websearch_to_tsquery
+      ----------------------------------
+       'supernova' &lt;-&gt; 'star' &amp; !'crab'
+      (1 row)
+    </screen>
+    <screen>
+      select websearch_to_tsquery('english', '"sad cat" or "fat rat"');
+             websearch_to_tsquery
+      -----------------------------------
+       'sad' &lt;-&gt; 'cat' | 'fat' &lt;-&gt; 'rat'
+      (1 row)
+    </screen>
+    <screen>
+      select websearch_to_tsquery('english', 'signal -"segmentation fault"');
+               websearch_to_tsquery
+      ---------------------------------------
+       'signal' &amp; !( 'segment' &lt;-&gt; 'fault' )
+      (1 row)
+    </screen>
+    <screen>
+      select websearch_to_tsquery('english', '""" )( dummy \\ query &lt;-&gt;');
+       websearch_to_tsquery
+      ----------------------
+       'dummi' &amp; 'queri'
+      (1 row)
+    </screen>
+    </para>
   </sect2>
 
   <sect2 id="textsearch-ranking">
index ea5947a3a82074a00b4fbbac7535a31745193012..6055fb6b4e57060d0b09df5efaa2b2db14ade4f1 100644 (file)
@@ -490,7 +490,7 @@ to_tsquery_byid(PG_FUNCTION_ARGS)
        query = parse_tsquery(text_to_cstring(in),
                                                  pushval_morph,
                                                  PointerGetDatum(&data),
-                                                 false);
+                                                 0);
 
        PG_RETURN_TSQUERY(query);
 }
@@ -520,7 +520,7 @@ plainto_tsquery_byid(PG_FUNCTION_ARGS)
        query = parse_tsquery(text_to_cstring(in),
                                                  pushval_morph,
                                                  PointerGetDatum(&data),
-                                                 true);
+                                                 P_TSQ_PLAIN);
 
        PG_RETURN_POINTER(query);
 }
@@ -551,7 +551,7 @@ phraseto_tsquery_byid(PG_FUNCTION_ARGS)
        query = parse_tsquery(text_to_cstring(in),
                                                  pushval_morph,
                                                  PointerGetDatum(&data),
-                                                 true);
+                                                 P_TSQ_PLAIN);
 
        PG_RETURN_TSQUERY(query);
 }
@@ -567,3 +567,35 @@ phraseto_tsquery(PG_FUNCTION_ARGS)
                                                                                ObjectIdGetDatum(cfgId),
                                                                                PointerGetDatum(in)));
 }
+
+Datum
+websearch_to_tsquery_byid(PG_FUNCTION_ARGS)
+{
+       text       *in = PG_GETARG_TEXT_PP(1);
+       MorphOpaque     data;
+       TSQuery         query = NULL;
+
+       data.cfg_id = PG_GETARG_OID(0);
+
+       data.qoperator = OP_AND;
+
+       query = parse_tsquery(text_to_cstring(in),
+                                                 pushval_morph,
+                                                 PointerGetDatum(&data),
+                                                 P_TSQ_WEB);
+
+       PG_RETURN_TSQUERY(query);
+}
+
+Datum
+websearch_to_tsquery(PG_FUNCTION_ARGS)
+{
+       text       *in = PG_GETARG_TEXT_PP(0);
+       Oid                     cfgId;
+
+       cfgId = getTSCurrentConfig(true);
+       PG_RETURN_DATUM(DirectFunctionCall2(websearch_to_tsquery_byid,
+                                                                               ObjectIdGetDatum(cfgId),
+                                                                               PointerGetDatum(in)));
+
+}
index 1ccbf79030620c3002f10831ba4a54054b8bf4a6..793c0e5dd1c01253f478acbc0ba21b15686840f0 100644 (file)
@@ -32,14 +32,53 @@ const int   tsearch_op_priority[OP_COUNT] =
        3                                                       /* OP_PHRASE */
 };
 
+/*
+ * parser's states
+ */
+typedef enum
+{
+       WAITOPERAND = 1,
+       WAITOPERATOR = 2,
+       WAITFIRSTOPERAND = 3
+} ts_parserstate;
+
+/*
+ * token types for parsing
+ */
+typedef enum
+{
+       PT_END = 0,
+       PT_ERR = 1,
+       PT_VAL = 2,
+       PT_OPR = 3,
+       PT_OPEN = 4,
+       PT_CLOSE = 5
+} ts_tokentype;
+
+/*
+ * get token from query string
+ *
+ * *operator is filled in with OP_* when return values is PT_OPR,
+ * but *weight could contain a distance value in case of phrase operator.
+ * *strval, *lenval and *weight are filled in when return value is PT_VAL
+ *
+ */
+typedef ts_tokentype (*ts_tokenizer)(TSQueryParserState state, int8 *operator,
+                                                                        int *lenval, char **strval,
+                                                                        int16 *weight, bool *prefix);
+
 struct TSQueryParserStateData
 {
-       /* State for gettoken_query */
+       /* Tokenizer used for parsing tsquery */
+       ts_tokenizer gettoken;
+
+       /* State of tokenizer function */
        char       *buffer;                     /* entire string we are scanning */
        char       *buf;                        /* current scan point */
-       int                     state;
        int                     count;                  /* nesting count, incremented by (,
                                                                 * decremented by ) */
+       bool            in_quotes;              /* phrase in quotes "" */
+       ts_parserstate state;
 
        /* polish (prefix) notation in list, filled in by push* functions */
        List       *polstr;
@@ -57,12 +96,6 @@ struct TSQueryParserStateData
        TSVectorParseState valstate;
 };
 
-/* parser's states */
-#define WAITOPERAND 1
-#define WAITOPERATOR   2
-#define WAITFIRSTOPERAND 3
-#define WAITSINGLEOPERAND 4
-
 /*
  * subroutine to parse the modifiers (weight and prefix flag currently)
  * part, like ':AB*' of a query.
@@ -118,18 +151,17 @@ get_modifiers(char *buf, int16 *weight, bool *prefix)
  *
  * The buffer should begin with '<' char
  */
-static char *
-parse_phrase_operator(char *buf, int16 *distance)
+static bool
+parse_phrase_operator(TSQueryParserState pstate, int16 *distance)
 {
        enum
        {
                PHRASE_OPEN = 0,
                PHRASE_DIST,
                PHRASE_CLOSE,
-               PHRASE_ERR,
                PHRASE_FINISH
        }                       state = PHRASE_OPEN;
-       char       *ptr = buf;
+       char       *ptr = pstate->buf;
        char       *endptr;
        long            l = 1;                  /* default distance */
 
@@ -138,9 +170,13 @@ parse_phrase_operator(char *buf, int16 *distance)
                switch (state)
                {
                        case PHRASE_OPEN:
-                               Assert(t_iseq(ptr, '<'));
-                               state = PHRASE_DIST;
-                               ptr++;
+                               if (t_iseq(ptr, '<'))
+                               {
+                                       state = PHRASE_DIST;
+                                       ptr++;
+                               }
+                               else
+                                       return false;
                                break;
 
                        case PHRASE_DIST:
@@ -148,18 +184,16 @@ parse_phrase_operator(char *buf, int16 *distance)
                                {
                                        state = PHRASE_CLOSE;
                                        ptr++;
-                                       break;
+                                       continue;
                                }
+
                                if (!t_isdigit(ptr))
-                               {
-                                       state = PHRASE_ERR;
-                                       break;
-                               }
+                                       return false;
 
                                errno = 0;
                                l = strtol(ptr, &endptr, 10);
                                if (ptr == endptr)
-                                       state = PHRASE_ERR;
+                                       return false;
                                else if (errno == ERANGE || l < 0 || l > MAXENTRYPOS)
                                        ereport(ERROR,
                                                        (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
@@ -179,54 +213,77 @@ parse_phrase_operator(char *buf, int16 *distance)
                                        ptr++;
                                }
                                else
-                                       state = PHRASE_ERR;
+                                       return false;
                                break;
 
                        case PHRASE_FINISH:
                                *distance = (int16) l;
-                               return ptr;
-
-                       case PHRASE_ERR:
-                       default:
-                               goto err;
+                               pstate->buf = ptr;
+                               return true;
                }
        }
 
-err:
-       *distance = -1;
-       return buf;
+       return false;
 }
 
 /*
- * token types for parsing
+ * Parse OR operator used in websearch_to_tsquery(), returns true if we
+ * believe that "OR" literal could be an operator OR
  */
-typedef enum
+static bool
+parse_or_operator(TSQueryParserState pstate)
 {
-       PT_END = 0,
-       PT_ERR = 1,
-       PT_VAL = 2,
-       PT_OPR = 3,
-       PT_OPEN = 4,
-       PT_CLOSE = 5
-} ts_tokentype;
+       char *ptr = pstate->buf;
+
+       if (pstate->in_quotes)
+               return false;
+
+       /* it should begin with "OR" literal */
+       if (pg_strncasecmp(ptr, "or", 2) != 0)
+               return false;
+
+       ptr += 2;
+
+       /*
+        * it shouldn't be a part of any word but somewhere later it should be some
+        * operand
+        */
+       if (*ptr == '\0') /* no operand */
+               return false;
+
+       /* it shouldn't be a part of any word */
+   if (t_iseq(ptr, '-') || t_iseq(ptr, '_') || t_isalpha(ptr) || t_isdigit(ptr))
+               return false;
+
+       for(;;)
+       {
+               ptr += pg_mblen(ptr);
+
+               if (*ptr == '\0') /* got end of string without operand */
+                       return false;
+
+               /*
+                * Suppose, we found an operand, but could be a not correct operand. So
+                * we still treat OR literal as operation with possibly incorrect
+                * operand and  will not search it as lexeme
+                */
+               if (!t_isspace(ptr))
+                       break;
+       }
+
+       pstate->buf += 2;
+       return true;
+}
 
-/*
- * get token from query string
- *
- * *operator is filled in with OP_* when return values is PT_OPR,
- * but *weight could contain a distance value in case of phrase operator.
- * *strval, *lenval and *weight are filled in when return value is PT_VAL
- *
- */
 static ts_tokentype
-gettoken_query(TSQueryParserState state,
-                          int8 *operator,
-                          int *lenval, char **strval, int16 *weight, bool *prefix)
+gettoken_query_standard(TSQueryParserState state, int8 *operator,
+                                               int *lenval, char **strval,
+                                               int16 *weight, bool *prefix)
 {
        *weight = 0;
        *prefix = false;
 
-       while (1)
+       while (true)
        {
                switch (state->state)
                {
@@ -234,17 +291,16 @@ gettoken_query(TSQueryParserState state,
                        case WAITOPERAND:
                                if (t_iseq(state->buf, '!'))
                                {
-                                       (state->buf)++; /* can safely ++, t_iseq guarantee that
-                                                                        * pg_mblen()==1 */
-                                       *operator = OP_NOT;
+                                       state->buf++;
                                        state->state = WAITOPERAND;
+                                       *operator = OP_NOT;
                                        return PT_OPR;
                                }
                                else if (t_iseq(state->buf, '('))
                                {
-                                       state->count++;
-                                       (state->buf)++;
+                                       state->buf++;
                                        state->state = WAITOPERAND;
+                                       state->count++;
                                        return PT_OPEN;
                                }
                                else if (t_iseq(state->buf, ':'))
@@ -256,19 +312,19 @@ gettoken_query(TSQueryParserState state,
                                }
                                else if (!t_isspace(state->buf))
                                {
-                                       /*
-                                        * We rely on the tsvector parser to parse the value for
-                                        * us
-                                        */
+                                       /* We rely on the tsvector parser to parse the value for us */
                                        reset_tsvector_parser(state->valstate, state->buf);
-                                       if (gettoken_tsvector(state->valstate, strval, lenval, NULL, NULL, &state->buf))
+                                       if (gettoken_tsvector(state->valstate, strval, lenval,
+                                                                                 NULL, NULL, &state->buf))
                                        {
                                                state->buf = get_modifiers(state->buf, weight, prefix);
                                                state->state = WAITOPERATOR;
                                                return PT_VAL;
                                        }
                                        else if (state->state == WAITFIRSTOPERAND)
+                                       {
                                                return PT_END;
+                                       }
                                        else
                                                ereport(ERROR,
                                                                (errcode(ERRCODE_SYNTAX_ERROR),
@@ -276,58 +332,206 @@ gettoken_query(TSQueryParserState state,
                                                                                state->buffer)));
                                }
                                break;
+
                        case WAITOPERATOR:
                                if (t_iseq(state->buf, '&'))
                                {
+                                       state->buf++;
                                        state->state = WAITOPERAND;
                                        *operator = OP_AND;
-                                       (state->buf)++;
                                        return PT_OPR;
                                }
                                else if (t_iseq(state->buf, '|'))
                                {
+                                       state->buf++;
                                        state->state = WAITOPERAND;
                                        *operator = OP_OR;
-                                       (state->buf)++;
                                        return PT_OPR;
                                }
-                               else if (t_iseq(state->buf, '<'))
+                               else if (parse_phrase_operator(state, weight))
                                {
+                                       /* weight var is used as storage for distance */
                                        state->state = WAITOPERAND;
                                        *operator = OP_PHRASE;
-                                       /* weight var is used as storage for distance */
-                                       state->buf = parse_phrase_operator(state->buf, weight);
-                                       if (*weight < 0)
-                                               return PT_ERR;
                                        return PT_OPR;
                                }
                                else if (t_iseq(state->buf, ')'))
                                {
-                                       (state->buf)++;
+                                       state->buf++;
                                        state->count--;
                                        return (state->count < 0) ? PT_ERR : PT_CLOSE;
                                }
-                               else if (*(state->buf) == '\0')
+                               else if (*state->buf == '\0')
+                               {
                                        return (state->count) ? PT_ERR : PT_END;
+                               }
                                else if (!t_isspace(state->buf))
+                               {
                                        return PT_ERR;
+                               }
+                               break;
+               }
+
+               state->buf += pg_mblen(state->buf);
+       }
+}
+
+static ts_tokentype
+gettoken_query_websearch(TSQueryParserState state, int8 *operator,
+                                                int *lenval, char **strval,
+                                                int16 *weight, bool *prefix)
+{
+       *weight = 0;
+       *prefix = false;
+
+       while (true)
+       {
+               switch (state->state)
+               {
+                       case WAITFIRSTOPERAND:
+                       case WAITOPERAND:
+                               if (t_iseq(state->buf, '-'))
+                               {
+                                       state->buf++;
+                                       state->state = WAITOPERAND;
+
+                                       if (state->in_quotes)
+                                               continue;
+
+                                       *operator = OP_NOT;
+                                       return PT_OPR;
+                               }
+                               else if (t_iseq(state->buf, '"'))
+                               {
+                                       state->buf++;
+
+                                       if (!state->in_quotes)
+                                       {
+                                               state->state = WAITOPERAND;
+
+                                               if (strchr(state->buf, '"'))
+                                               {
+                                                       /* quoted text should be ordered <-> */
+                                                       state->in_quotes = true;
+                                                       return PT_OPEN;
+                                               }
+
+                                               /* web search tolerates missing quotes */
+                                               continue;
+                                       }
+                                       else
+                                       {
+                                               /* we have to provide an operand */
+                                               state->in_quotes = false;
+                                               state->state = WAITOPERATOR;
+                                               pushStop(state);
+                                               return PT_CLOSE;
+                                       }
+                               }
+                               else if (ISOPERATOR(state->buf))
+                               {
+                                       /* or else gettoken_tsvector() will raise an error */
+                                       state->buf++;
+                                       state->state = WAITOPERAND;
+                                       continue;
+                               }
+                               else if (!t_isspace(state->buf))
+                               {
+                                       /* We rely on the tsvector parser to parse the value for us */
+                                       reset_tsvector_parser(state->valstate, state->buf);
+                                       if (gettoken_tsvector(state->valstate, strval, lenval,
+                                                                                 NULL, NULL, &state->buf))
+                                       {
+                                               state->state = WAITOPERATOR;
+                                               return PT_VAL;
+                                       }
+                                       else if (state->state == WAITFIRSTOPERAND)
+                                       {
+                                               return PT_END;
+                                       }
+                                       else
+                                       {
+                                               /* finally, we have to provide an operand */
+                                               pushStop(state);
+                                               return PT_END;
+                                       }
+                               }
                                break;
-                       case WAITSINGLEOPERAND:
-                               if (*(state->buf) == '\0')
+
+                       case WAITOPERATOR:
+                               if (t_iseq(state->buf, '"'))
+                               {
+                                       if (!state->in_quotes)
+                                       {
+                                               /*
+                                                * put implicit AND after an operand
+                                                * and handle this quote in WAITOPERAND
+                                                */
+                                               state->state = WAITOPERAND;
+                                               *operator = OP_AND;
+                                               return PT_OPR;
+                                       }
+                                       else
+                                       {
+                                               state->buf++;
+
+                                               /* just close quotes */
+                                               state->in_quotes = false;
+                                               return PT_CLOSE;
+                                       }
+                               }
+                               else if (parse_or_operator(state))
+                               {
+                                       state->state = WAITOPERAND;
+                                       *operator = OP_OR;
+                                       return PT_OPR;
+                               }
+                               else if (*state->buf == '\0')
+                               {
                                        return PT_END;
-                               *strval = state->buf;
-                               *lenval = strlen(state->buf);
-                               state->buf += strlen(state->buf);
-                               state->count++;
-                               return PT_VAL;
-                       default:
-                               return PT_ERR;
+                               }
+                               else if (!t_isspace(state->buf))
+                               {
+                                       if (state->in_quotes)
+                                       {
+                                               /* put implicit <-> after an operand */
+                                               *operator = OP_PHRASE;
+                                               *weight = 1;
+                                       }
+                                       else
+                                       {
+                                               /* put implicit AND after an operand */
+                                               *operator = OP_AND;
+                                       }
+
+                                       state->state = WAITOPERAND;
+                                       return PT_OPR;
+                               }
                                break;
                }
+
                state->buf += pg_mblen(state->buf);
        }
 }
 
+static ts_tokentype
+gettoken_query_plain(TSQueryParserState state, int8 *operator,
+                                        int *lenval, char **strval,
+                                        int16 *weight, bool *prefix)
+{
+       *weight = 0;
+       *prefix = false;
+
+       if (*state->buf == '\0')
+               return PT_END;
+
+       *strval = state->buf;
+       *lenval = strlen(state->buf);
+       state->buf += *lenval;
+       state->count++;
+       return PT_VAL;
+}
+
 /*
  * Push an operator to state->polstr
  */
@@ -489,7 +693,9 @@ makepol(TSQueryParserState state,
        /* since this function recurses, it could be driven to stack overflow */
        check_stack_depth();
 
-       while ((type = gettoken_query(state, &operator, &lenval, &strval, &weight, &prefix)) != PT_END)
+       while ((type = state->gettoken(state, &operator,
+                                                                  &lenval, &strval,
+                                                                  &weight, &prefix)) != PT_END)
        {
                switch (type)
                {
@@ -605,7 +811,7 @@ TSQuery
 parse_tsquery(char *buf,
                          PushFunction pushval,
                          Datum opaque,
-                         bool isplain)
+                         int flags)
 {
        struct TSQueryParserStateData state;
        int                     i;
@@ -614,16 +820,32 @@ parse_tsquery(char *buf,
        QueryItem  *ptr;
        ListCell   *cell;
        bool            needcleanup;
+       int                     tsv_flags = P_TSV_OPR_IS_DELIM | P_TSV_IS_TSQUERY;
+
+       /* plain should not be used with web */
+       Assert((flags & (P_TSQ_PLAIN | P_TSQ_WEB)) != (P_TSQ_PLAIN | P_TSQ_WEB));
+
+       /* select suitable tokenizer */
+       if (flags & P_TSQ_PLAIN)
+               state.gettoken = gettoken_query_plain;
+       else if (flags & P_TSQ_WEB)
+       {
+               state.gettoken = gettoken_query_websearch;
+               tsv_flags |= P_TSV_IS_WEB;
+       }
+       else
+               state.gettoken = gettoken_query_standard;
 
        /* init state */
        state.buffer = buf;
        state.buf = buf;
-       state.state = (isplain) ? WAITSINGLEOPERAND : WAITFIRSTOPERAND;
        state.count = 0;
+       state.in_quotes = false;
+       state.state = WAITFIRSTOPERAND;
        state.polstr = NIL;
 
        /* init value parser's state */
-       state.valstate = init_tsvector_parser(state.buffer, true, true);
+       state.valstate = init_tsvector_parser(state.buffer, tsv_flags);
 
        /* init list of operand */
        state.sumlen = 0;
@@ -716,7 +938,7 @@ tsqueryin(PG_FUNCTION_ARGS)
 {
        char       *in = PG_GETARG_CSTRING(0);
 
-       PG_RETURN_TSQUERY(parse_tsquery(in, pushval_asis, PointerGetDatum(NULL), false));
+       PG_RETURN_TSQUERY(parse_tsquery(in, pushval_asis, PointerGetDatum(NULL), 0));
 }
 
 /*
index 64e02ef4343b28780f7ad43bc756e44952e6fd82..7a27bd12a31676c29f63153a8df537e07974d760 100644 (file)
@@ -200,7 +200,7 @@ tsvectorin(PG_FUNCTION_ARGS)
        char       *cur;
        int                     buflen = 256;   /* allocated size of tmpbuf */
 
-       state = init_tsvector_parser(buf, false, false);
+       state = init_tsvector_parser(buf, 0);
 
        arrlen = 64;
        arr = (WordEntryIN *) palloc(sizeof(WordEntryIN) * arrlen);
index 7367ba6a40f7c66991d16de29ecb06cc8c02a50f..fed411a842e75fa7c0c065db5866b4b13c7154f2 100644 (file)
@@ -33,6 +33,7 @@ struct TSVectorParseStateData
        int                     eml;                    /* max bytes per character */
        bool            oprisdelim;             /* treat ! | * ( ) as delimiters? */
        bool            is_tsquery;             /* say "tsquery" not "tsvector" in errors? */
+       bool            is_web;                 /* we're in websearch_to_tsquery() */
 };
 
 
@@ -42,7 +43,7 @@ struct TSVectorParseStateData
  * ! | & ( )
  */
 TSVectorParseState
-init_tsvector_parser(char *input, bool oprisdelim, bool is_tsquery)
+init_tsvector_parser(char *input, int flags)
 {
        TSVectorParseState state;
 
@@ -52,8 +53,9 @@ init_tsvector_parser(char *input, bool oprisdelim, bool is_tsquery)
        state->len = 32;
        state->word = (char *) palloc(state->len);
        state->eml = pg_database_encoding_max_length();
-       state->oprisdelim = oprisdelim;
-       state->is_tsquery = is_tsquery;
+       state->oprisdelim = (flags & P_TSV_OPR_IS_DELIM) != 0;
+       state->is_tsquery = (flags & P_TSV_IS_TSQUERY) != 0;
+       state->is_web = (flags & P_TSV_IS_WEB) != 0;
 
        return state;
 }
@@ -89,16 +91,6 @@ do { \
        } \
 } while (0)
 
-/* phrase operator begins with '<' */
-#define ISOPERATOR(x) \
-       ( pg_mblen(x) == 1 && ( *(x) == '!' ||  \
-                                                       *(x) == '&' ||  \
-                                                       *(x) == '|' ||  \
-                                                       *(x) == '(' ||  \
-                                                       *(x) == ')' ||  \
-                                                       *(x) == '<'             \
-                                                 ) )
-
 /* Fills gettoken_tsvector's output parameters, and returns true */
 #define RETURN_TOKEN \
 do { \
@@ -183,14 +175,15 @@ gettoken_tsvector(TSVectorParseState state,
                {
                        if (*(state->prsbuf) == '\0')
                                return false;
-                       else if (t_iseq(state->prsbuf, '\''))
+                       else if (!state->is_web && t_iseq(state->prsbuf, '\''))
                                statecode = WAITENDCMPLX;
-                       else if (t_iseq(state->prsbuf, '\\'))
+                       else if (!state->is_web && t_iseq(state->prsbuf, '\\'))
                        {
                                statecode = WAITNEXTCHAR;
                                oldstate = WAITENDWORD;
                        }
-                       else if (state->oprisdelim && ISOPERATOR(state->prsbuf))
+                       else if ((state->oprisdelim && ISOPERATOR(state->prsbuf)) ||
+                                        (state->is_web && t_iseq(state->prsbuf, '"')))
                                PRSSYNTAXERROR;
                        else if (!t_isspace(state->prsbuf))
                        {
@@ -217,13 +210,14 @@ gettoken_tsvector(TSVectorParseState state,
                }
                else if (statecode == WAITENDWORD)
                {
-                       if (t_iseq(state->prsbuf, '\\'))
+                       if (!state->is_web && t_iseq(state->prsbuf, '\\'))
                        {
                                statecode = WAITNEXTCHAR;
                                oldstate = WAITENDWORD;
                        }
                        else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' ||
-                                        (state->oprisdelim && ISOPERATOR(state->prsbuf)))
+                                        (state->oprisdelim && ISOPERATOR(state->prsbuf)) ||
+                                        (state->is_web && t_iseq(state->prsbuf, '"')))
                        {
                                RESIZEPRSBUF;
                                if (curpos == state->word)
@@ -250,11 +244,11 @@ gettoken_tsvector(TSVectorParseState state,
                }
                else if (statecode == WAITENDCMPLX)
                {
-                       if (t_iseq(state->prsbuf, '\''))
+                       if (!state->is_web && t_iseq(state->prsbuf, '\''))
                        {
                                statecode = WAITCHARCMPLX;
                        }
-                       else if (t_iseq(state->prsbuf, '\\'))
+                       else if (!state->is_web && t_iseq(state->prsbuf, '\\'))
                        {
                                statecode = WAITNEXTCHAR;
                                oldstate = WAITENDCMPLX;
@@ -270,7 +264,7 @@ gettoken_tsvector(TSVectorParseState state,
                }
                else if (statecode == WAITCHARCMPLX)
                {
-                       if (t_iseq(state->prsbuf, '\''))
+                       if (!state->is_web && t_iseq(state->prsbuf, '\''))
                        {
                                RESIZEPRSBUF;
                                COPYCHAR(curpos, state->prsbuf);
index 5d55890b9dd35c0aca9e786903e8f2ab4bb5aac8..5f63efc35520c7ca7df09df6557463d5b4a9cafa 100644 (file)
@@ -53,6 +53,6 @@
  */
 
 /*                                                     yyyymmddN */
-#define CATALOG_VERSION_NO     201804031
+#define CATALOG_VERSION_NO     201804051
 
 #endif
index 9bf20c059bc5c129ae1d14d1c27662517bcb0659..edf212fcf0f9a738ce63ea267404eee094853d25 100644 (file)
@@ -4971,6 +4971,8 @@ DATA(insert OID = 3747 (  plainto_tsquery PGNSP PGUID 12 100 0 0 0 f f f t f i s
 DESCR("transform to tsquery");
 DATA(insert OID = 5006 (  phraseto_tsquery     PGNSP PGUID 12 100 0 0 0 f f f t f i s 2 0 3615 "3734 25" _null_ _null_ _null_ _null_ _null_ phraseto_tsquery_byid _null_ _null_ _null_ ));
 DESCR("transform to tsquery");
+DATA(insert OID = 8889 (  websearch_to_tsquery PGNSP PGUID 12 100 0 0 0 f f f  t f i s 2 0 3615 "3734 25" _null_ _null_ _null_ _null_ _null_ websearch_to_tsquery_byid _null_ _null_ _null_ ));
+DESCR("transform to tsquery");
 DATA(insert OID = 3749 (  to_tsvector          PGNSP PGUID 12 100 0 0 0 f f f t f s s 1 0 3614 "25" _null_ _null_ _null_ _null_ _null_ to_tsvector _null_ _null_ _null_ ));
 DESCR("transform to tsvector");
 DATA(insert OID = 3750 (  to_tsquery           PGNSP PGUID 12 100 0 0 0 f f f t f s s 1 0 3615 "25" _null_ _null_ _null_ _null_ _null_ to_tsquery _null_ _null_ _null_ ));
@@ -4979,6 +4981,8 @@ DATA(insert OID = 3751 (  plainto_tsquery PGNSP PGUID 12 100 0 0 0 f f f t f s s
 DESCR("transform to tsquery");
 DATA(insert OID = 5001 (  phraseto_tsquery     PGNSP PGUID 12 100 0 0 0 f f f t f s s 1 0 3615 "25" _null_ _null_ _null_ _null_ _null_ phraseto_tsquery _null_ _null_ _null_ ));
 DESCR("transform to tsquery");
+DATA(insert OID = 8890 (  websearch_to_tsquery PGNSP PGUID 12 100 0 0 0 f f f t f s s 1 0 3615 "25" _null_ _null_ _null_ _null_ _null_ websearch_to_tsquery _null_ _null_ _null_ ));
+DESCR("transform to tsquery");
 DATA(insert OID = 4209 (  to_tsvector          PGNSP PGUID 12 100 0 0 0 f f f t f s s 1 0 3614 "3802" _null_ _null_ _null_ _null_ _null_ jsonb_to_tsvector _null_ _null_ _null_ ));
 DESCR("transform jsonb to tsvector");
 DATA(insert OID = 4210 (  to_tsvector          PGNSP PGUID 12 100 0 0 0 f f f t f s s 1 0 3614 "114" _null_ _null_ _null_ _null_ _null_ json_to_tsvector _null_ _null_ _null_ ));
index f8ddce5ecbd68383fcf14cfa0f38fe5e3ce3c90e..73e969fe9cecfb2d5353e5f792a5fe3b529fd5d7 100644 (file)
 struct TSVectorParseStateData; /* opaque struct in tsvector_parser.c */
 typedef struct TSVectorParseStateData *TSVectorParseState;
 
-extern TSVectorParseState init_tsvector_parser(char *input,
-                                        bool oprisdelim,
-                                        bool is_tsquery);
+#define P_TSV_OPR_IS_DELIM     (1 << 0)
+#define P_TSV_IS_TSQUERY       (1 << 1)
+#define P_TSV_IS_WEB           (1 << 2)
+
+extern TSVectorParseState init_tsvector_parser(char *input, int flags);
 extern void reset_tsvector_parser(TSVectorParseState state, char *input);
 extern bool gettoken_tsvector(TSVectorParseState state,
                                  char **token, int *len,
@@ -35,6 +37,16 @@ extern bool gettoken_tsvector(TSVectorParseState state,
                                  char **endptr);
 extern void close_tsvector_parser(TSVectorParseState state);
 
+/* phrase operator begins with '<' */
+#define ISOPERATOR(x) \
+       ( pg_mblen(x) == 1 && ( *(x) == '!' ||  \
+                                                       *(x) == '&' ||  \
+                                                       *(x) == '|' ||  \
+                                                       *(x) == '(' ||  \
+                                                       *(x) == ')' ||  \
+                                                       *(x) == '<'             \
+                                                 ) )
+
 /* parse_tsquery */
 
 struct TSQueryParserStateData; /* private in backend/utils/adt/tsquery.c */
@@ -46,9 +58,13 @@ typedef void (*PushFunction) (Datum opaque, TSQueryParserState state,
                                                                                                         * QueryOperand struct */
                                                          bool prefix);
 
+#define P_TSQ_PLAIN            (1 << 0)
+#define P_TSQ_WEB              (1 << 1)
+
 extern TSQuery parse_tsquery(char *buf,
-                         PushFunction pushval,
-                         Datum opaque, bool isplain);
+                                                        PushFunction pushval,
+                                                        Datum opaque,
+                                                        int flags);
 
 /* Functions for use by PushFunction implementations */
 extern void pushValue(TSQueryParserState state,
index d63fb12f1de2a71eeff09f1a012a71ee072bd52a..c38237c8a4dfdb7b8067d1bef3b8e3f83ec75be7 100644 (file)
@@ -1672,3 +1672,426 @@ select * from phrase_index_test where fts @@ phraseto_tsquery('english', 'fat ca
 (1 row)
 
 set enable_seqscan = on;
+-- test websearch_to_tsquery function
+select websearch_to_tsquery('simple', 'I have a fat:*ABCD cat');
+            websearch_to_tsquery             
+---------------------------------------------
+ 'i' & 'have' & 'a' & 'fat' & 'abcd' & 'cat'
+(1 row)
+
+select websearch_to_tsquery('simple', 'orange:**AABBCCDD');
+ websearch_to_tsquery  
+-----------------------
+ 'orange' & 'aabbccdd'
+(1 row)
+
+select websearch_to_tsquery('simple', 'fat:A!cat:B|rat:C<');
+          websearch_to_tsquery           
+-----------------------------------------
+ 'fat' & 'a' & 'cat' & 'b' & 'rat' & 'c'
+(1 row)
+
+select websearch_to_tsquery('simple', 'fat:A : cat:B');
+   websearch_to_tsquery    
+---------------------------
+ 'fat' & 'a' & 'cat' & 'b'
+(1 row)
+
+select websearch_to_tsquery('simple', 'fat*rat');
+ websearch_to_tsquery 
+----------------------
+ 'fat' & 'rat'
+(1 row)
+
+select websearch_to_tsquery('simple', 'fat-rat');
+   websearch_to_tsquery    
+---------------------------
+ 'fat-rat' & 'fat' & 'rat'
+(1 row)
+
+select websearch_to_tsquery('simple', 'fat_rat');
+ websearch_to_tsquery 
+----------------------
+ 'fat' & 'rat'
+(1 row)
+
+-- weights are completely ignored
+select websearch_to_tsquery('simple', 'abc : def');
+ websearch_to_tsquery 
+----------------------
+ 'abc' & 'def'
+(1 row)
+
+select websearch_to_tsquery('simple', 'abc:def');
+ websearch_to_tsquery 
+----------------------
+ 'abc' & 'def'
+(1 row)
+
+select websearch_to_tsquery('simple', 'a:::b');
+ websearch_to_tsquery 
+----------------------
+ 'a' & 'b'
+(1 row)
+
+select websearch_to_tsquery('simple', 'abc:d');
+ websearch_to_tsquery 
+----------------------
+ 'abc' & 'd'
+(1 row)
+
+select websearch_to_tsquery('simple', ':');
+NOTICE:  text-search query contains only stop words or doesn't contain lexemes, ignored
+ websearch_to_tsquery 
+----------------------
+(1 row)
+
+-- these operators are ignored
+select websearch_to_tsquery('simple', 'abc & def');
+ websearch_to_tsquery 
+----------------------
+ 'abc' & 'def'
+(1 row)
+
+select websearch_to_tsquery('simple', 'abc | def');
+ websearch_to_tsquery 
+----------------------
+ 'abc' & 'def'
+(1 row)
+
+select websearch_to_tsquery('simple', 'abc <-> def');
+ websearch_to_tsquery 
+----------------------
+ 'abc' & 'def'
+(1 row)
+
+select websearch_to_tsquery('simple', 'abc (pg or class)');
+  websearch_to_tsquery  
+------------------------
+ 'abc' & 'pg' | 'class'
+(1 row)
+
+-- NOT is ignored in quotes
+select websearch_to_tsquery('english', 'My brand new smartphone');
+     websearch_to_tsquery      
+-------------------------------
+ 'brand' & 'new' & 'smartphon'
+(1 row)
+
+select websearch_to_tsquery('english', 'My brand "new smartphone"');
+      websearch_to_tsquery       
+---------------------------------
+ 'brand' & 'new' <-> 'smartphon'
+(1 row)
+
+select websearch_to_tsquery('english', 'My brand "new -smartphone"');
+      websearch_to_tsquery       
+---------------------------------
+ 'brand' & 'new' <-> 'smartphon'
+(1 row)
+
+-- test OR operator
+select websearch_to_tsquery('simple', 'cat or rat');
+ websearch_to_tsquery 
+----------------------
+ 'cat' | 'rat'
+(1 row)
+
+select websearch_to_tsquery('simple', 'cat OR rat');
+ websearch_to_tsquery 
+----------------------
+ 'cat' | 'rat'
+(1 row)
+
+select websearch_to_tsquery('simple', 'cat "OR" rat');
+ websearch_to_tsquery 
+----------------------
+ 'cat' & 'or' & 'rat'
+(1 row)
+
+select websearch_to_tsquery('simple', 'cat OR');
+ websearch_to_tsquery 
+----------------------
+ 'cat' & 'or'
+(1 row)
+
+select websearch_to_tsquery('simple', 'OR rat');
+ websearch_to_tsquery 
+----------------------
+ 'or' & 'rat'
+(1 row)
+
+select websearch_to_tsquery('simple', '"fat cat OR rat"');
+        websearch_to_tsquery        
+------------------------------------
+ 'fat' <-> 'cat' <-> 'or' <-> 'rat'
+(1 row)
+
+select websearch_to_tsquery('simple', 'fat (cat OR rat');
+ websearch_to_tsquery  
+-----------------------
+ 'fat' & 'cat' | 'rat'
+(1 row)
+
+select websearch_to_tsquery('simple', 'or OR or');
+ websearch_to_tsquery 
+----------------------
+ 'or' | 'or'
+(1 row)
+
+-- OR is an operator here ...
+select websearch_to_tsquery('simple', '"fat cat"or"fat rat"');
+       websearch_to_tsquery        
+-----------------------------------
+ 'fat' <-> 'cat' | 'fat' <-> 'rat'
+(1 row)
+
+select websearch_to_tsquery('simple', 'fat or(rat');
+ websearch_to_tsquery 
+----------------------
+ 'fat' | 'rat'
+(1 row)
+
+select websearch_to_tsquery('simple', 'fat or)rat');
+ websearch_to_tsquery 
+----------------------
+ 'fat' | 'rat'
+(1 row)
+
+select websearch_to_tsquery('simple', 'fat or&rat');
+ websearch_to_tsquery 
+----------------------
+ 'fat' | 'rat'
+(1 row)
+
+select websearch_to_tsquery('simple', 'fat or|rat');
+ websearch_to_tsquery 
+----------------------
+ 'fat' | 'rat'
+(1 row)
+
+select websearch_to_tsquery('simple', 'fat or!rat');
+ websearch_to_tsquery 
+----------------------
+ 'fat' | 'rat'
+(1 row)
+
+select websearch_to_tsquery('simple', 'fat or<rat');
+ websearch_to_tsquery 
+----------------------
+ 'fat' | 'rat'
+(1 row)
+
+select websearch_to_tsquery('simple', 'fat or>rat');
+ websearch_to_tsquery 
+----------------------
+ 'fat' | 'rat'
+(1 row)
+
+select websearch_to_tsquery('simple', 'fat or ');
+ websearch_to_tsquery 
+----------------------
+ 'fat' & 'or'
+(1 row)
+
+-- ... but not here
+select websearch_to_tsquery('simple', 'abc orange');
+ websearch_to_tsquery 
+----------------------
+ 'abc' & 'orange'
+(1 row)
+
+select websearch_to_tsquery('simple', 'abc orтест');
+ websearch_to_tsquery 
+----------------------
+ 'abc' & 'orтест'
+(1 row)
+
+select websearch_to_tsquery('simple', 'abc OR1234');
+ websearch_to_tsquery 
+----------------------
+ 'abc' & 'or1234'
+(1 row)
+
+select websearch_to_tsquery('simple', 'abc or-abc');
+      websearch_to_tsquery       
+---------------------------------
+ 'abc' & 'or-abc' & 'or' & 'abc'
+(1 row)
+
+select websearch_to_tsquery('simple', 'abc OR_abc');
+ websearch_to_tsquery 
+----------------------
+ 'abc' & 'or' & 'abc'
+(1 row)
+
+-- test quotes
+select websearch_to_tsquery('english', '"pg_class pg');
+ websearch_to_tsquery  
+-----------------------
+ 'pg' & 'class' & 'pg'
+(1 row)
+
+select websearch_to_tsquery('english', 'pg_class pg"');
+ websearch_to_tsquery  
+-----------------------
+ 'pg' & 'class' & 'pg'
+(1 row)
+
+select websearch_to_tsquery('english', '"pg_class pg"');
+    websearch_to_tsquery     
+-----------------------------
+ ( 'pg' & 'class' ) <-> 'pg'
+(1 row)
+
+select websearch_to_tsquery('english', 'abc "pg_class pg"');
+        websearch_to_tsquery         
+-------------------------------------
+ 'abc' & ( 'pg' & 'class' ) <-> 'pg'
+(1 row)
+
+select websearch_to_tsquery('english', '"pg_class pg" def');
+        websearch_to_tsquery         
+-------------------------------------
+ ( 'pg' & 'class' ) <-> 'pg' & 'def'
+(1 row)
+
+select websearch_to_tsquery('english', 'abc "pg pg_class pg" def');
+                 websearch_to_tsquery                 
+------------------------------------------------------
+ 'abc' & 'pg' <-> ( 'pg' & 'class' ) <-> 'pg' & 'def'
+(1 row)
+
+select websearch_to_tsquery('english', ' or "pg pg_class pg" or ');
+         websearch_to_tsquery         
+--------------------------------------
+ 'pg' <-> ( 'pg' & 'class' ) <-> 'pg'
+(1 row)
+
+select websearch_to_tsquery('english', '""pg pg_class pg""');
+     websearch_to_tsquery     
+------------------------------
+ 'pg' & 'pg' & 'class' & 'pg'
+(1 row)
+
+select websearch_to_tsquery('english', 'abc """"" def');
+ websearch_to_tsquery 
+----------------------
+ 'abc' & 'def'
+(1 row)
+
+select websearch_to_tsquery('english', 'cat -"fat rat"');
+     websearch_to_tsquery     
+------------------------------
+ 'cat' & !( 'fat' <-> 'rat' )
+(1 row)
+
+select websearch_to_tsquery('english', 'cat -"fat rat" cheese');
+          websearch_to_tsquery          
+----------------------------------------
+ 'cat' & !( 'fat' <-> 'rat' ) & 'chees'
+(1 row)
+
+select websearch_to_tsquery('english', 'abc "def -"');
+ websearch_to_tsquery 
+----------------------
+ 'abc' & 'def'
+(1 row)
+
+select websearch_to_tsquery('english', 'abc "def :"');
+ websearch_to_tsquery 
+----------------------
+ 'abc' & 'def'
+(1 row)
+
+select websearch_to_tsquery('english', '"A fat cat" has just eaten a -rat.');
+        websearch_to_tsquery        
+------------------------------------
+ 'fat' <-> 'cat' & 'eaten' & !'rat'
+(1 row)
+
+select websearch_to_tsquery('english', '"A fat cat" has just eaten OR !rat.');
+       websearch_to_tsquery        
+-----------------------------------
+ 'fat' <-> 'cat' & 'eaten' | 'rat'
+(1 row)
+
+select websearch_to_tsquery('english', '"A fat cat" has just (+eaten OR -rat)');
+        websearch_to_tsquery        
+------------------------------------
+ 'fat' <-> 'cat' & 'eaten' | !'rat'
+(1 row)
+
+select websearch_to_tsquery('english', 'this is ----fine');
+ websearch_to_tsquery 
+----------------------
+ !!!!'fine'
+(1 row)
+
+select websearch_to_tsquery('english', '(()) )))) this ||| is && -fine, "dear friend" OR good');
+          websearch_to_tsquery          
+----------------------------------------
+ !'fine' & 'dear' <-> 'friend' | 'good'
+(1 row)
+
+select websearch_to_tsquery('english', 'an old <-> cat " is fine &&& too');
+  websearch_to_tsquery  
+------------------------
+ 'old' & 'cat' & 'fine'
+(1 row)
+
+select websearch_to_tsquery('english', '"A the" OR just on');
+NOTICE:  text-search query contains only stop words or doesn't contain lexemes, ignored
+ websearch_to_tsquery 
+----------------------
+(1 row)
+
+select websearch_to_tsquery('english', '"a fat cat" ate a rat');
+      websearch_to_tsquery       
+---------------------------------
+ 'fat' <-> 'cat' & 'ate' & 'rat'
+(1 row)
+
+select to_tsvector('english', 'A fat cat ate a rat') @@
+       websearch_to_tsquery('english', '"a fat cat" ate a rat');
+ ?column? 
+----------
+ t
+(1 row)
+
+select to_tsvector('english', 'A fat grey cat ate a rat') @@
+       websearch_to_tsquery('english', '"a fat cat" ate a rat');
+ ?column? 
+----------
+ f
+(1 row)
+
+-- cases handled by gettoken_tsvector()
+select websearch_to_tsquery('''');
+NOTICE:  text-search query contains only stop words or doesn't contain lexemes, ignored
+ websearch_to_tsquery 
+----------------------
+(1 row)
+
+select websearch_to_tsquery('''abc''''def''');
+ websearch_to_tsquery 
+----------------------
+ 'abc' & 'def'
+(1 row)
+
+select websearch_to_tsquery('\abc');
+ websearch_to_tsquery 
+----------------------
+ 'abc'
+(1 row)
+
+select websearch_to_tsquery('\');
+NOTICE:  text-search query contains only stop words or doesn't contain lexemes, ignored
+ websearch_to_tsquery 
+----------------------
+(1 row)
+
index 1c8520b3e917a9f7c562528325e2db828855a3c7..1768541f21bd84777d999dbc4a263c437fc67c54 100644 (file)
@@ -539,3 +539,97 @@ create index phrase_index_test_idx on phrase_index_test using gin(fts);
 set enable_seqscan = off;
 select * from phrase_index_test where fts @@ phraseto_tsquery('english', 'fat cat');
 set enable_seqscan = on;
+
+-- test websearch_to_tsquery function
+select websearch_to_tsquery('simple', 'I have a fat:*ABCD cat');
+select websearch_to_tsquery('simple', 'orange:**AABBCCDD');
+select websearch_to_tsquery('simple', 'fat:A!cat:B|rat:C<');
+select websearch_to_tsquery('simple', 'fat:A : cat:B');
+
+select websearch_to_tsquery('simple', 'fat*rat');
+select websearch_to_tsquery('simple', 'fat-rat');
+select websearch_to_tsquery('simple', 'fat_rat');
+
+-- weights are completely ignored
+select websearch_to_tsquery('simple', 'abc : def');
+select websearch_to_tsquery('simple', 'abc:def');
+select websearch_to_tsquery('simple', 'a:::b');
+select websearch_to_tsquery('simple', 'abc:d');
+select websearch_to_tsquery('simple', ':');
+
+-- these operators are ignored
+select websearch_to_tsquery('simple', 'abc & def');
+select websearch_to_tsquery('simple', 'abc | def');
+select websearch_to_tsquery('simple', 'abc <-> def');
+select websearch_to_tsquery('simple', 'abc (pg or class)');
+
+-- NOT is ignored in quotes
+select websearch_to_tsquery('english', 'My brand new smartphone');
+select websearch_to_tsquery('english', 'My brand "new smartphone"');
+select websearch_to_tsquery('english', 'My brand "new -smartphone"');
+
+-- test OR operator
+select websearch_to_tsquery('simple', 'cat or rat');
+select websearch_to_tsquery('simple', 'cat OR rat');
+select websearch_to_tsquery('simple', 'cat "OR" rat');
+select websearch_to_tsquery('simple', 'cat OR');
+select websearch_to_tsquery('simple', 'OR rat');
+select websearch_to_tsquery('simple', '"fat cat OR rat"');
+select websearch_to_tsquery('simple', 'fat (cat OR rat');
+select websearch_to_tsquery('simple', 'or OR or');
+
+-- OR is an operator here ...
+select websearch_to_tsquery('simple', '"fat cat"or"fat rat"');
+select websearch_to_tsquery('simple', 'fat or(rat');
+select websearch_to_tsquery('simple', 'fat or)rat');
+select websearch_to_tsquery('simple', 'fat or&rat');
+select websearch_to_tsquery('simple', 'fat or|rat');
+select websearch_to_tsquery('simple', 'fat or!rat');
+select websearch_to_tsquery('simple', 'fat or<rat');
+select websearch_to_tsquery('simple', 'fat or>rat');
+select websearch_to_tsquery('simple', 'fat or ');
+
+-- ... but not here
+select websearch_to_tsquery('simple', 'abc orange');
+select websearch_to_tsquery('simple', 'abc orтест');
+select websearch_to_tsquery('simple', 'abc OR1234');
+select websearch_to_tsquery('simple', 'abc or-abc');
+select websearch_to_tsquery('simple', 'abc OR_abc');
+
+-- test quotes
+select websearch_to_tsquery('english', '"pg_class pg');
+select websearch_to_tsquery('english', 'pg_class pg"');
+select websearch_to_tsquery('english', '"pg_class pg"');
+select websearch_to_tsquery('english', 'abc "pg_class pg"');
+select websearch_to_tsquery('english', '"pg_class pg" def');
+select websearch_to_tsquery('english', 'abc "pg pg_class pg" def');
+select websearch_to_tsquery('english', ' or "pg pg_class pg" or ');
+select websearch_to_tsquery('english', '""pg pg_class pg""');
+select websearch_to_tsquery('english', 'abc """"" def');
+select websearch_to_tsquery('english', 'cat -"fat rat"');
+select websearch_to_tsquery('english', 'cat -"fat rat" cheese');
+select websearch_to_tsquery('english', 'abc "def -"');
+select websearch_to_tsquery('english', 'abc "def :"');
+
+select websearch_to_tsquery('english', '"A fat cat" has just eaten a -rat.');
+select websearch_to_tsquery('english', '"A fat cat" has just eaten OR !rat.');
+select websearch_to_tsquery('english', '"A fat cat" has just (+eaten OR -rat)');
+
+select websearch_to_tsquery('english', 'this is ----fine');
+select websearch_to_tsquery('english', '(()) )))) this ||| is && -fine, "dear friend" OR good');
+select websearch_to_tsquery('english', 'an old <-> cat " is fine &&& too');
+
+select websearch_to_tsquery('english', '"A the" OR just on');
+select websearch_to_tsquery('english', '"a fat cat" ate a rat');
+
+select to_tsvector('english', 'A fat cat ate a rat') @@
+       websearch_to_tsquery('english', '"a fat cat" ate a rat');
+
+select to_tsvector('english', 'A fat grey cat ate a rat') @@
+       websearch_to_tsquery('english', '"a fat cat" ate a rat');
+
+-- cases handled by gettoken_tsvector()
+select websearch_to_tsquery('''');
+select websearch_to_tsquery('''abc''''def''');
+select websearch_to_tsquery('\abc');
+select websearch_to_tsquery('\');