Do COPY FROM encoding conversion/verification in larger chunks.

author Heikki Linnakangas <[email protected]>

Thu, 1 Apr 2021 09:23:40 +0000 (12:23 +0300)

committer Heikki Linnakangas <[email protected]>

Thu, 1 Apr 2021 09:23:40 +0000 (12:23 +0300)
author Heikki Linnakangas <[email protected]>
Thu, 1 Apr 2021 09:23:40 +0000 (12:23 +0300)
committer Heikki Linnakangas <[email protected]>
Thu, 1 Apr 2021 09:23:40 +0000 (12:23 +0300)
diff --git a/src/backend/commands/copyfrom.c b/src/backend/commands/copyfrom.c

index 74dbb709fe7dde35a4e7b5298efe69aa400a1c68..be2e3d7354f74f9898c92ad67a264196970c1b09 100644 (file)
--- a/src/backend/commands/copyfrom.c
+++ b/src/backend/commands/copyfrom.c
@@ -3,6 +3,12 @@
   * copyfrom.c
   *     COPY <table> FROM file/program/client
   *
+ * This file contains routines needed to efficiently load tuples into a
+ * table.  That includes looking up the correct partition, firing triggers,
+ * calling the table AM function to insert the data, and updating indexes.
+ * Reading data from the input file or client and parsing it into Datums
+ * is handled in copyfromparse.c.
+ *
   * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
@@ -23,6 +29,7 @@
  #include "access/tableam.h"
  #include "access/xact.h"
  #include "access/xlog.h"
+#include "catalog/namespace.h"
  #include "commands/copy.h"
  #include "commands/copyfrom_internal.h"
  #include "commands/progress.h"
@@ -87,7 +94,7 @@ typedef struct CopyMultiInsertInfo
     List       *multiInsertBuffers; /* List of tracked CopyMultiInsertBuffers */
     int         bufferedTuples; /* number of tuples buffered over all buffers */
     int         bufferedBytes;  /* number of bytes from all buffered tuples */
-   CopyFromState   cstate;         /* Copy state for this CopyMultiInsertInfo */
+   CopyFromState cstate;       /* Copy state for this CopyMultiInsertInfo */
     EState     *estate;         /* Executor state used for COPY */
     CommandId   mycid;          /* Command Id used for COPY */
     int         ti_options;     /* table insert options */
@@ -107,7 +114,7 @@ static void ClosePipeFromProgram(CopyFromState cstate);
  void
  CopyFromErrorCallback(void *arg)
  {
-   CopyFromState   cstate = (CopyFromState) arg;
+   CopyFromState cstate = (CopyFromState) arg;
     char        curlineno_str[32];
  
     snprintf(curlineno_str, sizeof(curlineno_str), UINT64_FORMAT,
@@ -149,15 +156,9 @@ CopyFromErrorCallback(void *arg)
             /*
              * Error is relevant to a particular line.
              *
-            * If line_buf still contains the correct line, and it's already
-            * transcoded, print it. If it's still in a foreign encoding, it's
-            * quite likely that the error is precisely a failure to do
-            * encoding conversion (ie, bad data). We dare not try to convert
-            * it, and at present there's no way to regurgitate it without
-            * conversion. So we have to punt and just report the line number.
+            * If line_buf still contains the correct line, print it.
              */
-           if (cstate->line_buf_valid &&
-               (cstate->line_buf_converted || !cstate->need_transcoding))
+           if (cstate->line_buf_valid)
             {
                 char       *lineval;
  
@@ -300,7 +301,7 @@ CopyMultiInsertBufferFlush(CopyMultiInsertInfo *miinfo,
     MemoryContext oldcontext;
     int         i;
     uint64      save_cur_lineno;
-   CopyFromState   cstate = miinfo->cstate;
+   CopyFromState cstate = miinfo->cstate;
     EState     *estate = miinfo->estate;
     CommandId   mycid = miinfo->mycid;
     int         ti_options = miinfo->ti_options;
@@ -1191,7 +1192,7 @@ BeginCopyFrom(ParseState *pstate,
               List *attnamelist,
               List *options)
  {
-   CopyFromState   cstate;
+   CopyFromState cstate;
     bool        pipe = (filename == NULL);
     TupleDesc   tupDesc;
     AttrNumber  num_phys_attrs,
@@ -1229,7 +1230,7 @@ BeginCopyFrom(ParseState *pstate,
     oldcontext = MemoryContextSwitchTo(cstate->copycontext);
  
     /* Extract options from the statement node tree */
-   ProcessCopyOptions(pstate, &cstate->opts, true /* is_from */, options);
+   ProcessCopyOptions(pstate, &cstate->opts, true /* is_from */ , options);
  
     /* Process the target relation */
     cstate->rel = rel;
@@ -1320,15 +1321,20 @@ BeginCopyFrom(ParseState *pstate,
         cstate->file_encoding = cstate->opts.file_encoding;
  
     /*
-    * Set up encoding conversion info.  Even if the file and server encodings
-    * are the same, we must apply pg_any_to_server() to validate data in
-    * multibyte encodings.
+    * Look up encoding conversion function.
      */
-   cstate->need_transcoding =
-       (cstate->file_encoding != GetDatabaseEncoding() ||
-        pg_database_encoding_max_length() > 1);
-   /* See Multibyte encoding comment above */
-   cstate->encoding_embeds_ascii = PG_ENCODING_IS_CLIENT_ONLY(cstate->file_encoding);
+   if (cstate->file_encoding == GetDatabaseEncoding() ||
+       cstate->file_encoding == PG_SQL_ASCII ||
+       GetDatabaseEncoding() == PG_SQL_ASCII)
+   {
+       cstate->need_transcoding = false;
+   }
+   else
+   {
+       cstate->need_transcoding = true;
+       cstate->conversion_proc = FindDefaultConversionProc(cstate->file_encoding,
+                                                           GetDatabaseEncoding());
+   }
  
     cstate->copy_src = COPY_FILE;   /* default */
  
@@ -1339,7 +1345,6 @@ BeginCopyFrom(ParseState *pstate,
     oldcontext = MemoryContextSwitchTo(cstate->copycontext);
  
     /* Initialize state variables */
-   cstate->reached_eof = false;
     cstate->eol_type = EOL_UNKNOWN;
     cstate->cur_relname = RelationGetRelationName(cstate->rel);
     cstate->cur_lineno = 0;
@@ -1347,19 +1352,36 @@ BeginCopyFrom(ParseState *pstate,
     cstate->cur_attval = NULL;
  
     /*
-    * Set up variables to avoid per-attribute overhead.  attribute_buf and
-    * raw_buf are used in both text and binary modes, but we use line_buf
-    * only in text mode.
+    * Allocate buffers for the input pipeline.
+    *
+    * attribute_buf and raw_buf are used in both text and binary modes, but
+    * input_buf and line_buf only in text mode.
      */
-   initStringInfo(&cstate->attribute_buf);
-   cstate->raw_buf = (char *) palloc(RAW_BUF_SIZE + 1);
+   cstate->raw_buf = palloc(RAW_BUF_SIZE + 1);
     cstate->raw_buf_index = cstate->raw_buf_len = 0;
+   cstate->raw_reached_eof = false;
+
     if (!cstate->opts.binary)
     {
+       /*
+        * If encoding conversion is needed, we need another buffer to hold
+        * the converted input data.  Otherwise, we can just point input_buf
+        * to the same buffer as raw_buf.
+        */
+       if (cstate->need_transcoding)
+       {
+           cstate->input_buf = (char *) palloc(INPUT_BUF_SIZE + 1);
+           cstate->input_buf_index = cstate->input_buf_len = 0;
+       }
+       else
+           cstate->input_buf = cstate->raw_buf;
+       cstate->input_reached_eof = false;
+
         initStringInfo(&cstate->line_buf);
-       cstate->line_buf_converted = false;
     }
  
+   initStringInfo(&cstate->attribute_buf);
+
     /* Assign range table, we'll need it in CopyFrom. */
     if (pstate)
         cstate->range_table = pstate->p_rtable;
@@ -1584,7 +1606,7 @@ ClosePipeFromProgram(CopyFromState cstate)
          * should not report that as an error.  Otherwise, SIGPIPE indicates a
          * problem.
          */
-       if (!cstate->reached_eof &&
+       if (!cstate->raw_reached_eof &&
             wait_result_is_signal(pclose_rc, SIGPIPE))
             return;
  
diff --git a/src/backend/commands/copyfromparse.c b/src/backend/commands/copyfromparse.c

index ce24a1528bd745f252e582615a02df9fce4eba11..0813424768f39d718763a0c2ebcb410ddf76773d 100644 (file)
--- a/src/backend/commands/copyfromparse.c
+++ b/src/backend/commands/copyfromparse.c
@@ -3,6 +3,50 @@
   * copyfromparse.c
   *     Parse CSV/text/binary format for COPY FROM.
   *
+ * This file contains routines to parse the text, CSV and binary input
+ * formats.  The main entry point is NextCopyFrom(), which parses the
+ * next input line and returns it as Datums.
+ *
+ * In text/CSV mode, the parsing happens in multiple stages:
+ *
+ * [data source] --> raw_buf --> input_buf --> line_buf --> attribute_buf
+ *                1.          2.            3.           4.
+ *
+ * 1. CopyLoadRawBuf() reads raw data from the input file or client, and
+ *    places it into 'raw_buf'.
+ *
+ * 2. CopyConvertBuf() calls the encoding conversion function to convert
+ *    the data in 'raw_buf' from client to server encoding, placing the
+ *    converted result in 'input_buf'.
+ *
+ * 3. CopyReadLine() parses the data in 'input_buf', one line at a time.
+ *    It is responsible for finding the next newline marker, taking quote and
+ *    escape characters into account according to the COPY options.  The line
+ *    is copied into 'line_buf', with quotes and escape characters still
+ *    intact.
+ *
+ * 4. CopyReadAttributesText/CSV() function takes the input line from
+ *    'line_buf', and splits it into fields, unescaping the data as required.
+ *    The fields are stored in 'attribute_buf', and 'raw_fields' array holds
+ *    pointers to each field.
+ *
+ * If encoding conversion is not required, a shortcut is taken in step 2 to
+ * avoid copying the data unnecessarily.  The 'input_buf' pointer is set to
+ * point directly to 'raw_buf', so that CopyLoadRawBuf() loads the raw data
+ * directly into 'input_buf'.  CopyConvertBuf() then merely validates that
+ * the data is valid in the current encoding.
+ *
+ * In binary mode, the pipeline is much simpler.  Input is loaded into
+ * into 'raw_buf', and encoding conversion is done in the datatype-specific
+ * receive functions, if required.  'input_buf' and 'line_buf' are not used,
+ * but 'attribute_buf' is used as a temporary buffer to hold one attribute's
+ * data when it's passed the receive function.
+ *
+ * 'raw_buf' is always 64 kB in size (RAW_BUF_SIZE).  'input_buf' is also
+ * 64 kB (INPUT_BUF_SIZE), if encoding conversion is required.  'line_buf'
+ * and 'attribute_buf' are expanded on demand, to hold the longest line
+ * encountered so far.
+ *
   * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
@@ -35,7 +79,7 @@
  #define OCTVALUE(c) ((c) - '0')
  
  /*
- * These macros centralize code used to process line_buf and raw_buf buffers.
+ * These macros centralize code used to process line_buf and input_buf buffers.
   * They are macros because they often do continue/break control and to avoid
   * function call overhead in tight COPY loops.
   *
@@ -53,9 +97,9 @@
  #define IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(extralen) \
  if (1) \
  { \
-   if (raw_buf_ptr + (extralen) >= copy_buf_len && !hit_eof) \
+   if (input_buf_ptr + (extralen) >= copy_buf_len && !hit_eof) \
     { \
-       raw_buf_ptr = prev_raw_ptr; /* undo fetch */ \
+       input_buf_ptr = prev_raw_ptr; /* undo fetch */ \
         need_data = true; \
         continue; \
     } \
@@ -65,10 +109,10 @@ if (1) \
  #define IF_NEED_REFILL_AND_EOF_BREAK(extralen) \
  if (1) \
  { \
-   if (raw_buf_ptr + (extralen) >= copy_buf_len && hit_eof) \
+   if (input_buf_ptr + (extralen) >= copy_buf_len && hit_eof) \
     { \
         if (extralen) \
-           raw_buf_ptr = copy_buf_len; /* consume the partial character */ \
+           input_buf_ptr = copy_buf_len; /* consume the partial character */ \
         /* backslash just before EOF, treat as data char */ \
         result = true; \
         break; \
@@ -77,17 +121,17 @@ if (1) \
  
  /*
   * Transfer any approved data to line_buf; must do this to be sure
- * there is some room in raw_buf.
+ * there is some room in input_buf.
   */
  #define REFILL_LINEBUF \
  if (1) \
  { \
-   if (raw_buf_ptr > cstate->raw_buf_index) \
+   if (input_buf_ptr > cstate->input_buf_index) \
     { \
         appendBinaryStringInfo(&cstate->line_buf, \
-                            cstate->raw_buf + cstate->raw_buf_index, \
-                              raw_buf_ptr - cstate->raw_buf_index); \
-       cstate->raw_buf_index = raw_buf_ptr; \
+                            cstate->input_buf + cstate->input_buf_index, \
+                              input_buf_ptr - cstate->input_buf_index); \
+       cstate->input_buf_index = input_buf_ptr; \
     } \
  } else ((void) 0)
  
@@ -95,7 +139,7 @@ if (1) \
  #define NO_END_OF_COPY_GOTO \
  if (1) \
  { \
-   raw_buf_ptr = prev_raw_ptr + 1; \
+   input_buf_ptr = prev_raw_ptr + 1; \
     goto not_end_of_copy; \
  } else ((void) 0)
  
@@ -118,7 +162,7 @@ static int  CopyGetData(CopyFromState cstate, void *databuf,
                         int minread, int maxread);
  static inline bool CopyGetInt32(CopyFromState cstate, int32 *val);
  static inline bool CopyGetInt16(CopyFromState cstate, int16 *val);
-static bool CopyLoadRawBuf(CopyFromState cstate);
+static void CopyLoadInputBuf(CopyFromState cstate);
  static int CopyReadBinaryData(CopyFromState cstate, char *dest, int nbytes);
  
  void
@@ -210,10 +254,10 @@ CopyGetData(CopyFromState cstate, void *databuf, int minread, int maxread)
                         (errcode_for_file_access(),
                          errmsg("could not read from COPY file: %m")));
             if (bytesread == 0)
-               cstate->reached_eof = true;
+               cstate->raw_reached_eof = true;
             break;
         case COPY_FRONTEND:
-           while (maxread > 0 && bytesread < minread && !cstate->reached_eof)
+           while (maxread > 0 && bytesread < minread && !cstate->raw_reached_eof)
             {
                 int         avail;
  
@@ -241,7 +285,7 @@ CopyGetData(CopyFromState cstate, void *databuf, int minread, int maxread)
                             break;
                         case 'c':   /* CopyDone */
                             /* COPY IN correctly terminated by frontend */
-                           cstate->reached_eof = true;
+                           cstate->raw_reached_eof = true;
                             return bytesread;
                         case 'f':   /* CopyFail */
                             ereport(ERROR,
@@ -327,34 +371,303 @@ CopyGetInt16(CopyFromState cstate, int16 *val)
  
  
  /*
- * CopyLoadRawBuf loads some more data into raw_buf
+ * Perform encoding conversion on data in 'raw_buf', writing the converted
+ * data into 'input_buf'.
   *
- * Returns true if able to obtain at least one more byte, else false.
+ * On entry, there must be some data to convert in 'raw_buf'.
+ */
+static void
+CopyConvertBuf(CopyFromState cstate)
+{
+   /*
+    * If the file and server encoding are the same, no encoding conversion is
+    * required.  However, we still need to verify that the input is valid for
+    * the encoding.
+    */
+   if (!cstate->need_transcoding)
+   {
+       /*
+        * When conversion is not required, input_buf and raw_buf are the
+        * same.  raw_buf_len is the total number of bytes in the buffer, and
+        * input_buf_len tracks how many of those bytes have already been
+        * verified.
+        */
+       int         preverifiedlen = cstate->input_buf_len;
+       int         unverifiedlen = cstate->raw_buf_len - cstate->input_buf_len;
+       int         nverified;
+
+       if (unverifiedlen == 0)
+       {
+           /*
+            * If no more raw data is coming, report the EOF to the caller.
+            */
+           if (cstate->raw_reached_eof)
+               cstate->input_reached_eof = true;
+           return;
+       }
+
+       /*
+        * Verify the new data, including any residual unverified bytes from
+        * previous round.
+        */
+       nverified = pg_encoding_verifymbstr(cstate->file_encoding,
+                                           cstate->raw_buf + preverifiedlen,
+                                           unverifiedlen);
+       if (nverified == 0)
+       {
+           /*
+            * Could not verify anything.
+            *
+            * If there is no more raw input data coming, it means that there
+            * was an incomplete multi-byte sequence at the end.  Also, if
+            * there's "enough" input left, we should be able to verify at
+            * least one character, and a failure to do so means that we've
+            * hit an invalid byte sequence.
+            */
+           if (cstate->raw_reached_eof || unverifiedlen >= pg_database_encoding_max_length())
+               cstate->input_reached_error = true;
+           return;
+       }
+       cstate->input_buf_len += nverified;
+   }
+   else
+   {
+       /*
+        * Encoding conversion is needed.
+        */
+       int         nbytes;
+       unsigned char *src;
+       int         srclen;
+       unsigned char *dst;
+       int         dstlen;
+       int         convertedlen;
+
+       if (RAW_BUF_BYTES(cstate) == 0)
+       {
+           /*
+            * If no more raw data is coming, report the EOF to the caller.
+            */
+           if (cstate->raw_reached_eof)
+               cstate->input_reached_eof = true;
+           return;
+       }
+
+       /*
+        * First, copy down any unprocessed data.
+        */
+       nbytes = INPUT_BUF_BYTES(cstate);
+       if (nbytes > 0 && cstate->input_buf_index > 0)
+           memmove(cstate->input_buf, cstate->input_buf + cstate->input_buf_index,
+                   nbytes);
+       cstate->input_buf_index = 0;
+       cstate->input_buf_len = nbytes;
+       cstate->input_buf[nbytes] = '\0';
+
+       src = (unsigned char *) cstate->raw_buf + cstate->raw_buf_index;
+       srclen = cstate->raw_buf_len - cstate->raw_buf_index;
+       dst = (unsigned char *) cstate->input_buf + cstate->input_buf_len;
+       dstlen = INPUT_BUF_SIZE - cstate->input_buf_len + 1;
+
+       /*
+        * Do the conversion.  This might stop short, if there is an invalid
+        * byte sequence in the input.  We'll convert as much as we can in
+        * that case.
+        *
+        * Note: Even if we hit an invalid byte sequence, we don't report the
+        * error until all the valid bytes have been consumed.  The input
+        * might contain an end-of-input marker (\.), and we don't want to
+        * report an error if the invalid byte sequence is after the
+        * end-of-input marker.  We might unnecessarily convert some data
+        * after the end-of-input marker as long as it's valid for the
+        * encoding, but that's harmless.
+        */
+       convertedlen = pg_do_encoding_conversion_buf(cstate->conversion_proc,
+                                                    cstate->file_encoding,
+                                                    GetDatabaseEncoding(),
+                                                    src, srclen,
+                                                    dst, dstlen,
+                                                    true);
+       if (convertedlen == 0)
+       {
+           /*
+            * Could not convert anything.  If there is no more raw input data
+            * coming, it means that there was an incomplete multi-byte
+            * sequence at the end.  Also, if there is plenty of input left,
+            * we should be able to convert at least one character, so a
+            * failure to do so must mean that we've hit a byte sequence
+            * that's invalid.
+            */
+           if (cstate->raw_reached_eof || srclen >= MAX_CONVERSION_INPUT_LENGTH)
+               cstate->input_reached_error = true;
+           return;
+       }
+       cstate->raw_buf_index += convertedlen;
+       cstate->input_buf_len += strlen((char *) dst);
+   }
+}
+
+/*
+ * Report an encoding or conversion error.
+ */
+static void
+CopyConversionError(CopyFromState cstate)
+{
+   Assert(cstate->raw_buf_len > 0);
+   Assert(cstate->input_reached_error);
+
+   if (!cstate->need_transcoding)
+   {
+       /*
+        * Everything up to input_buf_len was successfully verified, and
+        * input_buf_len points to the invalid or incomplete character.
+        */
+       report_invalid_encoding(cstate->file_encoding,
+                               cstate->raw_buf + cstate->input_buf_len,
+                               cstate->raw_buf_len - cstate->input_buf_len);
+   }
+   else
+   {
+       /*
+        * raw_buf_index points to the invalid or untranslatable character. We
+        * let the conversion routine report the error, because it can provide
+        * a more specific error message than we could here.  An earlier call
+        * to the conversion routine in CopyConvertBuf() detected that there
+        * is an error, now we call the conversion routine again with
+        * noError=false, to have it throw the error.
+        */
+       unsigned char *src;
+       int         srclen;
+       unsigned char *dst;
+       int         dstlen;
+
+       src = (unsigned char *) cstate->raw_buf + cstate->raw_buf_index;
+       srclen = cstate->raw_buf_len - cstate->raw_buf_index;
+       dst = (unsigned char *) cstate->input_buf + cstate->input_buf_len;
+       dstlen = INPUT_BUF_SIZE - cstate->input_buf_len + 1;
+
+       (void) pg_do_encoding_conversion_buf(cstate->conversion_proc,
+                                            cstate->file_encoding,
+                                            GetDatabaseEncoding(),
+                                            src, srclen,
+                                            dst, dstlen,
+                                            false);
+
+       /*
+        * The conversion routine should have reported an error, so this
+        * should not be reached.
+        */
+       elog(ERROR, "encoding conversion failed without error");
+   }
+}
+
+/*
+ * Load more data from data source to raw_buf.
   *
- * If RAW_BUF_BYTES(cstate) > 0, the unprocessed bytes are moved to the start
- * of the buffer and then we load more data after that.  This case occurs only
- * when a multibyte character crosses a bufferload boundary.
+ * If RAW_BUF_BYTES(cstate) > 0, the unprocessed bytes are moved to the
+ * beginning of the buffer, and we load new data after that.
   */
-static bool
+static void
  CopyLoadRawBuf(CopyFromState cstate)
  {
-   int         nbytes = RAW_BUF_BYTES(cstate);
+   int         nbytes;
     int         inbytes;
  
-   /* Copy down the unprocessed data if any. */
-   if (nbytes > 0)
+   /*
+    * In text mode, if encoding conversion is not required, raw_buf and
+    * input_buf point to the same buffer.  Their len/index better agree, too.
+    */
+   if (cstate->raw_buf == cstate->input_buf)
+   {
+       Assert(!cstate->need_transcoding);
+       Assert(cstate->raw_buf_index == cstate->input_buf_index);
+       Assert(cstate->input_buf_len <= cstate->raw_buf_len);
+   }
+
+   /*
+    * Copy down the unprocessed data if any.
+    */
+   nbytes = RAW_BUF_BYTES(cstate);
+   if (nbytes > 0 && cstate->raw_buf_index > 0)
         memmove(cstate->raw_buf, cstate->raw_buf + cstate->raw_buf_index,
                 nbytes);
+   cstate->raw_buf_len -= cstate->raw_buf_index;
+   cstate->raw_buf_index = 0;
+
+   /*
+    * If raw_buf and input_buf are in fact the same buffer, adjust the
+    * input_buf variables, too.
+    */
+   if (cstate->raw_buf == cstate->input_buf)
+   {
+       cstate->input_buf_len -= cstate->input_buf_index;
+       cstate->input_buf_index = 0;
+   }
  
-   inbytes = CopyGetData(cstate, cstate->raw_buf + nbytes,
-                         1, RAW_BUF_SIZE - nbytes);
+   /* Load more data */
+   inbytes = CopyGetData(cstate, cstate->raw_buf + cstate->raw_buf_len,
+                         1, RAW_BUF_SIZE - cstate->raw_buf_len);
     nbytes += inbytes;
     cstate->raw_buf[nbytes] = '\0';
-   cstate->raw_buf_index = 0;
     cstate->raw_buf_len = nbytes;
+
     cstate->bytes_processed += inbytes;
     pgstat_progress_update_param(PROGRESS_COPY_BYTES_PROCESSED, cstate->bytes_processed);
-   return (inbytes > 0);
+
+   if (inbytes == 0)
+       cstate->raw_reached_eof = true;
+}
+
+/*
+ * CopyLoadInputBuf loads some more data into input_buf
+ *
+ * On return, at least one more input character is loaded into
+ * input_buf, or input_reached_eof is set.
+ *
+ * If INPUT_BUF_BYTES(cstate) > 0, the unprocessed bytes are moved to the start
+ * of the buffer and then we load more data after that.
+ */
+static void
+CopyLoadInputBuf(CopyFromState cstate)
+{
+   int         nbytes = INPUT_BUF_BYTES(cstate);
+
+   /*
+    * The caller has updated input_buf_index to indicate how much of the
+    * input has been consumed and isn't needed anymore.  If input_buf is the
+    * same physical area as raw_buf, update raw_buf_index accordingly.
+    */
+   if (cstate->raw_buf == cstate->input_buf)
+   {
+       Assert(!cstate->need_transcoding);
+       Assert(cstate->input_buf_index >= cstate->raw_buf_index);
+       cstate->raw_buf_index = cstate->input_buf_index;
+   }
+
+   for (;;)
+   {
+       /* If we now have some unconverted data, try to convert it */
+       CopyConvertBuf(cstate);
+
+       /* If we now have some more input bytes ready, return them */
+       if (INPUT_BUF_BYTES(cstate) > nbytes)
+           return;
+
+       /*
+        * If we reached an invalid byte sequence, or we're at an incomplete
+        * multi-byte character but there is no more raw input data, report
+        * conversion error.
+        */
+       if (cstate->input_reached_error)
+           CopyConversionError(cstate);
+
+       /* no more input, and everything has been converted */
+       if (cstate->input_reached_eof)
+           break;
+
+       /* Try to load more raw data */
+       Assert(!cstate->raw_reached_eof);
+       CopyLoadRawBuf(cstate);
+   }
  }
  
  /*
@@ -389,7 +702,8 @@ CopyReadBinaryData(CopyFromState cstate, char *dest, int nbytes)
             /* Load more data if buffer is empty. */
             if (RAW_BUF_BYTES(cstate) == 0)
             {
-               if (!CopyLoadRawBuf(cstate))
+               CopyLoadRawBuf(cstate);
+               if (cstate->raw_reached_eof)
                     break;      /* EOF */
             }
  
@@ -645,8 +959,7 @@ NextCopyFrom(CopyFromState cstate, ExprContext *econtext,
  }
  
  /*
- * Read the next input line and stash it in line_buf, with conversion to
- * server encoding.
+ * Read the next input line and stash it in line_buf.
   *
   * Result is true if read was terminated by EOF, false if terminated
   * by newline.  The terminating newline or EOF marker is not included
@@ -658,10 +971,7 @@ CopyReadLine(CopyFromState cstate)
     bool        result;
  
     resetStringInfo(&cstate->line_buf);
-   cstate->line_buf_valid = true;
-
-   /* Mark that encoding conversion hasn't occurred yet */
-   cstate->line_buf_converted = false;
+   cstate->line_buf_valid = false;
  
     /* Parse data and transfer into line_buf */
     result = CopyReadLineText(cstate);
@@ -675,10 +985,17 @@ CopyReadLine(CopyFromState cstate)
          */
         if (cstate->copy_src == COPY_FRONTEND)
         {
+           int         inbytes;
+
             do
             {
-               cstate->raw_buf_index = cstate->raw_buf_len;
-           } while (CopyLoadRawBuf(cstate));
+               inbytes = CopyGetData(cstate, cstate->input_buf,
+                                     1, INPUT_BUF_SIZE);
+           } while (inbytes > 0);
+           cstate->input_buf_index = 0;
+           cstate->input_buf_len = 0;
+           cstate->raw_buf_index = 0;
+           cstate->raw_buf_len = 0;
         }
     }
     else
@@ -715,25 +1032,8 @@ CopyReadLine(CopyFromState cstate)
         }
     }
  
-   /* Done reading the line.  Convert it to server encoding. */
-   if (cstate->need_transcoding)
-   {
-       char       *cvt;
-
-       cvt = pg_any_to_server(cstate->line_buf.data,
-                              cstate->line_buf.len,
-                              cstate->file_encoding);
-       if (cvt != cstate->line_buf.data)
-       {
-           /* transfer converted data back to line_buf */
-           resetStringInfo(&cstate->line_buf);
-           appendBinaryStringInfo(&cstate->line_buf, cvt, strlen(cvt));
-           pfree(cvt);
-       }
-   }
-
     /* Now it's safe to use the buffer in error messages */
-   cstate->line_buf_converted = true;
+   cstate->line_buf_valid = true;
  
     return result;
  }
@@ -744,13 +1044,12 @@ CopyReadLine(CopyFromState cstate)
  static bool
  CopyReadLineText(CopyFromState cstate)
  {
-   char       *copy_raw_buf;
-   int         raw_buf_ptr;
+   char       *copy_input_buf;
+   int         input_buf_ptr;
     int         copy_buf_len;
     bool        need_data = false;
     bool        hit_eof = false;
     bool        result = false;
-   char        mblen_str[2];
  
     /* CSV variables */
     bool        first_char_in_line = true;
@@ -768,8 +1067,6 @@ CopyReadLineText(CopyFromState cstate)
             escapec = '\0';
     }
  
-   mblen_str[1] = '\0';
-
     /*
      * The objective of this loop is to transfer the entire next input line
      * into line_buf.  Hence, we only care for detecting newlines (\r and/or
@@ -782,18 +1079,25 @@ CopyReadLineText(CopyFromState cstate)
      * These four characters, and the CSV escape and quote characters, are
      * assumed the same in frontend and backend encodings.
      *
-    * For speed, we try to move data from raw_buf to line_buf in chunks
-    * rather than one character at a time.  raw_buf_ptr points to the next
-    * character to examine; any characters from raw_buf_index to raw_buf_ptr
-    * have been determined to be part of the line, but not yet transferred to
-    * line_buf.
+    * The input has already been converted to the database encoding.  All
+    * supported server encodings have the property that all bytes in a
+    * multi-byte sequence have the high bit set, so a multibyte character
+    * cannot contain any newline or escape characters embedded in the
+    * multibyte sequence.  Therefore, we can process the input byte-by-byte,
+    * regardless of the encoding.
      *
-    * For a little extra speed within the loop, we copy raw_buf and
-    * raw_buf_len into local variables.
+    * For speed, we try to move data from input_buf to line_buf in chunks
+    * rather than one character at a time.  input_buf_ptr points to the next
+    * character to examine; any characters from input_buf_index to
+    * input_buf_ptr have been determined to be part of the line, but not yet
+    * transferred to line_buf.
+    *
+    * For a little extra speed within the loop, we copy input_buf and
+    * input_buf_len into local variables.
      */
-   copy_raw_buf = cstate->raw_buf;
-   raw_buf_ptr = cstate->raw_buf_index;
-   copy_buf_len = cstate->raw_buf_len;
+   copy_input_buf = cstate->input_buf;
+   input_buf_ptr = cstate->input_buf_index;
+   copy_buf_len = cstate->input_buf_len;
  
     for (;;)
     {
@@ -810,24 +1114,21 @@ CopyReadLineText(CopyFromState cstate)
          * cstate->copy_src != COPY_OLD_FE, but it hardly seems worth it,
          * considering the size of the buffer.
          */
-       if (raw_buf_ptr >= copy_buf_len || need_data)
+       if (input_buf_ptr >= copy_buf_len || need_data)
         {
             REFILL_LINEBUF;
  
-           /*
-            * Try to read some more data.  This will certainly reset
-            * raw_buf_index to zero, and raw_buf_ptr must go with it.
-            */
-           if (!CopyLoadRawBuf(cstate))
-               hit_eof = true;
-           raw_buf_ptr = 0;
-           copy_buf_len = cstate->raw_buf_len;
+           CopyLoadInputBuf(cstate);
+           /* update our local variables */
+           hit_eof = cstate->input_reached_eof;
+           input_buf_ptr = cstate->input_buf_index;
+           copy_buf_len = cstate->input_buf_len;
  
             /*
              * If we are completely out of data, break out of the loop,
              * reporting EOF.
              */
-           if (copy_buf_len <= 0)
+           if (INPUT_BUF_BYTES(cstate) <= 0)
             {
                 result = true;
                 break;
@@ -836,8 +1137,8 @@ CopyReadLineText(CopyFromState cstate)
         }
  
         /* OK to fetch a character */
-       prev_raw_ptr = raw_buf_ptr;
-       c = copy_raw_buf[raw_buf_ptr++];
+       prev_raw_ptr = input_buf_ptr;
+       c = copy_input_buf[input_buf_ptr++];
  
         if (cstate->opts.csv_mode)
         {
@@ -891,16 +1192,16 @@ CopyReadLineText(CopyFromState cstate)
                  * If need more data, go back to loop top to load it.
                  *
                  * Note that if we are at EOF, c will wind up as '\0' because
-                * of the guaranteed pad of raw_buf.
+                * of the guaranteed pad of input_buf.
                  */
                 IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
  
                 /* get next char */
-               c = copy_raw_buf[raw_buf_ptr];
+               c = copy_input_buf[input_buf_ptr];
  
                 if (c == '\n')
                 {
-                   raw_buf_ptr++;  /* eat newline */
+                   input_buf_ptr++;    /* eat newline */
                     cstate->eol_type = EOL_CRNL;    /* in case not set yet */
                 }
                 else
@@ -967,14 +1268,14 @@ CopyReadLineText(CopyFromState cstate)
             /* -----
              * get next character
              * Note: we do not change c so if it isn't \., we can fall
-            * through and continue processing for file encoding.
+            * through and continue processing.
              * -----
              */
-           c2 = copy_raw_buf[raw_buf_ptr];
+           c2 = copy_input_buf[input_buf_ptr];
  
             if (c2 == '.')
             {
-               raw_buf_ptr++;  /* consume the '.' */
+               input_buf_ptr++;    /* consume the '.' */
  
                 /*
                  * Note: if we loop back for more data here, it does not
@@ -986,7 +1287,7 @@ CopyReadLineText(CopyFromState cstate)
                     /* Get the next character */
                     IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
                     /* if hit_eof, c2 will become '\0' */
-                   c2 = copy_raw_buf[raw_buf_ptr++];
+                   c2 = copy_input_buf[input_buf_ptr++];
  
                     if (c2 == '\n')
                     {
@@ -1011,7 +1312,7 @@ CopyReadLineText(CopyFromState cstate)
                 /* Get the next character */
                 IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
                 /* if hit_eof, c2 will become '\0' */
-               c2 = copy_raw_buf[raw_buf_ptr++];
+               c2 = copy_input_buf[input_buf_ptr++];
  
                 if (c2 != '\r' && c2 != '\n')
                 {
@@ -1036,11 +1337,11 @@ CopyReadLineText(CopyFromState cstate)
                  * Transfer only the data before the \. into line_buf, then
                  * discard the data and the \. sequence.
                  */
-               if (prev_raw_ptr > cstate->raw_buf_index)
+               if (prev_raw_ptr > cstate->input_buf_index)
                     appendBinaryStringInfo(&cstate->line_buf,
-                                          cstate->raw_buf + cstate->raw_buf_index,
-                                          prev_raw_ptr - cstate->raw_buf_index);
-               cstate->raw_buf_index = raw_buf_ptr;
+                                          cstate->input_buf + cstate->input_buf_index,
+                                          prev_raw_ptr - cstate->input_buf_index);
+               cstate->input_buf_index = input_buf_ptr;
                 result = true;  /* report EOF */
                 break;
             }
@@ -1056,15 +1357,8 @@ CopyReadLineText(CopyFromState cstate)
                  * backslashes are not special, so we want to process the
                  * character after the backslash just like a normal character,
                  * so we don't increment in those cases.
-                *
-                * Set 'c' to skip whole character correctly in multi-byte
-                * encodings.  If we don't have the whole character in the
-                * buffer yet, we might loop back to process it, after all,
-                * but that's OK because multi-byte characters cannot have any
-                * special meaning.
                  */
-               raw_buf_ptr++;
-               c = c2;
+               input_buf_ptr++;
             }
         }
  
@@ -1075,30 +1369,6 @@ CopyReadLineText(CopyFromState cstate)
          * value, while in non-CSV mode, \. cannot be a data value.
          */
  not_end_of_copy:
-
-       /*
-        * Process all bytes of a multi-byte character as a group.
-        *
-        * We only support multi-byte sequences where the first byte has the
-        * high-bit set, so as an optimization we can avoid this block
-        * entirely if it is not set.
-        */
-       if (cstate->encoding_embeds_ascii && IS_HIGHBIT_SET(c))
-       {
-           int         mblen;
-
-           /*
-            * It is enough to look at the first byte in all our encodings, to
-            * get the length.  (GB18030 is a bit special, but still works for
-            * our purposes; see comment in pg_gb18030_mblen())
-            */
-           mblen_str[0] = c;
-           mblen = pg_encoding_mblen(cstate->file_encoding, mblen_str);
-
-           IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(mblen - 1);
-           IF_NEED_REFILL_AND_EOF_BREAK(mblen - 1);
-           raw_buf_ptr += mblen - 1;
-       }
         first_char_in_line = false;
     }                           /* end of outer loop */
  
diff --git a/src/include/commands/copyfrom_internal.h b/src/include/commands/copyfrom_internal.h

index 705f5b615be4ce400766aa9ece22c7a2fd2a3b98..858af7a717b14ddb0fcad7caa4586fcebb85a0fc 100644 (file)
--- a/src/include/commands/copyfrom_internal.h
+++ b/src/include/commands/copyfrom_internal.h
@@ -52,17 +52,6 @@ typedef enum CopyInsertMethod
  /*
   * This struct contains all the state variables used throughout a COPY FROM
   * operation.
- *
- * Multi-byte encodings: all supported client-side encodings encode multi-byte
- * characters by having the first byte's high bit set. Subsequent bytes of the
- * character can have the high bit not set. When scanning data in such an
- * encoding to look for a match to a single-byte (ie ASCII) character, we must
- * use the full pg_encoding_mblen() machinery to skip over multibyte
- * characters, else we might find a false match to a trailing byte. In
- * supported server encodings, there is no possibility of a false match, and
- * it's faster to make useless comparisons to trailing bytes than it is to
- * invoke pg_encoding_mblen() to skip over them. encoding_embeds_ascii is true
- * when we have to do it the hard way.
   */
  typedef struct CopyFromStateData
  {
@@ -70,13 +59,11 @@ typedef struct CopyFromStateData
     CopySource  copy_src;       /* type of copy source */
     FILE       *copy_file;      /* used if copy_src == COPY_FILE */
     StringInfo  fe_msgbuf;      /* used if copy_src == COPY_NEW_FE */
-   bool        reached_eof;    /* true if we read to end of copy data (not
-                                * all copy_src types maintain this) */
  
     EolType     eol_type;       /* EOL type of input */
     int         file_encoding;  /* file or remote side's character encoding */
     bool        need_transcoding;   /* file encoding diff from server? */
-   bool        encoding_embeds_ascii;  /* ASCII can be non-first byte? */
+   Oid         conversion_proc;    /* encoding conversion function */
  
     /* parameters from the COPY command */
     Relation    rel;            /* relation to copy from */
@@ -131,31 +118,52 @@ typedef struct CopyFromStateData
  
     /*
      * Similarly, line_buf holds the whole input line being processed. The
-    * input cycle is first to read the whole line into line_buf, convert it
-    * to server encoding there, and then extract the individual attribute
-    * fields into attribute_buf.  line_buf is preserved unmodified so that we
-    * can display it in error messages if appropriate.  (In binary mode,
-    * line_buf is not used.)
+    * input cycle is first to read the whole line into line_buf, and then
+    * extract the individual attribute fields into attribute_buf.  line_buf
+    * is preserved unmodified so that we can display it in error messages if
+    * appropriate.  (In binary mode, line_buf is not used.)
      */
     StringInfoData line_buf;
-   bool        line_buf_converted; /* converted to server encoding? */
     bool        line_buf_valid; /* contains the row being processed? */
  
     /*
-    * Finally, raw_buf holds raw data read from the data source (file or
-    * client connection).  In text mode, CopyReadLine parses this data
-    * sufficiently to locate line boundaries, then transfers the data to
-    * line_buf and converts it.  In binary mode, CopyReadBinaryData fetches
-    * appropriate amounts of data from this buffer.  In both modes, we
-    * guarantee that there is a \0 at raw_buf[raw_buf_len].
+    * input_buf holds input data, already converted to database encoding.
+    *
+    * In text mode, CopyReadLine parses this data sufficiently to locate
+    * line boundaries, then transfers the data to line_buf. We guarantee
+    * that there is a \0 at input_buf[input_buf_len] at all times.  (In
+    * binary mode, input_buf is not used.)
+    *
+    * If encoding conversion is not required, input_buf is not a separate
+    * buffer but points directly to raw_buf.  In that case, input_buf_len
+    * tracks the number of bytes that have been verified as valid in the
+    * database encoding, and raw_buf_len is the total number of bytes
+    * stored in the buffer.
+    */
+#define INPUT_BUF_SIZE 65536   /* we palloc INPUT_BUF_SIZE+1 bytes */
+   char       *input_buf;
+   int         input_buf_index;    /* next byte to process */
+   int         input_buf_len;      /* total # of bytes stored */
+   bool        input_reached_eof;  /* true if we reached EOF */
+   bool        input_reached_error; /* true if a conversion error happened */
+   /* Shorthand for number of unconsumed bytes available in input_buf */
+#define INPUT_BUF_BYTES(cstate) ((cstate)->input_buf_len - (cstate)->input_buf_index)
+
+   /*
+    * raw_buf holds raw input data read from the data source (file or client
+    * connection), not yet converted to the database encoding.  Like with
+    * 'input_buf', we guarantee that there is a \0 at raw_buf[raw_buf_len].
      */
  #define RAW_BUF_SIZE 65536     /* we palloc RAW_BUF_SIZE+1 bytes */
     char       *raw_buf;
     int         raw_buf_index;  /* next byte to process */
     int         raw_buf_len;    /* total # of bytes stored */
-   uint64      bytes_processed;/* number of bytes processed so far */
+   bool        raw_reached_eof;    /* true if we reached EOF */
+
     /* Shorthand for number of unconsumed bytes available in raw_buf */
  #define RAW_BUF_BYTES(cstate) ((cstate)->raw_buf_len - (cstate)->raw_buf_index)
+
+   uint64      bytes_processed;    /* number of bytes processed so far */
  } CopyFromStateData;
  
  extern void ReceiveCopyBegin(CopyFromState cstate);
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h

index a9aaff9e6dcfad72f28a8e0583083ece87dbc004..0f31e683189d7d3b985e839f6ec12e99d36cc9b2 100644 (file)
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -306,15 +306,33 @@ typedef enum pg_enc
  
  /*
   * When converting strings between different encodings, we assume that space
- * for converted result is 4-to-1 growth in the worst case. The rate for
+ * for converted result is 4-to-1 growth in the worst case.  The rate for
   * currently supported encoding pairs are within 3 (SJIS JIS X0201 half width
- * kanna -> UTF8 is the worst case).  So "4" should be enough for the moment.
+ * kana -> UTF8 is the worst case).  So "4" should be enough for the moment.
   *
   * Note that this is not the same as the maximum character width in any
   * particular encoding.
   */
  #define MAX_CONVERSION_GROWTH  4
  
+/*
+ * Maximum byte length of a string that's required in any encoding to convert
+ * at least one character to any other encoding.  In other words, if you feed
+ * MAX_CONVERSION_INPUT_LENGTH bytes to any encoding conversion function, it
+ * is guaranteed to be able to convert something without needing more input
+ * (assuming the input is valid).
+ *
+ * Currently, the maximum case is the conversion UTF8 -> SJIS JIS X0201 half
+ * width kana, where a pair of UTF-8 characters is converted into a single
+ * SHIFT_JIS_2004 character (the reverse of the worst case for
+ * MAX_CONVERSION_GROWTH).  It needs 6 bytes of input.  In theory, a
+ * user-defined conversion function might have more complicated cases, although
+ * for the reverse mapping you would probably also need to bump up
+ * MAX_CONVERSION_GROWTH.  But there is no need to be stingy here, so make it
+ * generous.
+ */
+#define MAX_CONVERSION_INPUT_LENGTH    16
+
  /*
   * Maximum byte length of the string equivalent to any one Unicode code point,
   * in any backend encoding.  The current value assumes that a 4-byte UTF-8
author	Heikki Linnakangas <[email protected]>
	Thu, 1 Apr 2021 09:23:40 +0000 (12:23 +0300)
committer	Heikki Linnakangas <[email protected]>
	Thu, 1 Apr 2021 09:23:40 +0000 (12:23 +0300)
src/backend/commands/copyfrom.c		patch \| blob \| blame \| history
src/backend/commands/copyfromparse.c		patch \| blob \| blame \| history
src/include/commands/copyfrom_internal.h		patch \| blob \| blame \| history
src/include/mb/pg_wchar.h		patch \| blob \| blame \| history