Change type "char"'s I/O format for non-ASCII characters.

author Tom Lane <[email protected]>

Tue, 2 Aug 2022 14:29:35 +0000 (10:29 -0400)

committer Tom Lane <[email protected]>

Tue, 2 Aug 2022 14:29:35 +0000 (10:29 -0400)
author Tom Lane <[email protected]>
Tue, 2 Aug 2022 14:29:35 +0000 (10:29 -0400)
committer Tom Lane <[email protected]>
Tue, 2 Aug 2022 14:29:35 +0000 (10:29 -0400)
diff --git a/doc/src/sgml/datatype.sgml b/doc/src/sgml/datatype.sgml

index 8e30b82273cf733b18f4addceb01401c826e0136..4cc9e592708ba4e5767dafc48010c2a0d39a7f06 100644 (file)
--- a/doc/src/sgml/datatype.sgml
+++ b/doc/src/sgml/datatype.sgml
@@ -1338,9 +1338,10 @@ SELECT b, char_length(b) FROM test2;
     <para>
      There are two other fixed-length character types in
      <productname>PostgreSQL</productname>, shown in <xref
-    linkend="datatype-character-special-table"/>. The <type>name</type>
-    type exists <emphasis>only</emphasis> for the storage of identifiers
-    in the internal system catalogs and is not intended for use by the general user. Its
+    linkend="datatype-character-special-table"/>.
+    These are not intended for general-purpose use, only for use
+    in the internal system catalogs.
+    The <type>name</type> type is used to store identifiers. Its
      length is currently defined as 64 bytes (63 usable characters plus
      terminator) but should be referenced using the constant
      <symbol>NAMEDATALEN</symbol> in <literal>C</literal> source code.
@@ -1348,7 +1349,8 @@ SELECT b, char_length(b) FROM test2;
      is therefore adjustable for special uses); the default maximum
      length might change in a future release. The type <type>"char"</type>
      (note the quotes) is different from <type>char(1)</type> in that it
-    only uses one byte of storage. It is internally used in the system
+    only uses one byte of storage, and therefore can store only a single
+    ASCII character. It is used in the system
      catalogs as a simplistic enumeration type.
     </para>
  
diff --git a/src/backend/utils/adt/char.c b/src/backend/utils/adt/char.c

index 0df41c2253888cbaed1c6c28568d50db875b15fd..e50293bf14c42c928d280f768de4a79d2b11a0b7 100644 (file)
--- a/src/backend/utils/adt/char.c
+++ b/src/backend/utils/adt/char.c
@@ -20,6 +20,11 @@
  #include "libpq/pqformat.h"
  #include "utils/builtins.h"
  
+#define ISOCTAL(c)   (((c) >= '0') && ((c) <= '7'))
+#define TOOCTAL(c)   ((c) + '0')
+#define FROMOCTAL(c) ((unsigned char) (c) - '0')
+
+
  /*****************************************************************************
   *      USER I/O ROUTINES                                                                                                               *
   *****************************************************************************/
@@ -27,31 +32,53 @@
  /*
   *             charin                  - converts "x" to 'x'
   *
- * Note that an empty input string will implicitly be converted to \0.
+ * This accepts the formats charout produces.  If we have multibyte input
+ * that is not in the form '\ooo', then we take its first byte as the value
+ * and silently discard the rest; this is a backwards-compatibility provision.
   */
  Datum
  charin(PG_FUNCTION_ARGS)
  {
         char       *ch = PG_GETARG_CSTRING(0);
  
+       if (strlen(ch) == 4 && ch[0] == '\\' &&
+               ISOCTAL(ch[1]) && ISOCTAL(ch[2]) && ISOCTAL(ch[3]))
+               PG_RETURN_CHAR((FROMOCTAL(ch[1]) << 6) +
+                                          (FROMOCTAL(ch[2]) << 3) +
+                                          FROMOCTAL(ch[3]));
+       /* This will do the right thing for a zero-length input string */
         PG_RETURN_CHAR(ch[0]);
  }
  
  /*
   *             charout                 - converts 'x' to "x"
   *
- * Note that if the char value is \0, the resulting string will appear
- * to be empty (null-terminated after zero characters).  So this is the
- * inverse of the charin() function for such data.
+ * The possible output formats are:
+ * 1. 0x00 is represented as an empty string.
+ * 2. 0x01..0x7F are represented as a single ASCII byte.
+ * 3. 0x80..0xFF are represented as \ooo (backslash and 3 octal digits).
+ * Case 3 is meant to match the traditional "escape" format of bytea.
   */
  Datum
  charout(PG_FUNCTION_ARGS)
  {
         char            ch = PG_GETARG_CHAR(0);
-       char       *result = (char *) palloc(2);
+       char       *result = (char *) palloc(5);
  
-       result[0] = ch;
-       result[1] = '\0';
+       if (IS_HIGHBIT_SET(ch))
+       {
+               result[0] = '\\';
+               result[1] = TOOCTAL(((unsigned char) ch) >> 6);
+               result[2] = TOOCTAL((((unsigned char) ch) >> 3) & 07);
+               result[3] = TOOCTAL(((unsigned char) ch) & 07);
+               result[4] = '\0';
+       }
+       else
+       {
+               /* This produces acceptable results for 0x00 as well */
+               result[0] = ch;
+               result[1] = '\0';
+       }
         PG_RETURN_CSTRING(result);
  }
  
@@ -176,15 +203,20 @@ Datum
  text_char(PG_FUNCTION_ARGS)
  {
         text       *arg1 = PG_GETARG_TEXT_PP(0);
+       char       *ch = VARDATA_ANY(arg1);
         char            result;
  
         /*
-        * An empty input string is converted to \0 (for consistency with charin).
-        * If the input is longer than one character, the excess data is silently
-        * discarded.
+        * Conversion rules are the same as in charin(), but here we need to
+        * handle the empty-string case honestly.
          */
-       if (VARSIZE_ANY_EXHDR(arg1) > 0)
-               result = *(VARDATA_ANY(arg1));
+       if (VARSIZE_ANY_EXHDR(arg1) == 4 && ch[0] == '\\' &&
+               ISOCTAL(ch[1]) && ISOCTAL(ch[2]) && ISOCTAL(ch[3]))
+               result = (FROMOCTAL(ch[1]) << 6) +
+                       (FROMOCTAL(ch[2]) << 3) +
+                       FROMOCTAL(ch[3]);
+       else if (VARSIZE_ANY_EXHDR(arg1) > 0)
+               result = ch[0];
         else
                 result = '\0';
  
@@ -195,13 +227,21 @@ Datum
  char_text(PG_FUNCTION_ARGS)
  {
         char            arg1 = PG_GETARG_CHAR(0);
-       text       *result = palloc(VARHDRSZ + 1);
+       text       *result = palloc(VARHDRSZ + 4);
  
         /*
-        * Convert \0 to an empty string, for consistency with charout (and
-        * because the text stuff doesn't like embedded nulls all that well).
+        * Conversion rules are the same as in charout(), but here we need to be
+        * honest about converting 0x00 to an empty string.
          */
-       if (arg1 != '\0')
+       if (IS_HIGHBIT_SET(arg1))
+       {
+               SET_VARSIZE(result, VARHDRSZ + 4);
+               (VARDATA(result))[0] = '\\';
+               (VARDATA(result))[1] = TOOCTAL(((unsigned char) arg1) >> 6);
+               (VARDATA(result))[2] = TOOCTAL((((unsigned char) arg1) >> 3) & 07);
+               (VARDATA(result))[3] = TOOCTAL(((unsigned char) arg1) & 07);
+       }
+       else if (arg1 != '\0')
         {
                 SET_VARSIZE(result, VARHDRSZ + 1);
                 *(VARDATA(result)) = arg1;
diff --git a/src/test/regress/expected/char.out b/src/test/regress/expected/char.out

index 2d78f90f3b9e5bcbd0600aefbf033bff00e5ac86..ea9b0b8eeb3ffe4c9464bafe853e6d1e4fb6c478 100644 (file)
--- a/src/test/regress/expected/char.out
+++ b/src/test/regress/expected/char.out
@@ -1,8 +1,8 @@
  --
  -- CHAR
  --
--- fixed-length by value
--- internally passed by value if <= 4 bytes in storage
+-- Per SQL standard, CHAR means character(1), that is a varlena type
+-- with a constraint restricting it to one character (not byte)
  SELECT char 'c' = char 'c' AS true;
   true 
  ------
@@ -119,3 +119,62 @@ SELECT * FROM CHAR_TBL;
   abcd
  (4 rows)
  
+--
+-- Also test "char", which is an ad-hoc one-byte type.  It can only
+-- really store ASCII characters, but we allow high-bit-set characters
+-- to be accessed via bytea-like escapes.
+--
+SELECT 'a'::"char";
+ char 
+------
+ a
+(1 row)
+
+SELECT '\101'::"char";
+ char 
+------
+ A
+(1 row)
+
+SELECT '\377'::"char";
+ char 
+------
+ \377
+(1 row)
+
+SELECT 'a'::"char"::text;
+ text 
+------
+ a
+(1 row)
+
+SELECT '\377'::"char"::text;
+ text 
+------
+ \377
+(1 row)
+
+SELECT '\000'::"char"::text;
+ text 
+------
+ 
+(1 row)
+
+SELECT 'a'::text::"char";
+ char 
+------
+ a
+(1 row)
+
+SELECT '\377'::text::"char";
+ char 
+------
+ \377
+(1 row)
+
+SELECT ''::text::"char";
+ char 
+------
+ 
+(1 row)
+
diff --git a/src/test/regress/expected/char_1.out b/src/test/regress/expected/char_1.out

index fa6644d6927776e400e6766ed5994f34b454ce8a..ffd31551de58df53ebfad3a6f70ed92e63f13c04 100644 (file)
--- a/src/test/regress/expected/char_1.out
+++ b/src/test/regress/expected/char_1.out
@@ -1,8 +1,8 @@
  --
  -- CHAR
  --
--- fixed-length by value
--- internally passed by value if <= 4 bytes in storage
+-- Per SQL standard, CHAR means character(1), that is a varlena type
+-- with a constraint restricting it to one character (not byte)
  SELECT char 'c' = char 'c' AS true;
   true 
  ------
@@ -119,3 +119,62 @@ SELECT * FROM CHAR_TBL;
   abcd
  (4 rows)
  
+--
+-- Also test "char", which is an ad-hoc one-byte type.  It can only
+-- really store ASCII characters, but we allow high-bit-set characters
+-- to be accessed via bytea-like escapes.
+--
+SELECT 'a'::"char";
+ char 
+------
+ a
+(1 row)
+
+SELECT '\101'::"char";
+ char 
+------
+ A
+(1 row)
+
+SELECT '\377'::"char";
+ char 
+------
+ \377
+(1 row)
+
+SELECT 'a'::"char"::text;
+ text 
+------
+ a
+(1 row)
+
+SELECT '\377'::"char"::text;
+ text 
+------
+ \377
+(1 row)
+
+SELECT '\000'::"char"::text;
+ text 
+------
+ 
+(1 row)
+
+SELECT 'a'::text::"char";
+ char 
+------
+ a
+(1 row)
+
+SELECT '\377'::text::"char";
+ char 
+------
+ \377
+(1 row)
+
+SELECT ''::text::"char";
+ char 
+------
+ 
+(1 row)
+
diff --git a/src/test/regress/expected/char_2.out b/src/test/regress/expected/char_2.out

index 09434a44cdcc5b9423134e7809c2fb8cfb1d79dd..56818f824b5f8d27a6dfa613b9ae9b71a3a64ec3 100644 (file)
--- a/src/test/regress/expected/char_2.out
+++ b/src/test/regress/expected/char_2.out
@@ -1,8 +1,8 @@
  --
  -- CHAR
  --
--- fixed-length by value
--- internally passed by value if <= 4 bytes in storage
+-- Per SQL standard, CHAR means character(1), that is a varlena type
+-- with a constraint restricting it to one character (not byte)
  SELECT char 'c' = char 'c' AS true;
   true 
  ------
@@ -119,3 +119,62 @@ SELECT * FROM CHAR_TBL;
   abcd
  (4 rows)
  
+--
+-- Also test "char", which is an ad-hoc one-byte type.  It can only
+-- really store ASCII characters, but we allow high-bit-set characters
+-- to be accessed via bytea-like escapes.
+--
+SELECT 'a'::"char";
+ char 
+------
+ a
+(1 row)
+
+SELECT '\101'::"char";
+ char 
+------
+ A
+(1 row)
+
+SELECT '\377'::"char";
+ char 
+------
+ \377
+(1 row)
+
+SELECT 'a'::"char"::text;
+ text 
+------
+ a
+(1 row)
+
+SELECT '\377'::"char"::text;
+ text 
+------
+ \377
+(1 row)
+
+SELECT '\000'::"char"::text;
+ text 
+------
+ 
+(1 row)
+
+SELECT 'a'::text::"char";
+ char 
+------
+ a
+(1 row)
+
+SELECT '\377'::text::"char";
+ char 
+------
+ \377
+(1 row)
+
+SELECT ''::text::"char";
+ char 
+------
+ 
+(1 row)
+
diff --git a/src/test/regress/sql/char.sql b/src/test/regress/sql/char.sql

index 9c83c45e340fd030bd10f63a0aeca260453e6fcd..120fed53e5c39d572ff0fe66bba2a90d5ce32d92 100644 (file)
--- a/src/test/regress/sql/char.sql
+++ b/src/test/regress/sql/char.sql
@@ -2,8 +2,8 @@
  -- CHAR
  --
  
--- fixed-length by value
--- internally passed by value if <= 4 bytes in storage
+-- Per SQL standard, CHAR means character(1), that is a varlena type
+-- with a constraint restricting it to one character (not byte)
  
  SELECT char 'c' = char 'c' AS true;
  
@@ -71,3 +71,19 @@ DROP TABLE CHAR_TBL;
  INSERT INTO CHAR_TBL (f1) VALUES ('abcde');
  
  SELECT * FROM CHAR_TBL;
+
+--
+-- Also test "char", which is an ad-hoc one-byte type.  It can only
+-- really store ASCII characters, but we allow high-bit-set characters
+-- to be accessed via bytea-like escapes.
+--
+
+SELECT 'a'::"char";
+SELECT '\101'::"char";
+SELECT '\377'::"char";
+SELECT 'a'::"char"::text;
+SELECT '\377'::"char"::text;
+SELECT '\000'::"char"::text;
+SELECT 'a'::text::"char";
+SELECT '\377'::text::"char";
+SELECT ''::text::"char";
author	Tom Lane <[email protected]>
	Tue, 2 Aug 2022 14:29:35 +0000 (10:29 -0400)
committer	Tom Lane <[email protected]>
	Tue, 2 Aug 2022 14:29:35 +0000 (10:29 -0400)
doc/src/sgml/datatype.sgml		patch \| blob \| blame \| history
src/backend/utils/adt/char.c		patch \| blob \| blame \| history
src/test/regress/expected/char.out		patch \| blob \| blame \| history
src/test/regress/expected/char_1.out		patch \| blob \| blame \| history
src/test/regress/expected/char_2.out		patch \| blob \| blame \| history
src/test/regress/sql/char.sql		patch \| blob \| blame \| history