From 3853a6956c3e3bc7a6fa9bcdb205a2997f46bac2 Mon Sep 17 00:00:00 2001 From: Jeff Davis Date: Wed, 29 Oct 2025 14:17:13 -0700 Subject: [PATCH] Use C11 char16_t and char32_t for Unicode code points. Reviewed-by: Tatsuo Ishii Reviewed-by: Thomas Munro Reviewed-by: Peter Eisentraut Discussion: https://postgr.es/m/bedcc93d06203dfd89815b10f815ca2de8626e85.camel%40j-davis.com --- configure | 2 +- configure.ac | 1 + meson.build | 1 + src/backend/parser/parser.c | 8 +-- src/backend/parser/scan.l | 8 +-- src/backend/utils/adt/jsonpath_scan.l | 6 +- src/backend/utils/adt/pg_locale_builtin.c | 44 ++++++++++----- src/backend/utils/adt/varlena.c | 40 ++++++------- src/backend/utils/mb/mbutils.c | 4 +- src/common/saslprep.c | 48 ++++++++-------- src/common/unicode/case_test.c | 23 ++++---- src/common/unicode/category_test.c | 3 +- .../unicode/generate-norm_test_table.pl | 4 +- .../unicode/generate-unicode_case_table.pl | 7 +-- .../generate-unicode_category_table.pl | 8 +-- src/common/unicode/norm_test.c | 6 +- src/common/unicode_case.c | 56 +++++++++---------- src/common/unicode_category.c | 50 ++++++++--------- src/common/unicode_norm.c | 56 +++++++++---------- src/fe_utils/mbprint.c | 10 ++-- src/include/c.h | 23 ++++++++ src/include/common/unicode_case.h | 10 ++-- src/include/common/unicode_case_table.h | 13 ++--- src/include/common/unicode_category.h | 46 ++++++++------- src/include/common/unicode_category_table.h | 8 +-- src/include/common/unicode_norm.h | 6 +- src/include/mb/pg_wchar.h | 32 +++++------ src/include/pg_config.h.in | 3 + src/tools/pgindent/typedefs.list | 2 + 29 files changed, 284 insertions(+), 244 deletions(-) diff --git a/configure b/configure index 7ce52173dd8..f7c24c8f576 100755 --- a/configure +++ b/configure @@ -13627,7 +13627,7 @@ fi ## Header files ## -for ac_header in atomic.h copyfile.h execinfo.h getopt.h ifaddrs.h mbarrier.h sys/epoll.h sys/event.h sys/personality.h sys/prctl.h sys/procctl.h sys/signalfd.h sys/ucred.h termios.h ucred.h xlocale.h +for ac_header in atomic.h copyfile.h execinfo.h getopt.h ifaddrs.h mbarrier.h sys/epoll.h sys/event.h sys/personality.h sys/prctl.h sys/procctl.h sys/signalfd.h sys/ucred.h termios.h uchar.h ucred.h xlocale.h do : as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh` ac_fn_c_check_header_mongrel "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default" diff --git a/configure.ac b/configure.ac index 0842fd06259..6c802deaacb 100644 --- a/configure.ac +++ b/configure.ac @@ -1513,6 +1513,7 @@ AC_CHECK_HEADERS(m4_normalize([ sys/signalfd.h sys/ucred.h termios.h + uchar.h ucred.h xlocale.h ])) diff --git a/meson.build b/meson.build index 1a123ce151a..0f61ff6a700 100644 --- a/meson.build +++ b/meson.build @@ -2613,6 +2613,7 @@ header_checks = [ 'sys/signalfd.h', 'sys/ucred.h', 'termios.h', + 'uchar.h', 'ucred.h', 'xlocale.h', ] diff --git a/src/backend/parser/parser.c b/src/backend/parser/parser.c index 33a040506b4..a3679f8e86c 100644 --- a/src/backend/parser/parser.c +++ b/src/backend/parser/parser.c @@ -339,7 +339,7 @@ hexval(unsigned char c) /* is Unicode code point acceptable? */ static void -check_unicode_value(pg_wchar c) +check_unicode_value(char32_t c) { if (!is_valid_unicode_codepoint(c)) ereport(ERROR, @@ -376,7 +376,7 @@ str_udeescape(const char *str, char escape, char *new, *out; size_t new_len; - pg_wchar pair_first = 0; + char16_t pair_first = 0; ScannerCallbackState scbstate; /* @@ -420,7 +420,7 @@ str_udeescape(const char *str, char escape, isxdigit((unsigned char) in[3]) && isxdigit((unsigned char) in[4])) { - pg_wchar unicode; + char32_t unicode; unicode = (hexval(in[1]) << 12) + (hexval(in[2]) << 8) + @@ -457,7 +457,7 @@ str_udeescape(const char *str, char escape, isxdigit((unsigned char) in[6]) && isxdigit((unsigned char) in[7])) { - pg_wchar unicode; + char32_t unicode; unicode = (hexval(in[2]) << 20) + (hexval(in[3]) << 16) + diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l index 08990831fe8..a67815339b7 100644 --- a/src/backend/parser/scan.l +++ b/src/backend/parser/scan.l @@ -121,7 +121,7 @@ static void addlitchar(unsigned char ychar, core_yyscan_t yyscanner); static char *litbufdup(core_yyscan_t yyscanner); static unsigned char unescape_single_char(unsigned char c, core_yyscan_t yyscanner); static int process_integer_literal(const char *token, YYSTYPE *lval, int base); -static void addunicode(pg_wchar c, yyscan_t yyscanner); +static void addunicode(char32_t c, yyscan_t yyscanner); #define yyerror(msg) scanner_yyerror(msg, yyscanner) @@ -640,7 +640,7 @@ other . addlit(yytext, yyleng, yyscanner); } {xeunicode} { - pg_wchar c = strtoul(yytext + 2, NULL, 16); + char32_t c = strtoul(yytext + 2, NULL, 16); /* * For consistency with other productions, issue any @@ -668,7 +668,7 @@ other . POP_YYLLOC(); } {xeunicode} { - pg_wchar c = strtoul(yytext + 2, NULL, 16); + char32_t c = strtoul(yytext + 2, NULL, 16); /* Remember start of overall string token ... */ PUSH_YYLLOC(); @@ -1376,7 +1376,7 @@ process_integer_literal(const char *token, YYSTYPE *lval, int base) } static void -addunicode(pg_wchar c, core_yyscan_t yyscanner) +addunicode(char32_t c, core_yyscan_t yyscanner) { ScannerCallbackState scbstate; char buf[MAX_UNICODE_EQUIVALENT_STRING + 1]; diff --git a/src/backend/utils/adt/jsonpath_scan.l b/src/backend/utils/adt/jsonpath_scan.l index c7aab83eeb4..8c3a0a9c642 100644 --- a/src/backend/utils/adt/jsonpath_scan.l +++ b/src/backend/utils/adt/jsonpath_scan.l @@ -574,7 +574,7 @@ hexval(char c, int *result, struct Node *escontext, yyscan_t yyscanner) /* Add given unicode character to scanstring */ static bool -addUnicodeChar(int ch, struct Node *escontext, yyscan_t yyscanner) +addUnicodeChar(char32_t ch, struct Node *escontext, yyscan_t yyscanner) { if (ch == 0) { @@ -607,7 +607,7 @@ addUnicodeChar(int ch, struct Node *escontext, yyscan_t yyscanner) /* Add unicode character, processing any surrogate pairs */ static bool -addUnicode(int ch, int *hi_surrogate, struct Node *escontext, yyscan_t yyscanner) +addUnicode(char32_t ch, int *hi_surrogate, struct Node *escontext, yyscan_t yyscanner) { if (is_utf16_surrogate_first(ch)) { @@ -655,7 +655,7 @@ parseUnicode(char *s, int l, struct Node *escontext, yyscan_t yyscanner) for (i = 2; i < l; i += 2) /* skip '\u' */ { - int ch = 0; + char32_t ch = 0; int j, si; diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c index 3dc611b50e1..1021e0d129b 100644 --- a/src/backend/utils/adt/pg_locale_builtin.c +++ b/src/backend/utils/adt/pg_locale_builtin.c @@ -15,7 +15,6 @@ #include "catalog/pg_collation.h" #include "common/unicode_case.h" #include "common/unicode_category.h" -#include "mb/pg_wchar.h" #include "miscadmin.h" #include "utils/builtins.h" #include "utils/pg_locale.h" @@ -35,6 +34,23 @@ struct WordBoundaryState bool prev_alnum; }; +/* + * In UTF-8, pg_wchar is guaranteed to be the code point value. + */ +static inline char32_t +to_char32(pg_wchar wc) +{ + Assert(GetDatabaseEncoding() == PG_UTF8); + return (char32_t) wc; +} + +static inline pg_wchar +to_pg_wchar(char32_t c32) +{ + Assert(GetDatabaseEncoding() == PG_UTF8); + return (pg_wchar) c32; +} + /* * Simple word boundary iterator that draws boundaries each time the result of * pg_u_isalnum() changes. @@ -47,7 +63,7 @@ initcap_wbnext(void *state) while (wbstate->offset < wbstate->len && wbstate->str[wbstate->offset] != '\0') { - pg_wchar u = utf8_to_unicode((unsigned char *) wbstate->str + + char32_t u = utf8_to_unicode((unsigned char *) wbstate->str + wbstate->offset); bool curr_alnum = pg_u_isalnum(u, wbstate->posix); @@ -112,61 +128,61 @@ strfold_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, static bool wc_isdigit_builtin(pg_wchar wc, pg_locale_t locale) { - return pg_u_isdigit(wc, !locale->builtin.casemap_full); + return pg_u_isdigit(to_char32(wc), !locale->builtin.casemap_full); } static bool wc_isalpha_builtin(pg_wchar wc, pg_locale_t locale) { - return pg_u_isalpha(wc); + return pg_u_isalpha(to_char32(wc)); } static bool wc_isalnum_builtin(pg_wchar wc, pg_locale_t locale) { - return pg_u_isalnum(wc, !locale->builtin.casemap_full); + return pg_u_isalnum(to_char32(wc), !locale->builtin.casemap_full); } static bool wc_isupper_builtin(pg_wchar wc, pg_locale_t locale) { - return pg_u_isupper(wc); + return pg_u_isupper(to_char32(wc)); } static bool wc_islower_builtin(pg_wchar wc, pg_locale_t locale) { - return pg_u_islower(wc); + return pg_u_islower(to_char32(wc)); } static bool wc_isgraph_builtin(pg_wchar wc, pg_locale_t locale) { - return pg_u_isgraph(wc); + return pg_u_isgraph(to_char32(wc)); } static bool wc_isprint_builtin(pg_wchar wc, pg_locale_t locale) { - return pg_u_isprint(wc); + return pg_u_isprint(to_char32(wc)); } static bool wc_ispunct_builtin(pg_wchar wc, pg_locale_t locale) { - return pg_u_ispunct(wc, !locale->builtin.casemap_full); + return pg_u_ispunct(to_char32(wc), !locale->builtin.casemap_full); } static bool wc_isspace_builtin(pg_wchar wc, pg_locale_t locale) { - return pg_u_isspace(wc); + return pg_u_isspace(to_char32(wc)); } static bool wc_isxdigit_builtin(pg_wchar wc, pg_locale_t locale) { - return pg_u_isxdigit(wc, !locale->builtin.casemap_full); + return pg_u_isxdigit(to_char32(wc), !locale->builtin.casemap_full); } static bool @@ -179,13 +195,13 @@ char_is_cased_builtin(char ch, pg_locale_t locale) static pg_wchar wc_toupper_builtin(pg_wchar wc, pg_locale_t locale) { - return unicode_uppercase_simple(wc); + return to_pg_wchar(unicode_uppercase_simple(to_char32(wc))); } static pg_wchar wc_tolower_builtin(pg_wchar wc, pg_locale_t locale) { - return unicode_lowercase_simple(wc); + return to_pg_wchar(unicode_lowercase_simple(to_char32(wc))); } static const struct ctype_methods ctype_methods_builtin = { diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index 2c398cd9e5c..8d735786e51 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -5419,12 +5419,12 @@ unicode_assigned(PG_FUNCTION_ARGS) ereport(ERROR, (errmsg("Unicode categorization can only be performed if server encoding is UTF8"))); - /* convert to pg_wchar */ + /* convert to char32_t */ size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input)); p = (unsigned char *) VARDATA_ANY(input); for (int i = 0; i < size; i++) { - pg_wchar uchar = utf8_to_unicode(p); + char32_t uchar = utf8_to_unicode(p); int category = unicode_category(uchar); if (category == PG_U_UNASSIGNED) @@ -5443,24 +5443,24 @@ unicode_normalize_func(PG_FUNCTION_ARGS) char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1)); UnicodeNormalizationForm form; int size; - pg_wchar *input_chars; - pg_wchar *output_chars; + char32_t *input_chars; + char32_t *output_chars; unsigned char *p; text *result; int i; form = unicode_norm_form_from_string(formstr); - /* convert to pg_wchar */ + /* convert to char32_t */ size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input)); - input_chars = palloc((size + 1) * sizeof(pg_wchar)); + input_chars = palloc((size + 1) * sizeof(char32_t)); p = (unsigned char *) VARDATA_ANY(input); for (i = 0; i < size; i++) { input_chars[i] = utf8_to_unicode(p); p += pg_utf_mblen(p); } - input_chars[i] = (pg_wchar) '\0'; + input_chars[i] = (char32_t) '\0'; Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input)); /* action */ @@ -5468,7 +5468,7 @@ unicode_normalize_func(PG_FUNCTION_ARGS) /* convert back to UTF-8 string */ size = 0; - for (pg_wchar *wp = output_chars; *wp; wp++) + for (char32_t *wp = output_chars; *wp; wp++) { unsigned char buf[4]; @@ -5480,7 +5480,7 @@ unicode_normalize_func(PG_FUNCTION_ARGS) SET_VARSIZE(result, size + VARHDRSZ); p = (unsigned char *) VARDATA_ANY(result); - for (pg_wchar *wp = output_chars; *wp; wp++) + for (char32_t *wp = output_chars; *wp; wp++) { unicode_to_utf8(*wp, p); p += pg_utf_mblen(p); @@ -5509,8 +5509,8 @@ unicode_is_normalized(PG_FUNCTION_ARGS) char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1)); UnicodeNormalizationForm form; int size; - pg_wchar *input_chars; - pg_wchar *output_chars; + char32_t *input_chars; + char32_t *output_chars; unsigned char *p; int i; UnicodeNormalizationQC quickcheck; @@ -5519,16 +5519,16 @@ unicode_is_normalized(PG_FUNCTION_ARGS) form = unicode_norm_form_from_string(formstr); - /* convert to pg_wchar */ + /* convert to char32_t */ size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input)); - input_chars = palloc((size + 1) * sizeof(pg_wchar)); + input_chars = palloc((size + 1) * sizeof(char32_t)); p = (unsigned char *) VARDATA_ANY(input); for (i = 0; i < size; i++) { input_chars[i] = utf8_to_unicode(p); p += pg_utf_mblen(p); } - input_chars[i] = (pg_wchar) '\0'; + input_chars[i] = (char32_t) '\0'; Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input)); /* quick check (see UAX #15) */ @@ -5542,11 +5542,11 @@ unicode_is_normalized(PG_FUNCTION_ARGS) output_chars = unicode_normalize(form, input_chars); output_size = 0; - for (pg_wchar *wp = output_chars; *wp; wp++) + for (char32_t *wp = output_chars; *wp; wp++) output_size++; result = (size == output_size) && - (memcmp(input_chars, output_chars, size * sizeof(pg_wchar)) == 0); + (memcmp(input_chars, output_chars, size * sizeof(char32_t)) == 0); PG_RETURN_BOOL(result); } @@ -5602,7 +5602,7 @@ unistr(PG_FUNCTION_ARGS) int len; StringInfoData str; text *result; - pg_wchar pair_first = 0; + char16_t pair_first = 0; char cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1]; instr = VARDATA_ANY(input_text); @@ -5626,7 +5626,7 @@ unistr(PG_FUNCTION_ARGS) else if ((len >= 5 && isxdigits_n(instr + 1, 4)) || (len >= 6 && instr[1] == 'u' && isxdigits_n(instr + 2, 4))) { - pg_wchar unicode; + char32_t unicode; int offset = instr[1] == 'u' ? 2 : 1; unicode = hexval_n(instr + offset, 4); @@ -5662,7 +5662,7 @@ unistr(PG_FUNCTION_ARGS) } else if (len >= 8 && instr[1] == '+' && isxdigits_n(instr + 2, 6)) { - pg_wchar unicode; + char32_t unicode; unicode = hexval_n(instr + 2, 6); @@ -5697,7 +5697,7 @@ unistr(PG_FUNCTION_ARGS) } else if (len >= 10 && instr[1] == 'U' && isxdigits_n(instr + 2, 8)) { - pg_wchar unicode; + char32_t unicode; unicode = hexval_n(instr + 2, 8); diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c index 886ecbad871..fb629ed5c8f 100644 --- a/src/backend/utils/mb/mbutils.c +++ b/src/backend/utils/mb/mbutils.c @@ -862,7 +862,7 @@ perform_default_encoding_conversion(const char *src, int len, * may call this outside any transaction, or in an aborted transaction. */ void -pg_unicode_to_server(pg_wchar c, unsigned char *s) +pg_unicode_to_server(char32_t c, unsigned char *s) { unsigned char c_as_utf8[MAX_MULTIBYTE_CHAR_LEN + 1]; int c_as_utf8_len; @@ -924,7 +924,7 @@ pg_unicode_to_server(pg_wchar c, unsigned char *s) * but simply return false on conversion failure. */ bool -pg_unicode_to_server_noerror(pg_wchar c, unsigned char *s) +pg_unicode_to_server_noerror(char32_t c, unsigned char *s) { unsigned char c_as_utf8[MAX_MULTIBYTE_CHAR_LEN + 1]; int c_as_utf8_len; diff --git a/src/common/saslprep.c b/src/common/saslprep.c index 97beb47940b..101e8d65a4d 100644 --- a/src/common/saslprep.c +++ b/src/common/saslprep.c @@ -47,7 +47,7 @@ /* Prototypes for local functions */ static int codepoint_range_cmp(const void *a, const void *b); -static bool is_code_in_table(pg_wchar code, const pg_wchar *map, int mapsize); +static bool is_code_in_table(char32_t code, const char32_t *map, int mapsize); static int pg_utf8_string_len(const char *source); /* @@ -64,7 +64,7 @@ static int pg_utf8_string_len(const char *source); * * These are all mapped to the ASCII space character (U+00A0). */ -static const pg_wchar non_ascii_space_ranges[] = +static const char32_t non_ascii_space_ranges[] = { 0x00A0, 0x00A0, 0x1680, 0x1680, @@ -79,7 +79,7 @@ static const pg_wchar non_ascii_space_ranges[] = * * If any of these appear in the input, they are removed. */ -static const pg_wchar commonly_mapped_to_nothing_ranges[] = +static const char32_t commonly_mapped_to_nothing_ranges[] = { 0x00AD, 0x00AD, 0x034F, 0x034F, @@ -114,7 +114,7 @@ static const pg_wchar commonly_mapped_to_nothing_ranges[] = * tables, so one code might originate from multiple source tables. * Adjacent ranges have also been merged together, to save space. */ -static const pg_wchar prohibited_output_ranges[] = +static const char32_t prohibited_output_ranges[] = { 0x0000, 0x001F, /* C.2.1 */ 0x007F, 0x00A0, /* C.1.2, C.2.1, C.2.2 */ @@ -155,7 +155,7 @@ static const pg_wchar prohibited_output_ranges[] = }; /* A.1 Unassigned code points in Unicode 3.2 */ -static const pg_wchar unassigned_codepoint_ranges[] = +static const char32_t unassigned_codepoint_ranges[] = { 0x0221, 0x0221, 0x0234, 0x024F, @@ -556,7 +556,7 @@ static const pg_wchar unassigned_codepoint_ranges[] = }; /* D.1 Characters with bidirectional property "R" or "AL" */ -static const pg_wchar RandALCat_codepoint_ranges[] = +static const char32_t RandALCat_codepoint_ranges[] = { 0x05BE, 0x05BE, 0x05C0, 0x05C0, @@ -595,7 +595,7 @@ static const pg_wchar RandALCat_codepoint_ranges[] = }; /* D.2 Characters with bidirectional property "L" */ -static const pg_wchar LCat_codepoint_ranges[] = +static const char32_t LCat_codepoint_ranges[] = { 0x0041, 0x005A, 0x0061, 0x007A, @@ -968,8 +968,8 @@ static const pg_wchar LCat_codepoint_ranges[] = static int codepoint_range_cmp(const void *a, const void *b) { - const pg_wchar *key = (const pg_wchar *) a; - const pg_wchar *range = (const pg_wchar *) b; + const char32_t *key = (const char32_t *) a; + const char32_t *range = (const char32_t *) b; if (*key < range[0]) return -1; /* less than lower bound */ @@ -980,14 +980,14 @@ codepoint_range_cmp(const void *a, const void *b) } static bool -is_code_in_table(pg_wchar code, const pg_wchar *map, int mapsize) +is_code_in_table(char32_t code, const char32_t *map, int mapsize) { Assert(mapsize % 2 == 0); if (code < map[0] || code > map[mapsize - 1]) return false; - if (bsearch(&code, map, mapsize / 2, sizeof(pg_wchar) * 2, + if (bsearch(&code, map, mapsize / 2, sizeof(char32_t) * 2, codepoint_range_cmp)) return true; else @@ -1046,8 +1046,8 @@ pg_utf8_string_len(const char *source) pg_saslprep_rc pg_saslprep(const char *input, char **output) { - pg_wchar *input_chars = NULL; - pg_wchar *output_chars = NULL; + char32_t *input_chars = NULL; + char32_t *output_chars = NULL; int input_size; char *result; int result_size; @@ -1055,7 +1055,7 @@ pg_saslprep(const char *input, char **output) int i; bool contains_RandALCat; unsigned char *p; - pg_wchar *wp; + char32_t *wp; /* Ensure we return *output as NULL on failure */ *output = NULL; @@ -1080,10 +1080,10 @@ pg_saslprep(const char *input, char **output) input_size = pg_utf8_string_len(input); if (input_size < 0) return SASLPREP_INVALID_UTF8; - if (input_size >= MaxAllocSize / sizeof(pg_wchar)) + if (input_size >= MaxAllocSize / sizeof(char32_t)) goto oom; - input_chars = ALLOC((input_size + 1) * sizeof(pg_wchar)); + input_chars = ALLOC((input_size + 1) * sizeof(char32_t)); if (!input_chars) goto oom; @@ -1093,7 +1093,7 @@ pg_saslprep(const char *input, char **output) input_chars[i] = utf8_to_unicode(p); p += pg_utf_mblen(p); } - input_chars[i] = (pg_wchar) '\0'; + input_chars[i] = (char32_t) '\0'; /* * The steps below correspond to the steps listed in [RFC3454], Section @@ -1107,7 +1107,7 @@ pg_saslprep(const char *input, char **output) count = 0; for (i = 0; i < input_size; i++) { - pg_wchar code = input_chars[i]; + char32_t code = input_chars[i]; if (IS_CODE_IN_TABLE(code, non_ascii_space_ranges)) input_chars[count++] = 0x0020; @@ -1118,7 +1118,7 @@ pg_saslprep(const char *input, char **output) else input_chars[count++] = code; } - input_chars[count] = (pg_wchar) '\0'; + input_chars[count] = (char32_t) '\0'; input_size = count; if (input_size == 0) @@ -1138,7 +1138,7 @@ pg_saslprep(const char *input, char **output) */ for (i = 0; i < input_size; i++) { - pg_wchar code = input_chars[i]; + char32_t code = input_chars[i]; if (IS_CODE_IN_TABLE(code, prohibited_output_ranges)) goto prohibited; @@ -1170,7 +1170,7 @@ pg_saslprep(const char *input, char **output) contains_RandALCat = false; for (i = 0; i < input_size; i++) { - pg_wchar code = input_chars[i]; + char32_t code = input_chars[i]; if (IS_CODE_IN_TABLE(code, RandALCat_codepoint_ranges)) { @@ -1181,12 +1181,12 @@ pg_saslprep(const char *input, char **output) if (contains_RandALCat) { - pg_wchar first = input_chars[0]; - pg_wchar last = input_chars[input_size - 1]; + char32_t first = input_chars[0]; + char32_t last = input_chars[input_size - 1]; for (i = 0; i < input_size; i++) { - pg_wchar code = input_chars[i]; + char32_t code = input_chars[i]; if (IS_CODE_IN_TABLE(code, LCat_codepoint_ranges)) goto prohibited; diff --git a/src/common/unicode/case_test.c b/src/common/unicode/case_test.c index fdfb62e8552..00d4f85e5a5 100644 --- a/src/common/unicode/case_test.c +++ b/src/common/unicode/case_test.c @@ -24,6 +24,7 @@ #include "common/unicode_case.h" #include "common/unicode_category.h" #include "common/unicode_version.h" +#include "mb/pg_wchar.h" /* enough to hold largest source or result string, including NUL */ #define BUFSZ 256 @@ -54,7 +55,7 @@ initcap_wbnext(void *state) while (wbstate->offset < wbstate->len && wbstate->str[wbstate->offset] != '\0') { - pg_wchar u = utf8_to_unicode((unsigned char *) wbstate->str + + char32_t u = utf8_to_unicode((unsigned char *) wbstate->str + wbstate->offset); bool curr_alnum = pg_u_isalnum(u, wbstate->posix); @@ -77,16 +78,16 @@ initcap_wbnext(void *state) #ifdef USE_ICU static void -icu_test_simple(pg_wchar code) +icu_test_simple(char32_t code) { - pg_wchar lower = unicode_lowercase_simple(code); - pg_wchar title = unicode_titlecase_simple(code); - pg_wchar upper = unicode_uppercase_simple(code); - pg_wchar fold = unicode_casefold_simple(code); - pg_wchar iculower = u_tolower(code); - pg_wchar icutitle = u_totitle(code); - pg_wchar icuupper = u_toupper(code); - pg_wchar icufold = u_foldCase(code, U_FOLD_CASE_DEFAULT); + char32_t lower = unicode_lowercase_simple(code); + char32_t title = unicode_titlecase_simple(code); + char32_t upper = unicode_uppercase_simple(code); + char32_t fold = unicode_casefold_simple(code); + char32_t iculower = u_tolower(code); + char32_t icutitle = u_totitle(code); + char32_t icuupper = u_toupper(code); + char32_t icufold = u_foldCase(code, U_FOLD_CASE_DEFAULT); if (lower != iculower || title != icutitle || upper != icuupper || fold != icufold) @@ -172,7 +173,7 @@ test_icu(void) int successful = 0; int skipped_mismatch = 0; - for (pg_wchar code = 0; code <= 0x10ffff; code++) + for (char32_t code = 0; code <= 0x10ffff; code++) { pg_unicode_category category = unicode_category(code); diff --git a/src/common/unicode/category_test.c b/src/common/unicode/category_test.c index 5d37ba39196..1e8c1f7905f 100644 --- a/src/common/unicode/category_test.c +++ b/src/common/unicode/category_test.c @@ -22,6 +22,7 @@ #include "common/unicode_category.h" #include "common/unicode_version.h" +#include "mb/pg_wchar.h" static int pg_unicode_version = 0; #ifdef USE_ICU @@ -59,7 +60,7 @@ icu_test() int pg_skipped_codepoints = 0; int icu_skipped_codepoints = 0; - for (pg_wchar code = 0; code <= 0x10ffff; code++) + for (char32_t code = 0; code <= 0x10ffff; code++) { uint8_t pg_category = unicode_category(code); uint8_t icu_category = u_charType(code); diff --git a/src/common/unicode/generate-norm_test_table.pl b/src/common/unicode/generate-norm_test_table.pl index 1b401be9409..1a8b908ff33 100644 --- a/src/common/unicode/generate-norm_test_table.pl +++ b/src/common/unicode/generate-norm_test_table.pl @@ -47,8 +47,8 @@ print $OUTPUT <input); diff --git a/src/common/unicode_case.c b/src/common/unicode_case.c index 073faf6a0d5..e5e494db43c 100644 --- a/src/common/unicode_case.c +++ b/src/common/unicode_case.c @@ -30,7 +30,7 @@ enum CaseMapResult /* * Map for each case kind. */ -static const pg_wchar *const casekind_map[NCaseKind] = +static const char32_t *const casekind_map[NCaseKind] = { [CaseLower] = case_map_lower, [CaseTitle] = case_map_title, @@ -38,42 +38,42 @@ static const pg_wchar *const casekind_map[NCaseKind] = [CaseFold] = case_map_fold, }; -static pg_wchar find_case_map(pg_wchar ucs, const pg_wchar *map); +static char32_t find_case_map(char32_t ucs, const char32_t *map); static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen, CaseKind str_casekind, bool full, WordBoundaryNext wbnext, void *wbstate); -static enum CaseMapResult casemap(pg_wchar u1, CaseKind casekind, bool full, +static enum CaseMapResult casemap(char32_t u1, CaseKind casekind, bool full, const char *src, size_t srclen, size_t srcoff, - pg_wchar *simple, const pg_wchar **special); + char32_t *simple, const char32_t **special); -pg_wchar -unicode_lowercase_simple(pg_wchar code) +char32_t +unicode_lowercase_simple(char32_t code) { - pg_wchar cp = find_case_map(code, case_map_lower); + char32_t cp = find_case_map(code, case_map_lower); return cp != 0 ? cp : code; } -pg_wchar -unicode_titlecase_simple(pg_wchar code) +char32_t +unicode_titlecase_simple(char32_t code) { - pg_wchar cp = find_case_map(code, case_map_title); + char32_t cp = find_case_map(code, case_map_title); return cp != 0 ? cp : code; } -pg_wchar -unicode_uppercase_simple(pg_wchar code) +char32_t +unicode_uppercase_simple(char32_t code) { - pg_wchar cp = find_case_map(code, case_map_upper); + char32_t cp = find_case_map(code, case_map_upper); return cp != 0 ? cp : code; } -pg_wchar -unicode_casefold_simple(pg_wchar code) +char32_t +unicode_casefold_simple(char32_t code) { - pg_wchar cp = find_case_map(code, case_map_fold); + char32_t cp = find_case_map(code, case_map_fold); return cp != 0 ? cp : code; } @@ -231,10 +231,10 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen, while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0') { - pg_wchar u1 = utf8_to_unicode((unsigned char *) src + srcoff); + char32_t u1 = utf8_to_unicode((unsigned char *) src + srcoff); int u1len = unicode_utf8len(u1); - pg_wchar simple = 0; - const pg_wchar *special = NULL; + char32_t simple = 0; + const char32_t *special = NULL; enum CaseMapResult casemap_result; if (str_casekind == CaseTitle) @@ -265,8 +265,8 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen, case CASEMAP_SIMPLE: { /* replace with single character */ - pg_wchar u2 = simple; - pg_wchar u2len = unicode_utf8len(u2); + char32_t u2 = simple; + char32_t u2len = unicode_utf8len(u2); Assert(special == NULL); if (result_len + u2len <= dstsize) @@ -280,7 +280,7 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen, Assert(simple == 0); for (int i = 0; i < MAX_CASE_EXPANSION && special[i]; i++) { - pg_wchar u2 = special[i]; + char32_t u2 = special[i]; size_t u2len = unicode_utf8len(u2); if (result_len + u2len <= dstsize) @@ -320,7 +320,7 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset) { if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0) { - pg_wchar curr = utf8_to_unicode(str + i); + char32_t curr = utf8_to_unicode(str + i); if (pg_u_prop_case_ignorable(curr)) continue; @@ -344,7 +344,7 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset) { if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0) { - pg_wchar curr = utf8_to_unicode(str + i); + char32_t curr = utf8_to_unicode(str + i); if (pg_u_prop_case_ignorable(curr)) continue; @@ -394,9 +394,9 @@ check_special_conditions(int conditions, const char *str, size_t len, * character without modification. */ static enum CaseMapResult -casemap(pg_wchar u1, CaseKind casekind, bool full, +casemap(char32_t u1, CaseKind casekind, bool full, const char *src, size_t srclen, size_t srcoff, - pg_wchar *simple, const pg_wchar **special) + char32_t *simple, const char32_t **special) { uint16 idx; @@ -434,8 +434,8 @@ casemap(pg_wchar u1, CaseKind casekind, bool full, * Find entry in simple case map. * If the entry does not exist, 0 will be returned. */ -static pg_wchar -find_case_map(pg_wchar ucs, const pg_wchar *map) +static char32_t +find_case_map(char32_t ucs, const char32_t *map) { /* Fast path for codepoints < 0x80 */ if (ucs < 0x80) diff --git a/src/common/unicode_category.c b/src/common/unicode_category.c index 4136c4d4f92..aab667a7bb4 100644 --- a/src/common/unicode_category.c +++ b/src/common/unicode_category.c @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * unicode_category.c * Determine general category and character properties of Unicode - * characters. Encoding must be UTF8, where we assume that the pg_wchar + * characters. Encoding must be UTF8, where we assume that the char32_t * representation is a code point. * * Portions Copyright (c) 2017-2025, PostgreSQL Global Development Group @@ -76,13 +76,13 @@ #define PG_U_CHARACTER_TAB 0x09 static bool range_search(const pg_unicode_range *tbl, size_t size, - pg_wchar code); + char32_t code); /* * Unicode general category for the given codepoint. */ pg_unicode_category -unicode_category(pg_wchar code) +unicode_category(char32_t code) { int min = 0; int mid; @@ -108,7 +108,7 @@ unicode_category(pg_wchar code) } bool -pg_u_prop_alphabetic(pg_wchar code) +pg_u_prop_alphabetic(char32_t code) { if (code < 0x80) return unicode_opt_ascii[code].properties & PG_U_PROP_ALPHABETIC; @@ -119,7 +119,7 @@ pg_u_prop_alphabetic(pg_wchar code) } bool -pg_u_prop_lowercase(pg_wchar code) +pg_u_prop_lowercase(char32_t code) { if (code < 0x80) return unicode_opt_ascii[code].properties & PG_U_PROP_LOWERCASE; @@ -130,7 +130,7 @@ pg_u_prop_lowercase(pg_wchar code) } bool -pg_u_prop_uppercase(pg_wchar code) +pg_u_prop_uppercase(char32_t code) { if (code < 0x80) return unicode_opt_ascii[code].properties & PG_U_PROP_UPPERCASE; @@ -141,7 +141,7 @@ pg_u_prop_uppercase(pg_wchar code) } bool -pg_u_prop_cased(pg_wchar code) +pg_u_prop_cased(char32_t code) { uint32 category_mask; @@ -156,7 +156,7 @@ pg_u_prop_cased(pg_wchar code) } bool -pg_u_prop_case_ignorable(pg_wchar code) +pg_u_prop_case_ignorable(char32_t code) { if (code < 0x80) return unicode_opt_ascii[code].properties & PG_U_PROP_CASE_IGNORABLE; @@ -167,7 +167,7 @@ pg_u_prop_case_ignorable(pg_wchar code) } bool -pg_u_prop_white_space(pg_wchar code) +pg_u_prop_white_space(char32_t code) { if (code < 0x80) return unicode_opt_ascii[code].properties & PG_U_PROP_WHITE_SPACE; @@ -178,7 +178,7 @@ pg_u_prop_white_space(pg_wchar code) } bool -pg_u_prop_hex_digit(pg_wchar code) +pg_u_prop_hex_digit(char32_t code) { if (code < 0x80) return unicode_opt_ascii[code].properties & PG_U_PROP_HEX_DIGIT; @@ -189,7 +189,7 @@ pg_u_prop_hex_digit(pg_wchar code) } bool -pg_u_prop_join_control(pg_wchar code) +pg_u_prop_join_control(char32_t code) { if (code < 0x80) return unicode_opt_ascii[code].properties & PG_U_PROP_JOIN_CONTROL; @@ -208,7 +208,7 @@ pg_u_prop_join_control(pg_wchar code) */ bool -pg_u_isdigit(pg_wchar code, bool posix) +pg_u_isdigit(char32_t code, bool posix) { if (posix) return ('0' <= code && code <= '9'); @@ -217,19 +217,19 @@ pg_u_isdigit(pg_wchar code, bool posix) } bool -pg_u_isalpha(pg_wchar code) +pg_u_isalpha(char32_t code) { return pg_u_prop_alphabetic(code); } bool -pg_u_isalnum(pg_wchar code, bool posix) +pg_u_isalnum(char32_t code, bool posix) { return pg_u_isalpha(code) || pg_u_isdigit(code, posix); } bool -pg_u_isword(pg_wchar code) +pg_u_isword(char32_t code) { uint32 category_mask = PG_U_CATEGORY_MASK(unicode_category(code)); @@ -240,32 +240,32 @@ pg_u_isword(pg_wchar code) } bool -pg_u_isupper(pg_wchar code) +pg_u_isupper(char32_t code) { return pg_u_prop_uppercase(code); } bool -pg_u_islower(pg_wchar code) +pg_u_islower(char32_t code) { return pg_u_prop_lowercase(code); } bool -pg_u_isblank(pg_wchar code) +pg_u_isblank(char32_t code) { return code == PG_U_CHARACTER_TAB || unicode_category(code) == PG_U_SPACE_SEPARATOR; } bool -pg_u_iscntrl(pg_wchar code) +pg_u_iscntrl(char32_t code) { return unicode_category(code) == PG_U_CONTROL; } bool -pg_u_isgraph(pg_wchar code) +pg_u_isgraph(char32_t code) { uint32 category_mask = PG_U_CATEGORY_MASK(unicode_category(code)); @@ -276,7 +276,7 @@ pg_u_isgraph(pg_wchar code) } bool -pg_u_isprint(pg_wchar code) +pg_u_isprint(char32_t code) { pg_unicode_category category = unicode_category(code); @@ -287,7 +287,7 @@ pg_u_isprint(pg_wchar code) } bool -pg_u_ispunct(pg_wchar code, bool posix) +pg_u_ispunct(char32_t code, bool posix) { uint32 category_mask; @@ -308,13 +308,13 @@ pg_u_ispunct(pg_wchar code, bool posix) } bool -pg_u_isspace(pg_wchar code) +pg_u_isspace(char32_t code) { return pg_u_prop_white_space(code); } bool -pg_u_isxdigit(pg_wchar code, bool posix) +pg_u_isxdigit(char32_t code, bool posix) { if (posix) return (('0' <= code && code <= '9') || @@ -478,7 +478,7 @@ unicode_category_abbrev(pg_unicode_category category) * given table. */ static bool -range_search(const pg_unicode_range *tbl, size_t size, pg_wchar code) +range_search(const pg_unicode_range *tbl, size_t size, char32_t code) { int min = 0; int mid; diff --git a/src/common/unicode_norm.c b/src/common/unicode_norm.c index 6654b4cbc49..489d99cd5ab 100644 --- a/src/common/unicode_norm.c +++ b/src/common/unicode_norm.c @@ -69,7 +69,7 @@ conv_compare(const void *p1, const void *p2) * lookup, while the frontend version uses a binary search. */ static const pg_unicode_decomposition * -get_code_entry(pg_wchar code) +get_code_entry(char32_t code) { #ifndef FRONTEND int h; @@ -109,7 +109,7 @@ get_code_entry(pg_wchar code) * Get the combining class of the given codepoint. */ static uint8 -get_canonical_class(pg_wchar code) +get_canonical_class(char32_t code) { const pg_unicode_decomposition *entry = get_code_entry(code); @@ -130,15 +130,15 @@ get_canonical_class(pg_wchar code) * Note: the returned pointer can point to statically allocated buffer, and * is only valid until next call to this function! */ -static const pg_wchar * +static const char32_t * get_code_decomposition(const pg_unicode_decomposition *entry, int *dec_size) { - static pg_wchar x; + static char32_t x; if (DECOMPOSITION_IS_INLINE(entry)) { Assert(DECOMPOSITION_SIZE(entry) == 1); - x = (pg_wchar) entry->dec_index; + x = (char32_t) entry->dec_index; *dec_size = 1; return &x; } @@ -156,7 +156,7 @@ get_code_decomposition(const pg_unicode_decomposition *entry, int *dec_size) * are, in turn, decomposable. */ static int -get_decomposed_size(pg_wchar code, bool compat) +get_decomposed_size(char32_t code, bool compat) { const pg_unicode_decomposition *entry; int size = 0; @@ -318,7 +318,7 @@ recompose_code(uint32 start, uint32 code, uint32 *result) * in the array result. */ static void -decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current) +decompose_code(char32_t code, bool compat, char32_t **result, int *current) { const pg_unicode_decomposition *entry; int i; @@ -337,7 +337,7 @@ decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current) v, tindex, sindex; - pg_wchar *res = *result; + char32_t *res = *result; sindex = code - SBASE; l = LBASE + sindex / (VCOUNT * TCOUNT); @@ -369,7 +369,7 @@ decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current) if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 || (!compat && DECOMPOSITION_IS_COMPAT(entry))) { - pg_wchar *res = *result; + char32_t *res = *result; res[*current] = code; (*current)++; @@ -382,7 +382,7 @@ decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current) decomp = get_code_decomposition(entry, &dec_size); for (i = 0; i < dec_size; i++) { - pg_wchar lcode = (pg_wchar) decomp[i]; + char32_t lcode = (char32_t) decomp[i]; /* Leave if no more decompositions */ decompose_code(lcode, compat, result, current); @@ -398,17 +398,17 @@ decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current) * malloc. Or NULL if we run out of memory. In backend, the returned * string is palloc'd instead, and OOM is reported with ereport(). */ -pg_wchar * -unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input) +char32_t * +unicode_normalize(UnicodeNormalizationForm form, const char32_t *input) { bool compat = (form == UNICODE_NFKC || form == UNICODE_NFKD); bool recompose = (form == UNICODE_NFC || form == UNICODE_NFKC); - pg_wchar *decomp_chars; - pg_wchar *recomp_chars; + char32_t *decomp_chars; + char32_t *recomp_chars; int decomp_size, current_size; int count; - const pg_wchar *p; + const char32_t *p; /* variables for recomposition */ int last_class; @@ -425,7 +425,7 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input) for (p = input; *p; p++) decomp_size += get_decomposed_size(*p, compat); - decomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar)); + decomp_chars = (char32_t *) ALLOC((decomp_size + 1) * sizeof(char32_t)); if (decomp_chars == NULL) return NULL; @@ -448,9 +448,9 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input) */ for (count = 1; count < decomp_size; count++) { - pg_wchar prev = decomp_chars[count - 1]; - pg_wchar next = decomp_chars[count]; - pg_wchar tmp; + char32_t prev = decomp_chars[count - 1]; + char32_t next = decomp_chars[count]; + char32_t tmp; const uint8 prevClass = get_canonical_class(prev); const uint8 nextClass = get_canonical_class(next); @@ -487,7 +487,7 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input) * longer than the decomposed one, so make the allocation of the output * string based on that assumption. */ - recomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar)); + recomp_chars = (char32_t *) ALLOC((decomp_size + 1) * sizeof(char32_t)); if (!recomp_chars) { FREE(decomp_chars); @@ -501,9 +501,9 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input) for (count = 1; count < decomp_size; count++) { - pg_wchar ch = decomp_chars[count]; + char32_t ch = decomp_chars[count]; int ch_class = get_canonical_class(ch); - pg_wchar composite; + char32_t composite; if (last_class < ch_class && recompose_code(starter_ch, ch, &composite)) @@ -524,7 +524,7 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input) recomp_chars[target_pos++] = ch; } } - recomp_chars[target_pos] = (pg_wchar) '\0'; + recomp_chars[target_pos] = (char32_t) '\0'; FREE(decomp_chars); @@ -540,7 +540,7 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input) #ifndef FRONTEND static const pg_unicode_normprops * -qc_hash_lookup(pg_wchar ch, const pg_unicode_norminfo *norminfo) +qc_hash_lookup(char32_t ch, const pg_unicode_norminfo *norminfo) { int h; uint32 hashkey; @@ -571,7 +571,7 @@ qc_hash_lookup(pg_wchar ch, const pg_unicode_norminfo *norminfo) * Look up the normalization quick check character property */ static UnicodeNormalizationQC -qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch) +qc_is_allowed(UnicodeNormalizationForm form, char32_t ch) { const pg_unicode_normprops *found = NULL; @@ -595,7 +595,7 @@ qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch) } UnicodeNormalizationQC -unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const pg_wchar *input) +unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const char32_t *input) { uint8 lastCanonicalClass = 0; UnicodeNormalizationQC result = UNICODE_NORM_QC_YES; @@ -610,9 +610,9 @@ unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const pg_wchar * if (form == UNICODE_NFD || form == UNICODE_NFKD) return UNICODE_NORM_QC_MAYBE; - for (const pg_wchar *p = input; *p; p++) + for (const char32_t *p = input; *p; p++) { - pg_wchar ch = *p; + char32_t ch = *p; uint8 canonicalClass; UnicodeNormalizationQC check; diff --git a/src/fe_utils/mbprint.c b/src/fe_utils/mbprint.c index eb3eeee9925..abffdbe18a2 100644 --- a/src/fe_utils/mbprint.c +++ b/src/fe_utils/mbprint.c @@ -49,20 +49,20 @@ pg_get_utf8_id(void) * * No error checks here, c must point to a long-enough string. */ -static pg_wchar +static char32_t utf8_to_unicode(const unsigned char *c) { if ((*c & 0x80) == 0) - return (pg_wchar) c[0]; + return (char32_t) c[0]; else if ((*c & 0xe0) == 0xc0) - return (pg_wchar) (((c[0] & 0x1f) << 6) | + return (char32_t) (((c[0] & 0x1f) << 6) | (c[1] & 0x3f)); else if ((*c & 0xf0) == 0xe0) - return (pg_wchar) (((c[0] & 0x0f) << 12) | + return (char32_t) (((c[0] & 0x0f) << 12) | ((c[1] & 0x3f) << 6) | (c[2] & 0x3f)); else if ((*c & 0xf8) == 0xf0) - return (pg_wchar) (((c[0] & 0x07) << 18) | + return (char32_t) (((c[0] & 0x07) << 18) | ((c[1] & 0x3f) << 12) | ((c[2] & 0x3f) << 6) | (c[3] & 0x3f)); diff --git a/src/include/c.h b/src/include/c.h index f4ec33e9b07..757dfff4782 100644 --- a/src/include/c.h +++ b/src/include/c.h @@ -1376,6 +1376,29 @@ typedef intptr_t sigjmp_buf[5]; /* /port compatibility functions */ #include "port.h" +/* + * char16_t and char32_t + * Unicode code points. + * + * uchar.h should always be available in C11, but it's not available on + * Mac. However, these types are keywords in C++11, so when using C++, we + * can't redefine the types. + * + * XXX: when uchar.h is available everywhere, we can remove this check and + * just include uchar.h unconditionally. + * + * XXX: this section is out of place because uchar.h needs to be included + * after port.h, due to an interaction with win32_port.h in some cases. + */ +#ifdef HAVE_UCHAR_H +#include +#else +#ifndef __cplusplus +typedef uint16_t char16_t; +typedef uint32_t char32_t; +#endif +#endif + /* IWYU pragma: end_exports */ #endif /* C_H */ diff --git a/src/include/common/unicode_case.h b/src/include/common/unicode_case.h index 41e2c1f4b33..6bcffd349c2 100644 --- a/src/include/common/unicode_case.h +++ b/src/include/common/unicode_case.h @@ -14,14 +14,12 @@ #ifndef UNICODE_CASE_H #define UNICODE_CASE_H -#include "mb/pg_wchar.h" - typedef size_t (*WordBoundaryNext) (void *wbstate); -pg_wchar unicode_lowercase_simple(pg_wchar code); -pg_wchar unicode_titlecase_simple(pg_wchar code); -pg_wchar unicode_uppercase_simple(pg_wchar code); -pg_wchar unicode_casefold_simple(pg_wchar code); +char32_t unicode_lowercase_simple(char32_t code); +char32_t unicode_titlecase_simple(char32_t code); +char32_t unicode_uppercase_simple(char32_t code); +char32_t unicode_casefold_simple(char32_t code); size_t unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen, bool full); size_t unicode_strtitle(char *dst, size_t dstsize, const char *src, diff --git a/src/include/common/unicode_case_table.h b/src/include/common/unicode_case_table.h index d5311786582..0a14fb2d97b 100644 --- a/src/include/common/unicode_case_table.h +++ b/src/include/common/unicode_case_table.h @@ -18,7 +18,6 @@ */ #include "common/unicode_case.h" -#include "mb/pg_wchar.h" /* * The maximum number of codepoints that can result from case mapping @@ -45,7 +44,7 @@ typedef enum typedef struct { int16 conditions; - pg_wchar map[NCaseKind][MAX_CASE_EXPANSION]; + char32_t map[NCaseKind][MAX_CASE_EXPANSION]; } pg_special_case; /* @@ -166,7 +165,7 @@ static const pg_special_case special_case[106] = * The entry case_map_lower[case_index(codepoint)] is the mapping for the * given codepoint. */ -static const pg_wchar case_map_lower[1704] = +static const char32_t case_map_lower[1704] = { 0x000000, /* reserved */ 0x000000, /* U+000000 */ @@ -1879,7 +1878,7 @@ static const pg_wchar case_map_lower[1704] = * The entry case_map_title[case_index(codepoint)] is the mapping for the * given codepoint. */ -static const pg_wchar case_map_title[1704] = +static const char32_t case_map_title[1704] = { 0x000000, /* reserved */ 0x000000, /* U+000000 */ @@ -3592,7 +3591,7 @@ static const pg_wchar case_map_title[1704] = * The entry case_map_upper[case_index(codepoint)] is the mapping for the * given codepoint. */ -static const pg_wchar case_map_upper[1704] = +static const char32_t case_map_upper[1704] = { 0x000000, /* reserved */ 0x000000, /* U+000000 */ @@ -5305,7 +5304,7 @@ static const pg_wchar case_map_upper[1704] = * The entry case_map_fold[case_index(codepoint)] is the mapping for the * given codepoint. */ -static const pg_wchar case_map_fold[1704] = +static const char32_t case_map_fold[1704] = { 0x000000, /* reserved */ 0x000000, /* U+000000 */ @@ -13522,7 +13521,7 @@ static const uint16 case_map[4778] = * the offset into the mapping tables. */ static inline uint16 -case_index(pg_wchar cp) +case_index(char32_t cp) { /* Fast path for codepoints < 0x0588 */ if (cp < 0x0588) diff --git a/src/include/common/unicode_category.h b/src/include/common/unicode_category.h index 8fd8b67a416..684143d3c8a 100644 --- a/src/include/common/unicode_category.h +++ b/src/include/common/unicode_category.h @@ -14,8 +14,6 @@ #ifndef UNICODE_CATEGORY_H #define UNICODE_CATEGORY_H -#include "mb/pg_wchar.h" - /* * Unicode General Category Values * @@ -61,31 +59,31 @@ typedef enum pg_unicode_category PG_U_FINAL_PUNCTUATION = 29 /* Pf */ } pg_unicode_category; -extern pg_unicode_category unicode_category(pg_wchar code); +extern pg_unicode_category unicode_category(char32_t code); extern const char *unicode_category_string(pg_unicode_category category); extern const char *unicode_category_abbrev(pg_unicode_category category); -extern bool pg_u_prop_alphabetic(pg_wchar code); -extern bool pg_u_prop_lowercase(pg_wchar code); -extern bool pg_u_prop_uppercase(pg_wchar code); -extern bool pg_u_prop_cased(pg_wchar code); -extern bool pg_u_prop_case_ignorable(pg_wchar code); -extern bool pg_u_prop_white_space(pg_wchar code); -extern bool pg_u_prop_hex_digit(pg_wchar code); -extern bool pg_u_prop_join_control(pg_wchar code); +extern bool pg_u_prop_alphabetic(char32_t code); +extern bool pg_u_prop_lowercase(char32_t code); +extern bool pg_u_prop_uppercase(char32_t code); +extern bool pg_u_prop_cased(char32_t code); +extern bool pg_u_prop_case_ignorable(char32_t code); +extern bool pg_u_prop_white_space(char32_t code); +extern bool pg_u_prop_hex_digit(char32_t code); +extern bool pg_u_prop_join_control(char32_t code); -extern bool pg_u_isdigit(pg_wchar code, bool posix); -extern bool pg_u_isalpha(pg_wchar code); -extern bool pg_u_isalnum(pg_wchar code, bool posix); -extern bool pg_u_isword(pg_wchar code); -extern bool pg_u_isupper(pg_wchar code); -extern bool pg_u_islower(pg_wchar code); -extern bool pg_u_isblank(pg_wchar code); -extern bool pg_u_iscntrl(pg_wchar code); -extern bool pg_u_isgraph(pg_wchar code); -extern bool pg_u_isprint(pg_wchar code); -extern bool pg_u_ispunct(pg_wchar code, bool posix); -extern bool pg_u_isspace(pg_wchar code); -extern bool pg_u_isxdigit(pg_wchar code, bool posix); +extern bool pg_u_isdigit(char32_t code, bool posix); +extern bool pg_u_isalpha(char32_t code); +extern bool pg_u_isalnum(char32_t code, bool posix); +extern bool pg_u_isword(char32_t code); +extern bool pg_u_isupper(char32_t code); +extern bool pg_u_islower(char32_t code); +extern bool pg_u_isblank(char32_t code); +extern bool pg_u_iscntrl(char32_t code); +extern bool pg_u_isgraph(char32_t code); +extern bool pg_u_isprint(char32_t code); +extern bool pg_u_ispunct(char32_t code, bool posix); +extern bool pg_u_isspace(char32_t code); +extern bool pg_u_isxdigit(char32_t code, bool posix); #endif /* UNICODE_CATEGORY_H */ diff --git a/src/include/common/unicode_category_table.h b/src/include/common/unicode_category_table.h index 95a1c65da7e..466a41b72b0 100644 --- a/src/include/common/unicode_category_table.h +++ b/src/include/common/unicode_category_table.h @@ -20,15 +20,15 @@ */ typedef struct { - uint32 first; /* Unicode codepoint */ - uint32 last; /* Unicode codepoint */ + char32_t first; /* Unicode codepoint */ + char32_t last; /* Unicode codepoint */ uint8 category; /* General Category */ } pg_category_range; typedef struct { - uint32 first; /* Unicode codepoint */ - uint32 last; /* Unicode codepoint */ + char32_t first; /* Unicode codepoint */ + char32_t last; /* Unicode codepoint */ } pg_unicode_range; typedef struct diff --git a/src/include/common/unicode_norm.h b/src/include/common/unicode_norm.h index 5bc3b79e78e..516c192cc4c 100644 --- a/src/include/common/unicode_norm.h +++ b/src/include/common/unicode_norm.h @@ -14,8 +14,6 @@ #ifndef UNICODE_NORM_H #define UNICODE_NORM_H -#include "mb/pg_wchar.h" - typedef enum { UNICODE_NFC = 0, @@ -32,8 +30,8 @@ typedef enum UNICODE_NORM_QC_MAYBE = -1, } UnicodeNormalizationQC; -extern pg_wchar *unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input); +extern char32_t *unicode_normalize(UnicodeNormalizationForm form, const char32_t *input); -extern UnicodeNormalizationQC unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const pg_wchar *input); +extern UnicodeNormalizationQC unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const char32_t *input); #endif /* UNICODE_NORM_H */ diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h index 4b4a9974b75..4d84bdc81e4 100644 --- a/src/include/mb/pg_wchar.h +++ b/src/include/mb/pg_wchar.h @@ -532,25 +532,25 @@ typedef uint32 (*utf_local_conversion_func) (uint32 code); * Some handy functions for Unicode-specific tests. */ static inline bool -is_valid_unicode_codepoint(pg_wchar c) +is_valid_unicode_codepoint(char32_t c) { return (c > 0 && c <= 0x10FFFF); } static inline bool -is_utf16_surrogate_first(pg_wchar c) +is_utf16_surrogate_first(char32_t c) { return (c >= 0xD800 && c <= 0xDBFF); } static inline bool -is_utf16_surrogate_second(pg_wchar c) +is_utf16_surrogate_second(char32_t c) { return (c >= 0xDC00 && c <= 0xDFFF); } -static inline pg_wchar -surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second) +static inline char32_t +surrogate_pair_to_codepoint(char16_t first, char16_t second) { return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF); } @@ -561,20 +561,20 @@ surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second) * * No error checks here, c must point to a long-enough string. */ -static inline pg_wchar +static inline char32_t utf8_to_unicode(const unsigned char *c) { if ((*c & 0x80) == 0) - return (pg_wchar) c[0]; + return (char32_t) c[0]; else if ((*c & 0xe0) == 0xc0) - return (pg_wchar) (((c[0] & 0x1f) << 6) | + return (char32_t) (((c[0] & 0x1f) << 6) | (c[1] & 0x3f)); else if ((*c & 0xf0) == 0xe0) - return (pg_wchar) (((c[0] & 0x0f) << 12) | + return (char32_t) (((c[0] & 0x0f) << 12) | ((c[1] & 0x3f) << 6) | (c[2] & 0x3f)); else if ((*c & 0xf8) == 0xf0) - return (pg_wchar) (((c[0] & 0x07) << 18) | + return (char32_t) (((c[0] & 0x07) << 18) | ((c[1] & 0x3f) << 12) | ((c[2] & 0x3f) << 6) | (c[3] & 0x3f)); @@ -588,7 +588,7 @@ utf8_to_unicode(const unsigned char *c) * unicode_utf8len(c) bytes available. */ static inline unsigned char * -unicode_to_utf8(pg_wchar c, unsigned char *utf8string) +unicode_to_utf8(char32_t c, unsigned char *utf8string) { if (c <= 0x7F) { @@ -620,7 +620,7 @@ unicode_to_utf8(pg_wchar c, unsigned char *utf8string) * Number of bytes needed to represent the given char in UTF8. */ static inline int -unicode_utf8len(pg_wchar c) +unicode_utf8len(char32_t c) { if (c <= 0x7F) return 1; @@ -676,8 +676,8 @@ extern int pg_valid_server_encoding(const char *name); extern bool is_encoding_supported_by_icu(int encoding); extern const char *get_encoding_name_for_icu(int encoding); -extern unsigned char *unicode_to_utf8(pg_wchar c, unsigned char *utf8string); -extern pg_wchar utf8_to_unicode(const unsigned char *c); +extern unsigned char *unicode_to_utf8(char32_t c, unsigned char *utf8string); +extern char32_t utf8_to_unicode(const unsigned char *c); extern bool pg_utf8_islegal(const unsigned char *source, int length); extern int pg_utf_mblen(const unsigned char *s); extern int pg_mule_mblen(const unsigned char *s); @@ -739,8 +739,8 @@ extern char *pg_server_to_client(const char *s, int len); extern char *pg_any_to_server(const char *s, int len, int encoding); extern char *pg_server_to_any(const char *s, int len, int encoding); -extern void pg_unicode_to_server(pg_wchar c, unsigned char *s); -extern bool pg_unicode_to_server_noerror(pg_wchar c, unsigned char *s); +extern void pg_unicode_to_server(char32_t c, unsigned char *s); +extern bool pg_unicode_to_server_noerror(char32_t c, unsigned char *s); extern unsigned short BIG5toCNS(unsigned short big5, unsigned char *lc); extern unsigned short CNStoBIG5(unsigned short cns, unsigned char lc); diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 08d7bfbee10..f52f14cc566 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -463,6 +463,9 @@ /* Define to 1 if you have the header file. */ #undef HAVE_TERMIOS_H +/* Define to 1 if you have the header file. */ +#undef HAVE_UCHAR_H + /* Define to 1 if curl_global_init() is guaranteed to be thread-safe. */ #undef HAVE_THREADSAFE_CURL_GLOBAL_INIT diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index ac2da4c98cf..df88c78fe3a 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -3505,6 +3505,8 @@ cb_cleanup_dir cb_options cb_tablespace cb_tablespace_mapping +char16_t +char32_t check_agg_arguments_context check_function_callback check_network_data -- 2.39.5