Compute CRC32C using AVX-512 instructions where available

author John Naylor <[email protected]>

Sun, 6 Apr 2025 07:04:30 +0000 (14:04 +0700)

committer John Naylor <[email protected]>

Sun, 6 Apr 2025 07:04:30 +0000 (14:04 +0700)
author John Naylor <[email protected]>
Sun, 6 Apr 2025 07:04:30 +0000 (14:04 +0700)
committer John Naylor <[email protected]>
Sun, 6 Apr 2025 07:04:30 +0000 (14:04 +0700)
diff --git a/config/c-compiler.m4 b/config/c-compiler.m4

index e9e54470e667b58ee719de81dd3db82842a73178..5f3e1d1faf93050030a780bb6a9741b64a1abdb8 100644 (file)
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -581,6 +581,43 @@ fi
  undefine([Ac_cachevar])dnl
  ])# PGAC_SSE42_CRC32_INTRINSICS
  
+# PGAC_AVX512_PCLMUL_INTRINSICS
+# ---------------------------
+# Check if the compiler supports AVX-512 carryless multiplication
+# and three-way exclusive-or instructions used for computing CRC.
+# AVX-512F is assumed to be supported if the above are.
+#
+# If the intrinsics are supported, sets pgac_avx512_pclmul_intrinsics.
+AC_DEFUN([PGAC_AVX512_PCLMUL_INTRINSICS],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_avx512_pclmul_intrinsics])])dnl
+AC_CACHE_CHECK([for _mm512_clmulepi64_epi128], [Ac_cachevar],
+[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <immintrin.h>
+    __m512i x;
+    __m512i y;
+
+    #if defined(__has_attribute) && __has_attribute (target)
+    __attribute__((target("vpclmulqdq,avx512vl")))
+    #endif
+    static int avx512_pclmul_test(void)
+    {
+      __m128i z;
+
+      y = _mm512_clmulepi64_epi128(x, y, 0);
+      z = _mm_ternarylogic_epi64(
+                _mm512_castsi512_si128(y),
+                _mm512_extracti32x4_epi32(y, 1),
+                _mm512_extracti32x4_epi32(y, 2),
+                0x96);
+      return _mm_crc32_u64(0, _mm_extract_epi64(z, 0));
+    }],
+  [return avx512_pclmul_test();])],
+  [Ac_cachevar=yes],
+  [Ac_cachevar=no])])
+if test x"$Ac_cachevar" = x"yes"; then
+  pgac_avx512_pclmul_intrinsics=yes
+fi
+undefine([Ac_cachevar])dnl
+])# PGAC_AVX512_PCLMUL_INTRINSICS
  
  # PGAC_ARMV8_CRC32C_INTRINSICS
  # ----------------------------
diff --git a/configure b/configure

index 11615d1122dea7e05cd330d8d767e480a727ad49..8f4a5ab28ec3c9a93938ff0fcd74383c6ffe466b 100755 (executable)
--- a/configure
+++ b/configure
@@ -17864,17 +17864,21 @@ fi
  
  # Select CRC-32C implementation.
  #
-# If we are targeting a processor that has Intel SSE 4.2 instructions, we can
-# use the special CRC instructions for calculating CRC-32C. If we're not
-# targeting such a processor, but we can nevertheless produce code that uses
-# the SSE intrinsics, compile both implementations and select which one to use
-# at runtime, depending on whether SSE 4.2 is supported by the processor we're
-# running on.
+# There are three methods of calculating CRC, in order of increasing
+# performance:
  #
-# Similarly, if we are targeting an ARM processor that has the CRC
-# instructions that are part of the ARMv8 CRC Extension, use them. And if
-# we're not targeting such a processor, but can nevertheless produce code that
-# uses the CRC instructions, compile both, and select at runtime.
+# 1. The fallback using a lookup table, called slicing-by-8
+# 2. CRC-32C instructions (found in e.g. Intel SSE 4.2 and ARMv8 CRC Extension)
+# 3. Algorithms using carryless multiplication instructions
+#    (e.g. Intel PCLMUL and Arm PMULL)
+#
+# If we can produce code (via function attributes or additional compiler
+# flags) that uses #2 (and possibly #3), we compile all implementations
+# and select which one to use at runtime, depending on what is supported
+# by the processor we're running on.
+#
+# If we are targeting a processor that has #2, we can use that without
+# runtime selection.
  #
  # Note that we do not use __attribute__((target("..."))) for the ARM CRC
  # instructions because until clang 16, using the ARM intrinsics still requires
@@ -17925,7 +17929,7 @@ if test x"$USE_SSE42_CRC32C" = x"1"; then
  
  $as_echo "#define USE_SSE42_CRC32C 1" >>confdefs.h
  
-  PG_CRC32C_OBJS="pg_crc32c_sse42.o"
+  PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sse42_choose.o"
    { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2" >&5
  $as_echo "SSE 4.2" >&6; }
  else
@@ -17974,6 +17978,71 @@ $as_echo "slicing-by-8" >&6; }
  fi
  
  
+# Check for carryless multiplication intrinsics to do vectorized CRC calculations.
+#
+if test x"$host_cpu" = x"x86_64"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_clmulepi64_epi128" >&5
+$as_echo_n "checking for _mm512_clmulepi64_epi128... " >&6; }
+if ${pgac_cv_avx512_pclmul_intrinsics+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <immintrin.h>
+    __m512i x;
+    __m512i y;
+
+    #if defined(__has_attribute) && __has_attribute (target)
+    __attribute__((target("vpclmulqdq,avx512vl")))
+    #endif
+    static int avx512_pclmul_test(void)
+    {
+      __m128i z;
+
+      y = _mm512_clmulepi64_epi128(x, y, 0);
+      z = _mm_ternarylogic_epi64(
+                _mm512_castsi512_si128(y),
+                _mm512_extracti32x4_epi32(y, 1),
+                _mm512_extracti32x4_epi32(y, 2),
+                0x96);
+      return _mm_crc32_u64(0, _mm_extract_epi64(z, 0));
+    }
+int
+main ()
+{
+return avx512_pclmul_test();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_avx512_pclmul_intrinsics=yes
+else
+  pgac_cv_avx512_pclmul_intrinsics=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx512_pclmul_intrinsics" >&5
+$as_echo "$pgac_cv_avx512_pclmul_intrinsics" >&6; }
+if test x"$pgac_cv_avx512_pclmul_intrinsics" = x"yes"; then
+  pgac_avx512_pclmul_intrinsics=yes
+fi
+
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for vectorized CRC-32C" >&5
+$as_echo_n "checking for vectorized CRC-32C... " >&6; }
+if test x"$pgac_avx512_pclmul_intrinsics" = x"yes"; then
+
+$as_echo "#define USE_AVX512_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h
+
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: AVX-512 with runtime check" >&5
+$as_echo "AVX-512 with runtime check" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: none" >&5
+$as_echo "none" >&6; }
+fi
  
  # Select semaphore implementation type.
  if test "$PORTNAME" != "win32"; then
diff --git a/configure.ac b/configure.ac

index debdf1650441b22ccf3c95b29f4f91222bdb83bd..fc5f7475d07b9bdc89a3cdbe944e2d99c82e7a17 100644 (file)
--- a/configure.ac
+++ b/configure.ac
@@ -2116,17 +2116,21 @@ AC_SUBST(CFLAGS_CRC)
  
  # Select CRC-32C implementation.
  #
-# If we are targeting a processor that has Intel SSE 4.2 instructions, we can
-# use the special CRC instructions for calculating CRC-32C. If we're not
-# targeting such a processor, but we can nevertheless produce code that uses
-# the SSE intrinsics, compile both implementations and select which one to use
-# at runtime, depending on whether SSE 4.2 is supported by the processor we're
-# running on.
-#
-# Similarly, if we are targeting an ARM processor that has the CRC
-# instructions that are part of the ARMv8 CRC Extension, use them. And if
-# we're not targeting such a processor, but can nevertheless produce code that
-# uses the CRC instructions, compile both, and select at runtime.
+# There are three methods of calculating CRC, in order of increasing
+# performance:
+#
+# 1. The fallback using a lookup table, called slicing-by-8
+# 2. CRC-32C instructions (found in e.g. Intel SSE 4.2 and ARMv8 CRC Extension)
+# 3. Algorithms using carryless multiplication instructions
+#    (e.g. Intel PCLMUL and Arm PMULL)
+#
+# If we can produce code (via function attributes or additional compiler
+# flags) that uses #2 (and possibly #3), we compile all implementations
+# and select which one to use at runtime, depending on what is supported
+# by the processor we're running on.
+#
+# If we are targeting a processor that has #2, we can use that without
+# runtime selection.
  #
  # Note that we do not use __attribute__((target("..."))) for the ARM CRC
  # instructions because until clang 16, using the ARM intrinsics still requires
@@ -2174,7 +2178,7 @@ fi
  AC_MSG_CHECKING([which CRC-32C implementation to use])
  if test x"$USE_SSE42_CRC32C" = x"1"; then
    AC_DEFINE(USE_SSE42_CRC32C, 1, [Define to 1 use Intel SSE 4.2 CRC instructions.])
-  PG_CRC32C_OBJS="pg_crc32c_sse42.o"
+  PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sse42_choose.o"
    AC_MSG_RESULT(SSE 4.2)
  else
    if test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then
@@ -2207,6 +2211,19 @@ else
  fi
  AC_SUBST(PG_CRC32C_OBJS)
  
+# Check for carryless multiplication intrinsics to do vectorized CRC calculations.
+#
+if test x"$host_cpu" = x"x86_64"; then
+  PGAC_AVX512_PCLMUL_INTRINSICS()
+fi
+
+AC_MSG_CHECKING([for vectorized CRC-32C])
+if test x"$pgac_avx512_pclmul_intrinsics" = x"yes"; then
+  AC_DEFINE(USE_AVX512_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use AVX-512 CRC algorithms with a runtime check.])
+  AC_MSG_RESULT(AVX-512 with runtime check)
+else
+  AC_MSG_RESULT(none)
+fi
  
  # Select semaphore implementation type.
  if test "$PORTNAME" != "win32"; then
diff --git a/meson.build b/meson.build

index 454ed81f5ead9ed49c309631d27e1081e30031cf..27717ad89767041eecc5c0fa529b33aa55dac37d 100644 (file)
--- a/meson.build
+++ b/meson.build
@@ -2349,17 +2349,21 @@ endif
  ###############################################################
  # Select CRC-32C implementation.
  #
-# If we are targeting a processor that has Intel SSE 4.2 instructions, we can
-# use the special CRC instructions for calculating CRC-32C. If we're not
-# targeting such a processor, but we can nevertheless produce code that uses
-# the SSE intrinsics, compile both implementations and select which one to use
-# at runtime, depending on whether SSE 4.2 is supported by the processor we're
-# running on.
+# There are three methods of calculating CRC, in order of increasing
+# performance:
  #
-# Similarly, if we are targeting an ARM processor that has the CRC
-# instructions that are part of the ARMv8 CRC Extension, use them. And if
-# we're not targeting such a processor, but can nevertheless produce code that
-# uses the CRC instructions, compile both, and select at runtime.
+# 1. The fallback using a lookup table, called slicing-by-8
+# 2. CRC-32C instructions (found in e.g. Intel SSE 4.2 and ARMv8 CRC Extension)
+# 3. Algorithms using carryless multiplication instructions
+#    (e.g. Intel PCLMUL and Arm PMULL)
+#
+# If we can produce code (via function attributes or additional compiler
+# flags) that uses #2 (and possibly #3), we compile all implementations
+# and select which one to use at runtime, depending on what is supported
+# by the processor we're running on.
+#
+# If we are targeting a processor that has #2, we can use that without
+# runtime selection.
  #
  # Note that we do not use __attribute__((target("..."))) for the ARM CRC
  # instructions because until clang 16, using the ARM intrinsics still requires
@@ -2393,7 +2397,7 @@ int main(void)
  }
  '''
  
-    if not cc.links(prog, name: '_mm_crc32_u8 and _mm_crc32_u32',
+    if not cc.links(prog, name: 'SSE 4.2 CRC32C',
            args: test_c_args)
        # Do not use Intel SSE 4.2
      elif (cc.get_define('__SSE4_2__') != '')
@@ -2408,6 +2412,38 @@ int main(void)
        have_optimized_crc = true
      endif
  
+    # Check if the compiler supports AVX-512 carryless multiplication
+    # and three-way exclusive-or instructions used for computing CRC.
+    # AVX-512F is assumed to be supported if the above are.
+    prog = '''
+#include <immintrin.h>
+__m512i x;
+__m512i y;
+
+#if defined(__has_attribute) && __has_attribute (target)
+__attribute__((target("vpclmulqdq,avx512vl")))
+#endif
+int main(void)
+{
+     __m128i z;
+
+    y = _mm512_clmulepi64_epi128(x, y, 0);
+    z = _mm_ternarylogic_epi64(
+            _mm512_castsi512_si128(y),
+            _mm512_extracti32x4_epi32(y, 1),
+            _mm512_extracti32x4_epi32(y, 2),
+            0x96);
+    /* return computed value, to prevent the above being optimized away */
+    return _mm_crc32_u64(0, _mm_extract_epi64(z, 0));
+}
+'''
+
+    if cc.links(prog,
+        name: 'AVX-512 CRC32C',
+        args: test_c_args)
+      cdata.set('USE_AVX512_CRC32C_WITH_RUNTIME_CHECK', 1)
+    endif
+
    endif
  
  elif host_cpu == 'arm' or host_cpu == 'aarch64'
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in

index c2f1241b2342208d19aa66312fbabad5bdee57a0..9891b9b05c30e6ce8b60a5666c04f9464f3ee5e8 100644 (file)
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -665,6 +665,9 @@
  /* Define to 1 to build with assertion checks. (--enable-cassert) */
  #undef USE_ASSERT_CHECKING
  
+/* Define to 1 to use AVX-512 CRC algorithms with a runtime check. */
+#undef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
+
  /* Define to 1 to use AVX-512 popcount instructions with a runtime check. */
  #undef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
  
diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h

index 9376d223feffe3d6ec63e8e1c02c8b8e2875a829..82313bb7fcfee82ad9500a65547de8d3a747e781 100644 (file)
--- a/src/include/port/pg_crc32c.h
+++ b/src/include/port/pg_crc32c.h
@@ -42,7 +42,10 @@ typedef uint32 pg_crc32c;
  #define EQ_CRC32C(c1, c2) ((c1) == (c2))
  
  #if defined(USE_SSE42_CRC32C)
-/* Use Intel SSE4.2 instructions. */
+/*
+ * Use either Intel SSE 4.2 or AVX-512 instructions. We don't need a runtime check
+ * for SSE 4.2, so we can inline those in some cases.
+ */
  
  #include <nmmintrin.h>
  
@@ -50,7 +53,11 @@ typedef uint32 pg_crc32c;
     ((crc) = pg_comp_crc32c_dispatch((crc), (data), (len)))
  #define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
  
+extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len);
  extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len);
+#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
+extern pg_crc32c pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t len);
+#endif
  
  /*
   * We can only get here if the host compiler targets SSE 4.2, but on some
@@ -82,9 +89,27 @@ pg_comp_crc32c_dispatch(pg_crc32c crc, const void *data, size_t len)
         return crc;
     }
     else
-       return pg_comp_crc32c_sse42(crc, data, len);
+       /* Otherwise, use a runtime check for AVX-512 instructions. */
+       return pg_comp_crc32c(crc, data, len);
  }
  
+#elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK)
+
+/*
+ * Use Intel SSE 4.2 or AVX-512 instructions, but perform a runtime check first
+ * to check that they are available.
+ */
+#define COMP_CRC32C(crc, data, len) \
+   ((crc) = pg_comp_crc32c((crc), (data), (len)))
+#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
+
+extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len);
+extern PGDLLIMPORT pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len);
+extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len);
+#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
+extern pg_crc32c pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t len);
+#endif
+
  #elif defined(USE_ARMV8_CRC32C)
  /* Use ARMv8 CRC Extension instructions. */
  
@@ -103,10 +128,10 @@ extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t le
  
  extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_t len);
  
-#elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK) || defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK)
+#elif defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK)
  
  /*
- * Use Intel SSE 4.2 or ARMv8 instructions, but perform a runtime check first
+ * Use ARMv8 instructions, but perform a runtime check first
   * to check that they are available.
   */
  #define COMP_CRC32C(crc, data, len) \
@@ -115,13 +140,7 @@ extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_
  
  extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len);
  extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len);
-
-#ifdef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK
-extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len);
-#endif
-#ifdef USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK
  extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len);
-#endif
  
  #else
  /*
diff --git a/src/port/meson.build b/src/port/meson.build

index 51041e756099007213942d3123a58c114b303ef1..48d2dfb7cf3d5f79fb2450cbcf49c3aa046eae96 100644 (file)
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -86,6 +86,7 @@ replace_funcs_pos = [
    # x86/x64
    ['pg_crc32c_sse42', 'USE_SSE42_CRC32C'],
    ['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
+  ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C'],
    ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
    ['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
  
diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c

index 22c2137df31c1a309059aebde9234cf87df1cd20..db60bb3c32c24d354f1508488a519f3a2f66e9e6 100644 (file)
--- a/src/port/pg_crc32c_sse42.c
+++ b/src/port/pg_crc32c_sse42.c
@@ -1,7 +1,7 @@
  /*-------------------------------------------------------------------------
   *
   * pg_crc32c_sse42.c
- *   Compute CRC-32C checksum using Intel SSE 4.2 instructions.
+ *   Compute CRC-32C checksum using Intel SSE 4.2 or AVX-512 instructions.
   *
   * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
@@ -15,6 +15,9 @@
  #include "c.h"
  
  #include <nmmintrin.h>
+#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
+#include <immintrin.h>
+#endif
  
  #include "port/pg_crc32c.h"
  
@@ -68,3 +71,92 @@ pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
  
     return crc;
  }
+
+#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
+
+/*
+ * Note: There is no copyright notice in the following generated code.
+ *
+ * We have modified the output to
+ *   - match our function declaration
+ *   - match whitespace to our project style
+ *   - add a threshold for the alignment stanza
+ */
+
+/* Generated by https://github.com/corsix/fast-crc32/ using: */
+/* ./generate -i avx512_vpclmulqdq -p crc32c -a v1e */
+/* MIT licensed */
+
+#define clmul_lo(a, b) (_mm512_clmulepi64_epi128((a), (b), 0))
+#define clmul_hi(a, b) (_mm512_clmulepi64_epi128((a), (b), 17))
+
+pg_attribute_target("vpclmulqdq,avx512vl")
+pg_crc32c
+pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t length)
+{
+   /* adjust names to match generated code */
+   pg_crc32c   crc0 = crc;
+   size_t      len = length;
+   const char *buf = data;
+
+   /* Align on cacheline boundary. The threshold is somewhat arbitrary. */
+   if (unlikely(len > 256))
+   {
+       for (; len && ((uintptr_t) buf & 7); --len)
+           crc0 = _mm_crc32_u8(crc0, *buf++);
+       while (((uintptr_t) buf & 56) && len >= 8)
+       {
+           crc0 = _mm_crc32_u64(crc0, *(const uint64_t *) buf);
+           buf += 8;
+           len -= 8;
+       }
+   }
+
+   if (len >= 64)
+   {
+       const char *end = buf + len;
+       const char *limit = buf + len - 64;
+       __m128i     z0;
+
+       /* First vector chunk. */
+       __m512i     x0 = _mm512_loadu_si512((const void *) buf),
+                   y0;
+       __m512i     k;
+
+       k = _mm512_broadcast_i32x4(_mm_setr_epi32(0x740eef02, 0, 0x9e4addf8, 0));
+       x0 = _mm512_xor_si512(_mm512_castsi128_si512(_mm_cvtsi32_si128(crc0)), x0);
+       buf += 64;
+
+       /* Main loop. */
+       while (buf <= limit)
+       {
+           y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+           x0 = _mm512_ternarylogic_epi64(x0, y0,
+                                          _mm512_loadu_si512((const void *) buf),
+                                          0x96);
+           buf += 64;
+       }
+
+       /* Reduce 512 bits to 128 bits. */
+       k = _mm512_setr_epi32(0x1c291d04, 0, 0xddc0152b, 0,
+                             0x3da6d0cb, 0, 0xba4fc28e, 0,
+                             0xf20c0dfe, 0, 0x493c7d27, 0,
+                             0, 0, 0, 0);
+       y0 = clmul_lo(x0, k), k = clmul_hi(x0, k);
+       y0 = _mm512_xor_si512(y0, k);
+       z0 = _mm_ternarylogic_epi64(_mm512_castsi512_si128(y0),
+                                   _mm512_extracti32x4_epi32(y0, 1),
+                                   _mm512_extracti32x4_epi32(y0, 2),
+                                   0x96);
+       z0 = _mm_xor_si128(z0, _mm512_extracti32x4_epi32(x0, 3));
+
+       /* Reduce 128 bits to 32 bits, and multiply by x^32. */
+       crc0 = _mm_crc32_u64(0, _mm_extract_epi64(z0, 0));
+       crc0 = _mm_crc32_u64(crc0, _mm_extract_epi64(z0, 1));
+       len = end - buf;
+   }
+
+   return pg_comp_crc32c_sse42(crc0, buf, len);
+}
+
+#endif
diff --git a/src/port/pg_crc32c_sse42_choose.c b/src/port/pg_crc32c_sse42_choose.c

index 65dbc4d42497bb0edc6dd721da30f35d8b0ebcb1..74d2421ba2be99cd73883388469e3fad8e5e557b 100644 (file)
--- a/src/port/pg_crc32c_sse42_choose.c
+++ b/src/port/pg_crc32c_sse42_choose.c
@@ -20,30 +20,37 @@
  
  #include "c.h"
  
-#ifdef HAVE__GET_CPUID
+#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT)
  #include <cpuid.h>
  #endif
  
-#ifdef HAVE__CPUID
+#if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX)
  #include <intrin.h>
  #endif
  
+#ifdef HAVE_XSAVE_INTRINSICS
+#include <immintrin.h>
+#endif
+
  #include "port/pg_crc32c.h"
  
+/*
+ * Does XGETBV say the ZMM registers are enabled?
+ *
+ * NB: Caller is responsible for verifying that osxsave is available
+ * before calling this.
+ */
+#ifdef HAVE_XSAVE_INTRINSICS
+pg_attribute_target("xsave")
+#endif
  static bool
-pg_crc32c_sse42_available(void)
+zmm_regs_available(void)
  {
-   unsigned int exx[4] = {0, 0, 0, 0};
-
-#if defined(HAVE__GET_CPUID)
-   __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
-#elif defined(HAVE__CPUID)
-   __cpuid(exx, 1);
+#ifdef HAVE_XSAVE_INTRINSICS
+   return (_xgetbv(0) & 0xe6) == 0xe6;
  #else
-#error cpuid instruction not available
+   return false;
  #endif
-
-   return (exx[2] & (1 << 20)) != 0;   /* SSE 4.2 */
  }
  
  /*
@@ -53,10 +60,48 @@ pg_crc32c_sse42_available(void)
  static pg_crc32c
  pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
  {
-   if (pg_crc32c_sse42_available())
+   unsigned int exx[4] = {0, 0, 0, 0};
+
+   /*
+    * Set fallback. We must guard since slicing-by-8 is not visible
+    * everywhere.
+    */
+#ifdef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK
+   pg_comp_crc32c = pg_comp_crc32c_sb8;
+#endif
+
+#if defined(HAVE__GET_CPUID)
+   __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUID)
+   __cpuid(exx, 1);
+#else
+#error cpuid instruction not available
+#endif
+
+   if ((exx[2] & (1 << 20)) != 0)  /* SSE 4.2 */
+   {
         pg_comp_crc32c = pg_comp_crc32c_sse42;
-   else
-       pg_comp_crc32c = pg_comp_crc32c_sb8;
+
+       if (exx[2] & (1 << 27) &&   /* OSXSAVE */
+           zmm_regs_available())
+       {
+           /* second cpuid call on leaf 7 to check extended AVX-512 support */
+
+           memset(exx, 0, 4 * sizeof(exx[0]));
+
+#if defined(HAVE__GET_CPUID_COUNT)
+           __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUIDEX)
+           __cpuidex(exx, 7, 0);
+#endif
+
+#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
+           if (exx[2] & (1 << 10) &&   /* VPCLMULQDQ */
+               exx[1] & (1 << 31)) /* AVX512-VL */
+               pg_comp_crc32c = pg_comp_crc32c_avx512;
+#endif
+       }
+   }
  
     return pg_comp_crc32c(crc, data, len);
  }
diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out

index dc485735aa45093ed8b3e658c42a68fc77d192c3..174f0a68331bdf9fd71171db9f9106ac222c06b3 100644 (file)
--- a/src/test/regress/expected/strings.out
+++ b/src/test/regress/expected/strings.out
@@ -2330,6 +2330,30 @@ SELECT crc32c('The quick brown fox jumps over the lazy dog.');
   419469235
  (1 row)
  
+SELECT crc32c(repeat('A', 127)::bytea);
+  crc32c   
+-----------
+ 291820082
+(1 row)
+
+SELECT crc32c(repeat('A', 128)::bytea);
+  crc32c   
+-----------
+ 816091258
+(1 row)
+
+SELECT crc32c(repeat('A', 129)::bytea);
+   crc32c   
+------------
+ 4213642571
+(1 row)
+
+SELECT crc32c(repeat('A', 800)::bytea);
+   crc32c   
+------------
+ 3134039419
+(1 row)
+
  --
  -- encode/decode
  --
diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql

index aeba798dac1fafaa96a6c362cce8c60c7a475dbc..f7b325baadf4d926b5af62edf343a0928f42879c 100644 (file)
--- a/src/test/regress/sql/strings.sql
+++ b/src/test/regress/sql/strings.sql
@@ -738,6 +738,11 @@ SELECT crc32('The quick brown fox jumps over the lazy dog.');
  SELECT crc32c('');
  SELECT crc32c('The quick brown fox jumps over the lazy dog.');
  
+SELECT crc32c(repeat('A', 127)::bytea);
+SELECT crc32c(repeat('A', 128)::bytea);
+SELECT crc32c(repeat('A', 129)::bytea);
+SELECT crc32c(repeat('A', 800)::bytea);
+
  --
  -- encode/decode
  --
author	John Naylor <[email protected]>
	Sun, 6 Apr 2025 07:04:30 +0000 (14:04 +0700)
committer	John Naylor <[email protected]>
	Sun, 6 Apr 2025 07:04:30 +0000 (14:04 +0700)
config/c-compiler.m4		patch \| blob \| blame \| history
configure		patch \| blob \| blame \| history
configure.ac		patch \| blob \| blame \| history
meson.build		patch \| blob \| blame \| history
src/include/pg_config.h.in		patch \| blob \| blame \| history
src/include/port/pg_crc32c.h		patch \| blob \| blame \| history
src/port/meson.build		patch \| blob \| blame \| history
src/port/pg_crc32c_sse42.c		patch \| blob \| blame \| history
src/port/pg_crc32c_sse42_choose.c		patch \| blob \| blame \| history
src/test/regress/expected/strings.out		patch \| blob \| blame \| history
src/test/regress/sql/strings.sql		patch \| blob \| blame \| history