-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[X86][AVX10.2] Support YMM rounding new instructions #101825
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-clang @llvm/pr-subscribers-clang-codegen Author: Phoebe Wang (phoebewang) ChangesRef.: https://cdrdv2.intel.com/v1/dl/getContent/828965 Patch is 767.98 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/101825.diff 17 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def
index f028711a807c0..b117c6d6d9340 100644
--- a/clang/include/clang/Basic/BuiltinsX86.def
+++ b/clang/include/clang/Basic/BuiltinsX86.def
@@ -1966,6 +1966,126 @@ TARGET_BUILTIN(__builtin_ia32_mpsadbw512, "V32sV64cV64cIc", "ncV:512:", "avx10.2
TARGET_BUILTIN(__builtin_ia32_vaddpd256_round, "V4dV4dV4dIi", "nV:256:", "avx10.2-256")
TARGET_BUILTIN(__builtin_ia32_vaddph256_round, "V16xV16xV16xIi", "nV:256:", "avx10.2-256")
TARGET_BUILTIN(__builtin_ia32_vaddps256_round, "V8fV8fV8fIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcmppd256_round_mask, "UcV4dV4dIiUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcmpph256_round_mask, "UsV16xV16xIiUsIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcmpps256_round_mask, "UcV8fV8fIiUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtdq2ph256_round_mask, "V8xV8iV8xUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtdq2ps256_round_mask, "V8fV8iV8fUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtpd2dq256_round_mask, "V4iV4dV4iUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtpd2ph256_round_mask, "V8xV4dV8xUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtpd2ps256_round_mask, "V4fV4dV4fUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtpd2qq256_round_mask, "V4LLiV4dV4LLiUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtpd2udq256_round_mask, "V4UiV4dV4UiUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtpd2uqq256_round_mask, "V4ULLiV4dV4ULLiUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtph2dq256_round_mask, "V8iV8xV8iUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtph2pd256_round_mask, "V4dV8xV4dUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtph2psx256_round_mask, "V8fV8xV8fUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtph2qq256_round_mask, "V4LLiV8xV4LLiUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtph2udq256_round_mask, "V8UiV8xV8UiUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtph2uqq256_round_mask, "V4ULLiV8xV4ULLiUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtph2uw256_round_mask, "V16UsV16xV16UsUsIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtph2w256_round_mask, "V16sV16xV16sUsIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtps2dq256_round_mask, "V8iV8fV8iUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtps2pd256_round_mask, "V4dV4fV4dUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtps2phx256_round_mask, "V8xV8fV8xUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtps2qq256_round_mask, "V4LLiV4fV4LLiUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtps2udq256_round_mask, "V8UiV8fV8UiUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtps2uqq256_round_mask, "V4ULLiV4fV4ULLiUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtqq2pd256_round_mask, "V4dV4LLiV4dUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtqq2ph256_round_mask, "V8xV4LLiV8xUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtqq2ps256_round_mask, "V4fV4LLiV4fUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvttpd2dq256_round_mask, "V4iV4dV4iUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvttpd2qq256_round_mask, "V4LLiV4dV4LLiUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvttpd2udq256_round_mask, "V4UiV4dV4UiUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvttpd2uqq256_round_mask, "V4ULLiV4dV4ULLiUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvttph2dq256_round_mask, "V8iV8xV8iUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvttph2qq256_round_mask, "V4LLiV8xV4LLiUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvttph2udq256_round_mask, "V8UiV8xV8UiUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvttph2uqq256_round_mask, "V4ULLiV8xV4ULLiUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvttph2uw256_round_mask, "V16UsV16xV16UsUsIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvttph2w256_round_mask, "V16sV16xV16sUsIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvttps2dq256_round_mask, "V8iV8fV8iUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvttps2qq256_round_mask, "V4LLiV4fV4LLiUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvttps2udq256_round_mask, "V8UiV8fV8UiUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvttps2uqq256_round_mask, "V4ULLiV4fV4ULLiUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtudq2ph256_round_mask, "V8xV8UiV8xUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtudq2ps256_round_mask, "V8fV8UiV8fUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtuqq2pd256_round_mask, "V4dV4ULLiV4dUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtuqq2ph256_round_mask, "V8xV4ULLiV8xUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtuqq2ps256_round_mask, "V4fV4ULLiV4fUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtuw2ph256_round_mask, "V16xV16UsV16xUsIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtw2ph256_round_mask, "V16xV16sV16xUsIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vdivpd256_round, "V4dV4dV4dIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vdivph256_round, "V16xV16xV16xIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vdivps256_round, "V8fV8fV8fIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vfcmaddcph256_round_mask, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vfcmaddcph256_round_maskz, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vfcmaddcph256_round_mask3, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vfcmulcph256_round_mask, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vfixupimmpd256_round_mask, "V4dV4dV4dV4LLiIiUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vfixupimmpd256_round_maskz, "V4dV4dV4dV4LLiIiUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vfixupimmps256_round_mask, "V8fV8fV8fV8iIiUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vfixupimmps256_round_maskz, "V8fV8fV8fV8iIiUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vfmaddpd256_round_mask, "V4dV4dV4dV4dUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vfmaddpd256_round_maskz, "V4dV4dV4dV4dUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vfmaddpd256_round_mask3, "V4dV4dV4dV4dUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vfmaddph256_round_mask, "V16xV16xV16xV16xUsIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vfmaddph256_round_maskz, "V16xV16xV16xV16xUsIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vfmaddph256_round_mask3, "V16xV16xV16xV16xUsIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vfmaddps256_round_mask, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vfmaddps256_round_maskz, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vfmaddps256_round_mask3, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vfmaddcph256_round_mask, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vfmaddcph256_round_maskz, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vfmaddcph256_round_mask3, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd256_round_mask, "V4dV4dV4dV4dUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd256_round_maskz, "V4dV4dV4dV4dUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd256_round_mask3, "V4dV4dV4dV4dUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vfmaddsubph256_round_mask, "V16xV16xV16xV16xUsIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vfmaddsubph256_round_maskz, "V16xV16xV16xV16xUsIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vfmaddsubph256_round_mask3, "V16xV16xV16xV16xUsIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vfmaddsubps256_round_mask, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vfmaddsubps256_round_maskz, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vfmaddsubps256_round_mask3, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vfmsubpd256_round_mask3, "V4dV4dV4dV4dUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vfmsubph256_round_mask3, "V16xV16xV16xV16xUsIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vfmsubps256_round_mask3, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vfmsubaddpd256_round_mask3, "V4dV4dV4dV4dUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vfmsubaddph256_round_mask3, "V16xV16xV16xV16xUsIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vfmsubaddps256_round_mask3, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vfmulcph256_round_mask, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vgetexppd256_round_mask, "V4dV4dV4dUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vgetexpph256_round_mask, "V16xV16xV16xUsIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vgetexpps256_round_mask, "V8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vgetmantpd256_round_mask, "V4dV4dIiV4dUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vgetmantph256_round_mask, "V16xV16xIiV16xUsIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vgetmantps256_round_mask, "V8fV8fIiV8fUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vmaxpd256_round, "V4dV4dV4dIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vmaxph256_round, "V16xV16xV16xIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vmaxps256_round, "V8fV8fV8fIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vminpd256_round, "V4dV4dV4dIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vminph256_round, "V16xV16xV16xIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vminps256_round, "V8fV8fV8fIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vmulpd256_round, "V4dV4dV4dIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vmulph256_round, "V16xV16xV16xIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vmulps256_round, "V8fV8fV8fIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vrangepd256_round_mask, "V4dV4dV4dIiV4dUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vrangeps256_round_mask, "V8fV8fV8fIiV8fUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vreducepd256_round_mask, "V4dV4dIiV4dUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vreduceph256_round_mask, "V16xV16xIiV16xUsIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vreduceps256_round_mask, "V8fV8fIiV8fUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vrndscalepd256_round_mask, "V4dV4dIiV4dUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vrndscaleph256_round_mask, "V16xV16xIiV16xUsIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vrndscaleps256_round_mask, "V8fV8fIiV8fUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vscalefpd256_round_mask, "V4dV4dV4dV4dUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vscalefph256_round_mask, "V16xV16xV16xV16xUsIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vscalefps256_round_mask, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vsqrtpd256_round, "V4dV4dIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vsqrtph256_round, "V16xV16xIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vsqrtps256_round, "V8fV8fIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vsubpd256_round, "V4dV4dV4dIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vsubph256_round, "V16xV16xV16xIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vsubps256_round, "V8fV8fV8fIi", "nV:256:", "avx10.2-256")
// AVX-VNNI-INT16
TARGET_BUILTIN(__builtin_ia32_vpdpwsud128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16")
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 13caab6c42111..51d1162c6e403 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -13938,6 +13938,54 @@ static Value *EmitX86FMAExpr(CodeGenFunction &CGF, const CallExpr *E,
case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask3:
IID = llvm::Intrinsic::x86_avx512_vfmaddsub_pd_512;
break;
+ case clang::X86::BI__builtin_ia32_vfmsubph256_round_mask3:
+ Subtract = true;
+ LLVM_FALLTHROUGH;
+ case clang::X86::BI__builtin_ia32_vfmaddph256_round_mask:
+ case clang::X86::BI__builtin_ia32_vfmaddph256_round_maskz:
+ case clang::X86::BI__builtin_ia32_vfmaddph256_round_mask3:
+ IID = llvm::Intrinsic::x86_avx10_vfmaddph256;
+ break;
+ case clang::X86::BI__builtin_ia32_vfmsubaddph256_round_mask3:
+ Subtract = true;
+ LLVM_FALLTHROUGH;
+ case clang::X86::BI__builtin_ia32_vfmaddsubph256_round_mask:
+ case clang::X86::BI__builtin_ia32_vfmaddsubph256_round_maskz:
+ case clang::X86::BI__builtin_ia32_vfmaddsubph256_round_mask3:
+ IID = llvm::Intrinsic::x86_avx10_vfmaddsubph256;
+ break;
+ case clang::X86::BI__builtin_ia32_vfmsubps256_round_mask3:
+ Subtract = true;
+ LLVM_FALLTHROUGH;
+ case clang::X86::BI__builtin_ia32_vfmaddps256_round_mask:
+ case clang::X86::BI__builtin_ia32_vfmaddps256_round_maskz:
+ case clang::X86::BI__builtin_ia32_vfmaddps256_round_mask3:
+ IID = llvm::Intrinsic::x86_avx10_vfmaddps256;
+ break;
+ case clang::X86::BI__builtin_ia32_vfmsubpd256_round_mask3:
+ Subtract = true;
+ LLVM_FALLTHROUGH;
+ case clang::X86::BI__builtin_ia32_vfmaddpd256_round_mask:
+ case clang::X86::BI__builtin_ia32_vfmaddpd256_round_maskz:
+ case clang::X86::BI__builtin_ia32_vfmaddpd256_round_mask3:
+ IID = llvm::Intrinsic::x86_avx10_vfmaddpd256;
+ break;
+ case clang::X86::BI__builtin_ia32_vfmsubaddps256_round_mask3:
+ Subtract = true;
+ LLVM_FALLTHROUGH;
+ case clang::X86::BI__builtin_ia32_vfmaddsubps256_round_mask:
+ case clang::X86::BI__builtin_ia32_vfmaddsubps256_round_maskz:
+ case clang::X86::BI__builtin_ia32_vfmaddsubps256_round_mask3:
+ IID = llvm::Intrinsic::x86_avx10_vfmaddsubps256;
+ break;
+ case clang::X86::BI__builtin_ia32_vfmsubaddpd256_round_mask3:
+ Subtract = true;
+ LLVM_FALLTHROUGH;
+ case clang::X86::BI__builtin_ia32_vfmaddsubpd256_round_mask:
+ case clang::X86::BI__builtin_ia32_vfmaddsubpd256_round_maskz:
+ case clang::X86::BI__builtin_ia32_vfmaddsubpd256_round_mask3:
+ IID = llvm::Intrinsic::x86_avx10_vfmaddsubpd256;
+ break;
}
Value *A = Ops[0];
@@ -13977,6 +14025,12 @@ static Value *EmitX86FMAExpr(CodeGenFunction &CGF, const CallExpr *E,
case clang::X86::BI__builtin_ia32_vfmaddsubph512_mask:
case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask:
case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask:
+ case clang::X86::BI__builtin_ia32_vfmaddph256_round_mask:
+ case clang::X86::BI__builtin_ia32_vfmaddps256_round_mask:
+ case clang::X86::BI__builtin_ia32_vfmaddpd256_round_mask:
+ case clang::X86::BI__builtin_ia32_vfmaddsubph256_round_mask:
+ case clang::X86::BI__builtin_ia32_vfmaddsubps256_round_mask:
+ case clang::X86::BI__builtin_ia32_vfmaddsubpd256_round_mask:
MaskFalseVal = Ops[0];
break;
case clang::X86::BI__builtin_ia32_vfmaddph512_maskz:
@@ -13985,6 +14039,12 @@ static Value *EmitX86FMAExpr(CodeGenFunction &CGF, const CallExpr *E,
case clang::X86::BI__builtin_ia32_vfmaddsubph512_maskz:
case clang::X86::BI__builtin_ia32_vfmaddsubps512_maskz:
case clang::X86::BI__builtin_ia32_vfmaddsubpd512_maskz:
+ case clang::X86::BI__builtin_ia32_vfmaddph256_round_maskz:
+ case clang::X86::BI__builtin_ia32_vfmaddps256_round_maskz:
+ case clang::X86::BI__builtin_ia32_vfmaddpd256_round_maskz:
+ case clang::X86::BI__builtin_ia32_vfmaddsubph256_round_maskz:
+ case clang::X86::BI__builtin_ia32_vfmaddsubps256_round_maskz:
+ case clang::X86::BI__builtin_ia32_vfmaddsubpd256_round_maskz:
MaskFalseVal = Constant::getNullValue(Ops[0]->getType());
break;
case clang::X86::BI__builtin_ia32_vfmsubph512_mask3:
@@ -13999,6 +14059,18 @@ static Value *EmitX86FMAExpr(CodeGenFunction &CGF, const CallExpr *E,
case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask3:
case clang::X86::BI__builtin_ia32_vfmsubaddpd512_mask3:
case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask3:
+ case clang::X86::BI__builtin_ia32_vfmsubph256_round_mask3:
+ case clang::X86::BI__builtin_ia32_vfmaddph256_round_mask3:
+ case clang::X86::BI__builtin_ia32_vfmsubps256_round_mask3:
+ case clang::X86::BI__builtin_ia32_vfmaddps256_round_mask3:
+ case clang::X86::BI__builtin_ia32_vfmsubpd256_round_mask3:
+ case clang::X86::BI__builtin_ia32_vfmaddpd256_round_mask3:
+ case clang::X86::BI__builtin_ia32_vfmsubaddph256_round_mask3:
+ case clang::X86::BI__builtin_ia32_vfmaddsubph256_round_mask3:
+ case clang::X86::BI__builtin_ia32_vfmsubaddps256_round_mask3:
+ case clang::X86::BI__builtin_ia32_vfmaddsubps256_round_mask3:
+ case clang::X86::BI__builtin_ia32_vfmsubaddpd256_round_mask3:
+ case clang::X86::BI__builtin_ia32_vfmaddsubpd256_round_mask3:
MaskFalseVal = Ops[2];
break;
}
@@ -14686,6 +14758,12 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
case X86::BI__builtin_ia32_vcvtw2ph512_mask:
case X86::BI__builtin_ia32_vcvtdq2ph512_mask:
case X86::BI__builtin_ia32_vcvtqq2ph512_mask:
+ case X86::BI__builtin_ia32_vcvtdq2ph256_round_mask:
+ case X86::BI__builtin_ia32_vcvtdq2ps256_round_mask:
+ case X86::BI__builtin_ia32_vcvtqq2pd256_round_mask:
+ case X86::BI__builtin_ia32_vcvtqq2ph256_round_mask:
+ case X86::BI__builtin_ia32_vcvtqq2ps256_round_mask:
+ case X86::BI__builtin_ia32_vcvtw2ph256_round_mask:
return EmitX86ConvertIntToFp(*this, E, Ops, /*IsSigned*/ true);
case X86::BI__builtin_ia32_cvtudq2ps512_mask:
case X86::BI__builtin_ia32_cvtuqq2ps512_mask:
@@ -14693,6 +14771,12 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
case X86::BI__builtin_ia32_vcvtuw2ph512_mask:
case X86::BI__builtin_ia32_vcvtudq2ph512_mask:
case X86::BI__builtin_ia32_vcvtuqq2ph512_mask:
+ case X86::BI__builtin_ia32_vcvtudq2ph256_round_mask:
+ case X86::BI__builtin_ia32_vcvtudq2ps256_round_mask:
+ case X86::BI__builtin_ia32_vcvtuqq2pd256_round_mask:
+ case X86::BI__builtin_ia32_vcvtuqq2ph256_round_mask:
+ case X86::BI__builtin_ia32_vcvtuqq2ps256_round_mask:
+ case X86::BI__builtin_ia32_vcvtuw2ph256_round_mask:
return EmitX86ConvertIntToFp(*this, E, Ops, /*IsSigned*/ false);
case X86::BI__builtin_ia32_vfmaddss3:
@@ -14736,6 +14820,18 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
case X86::BI__builtin_ia32_vfmaddpd512_mask3:
case X86::BI__builtin_ia32_vfmsubpd512_mask3:
case X86::BI__builtin_ia32_vfmsubph512_mask3:
+ case X86::BI__builtin_ia32_vfmaddph256_round_mask:
+ case X86::BI__builtin_ia32_vfmaddph256_round_maskz:
+ case X86::BI__builtin_ia32_vfmaddph256_round_mask3:
+ case X86::BI__builtin_ia32_vfmaddps256_round_mask:
+ case X86::BI__builtin_ia32_vfmaddps256_round_maskz:
+ case X86::BI__builtin_ia32_vfmaddps256_round_mask3:
+ case X86::BI__builtin_ia32_vfmsubps256_round_mask3:
+ case X86::BI__builtin_ia32_vfmaddpd256_round_mask:
+ case X86::BI__builtin_ia32_vfmaddpd256_round_maskz:
+ case X86::BI__builtin_ia32_vfmaddpd256_round_mask3:
+ case X86::BI__builtin_ia32_vfmsubpd256_round_mask3:
+ case X86::BI__builtin_ia32_vfmsubph256_round_mask3:
return EmitX86FMAExpr(*this, E, Ops, BuiltinID, /*IsAddSub*/ false);
case X86::BI__builtin_ia32_vfmaddsubph512_...
[truncated]
|
You can test this locally with the following command:git-clang-format --diff 79f7630e28589364ccf989a4a838f5dd74ce260a 520379066196ffc922c7d571399052a2eb7f869c --extensions cpp,c,h -- clang/lib/CodeGen/CGBuiltin.cpp clang/lib/Headers/avx10_2niintrin.h clang/lib/Sema/SemaX86.cpp clang/test/CodeGen/X86/avx10_2ni-builtins.c llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp llvm/lib/Target/X86/X86InstrFMA3Info.cpp llvm/lib/Target/X86/X86IntrinsicsInfo.h llvm/utils/TableGen/X86DisassemblerTables.cpp View the diff from clang-format here.diff --git a/clang/lib/Headers/avx10_2niintrin.h b/clang/lib/Headers/avx10_2niintrin.h
index 42b24d2b5b..830028650f 100644
--- a/clang/lib/Headers/avx10_2niintrin.h
+++ b/clang/lib/Headers/avx10_2niintrin.h
@@ -81,7 +81,7 @@
#define _mm256_cmp_round_pd_mask(A, B, P, R) \
((__mmask8)__builtin_ia32_vcmppd256_round_mask( \
- (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(P), (__mmask8)-1, \
+ (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(P), (__mmask8) - 1, \
(int)(R)))
#define _mm256_mask_cmp_round_pd_mask(U, A, B, P, R) \
@@ -91,7 +91,7 @@
#define _mm256_cmp_round_ph_mask(A, B, P, R) \
((__mmask16)__builtin_ia32_vcmpph256_round_mask( \
- (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (int)(P), (__mmask16)-1, \
+ (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (int)(P), (__mmask16) - 1, \
(int)(R)))
#define _mm256_mask_cmp_round_ph_mask(U, A, B, P, R) \
@@ -101,7 +101,7 @@
#define _mm256_cmp_round_ps_mask(A, B, P, R) \
((__mmask8)__builtin_ia32_vcmpps256_round_mask( \
- (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(P), (__mmask8)-1, \
+ (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(P), (__mmask8) - 1, \
(int)(R)))
#define _mm256_mask_cmp_round_ps_mask(U, A, B, P, R) \
@@ -124,7 +124,7 @@
#define _mm256_cvt_roundepi32_ps(A, R) \
((__m256)__builtin_ia32_vcvtdq2ps256_round_mask((__v8si)(__m256i)(A), \
(__v8sf)_mm256_setzero_ps(), \
- (__mmask8)-1, (int)(R)))
+ (__mmask8) - 1, (int)(R)))
#define _mm256_mask_cvt_roundepi32_ps(W, U, A, R) \
((__m256)__builtin_ia32_vcvtdq2ps256_round_mask( \
@@ -137,7 +137,7 @@
#define _mm256_cvt_roundpd_epi32(A, R) \
((__m128i)__builtin_ia32_vcvtpd2dq256_round_mask( \
- (__v4df)(__m256d)(A), (__v4si)_mm_setzero_si128(), (__mmask8)-1, \
+ (__v4df)(__m256d)(A), (__v4si)_mm_setzero_si128(), (__mmask8) - 1, \
(int)(R)))
#define _mm256_mask_cvt_roundpd_epi32(W, U, A, R) \
@@ -162,8 +162,9 @@
(__v4df)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
#define _mm256_cvt_roundpd_ps(A, R) \
- ((__m128)__builtin_ia32_vcvtpd2ps256_round_mask( \
- (__v4df)(__m256d)(A), (__v4sf)_mm_setzero_ps(), (__mmask8)-1, (int)(R)))
+ ((__m128)__builtin_ia32_vcvtpd2ps256_round_mask((__v4df)(__m256d)(A), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8) - 1, (int)(R)))
#define _mm256_mask_cvt_roundpd_ps(W, U, A, R) \
((__m128)__builtin_ia32_vcvtpd2ps256_round_mask( \
@@ -176,7 +177,7 @@
#define _mm256_cvt_roundpd_epi64(A, R) \
((__m256i)__builtin_ia32_vcvtpd2qq256_round_mask( \
- (__v4df)(__m256d)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)-1, \
+ (__v4df)(__m256d)(A), (__v4di)_mm256_setzero_si256(), (__mmask8) - 1, \
(int)(R)))
#define _mm256_mask_cvt_roundpd_epi64(W, U, A, R) \
@@ -190,7 +191,7 @@
#define _mm256_cvt_roundpd_epu32(A, R) \
((__m128i)__builtin_ia32_vcvtpd2udq256_round_mask( \
- (__v4df)(__m256d)(A), (__v4su)_mm_setzero_si128(), (__mmask8)-1, \
+ (__v4df)(__m256d)(A), (__v4su)_mm_setzero_si128(), (__mmask8) - 1, \
(int)(R)))
#define _mm256_mask_cvt_roundpd_epu32(W, U, A, R) \
@@ -204,7 +205,7 @@
#define _mm256_cvt_roundpd_epu64(A, R) \
((__m256i)__builtin_ia32_vcvtpd2uqq256_round_mask( \
- (__v4df)(__m256d)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)-1, \
+ (__v4df)(__m256d)(A), (__v4du)_mm256_setzero_si256(), (__mmask8) - 1, \
(int)(R)))
#define _mm256_mask_cvt_roundpd_epu64(W, U, A, R) \
@@ -322,7 +323,7 @@
#define _mm256_cvt_roundps_epi32(A, R) \
((__m256i)__builtin_ia32_vcvtps2dq256_round_mask( \
- (__v8sf)(__m256)(A), (__v8si)_mm256_setzero_si256(), (__mmask8)-1, \
+ (__v8sf)(__m256)(A), (__v8si)_mm256_setzero_si256(), (__mmask8) - 1, \
(int)(R)))
#define _mm256_mask_cvt_roundps_epi32(W, U, A, R) \
@@ -336,7 +337,7 @@
#define _mm256_cvt_roundps_pd(A, R) \
((__m256d)__builtin_ia32_vcvtps2pd256_round_mask( \
- (__v4sf)(__m128)(A), (__v4df)_mm256_undefined_pd(), (__mmask8)-1, \
+ (__v4sf)(__m128)(A), (__v4df)_mm256_undefined_pd(), (__mmask8) - 1, \
(int)(R)))
#define _mm256_mask_cvt_roundps_pd(W, U, A, R) \
@@ -351,7 +352,7 @@
#define _mm256_cvt_roundps_ph(A, I) \
((__m128i)__builtin_ia32_vcvtps2ph256_mask((__v8sf)(__m256)(A), (int)(I), \
(__v8hi)_mm_undefined_si128(), \
- (__mmask8)-1))
+ (__mmask8) - 1))
/* FIXME: We may use these way in future.
#define _mm256_cvt_roundps_ph(A, I) \
@@ -380,7 +381,7 @@
#define _mm256_cvt_roundps_epi64(A, R) \
((__m256i)__builtin_ia32_vcvtps2qq256_round_mask( \
- (__v4sf)(__m128)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)-1, \
+ (__v4sf)(__m128)(A), (__v4di)_mm256_setzero_si256(), (__mmask8) - 1, \
(int)(R)))
#define _mm256_mask_cvt_roundps_epi64(W, U, A, R) \
@@ -394,7 +395,7 @@
#define _mm256_cvt_roundps_epu32(A, R) \
((__m256i)__builtin_ia32_vcvtps2udq256_round_mask( \
- (__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)-1, \
+ (__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8) - 1, \
(int)(R)))
#define _mm256_mask_cvt_roundps_epu32(W, U, A, R) \
@@ -408,7 +409,7 @@
#define _mm256_cvt_roundps_epu64(A, R) \
((__m256i)__builtin_ia32_vcvtps2uqq256_round_mask( \
- (__v4sf)(__m128)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)-1, \
+ (__v4sf)(__m128)(A), (__v4du)_mm256_setzero_si256(), (__mmask8) - 1, \
(int)(R)))
#define _mm256_mask_cvt_roundps_epu64(W, U, A, R) \
@@ -422,7 +423,7 @@
#define _mm256_cvt_roundepi64_pd(A, R) \
((__m256d)__builtin_ia32_vcvtqq2pd256_round_mask( \
- (__v4di)(__m256i)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)-1, \
+ (__v4di)(__m256i)(A), (__v4df)_mm256_setzero_pd(), (__mmask8) - 1, \
(int)(R)))
#define _mm256_mask_cvt_roundepi64_pd(W, U, A, R) \
@@ -447,8 +448,9 @@
(__v4di)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
#define _mm256_cvt_roundepi64_ps(A, R) \
- ((__m128)__builtin_ia32_vcvtqq2ps256_round_mask( \
- (__v4di)(__m256i)(A), (__v4sf)_mm_setzero_ps(), (__mmask8)-1, (int)(R)))
+ ((__m128)__builtin_ia32_vcvtqq2ps256_round_mask((__v4di)(__m256i)(A), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8) - 1, (int)(R)))
#define _mm256_mask_cvt_roundepi64_ps(W, U, A, R) \
((__m128)__builtin_ia32_vcvtqq2ps256_round_mask( \
@@ -461,7 +463,7 @@
#define _mm256_cvtt_roundpd_epi32(A, R) \
((__m128i)__builtin_ia32_vcvttpd2dq256_round_mask( \
- (__v4df)(__m256d)(A), (__v4si)_mm_setzero_si128(), (__mmask8)-1, \
+ (__v4df)(__m256d)(A), (__v4si)_mm_setzero_si128(), (__mmask8) - 1, \
(int)(R)))
#define _mm256_mask_cvtt_roundpd_epi32(W, U, A, R) \
@@ -475,7 +477,7 @@
#define _mm256_cvtt_roundpd_epi64(A, R) \
((__m256i)__builtin_ia32_vcvttpd2qq256_round_mask( \
- (__v4df)(__m256d)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)-1, \
+ (__v4df)(__m256d)(A), (__v4di)_mm256_setzero_si256(), (__mmask8) - 1, \
(int)(R)))
#define _mm256_mask_cvtt_roundpd_epi64(W, U, A, R) \
@@ -489,7 +491,7 @@
#define _mm256_cvtt_roundpd_epu32(A, R) \
((__m128i)__builtin_ia32_vcvttpd2udq256_round_mask( \
- (__v4df)(__m256d)(A), (__v4su)_mm_setzero_si128(), (__mmask8)-1, \
+ (__v4df)(__m256d)(A), (__v4su)_mm_setzero_si128(), (__mmask8) - 1, \
(int)(R)))
#define _mm256_mask_cvtt_roundpd_epu32(W, U, A, R) \
@@ -503,7 +505,7 @@
#define _mm256_cvtt_roundpd_epu64(A, R) \
((__m256i)__builtin_ia32_vcvttpd2uqq256_round_mask( \
- (__v4df)(__m256d)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)-1, \
+ (__v4df)(__m256d)(A), (__v4du)_mm256_setzero_si256(), (__mmask8) - 1, \
(int)(R)))
#define _mm256_mask_cvtt_roundpd_epu64(W, U, A, R) \
@@ -597,7 +599,7 @@
#define _mm256_cvtt_roundps_epi32(A, R) \
((__m256i)__builtin_ia32_vcvttps2dq256_round_mask( \
- (__v8sf)(__m256)(A), (__v8si)_mm256_setzero_si256(), (__mmask8)-1, \
+ (__v8sf)(__m256)(A), (__v8si)_mm256_setzero_si256(), (__mmask8) - 1, \
(int)(R)))
#define _mm256_mask_cvtt_roundps_epi32(W, U, A, R) \
@@ -611,7 +613,7 @@
#define _mm256_cvtt_roundps_epi64(A, R) \
((__m256i)__builtin_ia32_vcvttps2qq256_round_mask( \
- (__v4sf)(__m128)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)-1, \
+ (__v4sf)(__m128)(A), (__v4di)_mm256_setzero_si256(), (__mmask8) - 1, \
(int)(R)))
#define _mm256_mask_cvtt_roundps_epi64(W, U, A, R) \
@@ -625,7 +627,7 @@
#define _mm256_cvtt_roundps_epu32(A, R) \
((__m256i)__builtin_ia32_vcvttps2udq256_round_mask( \
- (__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)-1, \
+ (__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8) - 1, \
(int)(R)))
#define _mm256_mask_cvtt_roundps_epu32(W, U, A, R) \
@@ -639,7 +641,7 @@
#define _mm256_cvtt_roundps_epu64(A, R) \
((__m256i)__builtin_ia32_vcvttps2uqq256_round_mask( \
- (__v4sf)(__m128)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)-1, \
+ (__v4sf)(__m128)(A), (__v4du)_mm256_setzero_si256(), (__mmask8) - 1, \
(int)(R)))
#define _mm256_mask_cvtt_roundps_epu64(W, U, A, R) \
@@ -665,7 +667,7 @@
#define _mm256_cvt_roundepu32_ps(A, R) \
((__m256)__builtin_ia32_vcvtudq2ps256_round_mask( \
- (__v8su)(__m256i)(A), (__v8sf)_mm256_setzero_ps(), (__mmask8)-1, \
+ (__v8su)(__m256i)(A), (__v8sf)_mm256_setzero_ps(), (__mmask8) - 1, \
(int)(R)))
#define _mm256_mask_cvt_roundepu32_ps(W, U, A, R) \
@@ -679,7 +681,7 @@
#define _mm256_cvt_roundepu64_pd(A, R) \
((__m256d)__builtin_ia32_vcvtuqq2pd256_round_mask( \
- (__v4du)(__m256i)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)-1, \
+ (__v4du)(__m256i)(A), (__v4df)_mm256_setzero_pd(), (__mmask8) - 1, \
(int)(R)))
#define _mm256_mask_cvt_roundepu64_pd(W, U, A, R) \
@@ -704,8 +706,9 @@
(__v4du)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
#define _mm256_cvt_roundepu64_ps(A, R) \
- ((__m128)__builtin_ia32_vcvtuqq2ps256_round_mask( \
- (__v4du)(__m256i)(A), (__v4sf)_mm_setzero_ps(), (__mmask8)-1, (int)(R)))
+ ((__m128)__builtin_ia32_vcvtuqq2ps256_round_mask((__v4du)(__m256i)(A), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8) - 1, (int)(R)))
#define _mm256_mask_cvt_roundepu64_ps(W, U, A, R) \
((__m128)__builtin_ia32_vcvtuqq2ps256_round_mask( \
@@ -787,7 +790,7 @@
#define _mm256_fcmadd_round_pch(A, B, C, R) \
((__m256h)__builtin_ia32_vfcmaddcph256_round_mask3( \
(__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C), \
- (__mmask8)-1, (int)(R)))
+ (__mmask8) - 1, (int)(R)))
#define _mm256_mask_fcmadd_round_pch(A, U, B, C, R) \
((__m256h)__builtin_ia32_vfcmaddcph256_round_mask( \
@@ -807,7 +810,7 @@
#define _mm256_cmul_round_pch(A, B, R) \
((__m256h)__builtin_ia32_vfcmulcph256_round_mask( \
(__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), \
- (__v8sf)(__m256h)_mm256_undefined_ph(), (__mmask8)-1, (int)(R)))
+ (__v8sf)(__m256h)_mm256_undefined_ph(), (__mmask8) - 1, (int)(R)))
#define _mm256_mask_cmul_round_pch(W, U, A, B, R) \
((__m256h)__builtin_ia32_vfcmulcph256_round_mask( \
@@ -822,7 +825,7 @@
#define _mm256_fixupimm_round_pd(A, B, C, imm, R) \
((__m256d)__builtin_ia32_vfixupimmpd256_round_mask( \
(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4di)(__m256i)(C), \
- (int)(imm), (__mmask8)-1, (int)(R)))
+ (int)(imm), (__mmask8) - 1, (int)(R)))
#define _mm256_mask_fixupimm_round_pd(A, U, B, C, imm, R) \
((__m256d)__builtin_ia32_vfixupimmpd256_round_mask( \
@@ -837,7 +840,7 @@
#define _mm256_fixupimm_round_ps(A, B, C, imm, R) \
((__m256)__builtin_ia32_vfixupimmps256_round_mask( \
(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8si)(__m256i)(C), \
- (int)(imm), (__mmask8)-1, (int)(R)))
+ (int)(imm), (__mmask8) - 1, (int)(R)))
#define _mm256_mask_fixupimm_round_ps(A, U, B, C, imm, R) \
((__m256)__builtin_ia32_vfixupimmps256_round_mask( \
@@ -852,7 +855,7 @@
#define _mm256_fmadd_round_pd(A, B, C, R) \
((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \
(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
- (__mmask8)-1, (int)(R)))
+ (__mmask8) - 1, (int)(R)))
#define _mm256_mask_fmadd_round_pd(A, U, B, C, R) \
((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \
@@ -872,7 +875,7 @@
#define _mm256_fmsub_round_pd(A, B, C, R) \
((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \
(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \
- (__mmask8)-1, (int)(R)))
+ (__mmask8) - 1, (int)(R)))
#define _mm256_mask_fmsub_round_pd(A, U, B, C, R) \
((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \
@@ -887,7 +890,7 @@
#define _mm256_fnmadd_round_pd(A, B, C, R) \
((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \
-(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
- (__mmask8)-1, (int)(R)))
+ (__mmask8) - 1, (int)(R)))
#define _mm256_mask3_fnmadd_round_pd(A, B, C, U, R) \
((__m256d)__builtin_ia32_vfmaddpd256_round_mask3( \
@@ -902,7 +905,7 @@
#define _mm256_fnmsub_round_pd(A, B, C, R) \
((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \
-(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \
- (__mmask8)-1, (int)(R)))
+ (__mmask8) - 1, (int)(R)))
#define _mm256_maskz_fnmsub_round_pd(U, A, B, C, R) \
((__m256d)__builtin_ia32_vfmaddpd256_round_maskz( \
@@ -912,7 +915,7 @@
#define _mm256_fmadd_round_ph(A, B, C, R) \
((__m256h)__builtin_ia32_vfmaddph256_round_mask( \
(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
- (__mmask16)-1, (int)(R)))
+ (__mmask16) - 1, (int)(R)))
#define _mm256_mask_fmadd_round_ph(A, U, B, C, R) \
((__m256h)__builtin_ia32_vfmaddph256_round_mask( \
@@ -932,7 +935,7 @@
#define _mm256_fmsub_round_ph(A, B, C, R) \
((__m256h)__builtin_ia32_vfmaddph256_round_mask( \
(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \
- (__mmask16)-1, (int)(R)))
+ (__mmask16) - 1, (int)(R)))
#define _mm256_mask_fmsub_round_ph(A, U, B, C, R) \
((__m256h)__builtin_ia32_vfmaddph256_round_mask( \
@@ -947,7 +950,7 @@
#define _mm256_fnmadd_round_ph(A, B, C, R) \
((__m256h)__builtin_ia32_vfmaddph256_round_mask( \
(__v16hf)(__m256h)(A), -(__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
- (__mmask16)-1, (int)(R)))
+ (__mmask16) - 1, (int)(R)))
#define _mm256_mask3_fnmadd_round_ph(A, B, C, U, R) \
((__m256h)__builtin_ia32_vfmaddph256_round_mask3( \
@@ -962,7 +965,7 @@
#define _mm256_fnmsub_round_ph(A, B, C, R) \
((__m256h)__builtin_ia32_vfmaddph256_round_mask( \
(__v16hf)(__m256h)(A), -(__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \
- (__mmask16)-1, (int)(R)))
+ (__mmask16) - 1, (int)(R)))
#define _mm256_maskz_fnmsub_round_ph(U, A, B, C, R) \
((__m256h)__builtin_ia32_vfmaddph256_round_maskz( \
@@ -972,7 +975,7 @@
#define _mm256_fmadd_round_ps(A, B, C, R) \
((__m256)__builtin_ia32_vfmaddps256_round_mask( \
(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
- (__mmask8)-1, (int)(R)))
+ (__mmask8) - 1, (int)(R)))
#define _mm256_mask_fmadd_round_ps(A, U, B, C, R) \
((__m256)__builtin_ia32_vfmaddps256_round_mask( \
@@ -992,7 +995,7 @@
#define _mm256_fmsub_round_ps(A, B, C, R) \
((__m256)__builtin_ia32_vfmaddps256_round_mask( \
(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \
- (__mmask8)-1, (int)(R)))
+ (__mmask8) - 1, (int)(R)))
#define _mm256_mask_fmsub_round_ps(A, U, B, C, R) \
((__m256)__builtin_ia32_vfmaddps256_round_mask( \
@@ -1007,7 +1010,7 @@
#define _mm256_fnmadd_round_ps(A, B, C, R) \
((__m256)__builtin_ia32_vfmaddps256_round_mask( \
(__v8sf)(__m256)(A), -(__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
- (__mmask8)-1, (int)(R)))
+ (__mmask8) - 1, (int)(R)))
#define _mm256_mask3_fnmadd_round_ps(A, B, C, U, R) \
((__m256)__builtin_ia32_vfmaddps256_round_mask3( \
@@ -1022,7 +1025,7 @@
#define _mm256_fnmsub_round_ps(A, B, C, R) \
((__m256)__builtin_ia32_vfmaddps256_round_mask( \
(__v8sf)(__m256)(A), -(__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \
- (__mmask8)-1, (int)(R)))
+ (__mmask8) - 1, (int)(R)))
#define _mm256_maskz_fnmsub_round_ps(U, A, B, C, R) \
((__m256)__builtin_ia32_vfmaddps256_round_maskz( \
@@ -1032,7 +1035,7 @@
#define _mm256_fmadd_round_pch(A, B, C, R) \
((__m256h)__builtin_ia32_vfmaddcph256_round_mask3( \
(__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C), \
- (__mmask8)-1, (int)(R)))
+ (__mmask8) - 1, (int)(R)))
#define _mm256_mask_fmadd_round_pch(A, U, B, C, R) \
((__m256h)__builtin_ia32_vfmaddcph256_round_mask( \
@@ -1052,7 +1055,7 @@
#define _mm256_fmaddsub_round_pd(A, B, C, R) \
((__m256d)__builtin_ia32_vfmaddsubpd256_round_mask( \
(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
- (__mmask8)-1, (int)(R)))
+ (__mmask8) - 1, (int)(R)))
#define _mm256_mask_fmaddsub_round_pd(A, U, B, C, R) \
((__m256d)__builtin_ia32_vfmaddsubpd256_round_mask( \
@@ -1072,7 +1075,7 @@
#define _mm256_fmsubadd_round_pd(A, B, C, R) \
((__m256d)__builtin_ia32_vfmaddsubpd256_round_mask( \
(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \
- (__mmask8)-1, (int)(R)))
+ (__mmask8) - 1, (int)(R)))
#define _mm256_mask_fmsubadd_round_pd(A, U, B, C, R) \
((__m256d)__builtin_ia32_vfmaddsubpd256_round_mask( \
@@ -1087,7 +1090,7 @@
#define _mm256_fmaddsub_round_ph(A, B, C, R) \
((__m256h)__builtin_ia32_vfmaddsubph256_round_mask( \
(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
- (__mmask16)-1, (int)(R)))
+ (__mmask16) - 1, (int)(R)))
#define _mm256_mask_fmaddsub_round_ph(A, U, B, C, R) \
((__m256h)__builtin_ia32_vfmaddsubph256_round_mask( \
@@ -1107,7 +1110,7 @@
#define _mm256_fmsubadd_round_ph(A, B, C, R) \
((__m256h)__builtin_ia32_vfmaddsubph256_round_mask( \
(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \
- (__mmask16)-1, (int)(R)))
+ (__mmask16) - 1, (int)(R)))
#define _mm256_mask_fmsubadd_round_ph(A, U, B, C, R) \
((__m256h)__builtin_ia32_vfmaddsubph256_round_mask( \
@@ -1122,7 +1125,7 @@
#define _mm256_fmaddsub_round_ps(A, B, C, R) \
((__m256)__builtin_ia32_vfmaddsubps256_round_mask( \
(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
- (__mmask8)-1, (int)(R)))
+ (__mmask8) - 1, (int)(R)))
#define _mm256_mask_fmaddsub_round_ps(A, U, B, C, R) \
((__m256)__builtin_ia32_vfmaddsubps256_round_mask( \
@@ -1142,7 +1145,7 @@
#define _mm256_fmsubadd_round_ps(A, B, C, R) \
((__m256)__builtin_ia32_vfmaddsubps256_round_mask( \
(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \
- (__mmask8)-1, (int)(R)))
+ (__mmask8) - 1, (int)(R)))
#define _mm256_mask_fmsubadd_round_ps(A, U, B, C, R) \
((__m256)__builtin_ia32_vfmaddsubps256_round_mask( \
@@ -1231,7 +1234,7 @@
#define _mm256_mul_round_pch(A, B, R) \
((__m256h)__builtin_ia32_vfmulcph256_round_mask( \
(__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), \
- (__v8sf)(__m256h)_mm256_undefined_ph(), (__mmask8)-1, (int)(R)))
+ (__v8sf)(__m256h)_mm256_undefined_ph(), (__mmask8) - 1, (int)(R)))
#define _mm256_mask_mul_round_pch(W, U, A, B, R) \
((__m256h)__builtin_ia32_vfmulcph256_round_mask( \
@@ -1245,7 +1248,7 @@
#define _mm256_getexp_round_pd(A, R) \
((__m256d)__builtin_ia32_vgetexppd256_round_mask( \
- (__v4df)(__m256d)(A), (__v4df)_mm256_undefined_pd(), (__mmask8)-1, \
+ (__v4df)(__m256d)(A), (__v4df)_mm256_undefined_pd(), (__mmask8) - 1, \
(int)(R)))
#define _mm256_mask_getexp_round_pd(W, U, A, R) \
@@ -1259,7 +1262,7 @@
#define _mm256_getexp_round_ph(A, R) \
((__m256h)__builtin_ia32_vgetexpph256_round_mask( \
- (__v16hf)(__m256h)(A), (__v16hf)_mm256_undefined_ph(), (__mmask16)-1, \
+ (__v16hf)(__m256h)(A), (__v16hf)_mm256_undefined_ph(), (__mmask16) - 1, \
(int)(R)))
#define _mm256_mask_getexp_round_ph(W, U, A, R) \
@@ -1273,7 +1276,7 @@
#define _mm256_getexp_round_ps(A, R) \
((__m256)__builtin_ia32_vgetexpps256_round_mask( \
- (__v8sf)(__m256)(A), (__v8sf)_mm256_undefined_ps(), (__mmask8)-1, \
+ (__v8sf)(__m256)(A), (__v8sf)_mm256_undefined_ps(), (__mmask8) - 1, \
(int)(R)))
#define _mm256_mask_getexp_round_ps(W, U, A, R) \
@@ -1288,7 +1291,7 @@
#define _mm256_getmant_round_pd(A, B, C, R) \
((__m256d)__builtin_ia32_vgetmantpd256_round_mask( \
(__v4df)(__m256d)(A), (int)(((C) << 2) | (B)), \
- (__v4df)_mm256_undefined_pd(), (__mmask8)-1, (int)(R)))
+ (__v4df)_mm256_undefined_pd(), (__mmask8) - 1, (int)(R)))
#define _mm256_mask_getmant_round_pd(W, U, A, B, C, R) \
((__m256d)__builtin_ia32_vgetmantpd256_round_mask( \
@@ -1303,7 +1306,7 @@
#define _mm256_getmant_round_ph(A, B, C, R) \
((__m256h)__builtin_ia32_vgetmantph256_round_mask( \
(__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), \
- (__v16hf)_mm256_undefined_ph(), (__mmask16)-1, (int)(R)))
+ (__v16hf)_mm256_undefined_ph(), (__mmask16) - 1, (int)(R)))
#define _mm256_mask_getmant_round_ph(W, U, A, B, C, R) \
((__m256h)__builtin_ia32_vgetmantph256_round_mask( \
@@ -1318,7 +1321,7 @@
#define _mm256_getmant_round_ps(A, B, C, R) \
((__m256)__builtin_ia32_vgetmantps256_round_mask( \
(__v8sf)(__m256)(A), (int)(((C) << 2) | (B)), \
- (__v8sf)_mm256_undefined_ps(), (__mmask8)-1, (int)(R)))
+ (__v8sf)_mm256_undefined_ps(), (__mmask8) - 1, (int)(R)))
#define _mm256_mask_getmant_round_ps(W, U, A, B, C, R) \
((__m256)__builtin_ia32_vgetmantps256_round_mask( \
@@ -1459,7 +1462,7 @@
#define _mm256_range_round_pd(A, B, C, R) \
((__m256d)__builtin_ia32_vrangepd256_round_mask( \
(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), \
- (__v4df)_mm256_setzero_pd(), (__mmask8)-1, (int)(R)))
+ (__v4df)_mm256_setzero_pd(), (__mmask8) - 1, (int)(R)))
#define _mm256_mask_range_round_pd(W, U, A, B, C, R) \
((__m256d)__builtin_ia32_vrangepd256_round_mask( \
@@ -1474,7 +1477,7 @@
#define _mm256_range_round_ps(A, B, C, R) \
((__m256)__builtin_ia32_vrangeps256_round_mask( \
(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), \
- (__v8sf)_mm256_setzero_ps(), (__mmask8)-1, (int)(R)))
+ (__v8sf)_mm256_setzero_ps(), (__mmask8) - 1, (int)(R)))
#define _mm256_mask_range_round_ps(W, U, A, B, C, R) \
((__m256)__builtin_ia32_vrangeps256_round_mask( \
@@ -1489,7 +1492,7 @@
#define _mm256_reduce_round_pd(A, B, R) \
((__m256d)__builtin_ia32_vreducepd256_round_mask( \
(__v4df)(__m256d)(A), (int)(B), (__v4df)_mm256_setzero_pd(), \
- (__mmask8)-1, (int)(R)))
+ (__mmask8) - 1, (int)(R)))
#define _mm256_mask_reduce_round_pd(W, U, A, B, R) \
((__m256d)__builtin_ia32_vreducepd256_round_mask( \
@@ -1514,12 +1517,12 @@
#define _mm256_reduce_round_ph(A, imm, R) \
((__m256h)__builtin_ia32_vreduceph256_round_mask( \
(__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_undefined_ph(), \
- (__mmask16)-1, (int)(R)))
+ (__mmask16) - 1, (int)(R)))
#define _mm256_reduce_round_ps(A, B, R) \
((__m256)__builtin_ia32_vreduceps256_round_mask( \
(__v8sf)(__m256)(A), (int)(B), (__v8sf)_mm256_setzero_ps(), \
- (__mmask8)-1, (int)(R)))
+ (__mmask8) - 1, (int)(R)))
#define _mm256_mask_reduce_round_ps(W, U, A, B, R) \
((__m256)__builtin_ia32_vreduceps256_round_mask( \
@@ -1534,7 +1537,7 @@
#define _mm256_roundscale_round_pd(A, imm, R) \
((__m256d)__builtin_ia32_vrndscalepd256_round_mask( \
(__v4df)(__m256d)(A), (int)(imm), (__v4df)_mm256_undefined_pd(), \
- (__mmask8)-1, (int)(R)))
+ (__mmask8) - 1, (int)(R)))
#define _mm256_mask_roundscale_round_pd(A, B, C, imm, R) \
((__m256d)__builtin_ia32_vrndscalepd256_round_mask( \
@@ -1549,7 +1552,7 @@
#define _mm256_roundscale_round_ph(A, imm, R) \
((__m256h)__builtin_ia32_vrndscaleph256_round_mask( \
(__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_undefined_ph(), \
- (__mmask16)-1, (int)(R)))
+ (__mmask16) - 1, (int)(R)))
#define _mm256_mask_roundscale_round_ph(A, B, C, imm, R) \
((__m256h)__builtin_ia32_vrndscaleph256_round_mask( \
@@ -1564,7 +1567,7 @@
#define _mm256_roundscale_round_ps(A, imm, R) \
((__m256)__builtin_ia32_vrndscaleps256_round_mask( \
(__v8sf)(__m256)(A), (int)(imm), (__v8sf)_mm256_undefined_ps(), \
- (__mmask8)-1, (int)(R)))
+ (__mmask8) - 1, (int)(R)))
#define _mm256_mask_roundscale_round_ps(A, B, C, imm, R) \
((__m256)__builtin_ia32_vrndscaleps256_round_mask( \
@@ -1579,7 +1582,7 @@
#define _mm256_scalef_round_pd(A, B, R) \
((__m256d)__builtin_ia32_vscalefpd256_round_mask( \
(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), \
- (__v4df)_mm256_undefined_pd(), (__mmask8)-1, (int)(R)))
+ (__v4df)_mm256_undefined_pd(), (__mmask8) - 1, (int)(R)))
#define _mm256_mask_scalef_round_pd(W, U, A, B, R) \
((__m256d)__builtin_ia32_vscalefpd256_round_mask( \
@@ -1594,7 +1597,7 @@
#define _mm256_scalef_round_ph(A, B, R) \
((__m256h)__builtin_ia32_vscalefph256_round_mask( \
(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), \
- (__v16hf)_mm256_undefined_ph(), (__mmask16)-1, (int)(R)))
+ (__v16hf)_mm256_undefined_ph(), (__mmask16) - 1, (int)(R)))
#define _mm256_mask_scalef_round_ph(W, U, A, B, R) \
((__m256h)__builtin_ia32_vscalefph256_round_mask( \
@@ -1609,7 +1612,7 @@
#define _mm256_scalef_round_ps(A, B, R) \
((__m256)__builtin_ia32_vscalefps256_round_mask( \
(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)_mm256_undefined_ps(), \
- (__mmask8)-1, (int)(R)))
+ (__mmask8) - 1, (int)(R)))
#define _mm256_mask_scalef_round_ps(W, U, A, B, R) \
((__m256)__builtin_ia32_vscalefps256_round_mask( \
|
The clang-format seems problematic, and isn't compatible with previous version. It looks like a bug to me. |
assert(sEntryNumber < -1U && | ||
"Index into ModRMDecision is too large for uint32_t!"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I suggest not using the tricky -1
, it's not robust b/c the suffix U
can represent unsigned long long (https://en.cppreference.com/w/cpp/language/integer_literal)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
-1U
is wildly used in LLVM code base. I think the document means it can be automatically recognized as LLU
with the range of constant value, instead of not to use it for small value. See https://godbolt.org/z/9W9srbYYM
…1825)" This reverts commit 0dba538. YMM rounding was removed from AVX10 whitepaper. Ref: https://cdrdv2.intel.com/v1/dl/getContent/784343 The MINMAX and SATURATING CONVERT instructions will be removed as a follow up.
…" (#132362) This reverts commit 0dba538. YMM rounding was removed from AVX10 whitepaper. Ref: https://cdrdv2.intel.com/v1/dl/getContent/784343 The MINMAX and SATURATING CONVERT instructions will be removed as a follow up.
…1825)" (llvm#132362) This reverts commit 0dba538. YMM rounding was removed from AVX10 whitepaper. Ref: https://cdrdv2.intel.com/v1/dl/getContent/784343 The MINMAX and SATURATING CONVERT instructions will be removed as a follow up.
Ref.: https://cdrdv2.intel.com/v1/dl/getContent/828965