-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[NVPTX] use untyped loads and stores where ever possible #137698
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[NVPTX] use untyped loads and stores where ever possible #137698
Conversation
@llvm/pr-subscribers-debuginfo @llvm/pr-subscribers-backend-nvptx Author: Alex MacLean (AlexMaclean) ChangesIn most cases, the type information attached to load and store instructions is meaningless and inconsistently applied. We can usually use ".b" loads and avoid the complexity of trying to assign the correct type. The one expectation is sign-extending load, which will continue to use ".s" to ensure the sign extension into a larger register is done correctly. Patch is 2.20 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/137698.diff 189 Files Affected:
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 295ed666a1902..e81448ff227be 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -1077,21 +1077,6 @@ pickOpcodeForVT(MVT::SimpleValueType VT, unsigned Opcode_i8,
}
}
-static int getLdStRegType(EVT VT) {
- if (VT.isFloatingPoint())
- switch (VT.getSimpleVT().SimpleTy) {
- case MVT::f16:
- case MVT::bf16:
- case MVT::v2f16:
- case MVT::v2bf16:
- return NVPTX::PTXLdStInstCode::Untyped;
- default:
- return NVPTX::PTXLdStInstCode::Float;
- }
- else
- return NVPTX::PTXLdStInstCode::Unsigned;
-}
-
bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
MemSDNode *LD = cast<MemSDNode>(N);
assert(LD->readMem() && "Expected load");
@@ -1122,24 +1107,14 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
// type is integer
// Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
MVT SimpleVT = LoadedVT.getSimpleVT();
- MVT ScalarVT = SimpleVT.getScalarType();
// Read at least 8 bits (predicates are stored as 8-bit values)
- unsigned FromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits());
- unsigned int FromType;
+ unsigned FromTypeWidth = std::max(8U, (unsigned)SimpleVT.getSizeInBits());
// Vector Setting
- unsigned VecType = NVPTX::PTXLdStInstCode::Scalar;
- if (SimpleVT.isVector()) {
- assert((Isv2x16VT(LoadedVT) || LoadedVT == MVT::v4i8) &&
- "Unexpected vector type");
- // v2f16/v2bf16/v2i16 is loaded using ld.b32
- FromTypeWidth = 32;
- }
-
- if (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD))
- FromType = NVPTX::PTXLdStInstCode::Signed;
- else
- FromType = getLdStRegType(ScalarVT);
+ unsigned int FromType =
+ (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD))
+ ? NVPTX::PTXLdStInstCode::Signed
+ : NVPTX::PTXLdStInstCode::Untyped;
// Create the machine instruction DAG
SDValue Offset, Base;
@@ -1147,7 +1122,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
SDValue Ops[] = {getI32Imm(Ordering, DL),
getI32Imm(Scope, DL),
getI32Imm(CodeAddrSpace, DL),
- getI32Imm(VecType, DL),
+ getI32Imm(NVPTX::PTXLdStInstCode::Scalar, DL),
getI32Imm(FromType, DL),
getI32Imm(FromTypeWidth, DL),
Base,
@@ -1214,7 +1189,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
unsigned ExtensionType = N->getConstantOperandVal(N->getNumOperands() - 1);
unsigned FromType = (ExtensionType == ISD::SEXTLOAD)
? NVPTX::PTXLdStInstCode::Signed
- : getLdStRegType(MemVT.getScalarType());
+ : NVPTX::PTXLdStInstCode::Untyped;
unsigned VecType;
unsigned FromTypeWidth;
@@ -1232,8 +1207,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
}
if (isSubVectorPackedInI32(EltVT)) {
+ assert(ExtensionType == ISD::NON_EXTLOAD);
EltVT = MVT::i32;
- FromType = NVPTX::PTXLdStInstCode::Untyped;
}
SDValue Offset, Base;
@@ -1434,21 +1409,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, ST);
// Vector Setting
- MVT SimpleVT = StoreVT.getSimpleVT();
- unsigned VecType = NVPTX::PTXLdStInstCode::Scalar;
-
- // Type Setting: toType + toTypeWidth
- // - for integer type, always use 'u'
- MVT ScalarVT = SimpleVT.getScalarType();
- unsigned ToTypeWidth = ScalarVT.getSizeInBits();
- if (SimpleVT.isVector()) {
- assert((Isv2x16VT(StoreVT) || StoreVT == MVT::v4i8) &&
- "Unexpected vector type");
- // v2x16 is stored using st.b32
- ToTypeWidth = 32;
- }
-
- unsigned int ToType = getLdStRegType(ScalarVT);
+ const unsigned ToTypeWidth = StoreVT.getSimpleVT().getSizeInBits();
// Create the machine instruction DAG
SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal();
@@ -1460,8 +1421,8 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
getI32Imm(Ordering, DL),
getI32Imm(Scope, DL),
getI32Imm(CodeAddrSpace, DL),
- getI32Imm(VecType, DL),
- getI32Imm(ToType, DL),
+ getI32Imm(NVPTX::PTXLdStInstCode::Scalar, DL),
+ getI32Imm(NVPTX::PTXLdStInstCode::Untyped, DL),
getI32Imm(ToTypeWidth, DL),
Base,
Offset,
@@ -1507,7 +1468,6 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
// Type Setting: toType + toTypeWidth
// - for integer type, always use 'u'
const unsigned TotalWidth = StoreVT.getSimpleVT().getSizeInBits();
- unsigned ToType = getLdStRegType(StoreVT.getSimpleVT().getScalarType());
SmallVector<SDValue, 12> Ops;
SDValue N2;
@@ -1534,7 +1494,6 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
if (isSubVectorPackedInI32(EltVT)) {
EltVT = MVT::i32;
- ToType = NVPTX::PTXLdStInstCode::Untyped;
}
SDValue Offset, Base;
@@ -1542,8 +1501,8 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
Ops.append({getI32Imm(Ordering, DL), getI32Imm(Scope, DL),
getI32Imm(CodeAddrSpace, DL), getI32Imm(VecType, DL),
- getI32Imm(ToType, DL), getI32Imm(ToTypeWidth, DL), Base, Offset,
- Chain});
+ getI32Imm(NVPTX::PTXLdStInstCode::Untyped, DL),
+ getI32Imm(ToTypeWidth, DL), Base, Offset, Chain});
std::optional<unsigned> Opcode;
switch (N->getOpcode()) {
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 043da14bcb236..21846583a8c04 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -2296,11 +2296,11 @@ def LoadParamMemV2I8 : LoadParamV2MemInst<Int16Regs, ".b8">;
def LoadParamMemV4I32 : LoadParamV4MemInst<Int32Regs, ".b32">;
def LoadParamMemV4I16 : LoadParamV4MemInst<Int16Regs, ".b16">;
def LoadParamMemV4I8 : LoadParamV4MemInst<Int16Regs, ".b8">;
-def LoadParamMemF32 : LoadParamMemInst<Float32Regs, ".f32">;
-def LoadParamMemF64 : LoadParamMemInst<Float64Regs, ".f64">;
-def LoadParamMemV2F32 : LoadParamV2MemInst<Float32Regs, ".f32">;
-def LoadParamMemV2F64 : LoadParamV2MemInst<Float64Regs, ".f64">;
-def LoadParamMemV4F32 : LoadParamV4MemInst<Float32Regs, ".f32">;
+def LoadParamMemF32 : LoadParamMemInst<Float32Regs, ".b32">;
+def LoadParamMemF64 : LoadParamMemInst<Float64Regs, ".b64">;
+def LoadParamMemV2F32 : LoadParamV2MemInst<Float32Regs, ".b32">;
+def LoadParamMemV2F64 : LoadParamV2MemInst<Float64Regs, ".b64">;
+def LoadParamMemV4F32 : LoadParamV4MemInst<Float32Regs, ".b32">;
defm StoreParamI64 : StoreParamInst<Int64Regs, i64imm, ".b64">;
defm StoreParamI32 : StoreParamInst<Int32Regs, i32imm, ".b32">;
@@ -2319,13 +2319,13 @@ defm StoreParamV4I32 : StoreParamV4Inst<Int32Regs, i32imm, ".b32">;
defm StoreParamV4I16 : StoreParamV4Inst<Int16Regs, i16imm, ".b16">;
defm StoreParamV4I8 : StoreParamV4Inst<Int16Regs, i8imm, ".b8">;
-defm StoreParamF32 : StoreParamInst<Float32Regs, f32imm, ".f32">;
-defm StoreParamF64 : StoreParamInst<Float64Regs, f64imm, ".f64">;
+defm StoreParamF32 : StoreParamInst<Float32Regs, f32imm, ".b32">;
+defm StoreParamF64 : StoreParamInst<Float64Regs, f64imm, ".b64">;
-defm StoreParamV2F32 : StoreParamV2Inst<Float32Regs, f32imm, ".f32">;
-defm StoreParamV2F64 : StoreParamV2Inst<Float64Regs, f64imm, ".f64">;
+defm StoreParamV2F32 : StoreParamV2Inst<Float32Regs, f32imm, ".b32">;
+defm StoreParamV2F64 : StoreParamV2Inst<Float64Regs, f64imm, ".b64">;
-defm StoreParamV4F32 : StoreParamV4Inst<Float32Regs, f32imm, ".f32">;
+defm StoreParamV4F32 : StoreParamV4Inst<Float32Regs, f32imm, ".b32">;
def StoreRetvalI64 : StoreRetvalInst<Int64Regs, ".b64">;
def StoreRetvalI32 : StoreRetvalInst<Int32Regs, ".b32">;
@@ -2341,11 +2341,11 @@ def StoreRetvalV4I32 : StoreRetvalV4Inst<Int32Regs, ".b32">;
def StoreRetvalV4I16 : StoreRetvalV4Inst<Int16Regs, ".b16">;
def StoreRetvalV4I8 : StoreRetvalV4Inst<Int16Regs, ".b8">;
-def StoreRetvalF64 : StoreRetvalInst<Float64Regs, ".f64">;
-def StoreRetvalF32 : StoreRetvalInst<Float32Regs, ".f32">;
-def StoreRetvalV2F64 : StoreRetvalV2Inst<Float64Regs, ".f64">;
-def StoreRetvalV2F32 : StoreRetvalV2Inst<Float32Regs, ".f32">;
-def StoreRetvalV4F32 : StoreRetvalV4Inst<Float32Regs, ".f32">;
+def StoreRetvalF64 : StoreRetvalInst<Float64Regs, ".b64">;
+def StoreRetvalF32 : StoreRetvalInst<Float32Regs, ".b32">;
+def StoreRetvalV2F64 : StoreRetvalV2Inst<Float64Regs, ".b64">;
+def StoreRetvalV2F32 : StoreRetvalV2Inst<Float32Regs, ".b32">;
+def StoreRetvalV4F32 : StoreRetvalV4Inst<Float32Regs, ".b32">;
def CallArgBeginInst : NVPTXInst<(outs), (ins), "(", [(CallArgBegin)]>;
def CallArgEndInst1 : NVPTXInst<(outs), (ins), ");", [(CallArgEnd (i32 1))]>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 3eedb43e4c81a..4d56cf38531e7 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -2302,12 +2302,12 @@ class LDU_G<string TyStr, NVPTXRegClass regclass>
"ldu.global." # TyStr # " \t$result, [$src];",
[]>, Requires<[hasLDU]>;
-def INT_PTX_LDU_GLOBAL_i8 : LDU_G<"u8", Int16Regs>;
-def INT_PTX_LDU_GLOBAL_i16 : LDU_G<"u16", Int16Regs>;
-def INT_PTX_LDU_GLOBAL_i32 : LDU_G<"u32", Int32Regs>;
-def INT_PTX_LDU_GLOBAL_i64 : LDU_G<"u64", Int64Regs>;
-def INT_PTX_LDU_GLOBAL_f32 : LDU_G<"f32", Float32Regs>;
-def INT_PTX_LDU_GLOBAL_f64 : LDU_G<"f64", Float64Regs>;
+def INT_PTX_LDU_GLOBAL_i8 : LDU_G<"b8", Int16Regs>;
+def INT_PTX_LDU_GLOBAL_i16 : LDU_G<"b16", Int16Regs>;
+def INT_PTX_LDU_GLOBAL_i32 : LDU_G<"b32", Int32Regs>;
+def INT_PTX_LDU_GLOBAL_i64 : LDU_G<"b64", Int64Regs>;
+def INT_PTX_LDU_GLOBAL_f32 : LDU_G<"b32", Float32Regs>;
+def INT_PTX_LDU_GLOBAL_f64 : LDU_G<"b64", Float64Regs>;
// vector
@@ -2324,19 +2324,19 @@ class VLDU_G_ELE_V4<string TyStr, NVPTXRegClass regclass>
"ldu.global.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
-def INT_PTX_LDU_G_v2i8_ELE : VLDU_G_ELE_V2<"u8", Int16Regs>;
-def INT_PTX_LDU_G_v2i16_ELE : VLDU_G_ELE_V2<"u16", Int16Regs>;
-def INT_PTX_LDU_G_v2i32_ELE : VLDU_G_ELE_V2<"u32", Int32Regs>;
-def INT_PTX_LDU_G_v2f32_ELE : VLDU_G_ELE_V2<"f32", Float32Regs>;
-def INT_PTX_LDU_G_v2i64_ELE : VLDU_G_ELE_V2<"u64", Int64Regs>;
-def INT_PTX_LDU_G_v2f64_ELE : VLDU_G_ELE_V2<"f64", Float64Regs>;
+def INT_PTX_LDU_G_v2i8_ELE : VLDU_G_ELE_V2<"b8", Int16Regs>;
+def INT_PTX_LDU_G_v2i16_ELE : VLDU_G_ELE_V2<"b16", Int16Regs>;
+def INT_PTX_LDU_G_v2i32_ELE : VLDU_G_ELE_V2<"b32", Int32Regs>;
+def INT_PTX_LDU_G_v2f32_ELE : VLDU_G_ELE_V2<"b32", Float32Regs>;
+def INT_PTX_LDU_G_v2i64_ELE : VLDU_G_ELE_V2<"b64", Int64Regs>;
+def INT_PTX_LDU_G_v2f64_ELE : VLDU_G_ELE_V2<"b64", Float64Regs>;
-def INT_PTX_LDU_G_v4i8_ELE : VLDU_G_ELE_V4<"u8", Int16Regs>;
-def INT_PTX_LDU_G_v4i16_ELE : VLDU_G_ELE_V4<"u16", Int16Regs>;
-def INT_PTX_LDU_G_v4i32_ELE : VLDU_G_ELE_V4<"u32", Int32Regs>;
+def INT_PTX_LDU_G_v4i8_ELE : VLDU_G_ELE_V4<"b8", Int16Regs>;
+def INT_PTX_LDU_G_v4i16_ELE : VLDU_G_ELE_V4<"b16", Int16Regs>;
+def INT_PTX_LDU_G_v4i32_ELE : VLDU_G_ELE_V4<"b32", Int32Regs>;
def INT_PTX_LDU_G_v4f16_ELE : VLDU_G_ELE_V4<"b16", Int16Regs>;
def INT_PTX_LDU_G_v4f16x2_ELE : VLDU_G_ELE_V4<"b32", Int32Regs>;
-def INT_PTX_LDU_G_v4f32_ELE : VLDU_G_ELE_V4<"f32", Float32Regs>;
+def INT_PTX_LDU_G_v4f32_ELE : VLDU_G_ELE_V4<"b32", Float32Regs>;
//-----------------------------------
@@ -2352,12 +2352,12 @@ class LDG_G<string TyStr, NVPTXRegClass regclass>
"ld.global.nc." # TyStr # " \t$result, [$src];",
[]>, Requires<[hasLDG]>;
-def INT_PTX_LDG_GLOBAL_i8 : LDG_G<"u8", Int16Regs>;
-def INT_PTX_LDG_GLOBAL_i16 : LDG_G<"u16", Int16Regs>;
-def INT_PTX_LDG_GLOBAL_i32 : LDG_G<"u32", Int32Regs>;
-def INT_PTX_LDG_GLOBAL_i64 : LDG_G<"u64", Int64Regs>;
-def INT_PTX_LDG_GLOBAL_f32 : LDG_G<"f32", Float32Regs>;
-def INT_PTX_LDG_GLOBAL_f64 : LDG_G<"f64", Float64Regs>;
+def INT_PTX_LDG_GLOBAL_i8 : LDG_G<"b8", Int16Regs>;
+def INT_PTX_LDG_GLOBAL_i16 : LDG_G<"b16", Int16Regs>;
+def INT_PTX_LDG_GLOBAL_i32 : LDG_G<"b32", Int32Regs>;
+def INT_PTX_LDG_GLOBAL_i64 : LDG_G<"b64", Int64Regs>;
+def INT_PTX_LDG_GLOBAL_f32 : LDG_G<"b32", Float32Regs>;
+def INT_PTX_LDG_GLOBAL_f64 : LDG_G<"b64", Float64Regs>;
// vector
@@ -2374,17 +2374,17 @@ class VLDG_G_ELE_V4<string TyStr, NVPTXRegClass regclass> :
"ld.global.nc.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
// FIXME: 8-bit LDG should be fixed once LDG/LDU nodes are made into proper loads.
-def INT_PTX_LDG_G_v2i8_ELE : VLDG_G_ELE_V2<"u8", Int16Regs>;
-def INT_PTX_LDG_G_v2i16_ELE : VLDG_G_ELE_V2<"u16", Int16Regs>;
-def INT_PTX_LDG_G_v2i32_ELE : VLDG_G_ELE_V2<"u32", Int32Regs>;
-def INT_PTX_LDG_G_v2f32_ELE : VLDG_G_ELE_V2<"f32", Float32Regs>;
-def INT_PTX_LDG_G_v2i64_ELE : VLDG_G_ELE_V2<"u64", Int64Regs>;
-def INT_PTX_LDG_G_v2f64_ELE : VLDG_G_ELE_V2<"f64", Float64Regs>;
-
-def INT_PTX_LDG_G_v4i8_ELE : VLDG_G_ELE_V4<"u8", Int16Regs>;
-def INT_PTX_LDG_G_v4i16_ELE : VLDG_G_ELE_V4<"u16", Int16Regs>;
-def INT_PTX_LDG_G_v4i32_ELE : VLDG_G_ELE_V4<"u32", Int32Regs>;
-def INT_PTX_LDG_G_v4f32_ELE : VLDG_G_ELE_V4<"f32", Float32Regs>;
+def INT_PTX_LDG_G_v2i8_ELE : VLDG_G_ELE_V2<"b8", Int16Regs>;
+def INT_PTX_LDG_G_v2i16_ELE : VLDG_G_ELE_V2<"b16", Int16Regs>;
+def INT_PTX_LDG_G_v2i32_ELE : VLDG_G_ELE_V2<"b32", Int32Regs>;
+def INT_PTX_LDG_G_v2f32_ELE : VLDG_G_ELE_V2<"b32", Float32Regs>;
+def INT_PTX_LDG_G_v2i64_ELE : VLDG_G_ELE_V2<"b64", Int64Regs>;
+def INT_PTX_LDG_G_v2f64_ELE : VLDG_G_ELE_V2<"b64", Float64Regs>;
+
+def INT_PTX_LDG_G_v4i8_ELE : VLDG_G_ELE_V4<"b8", Int16Regs>;
+def INT_PTX_LDG_G_v4i16_ELE : VLDG_G_ELE_V4<"b16", Int16Regs>;
+def INT_PTX_LDG_G_v4i32_ELE : VLDG_G_ELE_V4<"b32", Int32Regs>;
+def INT_PTX_LDG_G_v4f32_ELE : VLDG_G_ELE_V4<"b32", Float32Regs>;
multiclass NG_TO_G<string Str, bit Supports32 = 1, list<Predicate> Preds = []> {
diff --git a/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll b/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
index 8f0964c2d5eba..78b57badc06e8 100644
--- a/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
+++ b/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
@@ -16,8 +16,8 @@ define i32 @f(ptr %p) {
; ENABLED-NEXT: .reg .b64 %rd<2>;
; ENABLED-EMPTY:
; ENABLED-NEXT: // %bb.0:
-; ENABLED-NEXT: ld.param.u64 %rd1, [f_param_0];
-; ENABLED-NEXT: ld.v2.u32 {%r1, %r2}, [%rd1];
+; ENABLED-NEXT: ld.param.b64 %rd1, [f_param_0];
+; ENABLED-NEXT: ld.v2.b32 {%r1, %r2}, [%rd1];
; ENABLED-NEXT: add.s32 %r3, %r1, %r2;
; ENABLED-NEXT: st.param.b32 [func_retval0], %r3;
; ENABLED-NEXT: ret;
@@ -28,9 +28,9 @@ define i32 @f(ptr %p) {
; DISABLED-NEXT: .reg .b64 %rd<2>;
; DISABLED-EMPTY:
; DISABLED-NEXT: // %bb.0:
-; DISABLED-NEXT: ld.param.u64 %rd1, [f_param_0];
-; DISABLED-NEXT: ld.u32 %r1, [%rd1];
-; DISABLED-NEXT: ld.u32 %r2, [%rd1+4];
+; DISABLED-NEXT: ld.param.b64 %rd1, [f_param_0];
+; DISABLED-NEXT: ld.b32 %r1, [%rd1];
+; DISABLED-NEXT: ld.b32 %r2, [%rd1+4];
; DISABLED-NEXT: add.s32 %r3, %r1, %r2;
; DISABLED-NEXT: st.param.b32 [func_retval0], %r3;
; DISABLED-NEXT: ret;
@@ -49,7 +49,7 @@ define half @fh(ptr %p) {
; ENABLED-NEXT: .reg .b64 %rd<2>;
; ENABLED-EMPTY:
; ENABLED-NEXT: // %bb.0:
-; ENABLED-NEXT: ld.param.u64 %rd1, [fh_param_0];
+; ENABLED-NEXT: ld.param.b64 %rd1, [fh_param_0];
; ENABLED-NEXT: ld.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
; ENABLED-NEXT: ld.b16 %rs5, [%rd1+8];
; ENABLED-NEXT: cvt.f32.f16 %f1, %rs2;
@@ -78,7 +78,7 @@ define half @fh(ptr %p) {
; DISABLED-NEXT: .reg .b64 %rd<2>;
; DISABLED-EMPTY:
; DISABLED-NEXT: // %bb.0:
-; DISABLED-NEXT: ld.param.u64 %rd1, [fh_param_0];
+; DISABLED-NEXT: ld.param.b64 %rd1, [fh_param_0];
; DISABLED-NEXT: ld.b16 %rs1, [%rd1];
; DISABLED-NEXT: ld.b16 %rs2, [%rd1+2];
; DISABLED-NEXT: ld.b16 %rs3, [%rd1+4];
@@ -125,14 +125,14 @@ define float @ff(ptr %p) {
; ENABLED-NEXT: .reg .b64 %rd<2>;
; ENABLED-EMPTY:
; ENABLED-NEXT: // %bb.0:
-; ENABLED-NEXT: ld.param.u64 %rd1, [ff_param_0];
-; ENABLED-NEXT: ld.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
-; ENABLED-NEXT: ld.f32 %f5, [%rd1+16];
+; ENABLED-NEXT: ld.param.b64 %rd1, [ff_param_0];
+; ENABLED-NEXT: ld.v4.b32 {%f1, %f2, %f3, %f4}, [%rd1];
+; ENABLED-NEXT: ld.b32 %f5, [%rd1+16];
; ENABLED-NEXT: add.rn.f32 %f6, %f1, %f2;
; ENABLED-NEXT: add.rn.f32 %f7, %f3, %f4;
; ENABLED-NEXT: add.rn.f32 %f8, %f6, %f7;
; ENABLED-NEXT: add.rn.f32 %f9, %f8, %f5;
-; ENABLED-NEXT: st.param.f32 [func_retval0], %f9;
+; ENABLED-NEXT: st.param.b32 [func_retval0], %f9;
; ENABLED-NEXT: ret;
;
; DISABLED-LABEL: ff(
@@ -141,17 +141,17 @@ define float @ff(ptr %p) {
; DISABLED-NEXT: .reg .b64 %rd<2>;
; DISABLED-EMPTY:
; DISABLED-NEXT: // %bb.0:
-; DISABLED-NEXT: ld.param.u64 %rd1, [ff_param_0];
-; DISABLED-NEXT: ld.f32 %f1, [%rd1];
-; DISABLED-NEXT: ld.f32 %f2, [%rd1+4];
-; DISABLED-NEXT: ld.f32 %f3, [%rd1+8];
-; DISABLED-NEXT: ld.f32 %f4, [%rd1+12];
-; DISABLED-NEXT: ld.f32 %f5, [%rd1+16];
+; DISABLED-NEXT: ld.param.b64 %rd1, [ff_param_0];
+; DISABLED-NEXT: ld.b32 %f1, [%rd1];
+; DISABLED-NEXT: ld.b32 %f2, [%rd1+4];
+; DISABLED-NEXT: ld.b32 %f3, [%rd1+8];
+; DISABLED-NEXT: ld.b32 %f4, [%rd1+12];
+; DISABLED-NEXT: ld.b32 %f5, [%rd1+16];
; DISABLED-NEXT: add.rn.f32 %f6, %f1, %f2;
; DISABLED-NEXT: add.rn.f32 %f7, %f3, %f4;
; DISABLED-NEXT: add.rn.f32 %f8, %f6, %f7;
; DISABLED-NEXT: add.rn.f32 %f9, %f8, %f5;
-; DISABLED-NEXT: st.param.f32 [func_retval0], %f9;
+; DISABLED-NEXT: st.param.b32 [func_retval0], %f9;
; DISABLED-NEXT: ret;
%p.1 = getelementptr float, ptr %p, i32 1
%p.2 = getelementptr float, ptr %p, i32 2
@@ -176,9 +176,9 @@ define void @combine_v16i8(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr
; ENABLED-NEXT: .reg .b64 %rd<3>;
; ENABLED-EMPTY:
; ENABLED-NEXT: // %bb.0:
-; ENABLED-NEXT: ld.param.u64 %rd1, [combine_v16i8_param_0];
+; ENABLED-NEXT: ld.param.b64 %rd1, [combine_v16i8_param_0];
; ENABLED-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
-; ENABLED-NEXT: ld.param.u64 %rd2, [combine_v16i8_param_1];
+; ENABLED-NEXT: ld.param.b64 %rd2, [combine_v16i8_param_1];
; ENABLED-NEXT: bfe.u32 %r5, %r1, 0, 8;
; ENABLED-NEXT: bfe.u32 %r6, %r1, 8, 8;
; ENABLED-NEXT: bfe.u32 %r7, %r1, 16, 8;
@@ -210,7 +210,7 @@ define void @combine_v16i8(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr
; ENABLED-NEXT: add.s32 %r33, %r32, %r18;
; ENABLED-NEXT: add.s32 %r34, %r33, %r19;
; ENABLED-NEXT: add.s32 %r35, %r34, %r20;
-; ENABLED-NEXT: st.u32 [%rd2], %r35;
+; ENABLED-NEXT: st.b32 [%rd2], %r35;
; ENABLED-NEXT: ret;
;
; DISABLED-LABEL: combine_v16i8(
@@ -219,24 +219,24 @@ define void @combine_v16i8(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr
; DISABLED-NEXT: .reg .b64 %rd<3>;
; DISABLED-EMPTY:
; DISABLED-NEXT: // %bb.0:
-; DISABLED-NEXT: ld.param.u64 %rd1, [combine_v16i8_param_0];
-; DISABLED-NEXT: ld.u8 %r1, [%rd1];
-; DISABLED-NEXT: ld.param.u64 %rd2, [combine_v16i8_param_1];
-; DISABLED-NEXT: ld.u8 %r2, [%rd1+1];
-; DISABLED-NEXT: ld.u8 %r3, [%rd1+2];
-; DISABLED-NEXT: ld.u8 %r4, [%rd1+3];
-; DISABLED-NEXT: ld.u8 %r5, [%rd1+4];
-; DISABLED-NEXT: ld.u8 %r6, [%rd1+5];
-; DISABLED-NEXT: ld.u8 %r7, [%rd1+6];
-; DISABLED-NEXT: ld.u8 %r8, [%rd1+7];
-; DISABLED-NEXT: ld.u8 %r9, [%rd1+8];
-; DISABLED-NEXT: ld.u8 %r10, [%rd1+9];
-; DISABLED-NEXT: ld.u8 %r11, [%rd1+10];
-; DISABLED-NEXT: ld.u8 %r12, [%rd1+11];
-; DISABLED-NEXT: ld.u8 %r13, [%rd1+12];
-; DISABLED-NEXT: ld.u8 %r14, [%rd1+13];
-; DISABLED-NEXT: ld.u8 %r15, [%rd1+14];
-; DISABLED-NEXT: ld.u8 %r16, [%rd1+15];
+; DISABLED-NEXT: ld.param.b64 %rd1, [combine_v16i8_param_0];
+; DISABLED-NEXT: ld.b8 %r1, [%rd1];
+; DISABLED-NEXT: ld.param.b64 %rd2, [combine_v16i8_param_1];
+; DISABLED-NEXT: ld.b8 %r2, [%rd1+1];
+; DISABLED-NEXT: ld.b8 %r3, [%rd1+...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nice. I think I can see where you're going and I like the direction.
LGTM.
This is awesome, LGTM! |
f0cdc61
to
36abb65
Compare
36abb65
to
c9d08db
Compare
c9d08db
to
4bb455f
Compare
In most cases, the type information attached to load and store instructions is meaningless and inconsistently applied. We can usually use ".b" loads and avoid the complexity of trying to assign the correct type. The one expectation is sign-extending load, which will continue to use ".s" to ensure the sign extension into a larger register is done correctly.