-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[VPlan] Add VPInstruction::StepVector and use it in VPWidenIntOrFpInductionRecipe #129508
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-llvm-transforms Author: Luke Lau (lukel97) ChangesSplit off from #118638, this adds a new VPInstruction for integer step vectors (0,1,2,...), so that we can eventually model all the separate parts of VPWidenIntOrFpInductionRecipe in VPlan. The type of the element is specified through a sentinel value as is done in #119284. This is then used by VPWidenIntOrFpInductionRecipe, where we add it just before execution in convertToConcreteRecipes. We need a dummy placeholder operand so we have somewhere to pass it, but this should go away when ##118638 lands. Full diff: https://github.com/llvm/llvm-project/pull/129508.diff 10 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 0f1fa517be000..663ab90b89892 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -881,6 +881,9 @@ class VPInstruction : public VPRecipeWithIRFlags,
// Extracts the first active lane of a vector, where the first operand is
// the predicate, and the second operand is the vector to extract.
ExtractFirstActive,
+ // Creates a step vector starting from 0 with a step of 1. The first operand
+ // is a dummy constant that should be used to specify the element type.
+ StepVector,
};
private:
@@ -1769,6 +1772,7 @@ class VPWidenIntOrFpInductionRecipe : public VPWidenInductionRecipe {
Step, IndDesc, DL),
Trunc(nullptr) {
addOperand(VF);
+ addOperand(VF); // Dummy StepVector replaced in convertToConcreteRecipes
}
VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step,
@@ -1778,6 +1782,7 @@ class VPWidenIntOrFpInductionRecipe : public VPWidenInductionRecipe {
Step, IndDesc, DL),
Trunc(Trunc) {
addOperand(VF);
+ addOperand(VF); // Dummy StepVector replaced in convertToConcreteRecipes
}
~VPWidenIntOrFpInductionRecipe() override = default;
@@ -1803,10 +1808,14 @@ class VPWidenIntOrFpInductionRecipe : public VPWidenInductionRecipe {
VPValue *getVFValue() { return getOperand(2); }
const VPValue *getVFValue() const { return getOperand(2); }
+ VPValue *getStepVector() { return getOperand(3); }
+ const VPValue *getStepVector() const { return getOperand(3); }
+ void setStepVector(VPValue *V) { setOperand(3, V); }
+
VPValue *getSplatVFValue() {
// If the recipe has been unrolled (4 operands), return the VPValue for the
// induction increment.
- return getNumOperands() == 5 ? getOperand(3) : nullptr;
+ return getNumOperands() == 6 ? getOperand(4) : nullptr;
}
/// Returns the first defined value as TruncInst, if it is one or nullptr
@@ -1828,7 +1837,7 @@ class VPWidenIntOrFpInductionRecipe : public VPWidenInductionRecipe {
/// the last unrolled part, if it exists. Returns itself if unrolling did not
/// take place.
VPValue *getLastUnrolledPartOperand() {
- return getNumOperands() == 5 ? getOperand(4) : this;
+ return getNumOperands() == 6 ? getOperand(5) : this;
}
/// Returns true if the recipe only uses the first lane of operand \p Op.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 6f6875f0e5e0e..b4365b97639d7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -77,6 +77,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
case VPInstruction::CalculateTripCountMinusVF:
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::AnyOf:
+ case VPInstruction::StepVector:
return SetResultTyFromOp();
case VPInstruction::ExtractFirstActive:
case VPInstruction::ExtractFromEnd: {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index e9f50e88867b2..cd4feda2a6493 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -713,6 +713,11 @@ Value *VPInstruction::generate(VPTransformState &State) {
Builder.getInt64Ty(), Mask, true, "first.active.lane");
return Builder.CreateExtractElement(Vec, Ctz, "early.exit.value");
}
+ case VPInstruction::StepVector: {
+ Type *EltTy = State.get(getOperand(0), true)->getType();
+ return State.Builder.CreateStepVector(VectorType::get(EltTy, State.VF));
+ }
+
default:
llvm_unreachable("Unsupported opcode for instruction");
}
@@ -824,6 +829,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
case VPInstruction::LogicalAnd:
case VPInstruction::Not:
case VPInstruction::PtrAdd:
+ case VPInstruction::StepVector:
return false;
default:
return true;
@@ -850,6 +856,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
case VPInstruction::BranchOnCount:
case VPInstruction::BranchOnCond:
case VPInstruction::ResumePhi:
+ case VPInstruction::StepVector:
return true;
case VPInstruction::PtrAdd:
return Op == getOperand(0) || vputils::onlyFirstLaneUsed(this);
@@ -947,6 +954,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::ExtractFirstActive:
O << "extract-first-active";
break;
+ case VPInstruction::StepVector:
+ O << "step-vector";
+ break;
default:
O << Instruction::getOpcodeName(getOpcode());
}
@@ -1710,7 +1720,8 @@ InstructionCost VPHeaderPHIRecipe::computeCost(ElementCount VF,
/// (0 * Step, 1 * Step, 2 * Step, ...)
/// to each vector element of Val.
/// \p Opcode is relevant for FP induction variable.
-static Value *getStepVector(Value *Val, Value *Step,
+/// \p InitVec is an integer step vector from 0 with a step of 1.
+static Value *getStepVector(Value *Val, Value *Step, Value *InitVec,
Instruction::BinaryOps BinOp, ElementCount VF,
IRBuilderBase &Builder) {
assert(VF.isVector() && "only vector VFs are supported");
@@ -1726,15 +1737,6 @@ static Value *getStepVector(Value *Val, Value *Step,
SmallVector<Constant *, 8> Indices;
- // Create a vector of consecutive numbers from zero to VF.
- VectorType *InitVecValVTy = ValVTy;
- if (STy->isFloatingPointTy()) {
- Type *InitVecValSTy =
- IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
- InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
- }
- Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
-
if (STy->isIntegerTy()) {
Step = Builder.CreateVectorSplat(VLen, Step);
assert(Step->getType() == Val->getType() && "Invalid step vec");
@@ -1800,8 +1802,11 @@ void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
}
Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
- Value *SteppedStart = getStepVector(SplatStart, Step, ID.getInductionOpcode(),
- State.VF, State.Builder);
+ assert(cast<VPInstruction>(getStepVector())->getOpcode() ==
+ VPInstruction::StepVector);
+ Value *SteppedStart =
+ ::getStepVector(SplatStart, Step, State.get(getStepVector()),
+ ID.getInductionOpcode(), State.VF, State.Builder);
// We create vector phi nodes for both integer and floating-point induction
// variables. Here, we determine the kind of arithmetic we will perform.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index b09933cd0e186..49bc0244f4e48 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2033,6 +2033,19 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan) {
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_depth_first_deep(Plan.getEntry()))) {
for (VPRecipeBase &R : make_early_inc_range(VPBB->phis())) {
+ if (auto *IVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
+ Type *Ty = IVR->getTruncInst() ? IVR->getTruncInst()->getType()
+ : IVR->getPHINode()->getType();
+ if (Ty->isFloatingPointTy())
+ Ty = IntegerType::get(Ty->getContext(), Ty->getScalarSizeInBits());
+ VPValue *TyVal = Plan.getOrAddLiveIn(Constant::getNullValue(Ty));
+
+ VPInstruction *StepVector =
+ new VPInstruction(VPInstruction::StepVector, {TyVal});
+ Plan.getVectorPreheader()->appendRecipe(StepVector);
+ IVR->setStepVector(StepVector);
+ }
+
if (!isa<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe>(&R))
continue;
auto *PhiR = cast<VPHeaderPHIRecipe>(&R);
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll
index a83c62b04afc7..344696f0ac932 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll
@@ -26,9 +26,9 @@ define void @test_no_scalarization(ptr %a, ptr noalias %b, i32 %idx, i32 %n) #0
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
; CHECK-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 2
; CHECK-NEXT: [[IND_END:%.*]] = add i32 [[IDX]], [[N_VEC]]
+; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[IDX]], i64 0
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[DOTSPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
; CHECK-NEXT: [[TMP10:%.*]] = mul <vscale x 2 x i32> [[TMP8]], splat (i32 1)
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i32> [[DOTSPLAT]], [[TMP10]]
; CHECK-NEXT: [[TMP13:%.*]] = mul i32 1, [[TMP7]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index c890cb71d34be..74aa3da4bc544 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -1452,11 +1452,11 @@ define void @PR34743(ptr %a, ptr %b, i64 %n) #1 {
; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP10:%.*]] = shl nuw nsw i64 [[TMP9]], 2
; CHECK-NEXT: [[IND_END:%.*]] = shl i64 [[N_VEC]], 1
+; CHECK-NEXT: [[TMP14:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
; CHECK-NEXT: [[TMP12:%.*]] = shl nuw nsw i32 [[TMP11]], 2
; CHECK-NEXT: [[TMP13:%.*]] = add nsw i32 [[TMP12]], -1
; CHECK-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i16> poison, i16 [[DOTPRE]], i32 [[TMP13]]
-; CHECK-NEXT: [[TMP14:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
; CHECK-NEXT: [[TMP15:%.*]] = shl <vscale x 4 x i64> [[TMP14]], splat (i64 1)
; CHECK-NEXT: [[TMP17:%.*]] = shl nuw nsw i64 [[TMP9]], 3
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP17]], i64 0
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
index 3839b367ae08c..fbcdd51f71b59 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
@@ -33,9 +33,9 @@ define void @dead_load(ptr %p, i16 %start) {
; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 8
; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[N_VEC]], 3
; CHECK-NEXT: [[IND_END:%.*]] = add i64 [[START_EXT]], [[TMP18]]
+; CHECK-NEXT: [[TMP15:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[START_EXT]], i64 0
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP15:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
; CHECK-NEXT: [[TMP17:%.*]] = mul <vscale x 8 x i64> [[TMP15]], splat (i64 3)
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 8 x i64> [[DOTSPLAT]], [[TMP17]]
; CHECK-NEXT: [[TMP20:%.*]] = mul i64 3, [[TMP14]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll
index 2582882baba00..5c15660e87132 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll
@@ -70,9 +70,9 @@ define void @skip_free_iv_truncate(i16 %x, ptr %A) #0 {
; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
; CHECK-NEXT: [[TMP50:%.*]] = mul i32 [[DOTCAST]], 3
; CHECK-NEXT: [[IND_END22:%.*]] = add i32 [[X_I32]], [[TMP50]]
+; CHECK-NEXT: [[TMP53:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[X_I64]], i64 0
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP53:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
; CHECK-NEXT: [[TMP55:%.*]] = mul <vscale x 8 x i64> [[TMP53]], splat (i64 3)
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 8 x i64> [[DOTSPLAT]], [[TMP55]]
; CHECK-NEXT: [[TMP58:%.*]] = mul i64 3, [[TMP52]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll
index 082386e39f3f6..46202c55555ee 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll
@@ -600,8 +600,8 @@ define i32 @step_cond_add(ptr %a, i64 %n, i32 %start) {
; NO-VP-OUTLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
; NO-VP-OUTLOOP-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
; NO-VP-OUTLOOP-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
-; NO-VP-OUTLOOP-NEXT: [[TMP11:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[START]], i32 0
; NO-VP-OUTLOOP-NEXT: [[TMP12:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; NO-VP-OUTLOOP-NEXT: [[TMP11:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[START]], i32 0
; NO-VP-OUTLOOP-NEXT: [[TMP14:%.*]] = mul <vscale x 4 x i32> [[TMP12]], splat (i32 1)
; NO-VP-OUTLOOP-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP14]]
; NO-VP-OUTLOOP-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP10]] to i32
@@ -792,8 +792,8 @@ define i32 @step_cond_add_pred(ptr %a, i64 %n, i32 %start) {
; NO-VP-OUTLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
; NO-VP-OUTLOOP-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
; NO-VP-OUTLOOP-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
-; NO-VP-OUTLOOP-NEXT: [[TMP11:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[START]], i32 0
; NO-VP-OUTLOOP-NEXT: [[TMP12:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; NO-VP-OUTLOOP-NEXT: [[TMP11:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[START]], i32 0
; NO-VP-OUTLOOP-NEXT: [[TMP14:%.*]] = mul <vscale x 4 x i32> [[TMP12]], splat (i32 1)
; NO-VP-OUTLOOP-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP14]]
; NO-VP-OUTLOOP-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP10]] to i32
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
index 00d8de67a3b40..f48161daf864e 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
@@ -663,7 +663,7 @@ define void @print_expand_scev(i64 %y, ptr %ptr) {
; CHECK-NEXT: <x1> vector loop: {
; CHECK-NEXT: vector.body:
; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
-; CHECK-NEXT: ir<%iv> = WIDEN-INDUCTION ir<0>, vp<[[EXP_SCEV]]>, vp<[[VF]]> (truncated to i8)
+; CHECK-NEXT: ir<%iv> = WIDEN-INDUCTION ir<0>, vp<[[EXP_SCEV]]>, vp<[[VF]]>, vp<[[VF]]> (truncated to i8)
; CHECK-NEXT: vp<[[DERIVED_IV:%.+]]> = DERIVED-IV ir<0> + vp<[[CAN_IV]]> * vp<[[EXP_SCEV]]>
; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DERIVED_IV]]>, vp<[[EXP_SCEV]]>
; CHECK-NEXT: WIDEN ir<%v3> = add nuw ir<%iv>, ir<1>
|
…(NFC) There are some opcodes that currently require specialized recipes, due to their result type not being implied by their operands, including casts. This leads to duplication from defining multiple full recipes. This patch introduces a new VPInstructionWithType subclass that also stores the result type. The general idea is to have opcodes needing to specify a result type to use this general recipe. The current patch replaces VPScalarCastRecipe with VInstructionWithType, a similar patch for VPWidenCastRecipe will follow soon. There are a few proposed opcodes that should also benefit, without the need of workarounds: * llvm#129508 * llvm#119284
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
…(NFC) There are some opcodes that currently require specialized recipes, due to their result type not being implied by their operands, including casts. This leads to duplication from defining multiple full recipes. This patch introduces a new VPInstructionWithType subclass that also stores the result type. The general idea is to have opcodes needing to specify a result type to use this general recipe. The current patch replaces VPScalarCastRecipe with VInstructionWithType, a similar patch for VPWidenCastRecipe will follow soon. There are a few proposed opcodes that should also benefit, without the need of workarounds: * llvm#129508 * llvm#119284
b27f82e
to
3b45321
Compare
Rebased on top of #129706 |
…NFC) (#129706) There are some opcodes that currently require specialized recipes, due to their result type not being implied by their operands, including casts. This leads to duplication from defining multiple full recipes. This patch introduces a new VPInstructionWithType subclass that also stores the result type. The general idea is to have opcodes needing to specify a result type to use this general recipe. The current patch replaces VPScalarCastRecipe with VInstructionWithType, a similar patch for VPWidenCastRecipe will follow soon. There are a few proposed opcodes that should also benefit, without the need of workarounds: * #129508 * #119284 PR: #129706
…ScalarCast(NFC) (#129706) There are some opcodes that currently require specialized recipes, due to their result type not being implied by their operands, including casts. This leads to duplication from defining multiple full recipes. This patch introduces a new VPInstructionWithType subclass that also stores the result type. The general idea is to have opcodes needing to specify a result type to use this general recipe. The current patch replaces VPScalarCastRecipe with VInstructionWithType, a similar patch for VPWidenCastRecipe will follow soon. There are a few proposed opcodes that should also benefit, without the need of workarounds: * llvm/llvm-project#129508 * llvm/llvm-project#119284 PR: llvm/llvm-project#129706
…NFC) (llvm#129706) There are some opcodes that currently require specialized recipes, due to their result type not being implied by their operands, including casts. This leads to duplication from defining multiple full recipes. This patch introduces a new VPInstructionWithType subclass that also stores the result type. The general idea is to have opcodes needing to specify a result type to use this general recipe. The current patch replaces VPScalarCastRecipe with VInstructionWithType, a similar patch for VPWidenCastRecipe will follow soon. There are a few proposed opcodes that should also benefit, without the need of workarounds: * llvm#129508 * llvm#119284 PR: llvm#129706
3b45321
to
dab3f14
Compare
…NFC) (llvm#129706) There are some opcodes that currently require specialized recipes, due to their result type not being implied by their operands, including casts. This leads to duplication from defining multiple full recipes. This patch introduces a new VPInstructionWithType subclass that also stores the result type. The general idea is to have opcodes needing to specify a result type to use this general recipe. The current patch replaces VPScalarCastRecipe with VInstructionWithType, a similar patch for VPWidenCastRecipe will follow soon. There are a few proposed opcodes that should also benefit, without the need of workarounds: * llvm#129508 * llvm#119284 PR: llvm#129706
…uctionRecipe Split off from llvm#118638, this adds a new VPInstruction for integer step vectors (0,1,2,...), so that we can eventually model all the separate parts of VPWidenIntOrFpInductionRecipe in VPlan. This is then used by VPWidenIntOrFpInductionRecipe, where we add it just before execution in convertToConcreteRecipes. We need a dummy placeholder operand so we have somewhere to pass it, but this should go away when #llvm#118638 lands.
1dc1eb2
to
5d307e4
Compare
Rebased on top of main now that #119284 has landed |
@@ -1965,8 +1965,11 @@ void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { | |||
} | |||
|
|||
Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); | |||
Value *SteppedStart = getStepVector(SplatStart, Step, ID.getInductionOpcode(), | |||
State.VF, State.Builder); | |||
assert(cast<VPInstruction>(getStepVector())->getOpcode() == |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
&& step vector operand must be a StepVector
VPInstruction.
Although that may be too restrictive, for fixed vectors it could just be a constant vector?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed vectors would still be a VPInstruction::StepVector. Hopefully the need for this assertion would also go away if we fully expand the recipe in #118638
* Document range is to VF * Move private method above * Merge classof checks * Use VPBuilder * Add assertion method
…pe. NFC Split off from llvm#129508, this generalizes getSplatVFValue and getLastUnrolledPartOperand so they don't need changed if another operand is added.
…pe. NFC (llvm#137635) Split off from llvm#129508, this generalizes getSplatVFValue and getLastUnrolledPartOperand so they don't need changed if another operand is added.
…teNaryOp (#137632) Currently if we try to create a VPInstructionWithType without a FMF via VPBuilder::createNaryOp we will use the constructor that asserts `assert(isFPMathOp() && "this op can't take fast-math flags");`. This fixes it by checking if FMFs have a value, similar to the other createNaryOp overloads. This is needed by #129508
@@ -1094,6 +1093,12 @@ void VPInstructionWithType::execute(VPTransformState &State) { | |||
State.set(this, Cast, VPLane(0)); | |||
break; | |||
} | |||
case VPInstruction::StepVector: { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we need to add this to VPInstruction::computeCost?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it's currently costed as zero for now since we don't want to change the overall cost of VPWidenIntOrFpInductionRecipe.
Once we expand VPWidenIntOrFpInductionRecipe in #118638 I think it should be safe to add a cost to it then? Since the expansion will happen just before execution, after any costing.
2bca580
to
8201fa2
Compare
…pe. NFC (llvm#137635) Split off from llvm#129508, this generalizes getSplatVFValue and getLastUnrolledPartOperand so they don't need changed if another operand is added.
…teNaryOp (llvm#137632) Currently if we try to create a VPInstructionWithType without a FMF via VPBuilder::createNaryOp we will use the constructor that asserts `assert(isFPMathOp() && "this op can't take fast-math flags");`. This fixes it by checking if FMFs have a value, similar to the other createNaryOp overloads. This is needed by llvm#129508
…pe. NFC (llvm#137635) Split off from llvm#129508, this generalizes getSplatVFValue and getLastUnrolledPartOperand so they don't need changed if another operand is added.
…teNaryOp (llvm#137632) Currently if we try to create a VPInstructionWithType without a FMF via VPBuilder::createNaryOp we will use the constructor that asserts `assert(isFPMathOp() && "this op can't take fast-math flags");`. This fixes it by checking if FMFs have a value, similar to the other createNaryOp overloads. This is needed by llvm#129508
…pe. NFC (llvm#137635) Split off from llvm#129508, this generalizes getSplatVFValue and getLastUnrolledPartOperand so they don't need changed if another operand is added.
…teNaryOp (llvm#137632) Currently if we try to create a VPInstructionWithType without a FMF via VPBuilder::createNaryOp we will use the constructor that asserts `assert(isFPMathOp() && "this op can't take fast-math flags");`. This fixes it by checking if FMFs have a value, similar to the other createNaryOp overloads. This is needed by llvm#129508
auto *NewStepVector = new VPInstructionWithType( | ||
VPInstruction::StepVector, {}, NewIVTy, OldStepVector->getDebugLoc()); | ||
NewStepVector->insertAfter(WideIV->getStepVector()->getDefiningRecipe()); | ||
WideIV->setStepVector(NewStepVector); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
StepVector
has only a single user, the wide IV, right? Can we assert that there's a single user before removing the old step vector and just use OldStepVector->replaceAllUsesWith(NewStepVector)
, without the need for adding setStepVector
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good idea, done now
@@ -1918,6 +1928,18 @@ class VPWidenIntOrFpInductionRecipe : public VPWidenInductionRecipe { | |||
VPValue *getVFValue() { return getOperand(2); } | |||
const VPValue *getVFValue() const { return getOperand(2); } | |||
|
|||
// TODO: Remove once VPWidenIntOrFpInduction is fully expanded in | |||
// convertToConcreteRecipes. | |||
VPValue *getStepVector() { return getOperand(3); } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
At the moment, it always must be a VPInstructionWithType
, right? Would changing the return type to VPInstructionWithType
remove some casts at use sites?
Also asser that it is a StepVector
opcode here, rather than at use sites?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, thanks
VPBuilder Builder(Plan.getVectorPreheader()); | ||
VPInstruction *StepVector = Builder.createNaryOp( | ||
VPInstruction::StepVector, {}, Ty, {}, IVR->getDebugLoc()); | ||
assert(IVR->getNumOperands() == 3); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
assert(IVR->getNumOperands() == 3); | |
assert(IVR->getNumOperands() == 3 && "can only add step vector before unrolling"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, done
// Infer an up-to-date type since | ||
// optimizeVectorInductionWidthForTCAndVFUF may have truncated the start | ||
// and step values. | ||
Type *Ty = IVR->getPHINode()->getType(); | ||
if (TruncInst *Trunc = IVR->getTruncInst()) | ||
Ty = Trunc->getType(); | ||
if (Ty->isFloatingPointTy()) | ||
Ty = IntegerType::get(Ty->getContext(), Ty->getScalarSizeInBits()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this still needed? We now run before optimizeVectorInductionWidthForTCAndVFUF
AFAICT
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Woops, that comment was old from when we needed to use VPTypeAnalysis. Removed it now, since we can just work out the type from the underlying phi or trunc instead now
* Remove setStepVector * Move asserts into getStepVector, change return type * Remove outdated optimizeVectorInductionWidthForTCAndVFUF comment
…pe. NFC (llvm#137635) Split off from llvm#129508, this generalizes getSplatVFValue and getLastUnrolledPartOperand so they don't need changed if another operand is added.
…teNaryOp (llvm#137632) Currently if we try to create a VPInstructionWithType without a FMF via VPBuilder::createNaryOp we will use the constructor that asserts `assert(isFPMathOp() && "this op can't take fast-math flags");`. This fixes it by checking if FMFs have a value, similar to the other createNaryOp overloads. This is needed by llvm#129508
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks
%t
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks like this empty file was added by accident? please remove before landing.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Woops, will remove. This sometimes gets left around when calling update_test_checks.py on files that check the stderr output, e.g. ; RUN: opt -S < %s -p loop-vectorize -debug-only=loop-vectorize -mattr=+sve 2>%t
. Not sure if UTC could be taught to redirect that
* Remove temp stepvector operand from VPWidenIntOrFpInductionRecipe * Use VPInstruction::Broadcast
…pe. NFC (llvm#137635) Split off from llvm#129508, this generalizes getSplatVFValue and getLastUnrolledPartOperand so they don't need changed if another operand is added.
…teNaryOp (llvm#137632) Currently if we try to create a VPInstructionWithType without a FMF via VPBuilder::createNaryOp we will use the constructor that asserts `assert(isFPMathOp() && "this op can't take fast-math flags");`. This fixes it by checking if FMFs have a value, similar to the other createNaryOp overloads. This is needed by llvm#129508
Split off from #118638, this adds VPInstruction::StepVector, which generates integer step vectors (0,1,2,...,VF). This is a step towards eventually modelling all the separate parts of VPWidenIntOrFpInductionRecipe in VPlan.
This is then used by VPWidenIntOrFpInductionRecipe, where we materialize it just before unrolling so the operands stay in a fixed position.
The need for a separate operand in VPWidenIntOrFpInductionRecipe, as well as the need to update it in optimizeVectorInductionWidthForTCAndVFUF, should be removed with #118638 when everything is expanded in convertToConcreteRecipes.