[RISCV] Remove`riscv.segN.load/store` in favor of their mask variants #137045

mshockwave · 2025-04-23T19:43:33Z

RISCVVectorPeepholePass would replace instructions with all-ones mask with their unmask variant, so there isn't really a point to keep separate versions of intrinsics.

Note that riscv.segN.load/store.mask does not take pointer type (i.e. address space) as part of its overloading type signature, because RISC-V doesn't really use address spaces other than the default one.

This PR stacks on top of #135445

llvmbot · 2025-04-23T19:44:05Z

@llvm/pr-subscribers-backend-risc-v

Author: Min-Yih Hsu (mshockwave)

Changes

RISCVVectorPeepholePass would replace instructions with all-ones mask with their unmask variant, so there isn't really a point to keep separate versions of intrinsics.

Note that riscv.segN.load/store.mask does not take pointer type (i.e. address space) as part of its overloading type signature, because RISC-V doesn't really use address spaces other than the default one.

This PR stacks on top of #135445

Patch is 116.66 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/137045.diff

12 Files Affected:

(modified) llvm/include/llvm/CodeGen/TargetLowering.h (+5-6)
(modified) llvm/include/llvm/IR/IntrinsicsRISCV.td (+12-4)
(modified) llvm/lib/CodeGen/InterleavedAccessPass.cpp (+144-37)
(modified) llvm/lib/Target/RISCV/RISCVISelLowering.cpp (+160-115)
(modified) llvm/lib/Target/RISCV/RISCVISelLowering.h (+5-6)
(modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll (+502-32)
(modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-segN-load.ll (+7-14)
(modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-segN-store.ll (+8-16)
(modified) llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll (-53)
(modified) llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll (+26-101)
(modified) llvm/test/Transforms/InterleavedAccess/RISCV/zve32x.ll (+1-1)
(modified) llvm/test/Transforms/InterleavedAccess/RISCV/zvl32b.ll (+1-1)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 4f2f202f94841..6eaa1bae1da97 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3179,15 +3179,15 @@ class TargetLoweringBase {
     return false;
   }
 
-  /// Lower an interleaved load to target specific intrinsics. Return
+  /// Lower a deinterleaved load to target specific intrinsics. Return
   /// true on success.
   ///
   /// \p Load is a vp.load instruction.
   /// \p Mask is a mask value
   /// \p DeinterleaveRes is a list of deinterleaved results.
   virtual bool
-  lowerDeinterleavedIntrinsicToVPLoad(VPIntrinsic *Load, Value *Mask,
-                                      ArrayRef<Value *> DeinterleaveRes) const {
+  lowerDeinterleavedVPLoad(VPIntrinsic *Load, Value *Mask,
+                           ArrayRef<Value *> DeinterleaveRes) const {
     return false;
   }
 
@@ -3197,9 +3197,8 @@ class TargetLoweringBase {
   /// \p Store is the vp.store instruction.
   /// \p Mask is a mask value
   /// \p InterleaveOps is a list of values being interleaved.
-  virtual bool
-  lowerInterleavedIntrinsicToVPStore(VPIntrinsic *Store, Value *Mask,
-                                     ArrayRef<Value *> InterleaveOps) const {
+  virtual bool lowerInterleavedVPStore(VPIntrinsic *Store, Value *Mask,
+                                       ArrayRef<Value *> InterleaveOps) const {
     return false;
   }
 
diff --git a/llvm/include/llvm/IR/IntrinsicsRISCV.td b/llvm/include/llvm/IR/IntrinsicsRISCV.td
index 99cb557d9aa09..18b2883eb00e7 100644
--- a/llvm/include/llvm/IR/IntrinsicsRISCV.td
+++ b/llvm/include/llvm/IR/IntrinsicsRISCV.td
@@ -1704,19 +1704,27 @@ let TargetPrefix = "riscv" in {
   }
 
   // Segment loads/stores for fixed vectors.
+  // Note: we only have the masked variants because RISCVVectorPeephole
+  // would lower any instructions with all-ones mask into unmasked version
+  // anyway.
   foreach nf = [2, 3, 4, 5, 6, 7, 8] in {
-    def int_riscv_seg # nf # _load
+    // Input: (pointer, mask, vl)
+    def int_riscv_seg # nf # _load_mask
           : DefaultAttrsIntrinsic<!listconcat([llvm_anyvector_ty],
                                               !listsplat(LLVMMatchType<0>,
                                               !add(nf, -1))),
-                                  [llvm_anyptr_ty, llvm_anyint_ty],
+                                  [llvm_ptr_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                   llvm_anyint_ty],
                                   [NoCapture<ArgIndex<0>>, IntrReadMem]>;
-    def int_riscv_seg # nf # _store
+
+    // Input: (<stored values>..., pointer, mask, vl)
+    def int_riscv_seg # nf # _store_mask
           : DefaultAttrsIntrinsic<[],
                                   !listconcat([llvm_anyvector_ty],
                                               !listsplat(LLVMMatchType<0>,
                                                           !add(nf, -1)),
-                                              [llvm_anyptr_ty, llvm_anyint_ty]),
+                                              [llvm_ptr_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                               llvm_anyint_ty]),
                                   [NoCapture<ArgIndex<nf>>, IntrWriteMem]>;
   }
 
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 9e47510e9cd1a..66c1cd98ef602 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -100,11 +100,11 @@ class InterleavedAccessImpl {
   unsigned MaxFactor = 0u;
 
   /// Transform an interleaved load into target specific intrinsics.
-  bool lowerInterleavedLoad(LoadInst *LI,
+  bool lowerInterleavedLoad(Instruction *LoadOp,
                             SmallSetVector<Instruction *, 32> &DeadInsts);
 
   /// Transform an interleaved store into target specific intrinsics.
-  bool lowerInterleavedStore(StoreInst *SI,
+  bool lowerInterleavedStore(Instruction *StoreOp,
                              SmallSetVector<Instruction *, 32> &DeadInsts);
 
   /// Transform a load and a deinterleave intrinsic into target specific
@@ -131,7 +131,7 @@ class InterleavedAccessImpl {
   /// made.
   bool replaceBinOpShuffles(ArrayRef<ShuffleVectorInst *> BinOpShuffles,
                             SmallVectorImpl<ShuffleVectorInst *> &Shuffles,
-                            LoadInst *LI);
+                            Instruction *LI);
 };
 
 class InterleavedAccess : public FunctionPass {
@@ -249,11 +249,43 @@ static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor,
   return false;
 }
 
+/// Return true if it's an interleaving mask. For instance, 111000111000 is
+/// interleaved from three 1010 masks. \p SubMask returns the mask of individual
+/// lane.
+static bool isInterleavedConstantMask(unsigned Factor, ConstantVector *Mask,
+                                      SmallVectorImpl<Constant *> &LaneMask) {
+  unsigned LaneMaskLen = LaneMask.size();
+  if (auto *Splat = Mask->getSplatValue()) {
+    std::fill(LaneMask.begin(), LaneMask.end(), Splat);
+  } else {
+    for (unsigned Idx = 0U, N = LaneMaskLen * Factor; Idx < N; ++Idx) {
+      Constant *Ref = Mask->getAggregateElement(alignDown(Idx, Factor));
+      if (Ref != Mask->getAggregateElement(Idx))
+        return false;
+      LaneMask[Idx / Factor] = Ref;
+    }
+  }
+
+  return true;
+}
+
 bool InterleavedAccessImpl::lowerInterleavedLoad(
-    LoadInst *LI, SmallSetVector<Instruction *, 32> &DeadInsts) {
-  if (!LI->isSimple() || isa<ScalableVectorType>(LI->getType()))
+    Instruction *LoadOp, SmallSetVector<Instruction *, 32> &DeadInsts) {
+  if (isa<ScalableVectorType>(LoadOp->getType()))
     return false;
 
+  if (auto *LI = dyn_cast<LoadInst>(LoadOp)) {
+    if (!LI->isSimple())
+      return false;
+  } else if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadOp)) {
+    assert(VPLoad->getIntrinsicID() == Intrinsic::vp_load);
+    // Require a constant mask and evl.
+    if (!isa<ConstantVector>(VPLoad->getArgOperand(1)))
+      return false;
+  } else {
+    llvm_unreachable("unsupported load operation");
+  }
+
   // Check if all users of this load are shufflevectors. If we encounter any
   // users that are extractelement instructions or binary operators, we save
   // them to later check if they can be modified to extract from one of the
@@ -265,7 +297,7 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
   // binop are the same load.
   SmallSetVector<ShuffleVectorInst *, 4> BinOpShuffles;
 
-  for (auto *User : LI->users()) {
+  for (auto *User : LoadOp->users()) {
     auto *Extract = dyn_cast<ExtractElementInst>(User);
     if (Extract && isa<ConstantInt>(Extract->getIndexOperand())) {
       Extracts.push_back(Extract);
@@ -294,7 +326,7 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
   unsigned Factor, Index;
 
   unsigned NumLoadElements =
-      cast<FixedVectorType>(LI->getType())->getNumElements();
+      cast<FixedVectorType>(LoadOp->getType())->getNumElements();
   auto *FirstSVI = Shuffles.size() > 0 ? Shuffles[0] : BinOpShuffles[0];
   // Check if the first shufflevector is DE-interleave shuffle.
   if (!isDeInterleaveMask(FirstSVI->getShuffleMask(), Factor, Index, MaxFactor,
@@ -327,9 +359,9 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
 
     assert(Shuffle->getShuffleMask().size() <= NumLoadElements);
 
-    if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(0) == LI)
+    if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(0) == LoadOp)
       Indices.push_back(Index);
-    if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(1) == LI)
+    if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(1) == LoadOp)
       Indices.push_back(Index);
   }
 
@@ -339,25 +371,48 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
     return false;
 
   bool BinOpShuffleChanged =
-      replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, LI);
-
-  LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *LI << "\n");
+      replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, LoadOp);
+
+  // Check if the de-interleaved vp.load masks are the same.
+  unsigned ShuffleMaskLen = Shuffles[0]->getShuffleMask().size();
+  SmallVector<Constant *, 8> LaneMask(ShuffleMaskLen, nullptr);
+  if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadOp)) {
+    if (!isInterleavedConstantMask(
+            Factor, cast<ConstantVector>(VPLoad->getArgOperand(1)), LaneMask))
+      return false;
+  }
 
-  // Try to create target specific intrinsics to replace the load and shuffles.
-  if (!TLI->lowerInterleavedLoad(LI, Shuffles, Indices, Factor)) {
-    // If Extracts is not empty, tryReplaceExtracts made changes earlier.
-    return !Extracts.empty() || BinOpShuffleChanged;
+  LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *LoadOp << "\n");
+
+  if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadOp)) {
+    auto *MaskVec = ConstantVector::get(LaneMask);
+    // Sometimes the number of Shuffles might be less than Factor, we have to
+    // fill the gaps with null. Also, lowerDeinterleavedVPLoad
+    // expects them to be sorted.
+    SmallVector<Value *, 4> ShuffleValues(Factor, nullptr);
+    for (auto [Idx, ShuffleMaskIdx] : enumerate(Indices))
+      ShuffleValues[ShuffleMaskIdx] = Shuffles[Idx];
+    if (!TLI->lowerDeinterleavedVPLoad(VPLoad, MaskVec, ShuffleValues))
+      // If Extracts is not empty, tryReplaceExtracts made changes earlier.
+      return !Extracts.empty() || BinOpShuffleChanged;
+  } else {
+    // Try to create target specific intrinsics to replace the load and
+    // shuffles.
+    if (!TLI->lowerInterleavedLoad(cast<LoadInst>(LoadOp), Shuffles, Indices,
+                                   Factor))
+      // If Extracts is not empty, tryReplaceExtracts made changes earlier.
+      return !Extracts.empty() || BinOpShuffleChanged;
   }
 
   DeadInsts.insert_range(Shuffles);
 
-  DeadInsts.insert(LI);
+  DeadInsts.insert(LoadOp);
   return true;
 }
 
 bool InterleavedAccessImpl::replaceBinOpShuffles(
     ArrayRef<ShuffleVectorInst *> BinOpShuffles,
-    SmallVectorImpl<ShuffleVectorInst *> &Shuffles, LoadInst *LI) {
+    SmallVectorImpl<ShuffleVectorInst *> &Shuffles, Instruction *LoadOp) {
   for (auto *SVI : BinOpShuffles) {
     BinaryOperator *BI = cast<BinaryOperator>(SVI->getOperand(0));
     Type *BIOp0Ty = BI->getOperand(0)->getType();
@@ -380,9 +435,9 @@ bool InterleavedAccessImpl::replaceBinOpShuffles(
                       << "\n  With    : " << *NewSVI1 << "\n    And   : "
                       << *NewSVI2 << "\n    And   : " << *NewBI << "\n");
     RecursivelyDeleteTriviallyDeadInstructions(SVI);
-    if (NewSVI1->getOperand(0) == LI)
+    if (NewSVI1->getOperand(0) == LoadOp)
       Shuffles.push_back(NewSVI1);
-    if (NewSVI2->getOperand(0) == LI)
+    if (NewSVI2->getOperand(0) == LoadOp)
       Shuffles.push_back(NewSVI2);
   }
 
@@ -454,27 +509,78 @@ bool InterleavedAccessImpl::tryReplaceExtracts(
 }
 
 bool InterleavedAccessImpl::lowerInterleavedStore(
-    StoreInst *SI, SmallSetVector<Instruction *, 32> &DeadInsts) {
-  if (!SI->isSimple())
-    return false;
+    Instruction *StoreOp, SmallSetVector<Instruction *, 32> &DeadInsts) {
+  Value *StoredValue;
+  if (auto *SI = dyn_cast<StoreInst>(StoreOp)) {
+    if (!SI->isSimple())
+      return false;
+    StoredValue = SI->getValueOperand();
+  } else if (auto *VPStore = dyn_cast<VPIntrinsic>(StoreOp)) {
+    assert(VPStore->getIntrinsicID() == Intrinsic::vp_store);
+    // Require a constant mask.
+    if (!isa<ConstantVector>(VPStore->getArgOperand(2)))
+      return false;
+    StoredValue = VPStore->getArgOperand(0);
+  } else {
+    llvm_unreachable("unsupported store operation");
+  }
 
-  auto *SVI = dyn_cast<ShuffleVectorInst>(SI->getValueOperand());
+  auto *SVI = dyn_cast<ShuffleVectorInst>(StoredValue);
   if (!SVI || !SVI->hasOneUse() || isa<ScalableVectorType>(SVI->getType()))
     return false;
 
+  unsigned NumStoredElements =
+      cast<FixedVectorType>(SVI->getType())->getNumElements();
   // Check if the shufflevector is RE-interleave shuffle.
   unsigned Factor;
   if (!isReInterleaveMask(SVI, Factor, MaxFactor))
     return false;
+  assert(NumStoredElements % Factor == 0 &&
+         "number of stored element should be a multiple of Factor");
+
+  // Check if the de-interleaved vp.store masks are the same.
+  unsigned LaneMaskLen = NumStoredElements / Factor;
+  SmallVector<Constant *, 8> LaneMask(LaneMaskLen, nullptr);
+  if (auto *VPStore = dyn_cast<VPIntrinsic>(StoreOp)) {
+    if (!isInterleavedConstantMask(
+            Factor, cast<ConstantVector>(VPStore->getArgOperand(2)), LaneMask))
+      return false;
+  }
 
-  LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *SI << "\n");
+  LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *StoreOp << "\n");
 
-  // Try to create target specific intrinsics to replace the store and shuffle.
-  if (!TLI->lowerInterleavedStore(SI, SVI, Factor))
-    return false;
+  if (auto *VPStore = dyn_cast<VPIntrinsic>(StoreOp)) {
+    IRBuilder<> Builder(VPStore);
+    // We need to effectively de-interleave the shufflemask
+    // because lowerInterleavedVPStore expected individual de-interleaved
+    // values.
+    SmallVector<Value *, 10> NewShuffles;
+    SmallVector<int, 16> NewShuffleMask(LaneMaskLen);
+    auto ShuffleMask = SVI->getShuffleMask();
+
+    for (unsigned i = 0; i < Factor; i++) {
+      for (unsigned j = 0; j < LaneMaskLen; j++)
+        NewShuffleMask[j] = ShuffleMask[i + Factor * j];
+
+      NewShuffles.push_back(Builder.CreateShuffleVector(
+          SVI->getOperand(0), SVI->getOperand(1), NewShuffleMask));
+    }
+
+    // Try to create target specific intrinsics to replace the vp.store and
+    // shuffle.
+    if (!TLI->lowerInterleavedVPStore(VPStore, ConstantVector::get(LaneMask),
+                                      NewShuffles))
+      // We already created new shuffles.
+      return true;
+  } else {
+    // Try to create target specific intrinsics to replace the store and
+    // shuffle.
+    if (!TLI->lowerInterleavedStore(cast<StoreInst>(StoreOp), SVI, Factor))
+      return false;
+  }
 
   // Already have a new target specific interleaved store. Erase the old store.
-  DeadInsts.insert(SI);
+  DeadInsts.insert(StoreOp);
   DeadInsts.insert(SVI);
   return true;
 }
@@ -686,8 +792,7 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
 
     // Since lowerInterleaveLoad expects Shuffles and LoadInst, use special
     // TLI function to emit target-specific interleaved instruction.
-    if (!TLI->lowerDeinterleavedIntrinsicToVPLoad(VPLoad, Mask,
-                                                  DeinterleaveValues))
+    if (!TLI->lowerDeinterleavedVPLoad(VPLoad, Mask, DeinterleaveValues))
       return false;
 
   } else {
@@ -739,8 +844,7 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
 
     // Since lowerInterleavedStore expects Shuffle and StoreInst, use special
     // TLI function to emit target-specific interleaved instruction.
-    if (!TLI->lowerInterleavedIntrinsicToVPStore(VPStore, Mask,
-                                                 InterleaveValues))
+    if (!TLI->lowerInterleavedVPStore(VPStore, Mask, InterleaveValues))
       return false;
   } else {
     auto *SI = cast<StoreInst>(StoredBy);
@@ -766,12 +870,15 @@ bool InterleavedAccessImpl::runOnFunction(Function &F) {
   SmallSetVector<Instruction *, 32> DeadInsts;
   bool Changed = false;
 
+  using namespace PatternMatch;
   for (auto &I : instructions(F)) {
-    if (auto *LI = dyn_cast<LoadInst>(&I))
-      Changed |= lowerInterleavedLoad(LI, DeadInsts);
+    if (match(&I, m_CombineOr(m_Load(m_Value()),
+                              m_Intrinsic<Intrinsic::vp_load>())))
+      Changed |= lowerInterleavedLoad(&I, DeadInsts);
 
-    if (auto *SI = dyn_cast<StoreInst>(&I))
-      Changed |= lowerInterleavedStore(SI, DeadInsts);
+    if (match(&I, m_CombineOr(m_Store(m_Value(), m_Value()),
+                              m_Intrinsic<Intrinsic::vp_store>())))
+      Changed |= lowerInterleavedStore(&I, DeadInsts);
 
     if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
       // At present, we only have intrinsics to represent (de)interleaving
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index f7d192756fd56..96baa24e9665f 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1724,24 +1724,24 @@ bool RISCVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
                  MachineMemOperand::MOVolatile;
     return true;
-  case Intrinsic::riscv_seg2_load:
-  case Intrinsic::riscv_seg3_load:
-  case Intrinsic::riscv_seg4_load:
-  case Intrinsic::riscv_seg5_load:
-  case Intrinsic::riscv_seg6_load:
-  case Intrinsic::riscv_seg7_load:
-  case Intrinsic::riscv_seg8_load:
+  case Intrinsic::riscv_seg2_load_mask:
+  case Intrinsic::riscv_seg3_load_mask:
+  case Intrinsic::riscv_seg4_load_mask:
+  case Intrinsic::riscv_seg5_load_mask:
+  case Intrinsic::riscv_seg6_load_mask:
+  case Intrinsic::riscv_seg7_load_mask:
+  case Intrinsic::riscv_seg8_load_mask:
     return SetRVVLoadStoreInfo(/*PtrOp*/ 0, /*IsStore*/ false,
                                /*IsUnitStrided*/ false, /*UsePtrVal*/ true);
-  case Intrinsic::riscv_seg2_store:
-  case Intrinsic::riscv_seg3_store:
-  case Intrinsic::riscv_seg4_store:
-  case Intrinsic::riscv_seg5_store:
-  case Intrinsic::riscv_seg6_store:
-  case Intrinsic::riscv_seg7_store:
-  case Intrinsic::riscv_seg8_store:
-    // Operands are (vec, ..., vec, ptr, vl)
-    return SetRVVLoadStoreInfo(/*PtrOp*/ I.arg_size() - 2,
+  case Intrinsic::riscv_seg2_store_mask:
+  case Intrinsic::riscv_seg3_store_mask:
+  case Intrinsic::riscv_seg4_store_mask:
+  case Intrinsic::riscv_seg5_store_mask:
+  case Intrinsic::riscv_seg6_store_mask:
+  case Intrinsic::riscv_seg7_store_mask:
+  case Intrinsic::riscv_seg8_store_mask:
+    // Operands are (vec, ..., vec, ptr, mask, vl)
+    return SetRVVLoadStoreInfo(/*PtrOp*/ I.arg_size() - 3,
                                /*IsStore*/ true,
                                /*IsUnitStrided*/ false, /*UsePtrVal*/ true);
   case Intrinsic::riscv_vle:
@@ -10444,19 +10444,19 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
   switch (IntNo) {
   default:
     break;
-  case Intrinsic::riscv_seg2_load:
-  case Intrinsic::riscv_seg3_load:
-  case Intrinsic::riscv_seg4_load:
-  case Intrinsic::riscv_seg5_load:
-  case Intrinsic::riscv_seg6_load:
-  case Intrinsic::riscv_seg7_load:
-  case Intrinsic::riscv_seg8_load: {
+  case Intrinsic::riscv_seg2_load_mask:
+  case Intrinsic::riscv_seg3_load_mask:
+  case Intrinsic::riscv_seg4_load_mask:
+  case Intrinsic::riscv_seg5_load_mask:
+  case Intrinsic::riscv_seg6_load_mask:
+  case Intrinsic::riscv_seg7_load_mask:
+  case Intrinsic::riscv_seg8_load_mask: {
     SDLoc DL(Op);
     static const Intrinsic::ID VlsegInts[7] = {
-        Intrinsic::riscv_vlseg2, Intrinsic::riscv_vlseg3,
-        Intrinsic::riscv_vlseg4, Intrinsic::riscv_vlseg5,
-        Intrinsic::riscv_vlseg6, Intrinsic::riscv_vlseg7,
-        Intrinsic::riscv_vlseg8};
+        Intrinsic::riscv_vlseg2_mask, Intrinsic::riscv_vlseg3_mask,
+        Intrinsic::riscv_vlseg4_mask, Intrinsic::riscv_vlseg5_mask,
+        Intrinsic::riscv_vlseg6_mask, Intrinsic::riscv_vlseg7_mask,
+        Intrinsic::riscv_vlseg8_mask};
     unsigned NF = Op->getNumValues() - 1;
     assert(NF >= 2 && NF <= 8 && "Unexpected seg number");
     MVT XLenVT = Subtarget.getXLenVT();
@@ -10466,7 +10466,16 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
                   ContainerVT.getScalarSizeInBits();
     EVT VecTupTy = MVT::getRISCVVectorTupleVT(Sz, NF);
 
-    SDValue VL = DAG.getConstant(VT.getVectorNumElements(), DL, XLenVT);
+    // Operands: (chain, int_id, pointer, mask, vl)
+    SDValue VL = Op.getOperand(Op.getNumOperands() - 1);
+    SDValue Mask = Op.getOperand(3);
+    MVT MaskVT = Mask.getSimpleValueType();
+    if (MaskVT.isFixedLengthVector()) {
+      MVT MaskContainerVT =
+          ::getContainerForFixedLengthVector(DAG, MaskVT, Subtarget);
+      Mask = convertToScalabl...
[truncated]

llvmbot · 2025-04-23T19:44:06Z

@llvm/pr-subscribers-llvm-ir

Author: Min-Yih Hsu (mshockwave)

Changes

RISCVVectorPeepholePass would replace instructions with all-ones mask with their unmask variant, so there isn't really a point to keep separate versions of intrinsics.

Note that riscv.segN.load/store.mask does not take pointer type (i.e. address space) as part of its overloading type signature, because RISC-V doesn't really use address spaces other than the default one.

This PR stacks on top of #135445

Patch is 116.66 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/137045.diff

12 Files Affected:

(modified) llvm/include/llvm/CodeGen/TargetLowering.h (+5-6)
(modified) llvm/include/llvm/IR/IntrinsicsRISCV.td (+12-4)
(modified) llvm/lib/CodeGen/InterleavedAccessPass.cpp (+144-37)
(modified) llvm/lib/Target/RISCV/RISCVISelLowering.cpp (+160-115)
(modified) llvm/lib/Target/RISCV/RISCVISelLowering.h (+5-6)
(modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll (+502-32)
(modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-segN-load.ll (+7-14)
(modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-segN-store.ll (+8-16)
(modified) llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll (-53)
(modified) llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll (+26-101)
(modified) llvm/test/Transforms/InterleavedAccess/RISCV/zve32x.ll (+1-1)
(modified) llvm/test/Transforms/InterleavedAccess/RISCV/zvl32b.ll (+1-1)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 4f2f202f94841..6eaa1bae1da97 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3179,15 +3179,15 @@ class TargetLoweringBase {
     return false;
   }
 
-  /// Lower an interleaved load to target specific intrinsics. Return
+  /// Lower a deinterleaved load to target specific intrinsics. Return
   /// true on success.
   ///
   /// \p Load is a vp.load instruction.
   /// \p Mask is a mask value
   /// \p DeinterleaveRes is a list of deinterleaved results.
   virtual bool
-  lowerDeinterleavedIntrinsicToVPLoad(VPIntrinsic *Load, Value *Mask,
-                                      ArrayRef<Value *> DeinterleaveRes) const {
+  lowerDeinterleavedVPLoad(VPIntrinsic *Load, Value *Mask,
+                           ArrayRef<Value *> DeinterleaveRes) const {
     return false;
   }
 
@@ -3197,9 +3197,8 @@ class TargetLoweringBase {
   /// \p Store is the vp.store instruction.
   /// \p Mask is a mask value
   /// \p InterleaveOps is a list of values being interleaved.
-  virtual bool
-  lowerInterleavedIntrinsicToVPStore(VPIntrinsic *Store, Value *Mask,
-                                     ArrayRef<Value *> InterleaveOps) const {
+  virtual bool lowerInterleavedVPStore(VPIntrinsic *Store, Value *Mask,
+                                       ArrayRef<Value *> InterleaveOps) const {
     return false;
   }
 
diff --git a/llvm/include/llvm/IR/IntrinsicsRISCV.td b/llvm/include/llvm/IR/IntrinsicsRISCV.td
index 99cb557d9aa09..18b2883eb00e7 100644
--- a/llvm/include/llvm/IR/IntrinsicsRISCV.td
+++ b/llvm/include/llvm/IR/IntrinsicsRISCV.td
@@ -1704,19 +1704,27 @@ let TargetPrefix = "riscv" in {
   }
 
   // Segment loads/stores for fixed vectors.
+  // Note: we only have the masked variants because RISCVVectorPeephole
+  // would lower any instructions with all-ones mask into unmasked version
+  // anyway.
   foreach nf = [2, 3, 4, 5, 6, 7, 8] in {
-    def int_riscv_seg # nf # _load
+    // Input: (pointer, mask, vl)
+    def int_riscv_seg # nf # _load_mask
           : DefaultAttrsIntrinsic<!listconcat([llvm_anyvector_ty],
                                               !listsplat(LLVMMatchType<0>,
                                               !add(nf, -1))),
-                                  [llvm_anyptr_ty, llvm_anyint_ty],
+                                  [llvm_ptr_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                   llvm_anyint_ty],
                                   [NoCapture<ArgIndex<0>>, IntrReadMem]>;
-    def int_riscv_seg # nf # _store
+
+    // Input: (<stored values>..., pointer, mask, vl)
+    def int_riscv_seg # nf # _store_mask
           : DefaultAttrsIntrinsic<[],
                                   !listconcat([llvm_anyvector_ty],
                                               !listsplat(LLVMMatchType<0>,
                                                           !add(nf, -1)),
-                                              [llvm_anyptr_ty, llvm_anyint_ty]),
+                                              [llvm_ptr_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                               llvm_anyint_ty]),
                                   [NoCapture<ArgIndex<nf>>, IntrWriteMem]>;
   }
 
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 9e47510e9cd1a..66c1cd98ef602 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -100,11 +100,11 @@ class InterleavedAccessImpl {
   unsigned MaxFactor = 0u;
 
   /// Transform an interleaved load into target specific intrinsics.
-  bool lowerInterleavedLoad(LoadInst *LI,
+  bool lowerInterleavedLoad(Instruction *LoadOp,
                             SmallSetVector<Instruction *, 32> &DeadInsts);
 
   /// Transform an interleaved store into target specific intrinsics.
-  bool lowerInterleavedStore(StoreInst *SI,
+  bool lowerInterleavedStore(Instruction *StoreOp,
                              SmallSetVector<Instruction *, 32> &DeadInsts);
 
   /// Transform a load and a deinterleave intrinsic into target specific
@@ -131,7 +131,7 @@ class InterleavedAccessImpl {
   /// made.
   bool replaceBinOpShuffles(ArrayRef<ShuffleVectorInst *> BinOpShuffles,
                             SmallVectorImpl<ShuffleVectorInst *> &Shuffles,
-                            LoadInst *LI);
+                            Instruction *LI);
 };
 
 class InterleavedAccess : public FunctionPass {
@@ -249,11 +249,43 @@ static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor,
   return false;
 }
 
+/// Return true if it's an interleaving mask. For instance, 111000111000 is
+/// interleaved from three 1010 masks. \p SubMask returns the mask of individual
+/// lane.
+static bool isInterleavedConstantMask(unsigned Factor, ConstantVector *Mask,
+                                      SmallVectorImpl<Constant *> &LaneMask) {
+  unsigned LaneMaskLen = LaneMask.size();
+  if (auto *Splat = Mask->getSplatValue()) {
+    std::fill(LaneMask.begin(), LaneMask.end(), Splat);
+  } else {
+    for (unsigned Idx = 0U, N = LaneMaskLen * Factor; Idx < N; ++Idx) {
+      Constant *Ref = Mask->getAggregateElement(alignDown(Idx, Factor));
+      if (Ref != Mask->getAggregateElement(Idx))
+        return false;
+      LaneMask[Idx / Factor] = Ref;
+    }
+  }
+
+  return true;
+}
+
 bool InterleavedAccessImpl::lowerInterleavedLoad(
-    LoadInst *LI, SmallSetVector<Instruction *, 32> &DeadInsts) {
-  if (!LI->isSimple() || isa<ScalableVectorType>(LI->getType()))
+    Instruction *LoadOp, SmallSetVector<Instruction *, 32> &DeadInsts) {
+  if (isa<ScalableVectorType>(LoadOp->getType()))
     return false;
 
+  if (auto *LI = dyn_cast<LoadInst>(LoadOp)) {
+    if (!LI->isSimple())
+      return false;
+  } else if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadOp)) {
+    assert(VPLoad->getIntrinsicID() == Intrinsic::vp_load);
+    // Require a constant mask and evl.
+    if (!isa<ConstantVector>(VPLoad->getArgOperand(1)))
+      return false;
+  } else {
+    llvm_unreachable("unsupported load operation");
+  }
+
   // Check if all users of this load are shufflevectors. If we encounter any
   // users that are extractelement instructions or binary operators, we save
   // them to later check if they can be modified to extract from one of the
@@ -265,7 +297,7 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
   // binop are the same load.
   SmallSetVector<ShuffleVectorInst *, 4> BinOpShuffles;
 
-  for (auto *User : LI->users()) {
+  for (auto *User : LoadOp->users()) {
     auto *Extract = dyn_cast<ExtractElementInst>(User);
     if (Extract && isa<ConstantInt>(Extract->getIndexOperand())) {
       Extracts.push_back(Extract);
@@ -294,7 +326,7 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
   unsigned Factor, Index;
 
   unsigned NumLoadElements =
-      cast<FixedVectorType>(LI->getType())->getNumElements();
+      cast<FixedVectorType>(LoadOp->getType())->getNumElements();
   auto *FirstSVI = Shuffles.size() > 0 ? Shuffles[0] : BinOpShuffles[0];
   // Check if the first shufflevector is DE-interleave shuffle.
   if (!isDeInterleaveMask(FirstSVI->getShuffleMask(), Factor, Index, MaxFactor,
@@ -327,9 +359,9 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
 
     assert(Shuffle->getShuffleMask().size() <= NumLoadElements);
 
-    if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(0) == LI)
+    if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(0) == LoadOp)
       Indices.push_back(Index);
-    if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(1) == LI)
+    if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(1) == LoadOp)
       Indices.push_back(Index);
   }
 
@@ -339,25 +371,48 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
     return false;
 
   bool BinOpShuffleChanged =
-      replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, LI);
-
-  LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *LI << "\n");
+      replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, LoadOp);
+
+  // Check if the de-interleaved vp.load masks are the same.
+  unsigned ShuffleMaskLen = Shuffles[0]->getShuffleMask().size();
+  SmallVector<Constant *, 8> LaneMask(ShuffleMaskLen, nullptr);
+  if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadOp)) {
+    if (!isInterleavedConstantMask(
+            Factor, cast<ConstantVector>(VPLoad->getArgOperand(1)), LaneMask))
+      return false;
+  }
 
-  // Try to create target specific intrinsics to replace the load and shuffles.
-  if (!TLI->lowerInterleavedLoad(LI, Shuffles, Indices, Factor)) {
-    // If Extracts is not empty, tryReplaceExtracts made changes earlier.
-    return !Extracts.empty() || BinOpShuffleChanged;
+  LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *LoadOp << "\n");
+
+  if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadOp)) {
+    auto *MaskVec = ConstantVector::get(LaneMask);
+    // Sometimes the number of Shuffles might be less than Factor, we have to
+    // fill the gaps with null. Also, lowerDeinterleavedVPLoad
+    // expects them to be sorted.
+    SmallVector<Value *, 4> ShuffleValues(Factor, nullptr);
+    for (auto [Idx, ShuffleMaskIdx] : enumerate(Indices))
+      ShuffleValues[ShuffleMaskIdx] = Shuffles[Idx];
+    if (!TLI->lowerDeinterleavedVPLoad(VPLoad, MaskVec, ShuffleValues))
+      // If Extracts is not empty, tryReplaceExtracts made changes earlier.
+      return !Extracts.empty() || BinOpShuffleChanged;
+  } else {
+    // Try to create target specific intrinsics to replace the load and
+    // shuffles.
+    if (!TLI->lowerInterleavedLoad(cast<LoadInst>(LoadOp), Shuffles, Indices,
+                                   Factor))
+      // If Extracts is not empty, tryReplaceExtracts made changes earlier.
+      return !Extracts.empty() || BinOpShuffleChanged;
   }
 
   DeadInsts.insert_range(Shuffles);
 
-  DeadInsts.insert(LI);
+  DeadInsts.insert(LoadOp);
   return true;
 }
 
 bool InterleavedAccessImpl::replaceBinOpShuffles(
     ArrayRef<ShuffleVectorInst *> BinOpShuffles,
-    SmallVectorImpl<ShuffleVectorInst *> &Shuffles, LoadInst *LI) {
+    SmallVectorImpl<ShuffleVectorInst *> &Shuffles, Instruction *LoadOp) {
   for (auto *SVI : BinOpShuffles) {
     BinaryOperator *BI = cast<BinaryOperator>(SVI->getOperand(0));
     Type *BIOp0Ty = BI->getOperand(0)->getType();
@@ -380,9 +435,9 @@ bool InterleavedAccessImpl::replaceBinOpShuffles(
                       << "\n  With    : " << *NewSVI1 << "\n    And   : "
                       << *NewSVI2 << "\n    And   : " << *NewBI << "\n");
     RecursivelyDeleteTriviallyDeadInstructions(SVI);
-    if (NewSVI1->getOperand(0) == LI)
+    if (NewSVI1->getOperand(0) == LoadOp)
       Shuffles.push_back(NewSVI1);
-    if (NewSVI2->getOperand(0) == LI)
+    if (NewSVI2->getOperand(0) == LoadOp)
       Shuffles.push_back(NewSVI2);
   }
 
@@ -454,27 +509,78 @@ bool InterleavedAccessImpl::tryReplaceExtracts(
 }
 
 bool InterleavedAccessImpl::lowerInterleavedStore(
-    StoreInst *SI, SmallSetVector<Instruction *, 32> &DeadInsts) {
-  if (!SI->isSimple())
-    return false;
+    Instruction *StoreOp, SmallSetVector<Instruction *, 32> &DeadInsts) {
+  Value *StoredValue;
+  if (auto *SI = dyn_cast<StoreInst>(StoreOp)) {
+    if (!SI->isSimple())
+      return false;
+    StoredValue = SI->getValueOperand();
+  } else if (auto *VPStore = dyn_cast<VPIntrinsic>(StoreOp)) {
+    assert(VPStore->getIntrinsicID() == Intrinsic::vp_store);
+    // Require a constant mask.
+    if (!isa<ConstantVector>(VPStore->getArgOperand(2)))
+      return false;
+    StoredValue = VPStore->getArgOperand(0);
+  } else {
+    llvm_unreachable("unsupported store operation");
+  }
 
-  auto *SVI = dyn_cast<ShuffleVectorInst>(SI->getValueOperand());
+  auto *SVI = dyn_cast<ShuffleVectorInst>(StoredValue);
   if (!SVI || !SVI->hasOneUse() || isa<ScalableVectorType>(SVI->getType()))
     return false;
 
+  unsigned NumStoredElements =
+      cast<FixedVectorType>(SVI->getType())->getNumElements();
   // Check if the shufflevector is RE-interleave shuffle.
   unsigned Factor;
   if (!isReInterleaveMask(SVI, Factor, MaxFactor))
     return false;
+  assert(NumStoredElements % Factor == 0 &&
+         "number of stored element should be a multiple of Factor");
+
+  // Check if the de-interleaved vp.store masks are the same.
+  unsigned LaneMaskLen = NumStoredElements / Factor;
+  SmallVector<Constant *, 8> LaneMask(LaneMaskLen, nullptr);
+  if (auto *VPStore = dyn_cast<VPIntrinsic>(StoreOp)) {
+    if (!isInterleavedConstantMask(
+            Factor, cast<ConstantVector>(VPStore->getArgOperand(2)), LaneMask))
+      return false;
+  }
 
-  LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *SI << "\n");
+  LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *StoreOp << "\n");
 
-  // Try to create target specific intrinsics to replace the store and shuffle.
-  if (!TLI->lowerInterleavedStore(SI, SVI, Factor))
-    return false;
+  if (auto *VPStore = dyn_cast<VPIntrinsic>(StoreOp)) {
+    IRBuilder<> Builder(VPStore);
+    // We need to effectively de-interleave the shufflemask
+    // because lowerInterleavedVPStore expected individual de-interleaved
+    // values.
+    SmallVector<Value *, 10> NewShuffles;
+    SmallVector<int, 16> NewShuffleMask(LaneMaskLen);
+    auto ShuffleMask = SVI->getShuffleMask();
+
+    for (unsigned i = 0; i < Factor; i++) {
+      for (unsigned j = 0; j < LaneMaskLen; j++)
+        NewShuffleMask[j] = ShuffleMask[i + Factor * j];
+
+      NewShuffles.push_back(Builder.CreateShuffleVector(
+          SVI->getOperand(0), SVI->getOperand(1), NewShuffleMask));
+    }
+
+    // Try to create target specific intrinsics to replace the vp.store and
+    // shuffle.
+    if (!TLI->lowerInterleavedVPStore(VPStore, ConstantVector::get(LaneMask),
+                                      NewShuffles))
+      // We already created new shuffles.
+      return true;
+  } else {
+    // Try to create target specific intrinsics to replace the store and
+    // shuffle.
+    if (!TLI->lowerInterleavedStore(cast<StoreInst>(StoreOp), SVI, Factor))
+      return false;
+  }
 
   // Already have a new target specific interleaved store. Erase the old store.
-  DeadInsts.insert(SI);
+  DeadInsts.insert(StoreOp);
   DeadInsts.insert(SVI);
   return true;
 }
@@ -686,8 +792,7 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
 
     // Since lowerInterleaveLoad expects Shuffles and LoadInst, use special
     // TLI function to emit target-specific interleaved instruction.
-    if (!TLI->lowerDeinterleavedIntrinsicToVPLoad(VPLoad, Mask,
-                                                  DeinterleaveValues))
+    if (!TLI->lowerDeinterleavedVPLoad(VPLoad, Mask, DeinterleaveValues))
       return false;
 
   } else {
@@ -739,8 +844,7 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
 
     // Since lowerInterleavedStore expects Shuffle and StoreInst, use special
     // TLI function to emit target-specific interleaved instruction.
-    if (!TLI->lowerInterleavedIntrinsicToVPStore(VPStore, Mask,
-                                                 InterleaveValues))
+    if (!TLI->lowerInterleavedVPStore(VPStore, Mask, InterleaveValues))
       return false;
   } else {
     auto *SI = cast<StoreInst>(StoredBy);
@@ -766,12 +870,15 @@ bool InterleavedAccessImpl::runOnFunction(Function &F) {
   SmallSetVector<Instruction *, 32> DeadInsts;
   bool Changed = false;
 
+  using namespace PatternMatch;
   for (auto &I : instructions(F)) {
-    if (auto *LI = dyn_cast<LoadInst>(&I))
-      Changed |= lowerInterleavedLoad(LI, DeadInsts);
+    if (match(&I, m_CombineOr(m_Load(m_Value()),
+                              m_Intrinsic<Intrinsic::vp_load>())))
+      Changed |= lowerInterleavedLoad(&I, DeadInsts);
 
-    if (auto *SI = dyn_cast<StoreInst>(&I))
-      Changed |= lowerInterleavedStore(SI, DeadInsts);
+    if (match(&I, m_CombineOr(m_Store(m_Value(), m_Value()),
+                              m_Intrinsic<Intrinsic::vp_store>())))
+      Changed |= lowerInterleavedStore(&I, DeadInsts);
 
     if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
       // At present, we only have intrinsics to represent (de)interleaving
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index f7d192756fd56..96baa24e9665f 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1724,24 +1724,24 @@ bool RISCVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
                  MachineMemOperand::MOVolatile;
     return true;
-  case Intrinsic::riscv_seg2_load:
-  case Intrinsic::riscv_seg3_load:
-  case Intrinsic::riscv_seg4_load:
-  case Intrinsic::riscv_seg5_load:
-  case Intrinsic::riscv_seg6_load:
-  case Intrinsic::riscv_seg7_load:
-  case Intrinsic::riscv_seg8_load:
+  case Intrinsic::riscv_seg2_load_mask:
+  case Intrinsic::riscv_seg3_load_mask:
+  case Intrinsic::riscv_seg4_load_mask:
+  case Intrinsic::riscv_seg5_load_mask:
+  case Intrinsic::riscv_seg6_load_mask:
+  case Intrinsic::riscv_seg7_load_mask:
+  case Intrinsic::riscv_seg8_load_mask:
     return SetRVVLoadStoreInfo(/*PtrOp*/ 0, /*IsStore*/ false,
                                /*IsUnitStrided*/ false, /*UsePtrVal*/ true);
-  case Intrinsic::riscv_seg2_store:
-  case Intrinsic::riscv_seg3_store:
-  case Intrinsic::riscv_seg4_store:
-  case Intrinsic::riscv_seg5_store:
-  case Intrinsic::riscv_seg6_store:
-  case Intrinsic::riscv_seg7_store:
-  case Intrinsic::riscv_seg8_store:
-    // Operands are (vec, ..., vec, ptr, vl)
-    return SetRVVLoadStoreInfo(/*PtrOp*/ I.arg_size() - 2,
+  case Intrinsic::riscv_seg2_store_mask:
+  case Intrinsic::riscv_seg3_store_mask:
+  case Intrinsic::riscv_seg4_store_mask:
+  case Intrinsic::riscv_seg5_store_mask:
+  case Intrinsic::riscv_seg6_store_mask:
+  case Intrinsic::riscv_seg7_store_mask:
+  case Intrinsic::riscv_seg8_store_mask:
+    // Operands are (vec, ..., vec, ptr, mask, vl)
+    return SetRVVLoadStoreInfo(/*PtrOp*/ I.arg_size() - 3,
                                /*IsStore*/ true,
                                /*IsUnitStrided*/ false, /*UsePtrVal*/ true);
   case Intrinsic::riscv_vle:
@@ -10444,19 +10444,19 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
   switch (IntNo) {
   default:
     break;
-  case Intrinsic::riscv_seg2_load:
-  case Intrinsic::riscv_seg3_load:
-  case Intrinsic::riscv_seg4_load:
-  case Intrinsic::riscv_seg5_load:
-  case Intrinsic::riscv_seg6_load:
-  case Intrinsic::riscv_seg7_load:
-  case Intrinsic::riscv_seg8_load: {
+  case Intrinsic::riscv_seg2_load_mask:
+  case Intrinsic::riscv_seg3_load_mask:
+  case Intrinsic::riscv_seg4_load_mask:
+  case Intrinsic::riscv_seg5_load_mask:
+  case Intrinsic::riscv_seg6_load_mask:
+  case Intrinsic::riscv_seg7_load_mask:
+  case Intrinsic::riscv_seg8_load_mask: {
     SDLoc DL(Op);
     static const Intrinsic::ID VlsegInts[7] = {
-        Intrinsic::riscv_vlseg2, Intrinsic::riscv_vlseg3,
-        Intrinsic::riscv_vlseg4, Intrinsic::riscv_vlseg5,
-        Intrinsic::riscv_vlseg6, Intrinsic::riscv_vlseg7,
-        Intrinsic::riscv_vlseg8};
+        Intrinsic::riscv_vlseg2_mask, Intrinsic::riscv_vlseg3_mask,
+        Intrinsic::riscv_vlseg4_mask, Intrinsic::riscv_vlseg5_mask,
+        Intrinsic::riscv_vlseg6_mask, Intrinsic::riscv_vlseg7_mask,
+        Intrinsic::riscv_vlseg8_mask};
     unsigned NF = Op->getNumValues() - 1;
     assert(NF >= 2 && NF <= 8 && "Unexpected seg number");
     MVT XLenVT = Subtarget.getXLenVT();
@@ -10466,7 +10466,16 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
                   ContainerVT.getScalarSizeInBits();
     EVT VecTupTy = MVT::getRISCVVectorTupleVT(Sz, NF);
 
-    SDValue VL = DAG.getConstant(VT.getVectorNumElements(), DL, XLenVT);
+    // Operands: (chain, int_id, pointer, mask, vl)
+    SDValue VL = Op.getOperand(Op.getNumOperands() - 1);
+    SDValue Mask = Op.getOperand(3);
+    MVT MaskVT = Mask.getSimpleValueType();
+    if (MaskVT.isFixedLengthVector()) {
+      MVT MaskContainerVT =
+          ::getContainerForFixedLengthVector(DAG, MaskVT, Subtarget);
+      Mask = convertToScalabl...
[truncated]

llvmbot · 2025-04-23T19:44:06Z

@llvm/pr-subscribers-llvm-transforms

Author: Min-Yih Hsu (mshockwave)

Changes

RISCVVectorPeepholePass would replace instructions with all-ones mask with their unmask variant, so there isn't really a point to keep separate versions of intrinsics.

Note that riscv.segN.load/store.mask does not take pointer type (i.e. address space) as part of its overloading type signature, because RISC-V doesn't really use address spaces other than the default one.

This PR stacks on top of #135445

Patch is 116.66 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/137045.diff

12 Files Affected:

(modified) llvm/include/llvm/CodeGen/TargetLowering.h (+5-6)
(modified) llvm/include/llvm/IR/IntrinsicsRISCV.td (+12-4)
(modified) llvm/lib/CodeGen/InterleavedAccessPass.cpp (+144-37)
(modified) llvm/lib/Target/RISCV/RISCVISelLowering.cpp (+160-115)
(modified) llvm/lib/Target/RISCV/RISCVISelLowering.h (+5-6)
(modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll (+502-32)
(modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-segN-load.ll (+7-14)
(modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-segN-store.ll (+8-16)
(modified) llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll (-53)
(modified) llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll (+26-101)
(modified) llvm/test/Transforms/InterleavedAccess/RISCV/zve32x.ll (+1-1)
(modified) llvm/test/Transforms/InterleavedAccess/RISCV/zvl32b.ll (+1-1)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 4f2f202f94841..6eaa1bae1da97 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3179,15 +3179,15 @@ class TargetLoweringBase {
     return false;
   }
 
-  /// Lower an interleaved load to target specific intrinsics. Return
+  /// Lower a deinterleaved load to target specific intrinsics. Return
   /// true on success.
   ///
   /// \p Load is a vp.load instruction.
   /// \p Mask is a mask value
   /// \p DeinterleaveRes is a list of deinterleaved results.
   virtual bool
-  lowerDeinterleavedIntrinsicToVPLoad(VPIntrinsic *Load, Value *Mask,
-                                      ArrayRef<Value *> DeinterleaveRes) const {
+  lowerDeinterleavedVPLoad(VPIntrinsic *Load, Value *Mask,
+                           ArrayRef<Value *> DeinterleaveRes) const {
     return false;
   }
 
@@ -3197,9 +3197,8 @@ class TargetLoweringBase {
   /// \p Store is the vp.store instruction.
   /// \p Mask is a mask value
   /// \p InterleaveOps is a list of values being interleaved.
-  virtual bool
-  lowerInterleavedIntrinsicToVPStore(VPIntrinsic *Store, Value *Mask,
-                                     ArrayRef<Value *> InterleaveOps) const {
+  virtual bool lowerInterleavedVPStore(VPIntrinsic *Store, Value *Mask,
+                                       ArrayRef<Value *> InterleaveOps) const {
     return false;
   }
 
diff --git a/llvm/include/llvm/IR/IntrinsicsRISCV.td b/llvm/include/llvm/IR/IntrinsicsRISCV.td
index 99cb557d9aa09..18b2883eb00e7 100644
--- a/llvm/include/llvm/IR/IntrinsicsRISCV.td
+++ b/llvm/include/llvm/IR/IntrinsicsRISCV.td
@@ -1704,19 +1704,27 @@ let TargetPrefix = "riscv" in {
   }
 
   // Segment loads/stores for fixed vectors.
+  // Note: we only have the masked variants because RISCVVectorPeephole
+  // would lower any instructions with all-ones mask into unmasked version
+  // anyway.
   foreach nf = [2, 3, 4, 5, 6, 7, 8] in {
-    def int_riscv_seg # nf # _load
+    // Input: (pointer, mask, vl)
+    def int_riscv_seg # nf # _load_mask
           : DefaultAttrsIntrinsic<!listconcat([llvm_anyvector_ty],
                                               !listsplat(LLVMMatchType<0>,
                                               !add(nf, -1))),
-                                  [llvm_anyptr_ty, llvm_anyint_ty],
+                                  [llvm_ptr_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                   llvm_anyint_ty],
                                   [NoCapture<ArgIndex<0>>, IntrReadMem]>;
-    def int_riscv_seg # nf # _store
+
+    // Input: (<stored values>..., pointer, mask, vl)
+    def int_riscv_seg # nf # _store_mask
           : DefaultAttrsIntrinsic<[],
                                   !listconcat([llvm_anyvector_ty],
                                               !listsplat(LLVMMatchType<0>,
                                                           !add(nf, -1)),
-                                              [llvm_anyptr_ty, llvm_anyint_ty]),
+                                              [llvm_ptr_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                               llvm_anyint_ty]),
                                   [NoCapture<ArgIndex<nf>>, IntrWriteMem]>;
   }
 
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 9e47510e9cd1a..66c1cd98ef602 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -100,11 +100,11 @@ class InterleavedAccessImpl {
   unsigned MaxFactor = 0u;
 
   /// Transform an interleaved load into target specific intrinsics.
-  bool lowerInterleavedLoad(LoadInst *LI,
+  bool lowerInterleavedLoad(Instruction *LoadOp,
                             SmallSetVector<Instruction *, 32> &DeadInsts);
 
   /// Transform an interleaved store into target specific intrinsics.
-  bool lowerInterleavedStore(StoreInst *SI,
+  bool lowerInterleavedStore(Instruction *StoreOp,
                              SmallSetVector<Instruction *, 32> &DeadInsts);
 
   /// Transform a load and a deinterleave intrinsic into target specific
@@ -131,7 +131,7 @@ class InterleavedAccessImpl {
   /// made.
   bool replaceBinOpShuffles(ArrayRef<ShuffleVectorInst *> BinOpShuffles,
                             SmallVectorImpl<ShuffleVectorInst *> &Shuffles,
-                            LoadInst *LI);
+                            Instruction *LI);
 };
 
 class InterleavedAccess : public FunctionPass {
@@ -249,11 +249,43 @@ static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor,
   return false;
 }
 
+/// Return true if it's an interleaving mask. For instance, 111000111000 is
+/// interleaved from three 1010 masks. \p SubMask returns the mask of individual
+/// lane.
+static bool isInterleavedConstantMask(unsigned Factor, ConstantVector *Mask,
+                                      SmallVectorImpl<Constant *> &LaneMask) {
+  unsigned LaneMaskLen = LaneMask.size();
+  if (auto *Splat = Mask->getSplatValue()) {
+    std::fill(LaneMask.begin(), LaneMask.end(), Splat);
+  } else {
+    for (unsigned Idx = 0U, N = LaneMaskLen * Factor; Idx < N; ++Idx) {
+      Constant *Ref = Mask->getAggregateElement(alignDown(Idx, Factor));
+      if (Ref != Mask->getAggregateElement(Idx))
+        return false;
+      LaneMask[Idx / Factor] = Ref;
+    }
+  }
+
+  return true;
+}
+
 bool InterleavedAccessImpl::lowerInterleavedLoad(
-    LoadInst *LI, SmallSetVector<Instruction *, 32> &DeadInsts) {
-  if (!LI->isSimple() || isa<ScalableVectorType>(LI->getType()))
+    Instruction *LoadOp, SmallSetVector<Instruction *, 32> &DeadInsts) {
+  if (isa<ScalableVectorType>(LoadOp->getType()))
     return false;
 
+  if (auto *LI = dyn_cast<LoadInst>(LoadOp)) {
+    if (!LI->isSimple())
+      return false;
+  } else if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadOp)) {
+    assert(VPLoad->getIntrinsicID() == Intrinsic::vp_load);
+    // Require a constant mask and evl.
+    if (!isa<ConstantVector>(VPLoad->getArgOperand(1)))
+      return false;
+  } else {
+    llvm_unreachable("unsupported load operation");
+  }
+
   // Check if all users of this load are shufflevectors. If we encounter any
   // users that are extractelement instructions or binary operators, we save
   // them to later check if they can be modified to extract from one of the
@@ -265,7 +297,7 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
   // binop are the same load.
   SmallSetVector<ShuffleVectorInst *, 4> BinOpShuffles;
 
-  for (auto *User : LI->users()) {
+  for (auto *User : LoadOp->users()) {
     auto *Extract = dyn_cast<ExtractElementInst>(User);
     if (Extract && isa<ConstantInt>(Extract->getIndexOperand())) {
       Extracts.push_back(Extract);
@@ -294,7 +326,7 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
   unsigned Factor, Index;
 
   unsigned NumLoadElements =
-      cast<FixedVectorType>(LI->getType())->getNumElements();
+      cast<FixedVectorType>(LoadOp->getType())->getNumElements();
   auto *FirstSVI = Shuffles.size() > 0 ? Shuffles[0] : BinOpShuffles[0];
   // Check if the first shufflevector is DE-interleave shuffle.
   if (!isDeInterleaveMask(FirstSVI->getShuffleMask(), Factor, Index, MaxFactor,
@@ -327,9 +359,9 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
 
     assert(Shuffle->getShuffleMask().size() <= NumLoadElements);
 
-    if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(0) == LI)
+    if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(0) == LoadOp)
       Indices.push_back(Index);
-    if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(1) == LI)
+    if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(1) == LoadOp)
       Indices.push_back(Index);
   }
 
@@ -339,25 +371,48 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
     return false;
 
   bool BinOpShuffleChanged =
-      replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, LI);
-
-  LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *LI << "\n");
+      replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, LoadOp);
+
+  // Check if the de-interleaved vp.load masks are the same.
+  unsigned ShuffleMaskLen = Shuffles[0]->getShuffleMask().size();
+  SmallVector<Constant *, 8> LaneMask(ShuffleMaskLen, nullptr);
+  if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadOp)) {
+    if (!isInterleavedConstantMask(
+            Factor, cast<ConstantVector>(VPLoad->getArgOperand(1)), LaneMask))
+      return false;
+  }
 
-  // Try to create target specific intrinsics to replace the load and shuffles.
-  if (!TLI->lowerInterleavedLoad(LI, Shuffles, Indices, Factor)) {
-    // If Extracts is not empty, tryReplaceExtracts made changes earlier.
-    return !Extracts.empty() || BinOpShuffleChanged;
+  LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *LoadOp << "\n");
+
+  if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadOp)) {
+    auto *MaskVec = ConstantVector::get(LaneMask);
+    // Sometimes the number of Shuffles might be less than Factor, we have to
+    // fill the gaps with null. Also, lowerDeinterleavedVPLoad
+    // expects them to be sorted.
+    SmallVector<Value *, 4> ShuffleValues(Factor, nullptr);
+    for (auto [Idx, ShuffleMaskIdx] : enumerate(Indices))
+      ShuffleValues[ShuffleMaskIdx] = Shuffles[Idx];
+    if (!TLI->lowerDeinterleavedVPLoad(VPLoad, MaskVec, ShuffleValues))
+      // If Extracts is not empty, tryReplaceExtracts made changes earlier.
+      return !Extracts.empty() || BinOpShuffleChanged;
+  } else {
+    // Try to create target specific intrinsics to replace the load and
+    // shuffles.
+    if (!TLI->lowerInterleavedLoad(cast<LoadInst>(LoadOp), Shuffles, Indices,
+                                   Factor))
+      // If Extracts is not empty, tryReplaceExtracts made changes earlier.
+      return !Extracts.empty() || BinOpShuffleChanged;
   }
 
   DeadInsts.insert_range(Shuffles);
 
-  DeadInsts.insert(LI);
+  DeadInsts.insert(LoadOp);
   return true;
 }
 
 bool InterleavedAccessImpl::replaceBinOpShuffles(
     ArrayRef<ShuffleVectorInst *> BinOpShuffles,
-    SmallVectorImpl<ShuffleVectorInst *> &Shuffles, LoadInst *LI) {
+    SmallVectorImpl<ShuffleVectorInst *> &Shuffles, Instruction *LoadOp) {
   for (auto *SVI : BinOpShuffles) {
     BinaryOperator *BI = cast<BinaryOperator>(SVI->getOperand(0));
     Type *BIOp0Ty = BI->getOperand(0)->getType();
@@ -380,9 +435,9 @@ bool InterleavedAccessImpl::replaceBinOpShuffles(
                       << "\n  With    : " << *NewSVI1 << "\n    And   : "
                       << *NewSVI2 << "\n    And   : " << *NewBI << "\n");
     RecursivelyDeleteTriviallyDeadInstructions(SVI);
-    if (NewSVI1->getOperand(0) == LI)
+    if (NewSVI1->getOperand(0) == LoadOp)
       Shuffles.push_back(NewSVI1);
-    if (NewSVI2->getOperand(0) == LI)
+    if (NewSVI2->getOperand(0) == LoadOp)
       Shuffles.push_back(NewSVI2);
   }
 
@@ -454,27 +509,78 @@ bool InterleavedAccessImpl::tryReplaceExtracts(
 }
 
 bool InterleavedAccessImpl::lowerInterleavedStore(
-    StoreInst *SI, SmallSetVector<Instruction *, 32> &DeadInsts) {
-  if (!SI->isSimple())
-    return false;
+    Instruction *StoreOp, SmallSetVector<Instruction *, 32> &DeadInsts) {
+  Value *StoredValue;
+  if (auto *SI = dyn_cast<StoreInst>(StoreOp)) {
+    if (!SI->isSimple())
+      return false;
+    StoredValue = SI->getValueOperand();
+  } else if (auto *VPStore = dyn_cast<VPIntrinsic>(StoreOp)) {
+    assert(VPStore->getIntrinsicID() == Intrinsic::vp_store);
+    // Require a constant mask.
+    if (!isa<ConstantVector>(VPStore->getArgOperand(2)))
+      return false;
+    StoredValue = VPStore->getArgOperand(0);
+  } else {
+    llvm_unreachable("unsupported store operation");
+  }
 
-  auto *SVI = dyn_cast<ShuffleVectorInst>(SI->getValueOperand());
+  auto *SVI = dyn_cast<ShuffleVectorInst>(StoredValue);
   if (!SVI || !SVI->hasOneUse() || isa<ScalableVectorType>(SVI->getType()))
     return false;
 
+  unsigned NumStoredElements =
+      cast<FixedVectorType>(SVI->getType())->getNumElements();
   // Check if the shufflevector is RE-interleave shuffle.
   unsigned Factor;
   if (!isReInterleaveMask(SVI, Factor, MaxFactor))
     return false;
+  assert(NumStoredElements % Factor == 0 &&
+         "number of stored element should be a multiple of Factor");
+
+  // Check if the de-interleaved vp.store masks are the same.
+  unsigned LaneMaskLen = NumStoredElements / Factor;
+  SmallVector<Constant *, 8> LaneMask(LaneMaskLen, nullptr);
+  if (auto *VPStore = dyn_cast<VPIntrinsic>(StoreOp)) {
+    if (!isInterleavedConstantMask(
+            Factor, cast<ConstantVector>(VPStore->getArgOperand(2)), LaneMask))
+      return false;
+  }
 
-  LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *SI << "\n");
+  LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *StoreOp << "\n");
 
-  // Try to create target specific intrinsics to replace the store and shuffle.
-  if (!TLI->lowerInterleavedStore(SI, SVI, Factor))
-    return false;
+  if (auto *VPStore = dyn_cast<VPIntrinsic>(StoreOp)) {
+    IRBuilder<> Builder(VPStore);
+    // We need to effectively de-interleave the shufflemask
+    // because lowerInterleavedVPStore expected individual de-interleaved
+    // values.
+    SmallVector<Value *, 10> NewShuffles;
+    SmallVector<int, 16> NewShuffleMask(LaneMaskLen);
+    auto ShuffleMask = SVI->getShuffleMask();
+
+    for (unsigned i = 0; i < Factor; i++) {
+      for (unsigned j = 0; j < LaneMaskLen; j++)
+        NewShuffleMask[j] = ShuffleMask[i + Factor * j];
+
+      NewShuffles.push_back(Builder.CreateShuffleVector(
+          SVI->getOperand(0), SVI->getOperand(1), NewShuffleMask));
+    }
+
+    // Try to create target specific intrinsics to replace the vp.store and
+    // shuffle.
+    if (!TLI->lowerInterleavedVPStore(VPStore, ConstantVector::get(LaneMask),
+                                      NewShuffles))
+      // We already created new shuffles.
+      return true;
+  } else {
+    // Try to create target specific intrinsics to replace the store and
+    // shuffle.
+    if (!TLI->lowerInterleavedStore(cast<StoreInst>(StoreOp), SVI, Factor))
+      return false;
+  }
 
   // Already have a new target specific interleaved store. Erase the old store.
-  DeadInsts.insert(SI);
+  DeadInsts.insert(StoreOp);
   DeadInsts.insert(SVI);
   return true;
 }
@@ -686,8 +792,7 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
 
     // Since lowerInterleaveLoad expects Shuffles and LoadInst, use special
     // TLI function to emit target-specific interleaved instruction.
-    if (!TLI->lowerDeinterleavedIntrinsicToVPLoad(VPLoad, Mask,
-                                                  DeinterleaveValues))
+    if (!TLI->lowerDeinterleavedVPLoad(VPLoad, Mask, DeinterleaveValues))
       return false;
 
   } else {
@@ -739,8 +844,7 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
 
     // Since lowerInterleavedStore expects Shuffle and StoreInst, use special
     // TLI function to emit target-specific interleaved instruction.
-    if (!TLI->lowerInterleavedIntrinsicToVPStore(VPStore, Mask,
-                                                 InterleaveValues))
+    if (!TLI->lowerInterleavedVPStore(VPStore, Mask, InterleaveValues))
       return false;
   } else {
     auto *SI = cast<StoreInst>(StoredBy);
@@ -766,12 +870,15 @@ bool InterleavedAccessImpl::runOnFunction(Function &F) {
   SmallSetVector<Instruction *, 32> DeadInsts;
   bool Changed = false;
 
+  using namespace PatternMatch;
   for (auto &I : instructions(F)) {
-    if (auto *LI = dyn_cast<LoadInst>(&I))
-      Changed |= lowerInterleavedLoad(LI, DeadInsts);
+    if (match(&I, m_CombineOr(m_Load(m_Value()),
+                              m_Intrinsic<Intrinsic::vp_load>())))
+      Changed |= lowerInterleavedLoad(&I, DeadInsts);
 
-    if (auto *SI = dyn_cast<StoreInst>(&I))
-      Changed |= lowerInterleavedStore(SI, DeadInsts);
+    if (match(&I, m_CombineOr(m_Store(m_Value(), m_Value()),
+                              m_Intrinsic<Intrinsic::vp_store>())))
+      Changed |= lowerInterleavedStore(&I, DeadInsts);
 
     if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
       // At present, we only have intrinsics to represent (de)interleaving
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index f7d192756fd56..96baa24e9665f 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1724,24 +1724,24 @@ bool RISCVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
                  MachineMemOperand::MOVolatile;
     return true;
-  case Intrinsic::riscv_seg2_load:
-  case Intrinsic::riscv_seg3_load:
-  case Intrinsic::riscv_seg4_load:
-  case Intrinsic::riscv_seg5_load:
-  case Intrinsic::riscv_seg6_load:
-  case Intrinsic::riscv_seg7_load:
-  case Intrinsic::riscv_seg8_load:
+  case Intrinsic::riscv_seg2_load_mask:
+  case Intrinsic::riscv_seg3_load_mask:
+  case Intrinsic::riscv_seg4_load_mask:
+  case Intrinsic::riscv_seg5_load_mask:
+  case Intrinsic::riscv_seg6_load_mask:
+  case Intrinsic::riscv_seg7_load_mask:
+  case Intrinsic::riscv_seg8_load_mask:
     return SetRVVLoadStoreInfo(/*PtrOp*/ 0, /*IsStore*/ false,
                                /*IsUnitStrided*/ false, /*UsePtrVal*/ true);
-  case Intrinsic::riscv_seg2_store:
-  case Intrinsic::riscv_seg3_store:
-  case Intrinsic::riscv_seg4_store:
-  case Intrinsic::riscv_seg5_store:
-  case Intrinsic::riscv_seg6_store:
-  case Intrinsic::riscv_seg7_store:
-  case Intrinsic::riscv_seg8_store:
-    // Operands are (vec, ..., vec, ptr, vl)
-    return SetRVVLoadStoreInfo(/*PtrOp*/ I.arg_size() - 2,
+  case Intrinsic::riscv_seg2_store_mask:
+  case Intrinsic::riscv_seg3_store_mask:
+  case Intrinsic::riscv_seg4_store_mask:
+  case Intrinsic::riscv_seg5_store_mask:
+  case Intrinsic::riscv_seg6_store_mask:
+  case Intrinsic::riscv_seg7_store_mask:
+  case Intrinsic::riscv_seg8_store_mask:
+    // Operands are (vec, ..., vec, ptr, mask, vl)
+    return SetRVVLoadStoreInfo(/*PtrOp*/ I.arg_size() - 3,
                                /*IsStore*/ true,
                                /*IsUnitStrided*/ false, /*UsePtrVal*/ true);
   case Intrinsic::riscv_vle:
@@ -10444,19 +10444,19 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
   switch (IntNo) {
   default:
     break;
-  case Intrinsic::riscv_seg2_load:
-  case Intrinsic::riscv_seg3_load:
-  case Intrinsic::riscv_seg4_load:
-  case Intrinsic::riscv_seg5_load:
-  case Intrinsic::riscv_seg6_load:
-  case Intrinsic::riscv_seg7_load:
-  case Intrinsic::riscv_seg8_load: {
+  case Intrinsic::riscv_seg2_load_mask:
+  case Intrinsic::riscv_seg3_load_mask:
+  case Intrinsic::riscv_seg4_load_mask:
+  case Intrinsic::riscv_seg5_load_mask:
+  case Intrinsic::riscv_seg6_load_mask:
+  case Intrinsic::riscv_seg7_load_mask:
+  case Intrinsic::riscv_seg8_load_mask: {
     SDLoc DL(Op);
     static const Intrinsic::ID VlsegInts[7] = {
-        Intrinsic::riscv_vlseg2, Intrinsic::riscv_vlseg3,
-        Intrinsic::riscv_vlseg4, Intrinsic::riscv_vlseg5,
-        Intrinsic::riscv_vlseg6, Intrinsic::riscv_vlseg7,
-        Intrinsic::riscv_vlseg8};
+        Intrinsic::riscv_vlseg2_mask, Intrinsic::riscv_vlseg3_mask,
+        Intrinsic::riscv_vlseg4_mask, Intrinsic::riscv_vlseg5_mask,
+        Intrinsic::riscv_vlseg6_mask, Intrinsic::riscv_vlseg7_mask,
+        Intrinsic::riscv_vlseg8_mask};
     unsigned NF = Op->getNumValues() - 1;
     assert(NF >= 2 && NF <= 8 && "Unexpected seg number");
     MVT XLenVT = Subtarget.getXLenVT();
@@ -10466,7 +10466,16 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
                   ContainerVT.getScalarSizeInBits();
     EVT VecTupTy = MVT::getRISCVVectorTupleVT(Sz, NF);
 
-    SDValue VL = DAG.getConstant(VT.getVectorNumElements(), DL, XLenVT);
+    // Operands: (chain, int_id, pointer, mask, vl)
+    SDValue VL = Op.getOperand(Op.getNumOperands() - 1);
+    SDValue Mask = Op.getOperand(3);
+    MVT MaskVT = Mask.getSimpleValueType();
+    if (MaskVT.isFixedLengthVector()) {
+      MVT MaskContainerVT =
+          ::getContainerForFixedLengthVector(DAG, MaskVT, Subtarget);
+      Mask = convertToScalabl...
[truncated]

github-actions · 2025-04-23T19:45:58Z

✅ With the latest revision this PR passed the undef deprecator.

wangpc-pp

LGTM. I agree we should do some subtractions at times.

topperc · 2025-04-25T21:25:11Z

I think we should not call it "deprecate" if you're deleting them. Most of the time when we say we're deprecating something in LLVM, it's still usable for a time with maybe a warning.

mshockwave · 2025-04-25T22:15:22Z

I think we should not call it "deprecate" if you're deleting them. Most of the time when we say we're deprecating something in LLVM, it's still usable for a time with maybe a warning.

Agree. The title is updated now.

…ants RISCVVectorPeepholePass would replace instructions with all-ones mask with their unmask variant, so there isn't really a point to keep separate versions of intrinsics.

mshockwave · 2025-05-08T00:37:51Z