diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2020-07-31 22:12:34 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2020-07-31 22:12:34 +0000 |
commit | 899468a0006db4146d9b229234a183f499f7bcd2 (patch) | |
tree | 12efffe77267e4f6ff3b870da0e01dea0462993d | |
parent | 2cedf089162d7ff08af2fb09e4adea44608e7233 (diff) |
Vendor import of llvm-project branch release/11.xvendor/llvm-project/llvmorg-11.0.0-rc1-25-g903c872b169
llvmorg-11.0.0-rc1-25-g903c872b169.
Notes
Notes:
svn path=/vendor/llvm-project/release-11.x/; revision=363744
svn path=/vendor/llvm-project/llvmorg-11.0.0-rc1-25-g903c872b169/; revision=363745; tag=vendor/llvm-project/llvmorg-11.0.0-rc1-25-g903c872b169
36 files changed, 1265 insertions, 234 deletions
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp index 8bf605e5e76b..533c5b1f6ff0 100644 --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -2244,7 +2244,11 @@ OpenMPClauseKind Sema::isOpenMPPrivateDecl(ValueDecl *D, unsigned Level, [](OpenMPDirectiveKind K) { return isOpenMPTaskingDirective(K); }, Level)) { bool IsTriviallyCopyable = - D->getType().getNonReferenceType().isTriviallyCopyableType(Context); + D->getType().getNonReferenceType().isTriviallyCopyableType(Context) && + !D->getType() + .getNonReferenceType() + .getCanonicalType() + ->getAsCXXRecordDecl(); OpenMPDirectiveKind DKind = DSAStack->getDirective(Level); SmallVector<OpenMPDirectiveKind, 4> CaptureRegions; getOpenMPCaptureRegions(CaptureRegions, DKind); diff --git a/lld/COFF/Config.h b/lld/COFF/Config.h index 72d826b8bd17..7c439176f3a4 100644 --- a/lld/COFF/Config.h +++ b/lld/COFF/Config.h @@ -140,6 +140,7 @@ struct Configuration { bool safeSEH = false; Symbol *sehTable = nullptr; Symbol *sehCount = nullptr; + bool noSEH = false; // Used for /opt:lldlto=N unsigned ltoo = 2; diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp index 7372505bb616..9ceccef86779 100644 --- a/lld/COFF/Driver.cpp +++ b/lld/COFF/Driver.cpp @@ -1700,9 +1700,10 @@ void LinkerDriver::link(ArrayRef<const char *> argsArr) { config->wordsize = config->is64() ? 8 : 4; // Handle /safeseh, x86 only, on by default, except for mingw. - if (config->machine == I386 && - args.hasFlag(OPT_safeseh, OPT_safeseh_no, !config->mingw)) - config->safeSEH = true; + if (config->machine == I386) { + config->safeSEH = args.hasFlag(OPT_safeseh, OPT_safeseh_no, !config->mingw); + config->noSEH = args.hasArg(OPT_noseh); + } // Handle /functionpadmin for (auto *arg : args.filtered(OPT_functionpadmin, OPT_functionpadmin_opt)) diff --git a/lld/COFF/InputFiles.cpp b/lld/COFF/InputFiles.cpp index 0adc2b91bd99..4346b3a2ffa7 100644 --- a/lld/COFF/InputFiles.cpp +++ b/lld/COFF/InputFiles.cpp @@ -348,13 +348,13 @@ void ObjFile::recordPrevailingSymbolForMingw( // of the section chunk we actually include instead of discarding it, // add the symbol to a map to allow using it for implicitly // associating .[px]data$<func> sections to it. + // Use the suffix from the .text$<func> instead of the leader symbol + // name, for cases where the names differ (i386 mangling/decorations, + // cases where the leader is a weak symbol named .weak.func.default*). int32_t sectionNumber = sym.getSectionNumber(); SectionChunk *sc = sparseChunks[sectionNumber]; if (sc && sc->getOutputCharacteristics() & IMAGE_SCN_MEM_EXECUTE) { - StringRef name; - name = check(coffObj->getSymbolName(sym)); - if (getMachineType() == I386) - name.consume_front("_"); + StringRef name = sc->getSectionName().split('$').second; prevailingSectionMap[name] = sectionNumber; } } diff --git a/lld/COFF/MinGW.cpp b/lld/COFF/MinGW.cpp index bded985f04d0..e24cdca6ee34 100644 --- a/lld/COFF/MinGW.cpp +++ b/lld/COFF/MinGW.cpp @@ -34,6 +34,11 @@ AutoExporter::AutoExporter() { "libclang_rt.builtins-arm", "libclang_rt.builtins-i386", "libclang_rt.builtins-x86_64", + "libclang_rt.profile", + "libclang_rt.profile-aarch64", + "libclang_rt.profile-arm", + "libclang_rt.profile-i386", + "libclang_rt.profile-x86_64", "libc++", "libc++abi", "libunwind", @@ -57,6 +62,10 @@ AutoExporter::AutoExporter() { "__builtin_", // Artificial symbols such as .refptr ".", + // profile generate symbols + "__profc_", + "__profd_", + "__profvp_", }; excludeSymbolSuffixes = { diff --git a/lld/COFF/Options.td b/lld/COFF/Options.td index 212879e1d60b..087d53b5d2dd 100644 --- a/lld/COFF/Options.td +++ b/lld/COFF/Options.td @@ -204,6 +204,7 @@ def include_optional : Joined<["/", "-", "/?", "-?"], "includeoptional:">, HelpText<"Add symbol as undefined, but allow it to remain undefined">; def kill_at : F<"kill-at">; def lldmingw : F<"lldmingw">; +def noseh : F<"noseh">; def output_def : Joined<["/", "-", "/?", "-?"], "output-def:">; def pdb_source_path : P<"pdbsourcepath", "Base path used to make relative source file path absolute in PDB">; diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp index 3bcc1777f7ac..082de5b8c1d6 100644 --- a/lld/COFF/Writer.cpp +++ b/lld/COFF/Writer.cpp @@ -1393,7 +1393,7 @@ template <typename PEHeaderTy> void Writer::writeHeader() { pe->DLLCharacteristics |= IMAGE_DLL_CHARACTERISTICS_GUARD_CF; if (config->integrityCheck) pe->DLLCharacteristics |= IMAGE_DLL_CHARACTERISTICS_FORCE_INTEGRITY; - if (setNoSEHCharacteristic) + if (setNoSEHCharacteristic || config->noSEH) pe->DLLCharacteristics |= IMAGE_DLL_CHARACTERISTICS_NO_SEH; if (config->terminalServerAware) pe->DLLCharacteristics |= IMAGE_DLL_CHARACTERISTICS_TERMINAL_SERVER_AWARE; diff --git a/llvm/include/llvm/CodeGen/TargetFrameLowering.h b/llvm/include/llvm/CodeGen/TargetFrameLowering.h index c3a11b199675..d6580430daf7 100644 --- a/llvm/include/llvm/CodeGen/TargetFrameLowering.h +++ b/llvm/include/llvm/CodeGen/TargetFrameLowering.h @@ -134,6 +134,12 @@ public: /// was called). virtual unsigned getStackAlignmentSkew(const MachineFunction &MF) const; + /// This method returns whether or not it is safe for an object with the + /// given stack id to be bundled into the local area. + virtual bool isStackIdSafeForLocalArea(unsigned StackId) const { + return true; + } + /// getOffsetOfLocalArea - This method returns the offset of the local area /// from the stack pointer on entrance to a function. /// diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp index 74664098ce1d..33f122728d2a 100644 --- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp +++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp @@ -1648,8 +1648,32 @@ AliasResult BasicAAResult::aliasPHI(const PHINode *PN, LocationSize PNSize, } SmallVector<Value *, 4> V1Srcs; + // For a recursive phi, that recurses through a contant gep, we can perform + // aliasing calculations using the other phi operands with an unknown size to + // specify that an unknown number of elements after the initial value are + // potentially accessed. bool isRecursive = false; - if (PV) { + auto CheckForRecPhi = [&](Value *PV) { + if (!EnableRecPhiAnalysis) + return false; + if (GEPOperator *PVGEP = dyn_cast<GEPOperator>(PV)) { + // Check whether the incoming value is a GEP that advances the pointer + // result of this PHI node (e.g. in a loop). If this is the case, we + // would recurse and always get a MayAlias. Handle this case specially + // below. We need to ensure that the phi is inbounds and has a constant + // positive operand so that we can check for alias with the initial value + // and an unknown but positive size. + if (PVGEP->getPointerOperand() == PN && PVGEP->isInBounds() && + PVGEP->getNumIndices() == 1 && isa<ConstantInt>(PVGEP->idx_begin()) && + !cast<ConstantInt>(PVGEP->idx_begin())->isNegative()) { + isRecursive = true; + return true; + } + } + return false; + }; + + if (PV) { // If we have PhiValues then use it to get the underlying phi values. const PhiValues::ValueSet &PhiValueSet = PV->getValuesForPhi(PN); // If we have more phi values than the search depth then return MayAlias @@ -1660,19 +1684,8 @@ AliasResult BasicAAResult::aliasPHI(const PHINode *PN, LocationSize PNSize, return MayAlias; // Add the values to V1Srcs for (Value *PV1 : PhiValueSet) { - if (EnableRecPhiAnalysis) { - if (GEPOperator *PV1GEP = dyn_cast<GEPOperator>(PV1)) { - // Check whether the incoming value is a GEP that advances the pointer - // result of this PHI node (e.g. in a loop). If this is the case, we - // would recurse and always get a MayAlias. Handle this case specially - // below. - if (PV1GEP->getPointerOperand() == PN && PV1GEP->getNumIndices() == 1 && - isa<ConstantInt>(PV1GEP->idx_begin())) { - isRecursive = true; - continue; - } - } - } + if (CheckForRecPhi(PV1)) + continue; V1Srcs.push_back(PV1); } } else { @@ -1687,18 +1700,8 @@ AliasResult BasicAAResult::aliasPHI(const PHINode *PN, LocationSize PNSize, // and 'n' are the number of PHI sources. return MayAlias; - if (EnableRecPhiAnalysis) - if (GEPOperator *PV1GEP = dyn_cast<GEPOperator>(PV1)) { - // Check whether the incoming value is a GEP that advances the pointer - // result of this PHI node (e.g. in a loop). If this is the case, we - // would recurse and always get a MayAlias. Handle this case specially - // below. - if (PV1GEP->getPointerOperand() == PN && PV1GEP->getNumIndices() == 1 && - isa<ConstantInt>(PV1GEP->idx_begin())) { - isRecursive = true; - continue; - } - } + if (CheckForRecPhi(PV1)) + continue; if (UniqueSrc.insert(PV1).second) V1Srcs.push_back(PV1); diff --git a/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp b/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp index 6c5ef0255a08..204fb556d810 100644 --- a/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp +++ b/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp @@ -220,6 +220,8 @@ void LocalStackSlotPass::calculateFrameObjectOffsets(MachineFunction &Fn) { continue; if (StackProtectorFI == (int)i) continue; + if (!TFI.isStackIdSafeForLocalArea(MFI.getStackID(i))) + continue; switch (MFI.getObjectSSPLayout(i)) { case MachineFrameInfo::SSPLK_None: @@ -254,6 +256,8 @@ void LocalStackSlotPass::calculateFrameObjectOffsets(MachineFunction &Fn) { continue; if (ProtectedObjs.count(i)) continue; + if (!TFI.isStackIdSafeForLocalArea(MFI.getStackID(i))) + continue; AdjustStackOffset(MFI, i, Offset, StackGrowsDown, MaxAlign); } diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index f14b3dba4f31..ec384d2a7c56 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -11372,9 +11372,10 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { // Stop if more than one members are non-undef. if (NumDefs > 1) break; + VTs.push_back(EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), - X.getValueType().getVectorNumElements())); + X.getValueType().getVectorElementCount())); } if (NumDefs == 0) @@ -18795,6 +18796,11 @@ static SDValue combineConcatVectorOfScalars(SDNode *N, SelectionDAG &DAG) { static SDValue combineConcatVectorOfExtracts(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); EVT OpVT = N->getOperand(0).getValueType(); + + // We currently can't generate an appropriate shuffle for a scalable vector. + if (VT.isScalableVector()) + return SDValue(); + int NumElts = VT.getVectorNumElements(); int NumOpElts = OpVT.getVectorNumElements(); @@ -19055,11 +19061,14 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) { return V; // Type legalization of vectors and DAG canonicalization of SHUFFLE_VECTOR - // nodes often generate nop CONCAT_VECTOR nodes. - // Scan the CONCAT_VECTOR operands and look for a CONCAT operations that - // place the incoming vectors at the exact same location. + // nodes often generate nop CONCAT_VECTOR nodes. Scan the CONCAT_VECTOR + // operands and look for a CONCAT operations that place the incoming vectors + // at the exact same location. + // + // For scalable vectors, EXTRACT_SUBVECTOR indexes are implicitly scaled. SDValue SingleSource = SDValue(); - unsigned PartNumElem = N->getOperand(0).getValueType().getVectorNumElements(); + unsigned PartNumElem = + N->getOperand(0).getValueType().getVectorMinNumElements(); for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { SDValue Op = N->getOperand(i); @@ -19181,7 +19190,10 @@ static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) { // The binop must be a vector type, so we can extract some fraction of it. EVT WideBVT = BinOp.getValueType(); - if (!WideBVT.isVector()) + // The optimisations below currently assume we are dealing with fixed length + // vectors. It is possible to add support for scalable vectors, but at the + // moment we've done no analysis to prove whether they are profitable or not. + if (!WideBVT.isFixedLengthVector()) return SDValue(); EVT VT = Extract->getValueType(0); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 414ba25ffd5f..c81d03cac81b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -2151,7 +2151,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_UnaryOp(SDNode *N) { EVT InVT = Lo.getValueType(); EVT OutVT = EVT::getVectorVT(*DAG.getContext(), ResVT.getVectorElementType(), - InVT.getVectorNumElements()); + InVT.getVectorElementCount()); if (N->isStrictFPOpcode()) { Lo = DAG.getNode(N->getOpcode(), dl, { OutVT, MVT::Other }, @@ -2197,13 +2197,19 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N) { SDValue Idx = N->getOperand(1); SDLoc dl(N); SDValue Lo, Hi; + + if (SubVT.isScalableVector() != + N->getOperand(0).getValueType().isScalableVector()) + report_fatal_error("Extracting a fixed-length vector from an illegal " + "scalable vector is not yet supported"); + GetSplitVector(N->getOperand(0), Lo, Hi); - uint64_t LoElts = Lo.getValueType().getVectorNumElements(); + uint64_t LoElts = Lo.getValueType().getVectorMinNumElements(); uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); if (IdxVal < LoElts) { - assert(IdxVal + SubVT.getVectorNumElements() <= LoElts && + assert(IdxVal + SubVT.getVectorMinNumElements() <= LoElts && "Extracted subvector crosses vector split!"); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Lo, Idx); } else { @@ -2559,13 +2565,9 @@ SDValue DAGTypeLegalizer::SplitVecOp_TruncateHelper(SDNode *N) { SDValue InVec = N->getOperand(OpNo); EVT InVT = InVec->getValueType(0); EVT OutVT = N->getValueType(0); - unsigned NumElements = OutVT.getVectorNumElements(); + ElementCount NumElements = OutVT.getVectorElementCount(); bool IsFloat = OutVT.isFloatingPoint(); - // Widening should have already made sure this is a power-two vector - // if we're trying to split it at all. assert() that's true, just in case. - assert(!(NumElements & 1) && "Splitting vector, but not in half!"); - unsigned InElementSize = InVT.getScalarSizeInBits(); unsigned OutElementSize = OutVT.getScalarSizeInBits(); @@ -2595,6 +2597,9 @@ SDValue DAGTypeLegalizer::SplitVecOp_TruncateHelper(SDNode *N) { GetSplitVector(InVec, InLoVec, InHiVec); // Truncate them to 1/2 the element size. + // + // This assumes the number of elements is a power of two; any vector that + // isn't should be widened, not split. EVT HalfElementVT = IsFloat ? EVT::getFloatingPointVT(InElementSize/2) : EVT::getIntegerVT(*DAG.getContext(), InElementSize/2); @@ -3605,16 +3610,15 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) { EVT InVT = N->getOperand(0).getValueType(); EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); SDLoc dl(N); - unsigned WidenNumElts = WidenVT.getVectorNumElements(); - unsigned NumInElts = InVT.getVectorNumElements(); unsigned NumOperands = N->getNumOperands(); bool InputWidened = false; // Indicates we need to widen the input. if (getTypeAction(InVT) != TargetLowering::TypeWidenVector) { - if (WidenVT.getVectorNumElements() % InVT.getVectorNumElements() == 0) { + unsigned WidenNumElts = WidenVT.getVectorMinNumElements(); + unsigned NumInElts = InVT.getVectorMinNumElements(); + if (WidenNumElts % NumInElts == 0) { // Add undef vectors to widen to correct length. - unsigned NumConcat = WidenVT.getVectorNumElements() / - InVT.getVectorNumElements(); + unsigned NumConcat = WidenNumElts / NumInElts; SDValue UndefVal = DAG.getUNDEF(InVT); SmallVector<SDValue, 16> Ops(NumConcat); for (unsigned i=0; i < NumOperands; ++i) @@ -3638,6 +3642,11 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) { return GetWidenedVector(N->getOperand(0)); if (NumOperands == 2) { + assert(!WidenVT.isScalableVector() && + "Cannot use vector shuffles to widen CONCAT_VECTOR result"); + unsigned WidenNumElts = WidenVT.getVectorNumElements(); + unsigned NumInElts = InVT.getVectorNumElements(); + // Replace concat of two operands with a shuffle. SmallVector<int, 16> MaskOps(WidenNumElts, -1); for (unsigned i = 0; i < NumInElts; ++i) { @@ -3652,6 +3661,11 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) { } } + assert(!WidenVT.isScalableVector() && + "Cannot use build vectors to widen CONCAT_VECTOR result"); + unsigned WidenNumElts = WidenVT.getVectorNumElements(); + unsigned NumInElts = InVT.getVectorNumElements(); + // Fall back to use extracts and build vector. EVT EltVT = WidenVT.getVectorElementType(); SmallVector<SDValue, 16> Ops(WidenNumElts); @@ -4913,7 +4927,8 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain, int LdWidth = LdVT.getSizeInBits(); int WidthDiff = WidenWidth - LdWidth; - // Allow wider loads. + // Allow wider loads if they are sufficiently aligned to avoid memory faults + // and if the original load is simple. unsigned LdAlign = (!LD->isSimple()) ? 0 : LD->getAlignment(); // Find the vector type that can load from. @@ -4965,19 +4980,6 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain, LD->getPointerInfo().getWithOffset(Offset), LD->getOriginalAlign(), MMOFlags, AAInfo); LdChain.push_back(L.getValue(1)); - if (L->getValueType(0).isVector() && NewVTWidth >= LdWidth) { - // Later code assumes the vector loads produced will be mergeable, so we - // must pad the final entry up to the previous width. Scalars are - // combined separately. - SmallVector<SDValue, 16> Loads; - Loads.push_back(L); - unsigned size = L->getValueSizeInBits(0); - while (size < LdOp->getValueSizeInBits(0)) { - Loads.push_back(DAG.getUNDEF(L->getValueType(0))); - size += L->getValueSizeInBits(0); - } - L = DAG.getNode(ISD::CONCAT_VECTORS, dl, LdOp->getValueType(0), Loads); - } } else { L = DAG.getLoad(NewVT, dl, Chain, BasePtr, LD->getPointerInfo().getWithOffset(Offset), @@ -5018,8 +5020,17 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain, EVT NewLdTy = LdOps[i].getValueType(); if (NewLdTy != LdTy) { // Create a larger vector. + unsigned NumOps = NewLdTy.getSizeInBits() / LdTy.getSizeInBits(); + assert(NewLdTy.getSizeInBits() % LdTy.getSizeInBits() == 0); + SmallVector<SDValue, 16> WidenOps(NumOps); + unsigned j = 0; + for (; j != End-Idx; ++j) + WidenOps[j] = ConcatOps[Idx+j]; + for (; j != NumOps; ++j) + WidenOps[j] = DAG.getUNDEF(LdTy); + ConcatOps[End-1] = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewLdTy, - makeArrayRef(&ConcatOps[Idx], End - Idx)); + WidenOps); Idx = End - 1; LdTy = NewLdTy; } diff --git a/llvm/lib/MC/WinCOFFObjectWriter.cpp b/llvm/lib/MC/WinCOFFObjectWriter.cpp index 4796ef531054..8e7bf1eb0169 100644 --- a/llvm/lib/MC/WinCOFFObjectWriter.cpp +++ b/llvm/lib/MC/WinCOFFObjectWriter.cpp @@ -375,6 +375,7 @@ void WinCOFFObjectWriter::DefineSymbol(const MCSymbol &MCSym, COFFSymbol *Local = nullptr; if (cast<MCSymbolCOFF>(MCSym).isWeakExternal()) { Sym->Data.StorageClass = COFF::IMAGE_SYM_CLASS_WEAK_EXTERNAL; + Sym->Section = nullptr; COFFSymbol *WeakDefault = getLinkedSymbol(MCSym); if (!WeakDefault) { diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index efa3fd5ca9ce..4789a9f02937 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -1192,7 +1192,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // Process the SVE callee-saves to determine what space needs to be // allocated. - if (AFI->getSVECalleeSavedStackSize()) { + if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) { // Find callee save instructions in frame. CalleeSavesBegin = MBBI; assert(IsSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction"); @@ -1200,11 +1200,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, ++MBBI; CalleeSavesEnd = MBBI; - int64_t OffsetToFirstCalleeSaveFromSP = - MFI.getObjectOffset(AFI->getMaxSVECSFrameIndex()); - StackOffset OffsetToCalleeSavesFromSP = - StackOffset(OffsetToFirstCalleeSaveFromSP, MVT::nxv1i8) + SVEStackSize; - AllocateBefore -= OffsetToCalleeSavesFromSP; + AllocateBefore = {CalleeSavedSize, MVT::nxv1i8}; AllocateAfter = SVEStackSize - AllocateBefore; } @@ -1582,7 +1578,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // deallocated. StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize; MachineBasicBlock::iterator RestoreBegin = LastPopI, RestoreEnd = LastPopI; - if (AFI->getSVECalleeSavedStackSize()) { + if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) { RestoreBegin = std::prev(RestoreEnd);; while (IsSVECalleeSave(RestoreBegin) && RestoreBegin != MBB.begin()) @@ -1592,23 +1588,21 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, assert(IsSVECalleeSave(RestoreBegin) && IsSVECalleeSave(std::prev(RestoreEnd)) && "Unexpected instruction"); - int64_t OffsetToFirstCalleeSaveFromSP = - MFI.getObjectOffset(AFI->getMaxSVECSFrameIndex()); - StackOffset OffsetToCalleeSavesFromSP = - StackOffset(OffsetToFirstCalleeSaveFromSP, MVT::nxv1i8) + SVEStackSize; - DeallocateBefore = OffsetToCalleeSavesFromSP; - DeallocateAfter = SVEStackSize - DeallocateBefore; + StackOffset CalleeSavedSizeAsOffset = {CalleeSavedSize, MVT::nxv1i8}; + DeallocateBefore = SVEStackSize - CalleeSavedSizeAsOffset; + DeallocateAfter = CalleeSavedSizeAsOffset; } // Deallocate the SVE area. if (SVEStackSize) { if (AFI->isStackRealigned()) { - if (AFI->getSVECalleeSavedStackSize()) - // Set SP to start of SVE area, from which the callee-save reloads - // can be done. The code below will deallocate the stack space + if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) + // Set SP to start of SVE callee-save area from which they can + // be reloaded. The code below will deallocate the stack space // space by moving FP -> SP. emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::FP, - -SVEStackSize, TII, MachineInstr::FrameDestroy); + {-CalleeSavedSize, MVT::nxv1i8}, TII, + MachineInstr::FrameDestroy); } else { if (AFI->getSVECalleeSavedStackSize()) { // Deallocate the non-SVE locals first before we can deallocate (and @@ -2595,25 +2589,23 @@ static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI, int &MinCSFrameIndex, int &MaxCSFrameIndex, bool AssignOffsets) { +#ifndef NDEBUG // First process all fixed stack objects. - int64_t Offset = 0; for (int I = MFI.getObjectIndexBegin(); I != 0; ++I) - if (MFI.getStackID(I) == TargetStackID::SVEVector) { - int64_t FixedOffset = -MFI.getObjectOffset(I); - if (FixedOffset > Offset) - Offset = FixedOffset; - } + assert(MFI.getStackID(I) != TargetStackID::SVEVector && + "SVE vectors should never be passed on the stack by value, only by " + "reference."); +#endif auto Assign = [&MFI](int FI, int64_t Offset) { LLVM_DEBUG(dbgs() << "alloc FI(" << FI << ") at SP[" << Offset << "]\n"); MFI.setObjectOffset(FI, Offset); }; + int64_t Offset = 0; + // Then process all callee saved slots. if (getSVECalleeSaveSlotRange(MFI, MinCSFrameIndex, MaxCSFrameIndex)) { - // Make sure to align the last callee save slot. - MFI.setObjectAlignment(MaxCSFrameIndex, Align(16)); - // Assign offsets to the callee save slots. for (int I = MinCSFrameIndex; I <= MaxCSFrameIndex; ++I) { Offset += MFI.getObjectSize(I); @@ -2623,6 +2615,9 @@ static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI, } } + // Ensure that the Callee-save area is aligned to 16bytes. + Offset = alignTo(Offset, Align(16U)); + // Create a buffer of SVE objects to allocate and sort it. SmallVector<int, 8> ObjectsToAllocate; for (int I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) { diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h index 9d0a6d9eaf25..444740cb50ab 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h @@ -105,6 +105,12 @@ public: } } + bool isStackIdSafeForLocalArea(unsigned StackId) const override { + // We don't support putting SVE objects into the pre-allocated local + // frame block at the moment. + return StackId != TargetStackID::SVEVector; + } + private: bool shouldCombineCSRLocalStackBump(MachineFunction &MF, uint64_t StackBumpBytes) const; diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 10c477853353..7799ebfbd68e 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -245,7 +245,8 @@ public: unsigned SubRegIdx); void SelectLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc); void SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc); - void SelectPredicatedLoad(SDNode *N, unsigned NumVecs, const unsigned Opc); + void SelectPredicatedLoad(SDNode *N, unsigned NumVecs, unsigned Scale, + unsigned Opc_rr, unsigned Opc_ri); bool SelectAddrModeFrameIndexSVE(SDValue N, SDValue &Base, SDValue &OffImm); /// SVE Reg+Imm addressing mode. @@ -262,14 +263,12 @@ public: void SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc); void SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc); void SelectPostStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc); - template <unsigned Scale> - void SelectPredicatedStore(SDNode *N, unsigned NumVecs, const unsigned Opc_rr, - const unsigned Opc_ri); - template <unsigned Scale> + void SelectPredicatedStore(SDNode *N, unsigned NumVecs, unsigned Scale, + unsigned Opc_rr, unsigned Opc_ri); std::tuple<unsigned, SDValue, SDValue> - findAddrModeSVELoadStore(SDNode *N, const unsigned Opc_rr, - const unsigned Opc_ri, const SDValue &OldBase, - const SDValue &OldOffset); + findAddrModeSVELoadStore(SDNode *N, unsigned Opc_rr, unsigned Opc_ri, + const SDValue &OldBase, const SDValue &OldOffset, + unsigned Scale); bool tryBitfieldExtractOp(SDNode *N); bool tryBitfieldExtractOpFromSExt(SDNode *N); @@ -1414,12 +1413,12 @@ void AArch64DAGToDAGISel::SelectPostLoad(SDNode *N, unsigned NumVecs, /// Optimize \param OldBase and \param OldOffset selecting the best addressing /// mode. Returns a tuple consisting of an Opcode, an SDValue representing the /// new Base and an SDValue representing the new offset. -template <unsigned Scale> std::tuple<unsigned, SDValue, SDValue> -AArch64DAGToDAGISel::findAddrModeSVELoadStore(SDNode *N, const unsigned Opc_rr, - const unsigned Opc_ri, +AArch64DAGToDAGISel::findAddrModeSVELoadStore(SDNode *N, unsigned Opc_rr, + unsigned Opc_ri, const SDValue &OldBase, - const SDValue &OldOffset) { + const SDValue &OldOffset, + unsigned Scale) { SDValue NewBase = OldBase; SDValue NewOffset = OldOffset; // Detect a possible Reg+Imm addressing mode. @@ -1429,21 +1428,30 @@ AArch64DAGToDAGISel::findAddrModeSVELoadStore(SDNode *N, const unsigned Opc_rr, // Detect a possible reg+reg addressing mode, but only if we haven't already // detected a Reg+Imm one. const bool IsRegReg = - !IsRegImm && SelectSVERegRegAddrMode<Scale>(OldBase, NewBase, NewOffset); + !IsRegImm && SelectSVERegRegAddrMode(OldBase, Scale, NewBase, NewOffset); // Select the instruction. return std::make_tuple(IsRegReg ? Opc_rr : Opc_ri, NewBase, NewOffset); } void AArch64DAGToDAGISel::SelectPredicatedLoad(SDNode *N, unsigned NumVecs, - const unsigned Opc) { + unsigned Scale, unsigned Opc_ri, + unsigned Opc_rr) { + assert(Scale < 4 && "Invalid scaling value."); SDLoc DL(N); EVT VT = N->getValueType(0); SDValue Chain = N->getOperand(0); + // Optimize addressing mode. + SDValue Base, Offset; + unsigned Opc; + std::tie(Opc, Base, Offset) = findAddrModeSVELoadStore( + N, Opc_rr, Opc_ri, N->getOperand(2), + CurDAG->getTargetConstant(0, DL, MVT::i64), Scale); + SDValue Ops[] = {N->getOperand(1), // Predicate - N->getOperand(2), // Memory operand - CurDAG->getTargetConstant(0, DL, MVT::i64), Chain}; + Base, // Memory operand + Offset, Chain}; const EVT ResTys[] = {MVT::Untyped, MVT::Other}; @@ -1479,10 +1487,9 @@ void AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs, ReplaceNode(N, St); } -template <unsigned Scale> void AArch64DAGToDAGISel::SelectPredicatedStore(SDNode *N, unsigned NumVecs, - const unsigned Opc_rr, - const unsigned Opc_ri) { + unsigned Scale, unsigned Opc_rr, + unsigned Opc_ri) { SDLoc dl(N); // Form a REG_SEQUENCE to force register allocation. @@ -1492,9 +1499,9 @@ void AArch64DAGToDAGISel::SelectPredicatedStore(SDNode *N, unsigned NumVecs, // Optimize addressing mode. unsigned Opc; SDValue Offset, Base; - std::tie(Opc, Base, Offset) = findAddrModeSVELoadStore<Scale>( + std::tie(Opc, Base, Offset) = findAddrModeSVELoadStore( N, Opc_rr, Opc_ri, N->getOperand(NumVecs + 3), - CurDAG->getTargetConstant(0, dl, MVT::i64)); + CurDAG->getTargetConstant(0, dl, MVT::i64), Scale); SDValue Ops[] = {RegSeq, N->getOperand(NumVecs + 2), // predicate Base, // address @@ -4085,63 +4092,51 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } case Intrinsic::aarch64_sve_st2: { if (VT == MVT::nxv16i8) { - SelectPredicatedStore</*Scale=*/0>(Node, 2, AArch64::ST2B, - AArch64::ST2B_IMM); + SelectPredicatedStore(Node, 2, 0, AArch64::ST2B, AArch64::ST2B_IMM); return; } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) { - SelectPredicatedStore</*Scale=*/1>(Node, 2, AArch64::ST2H, - AArch64::ST2H_IMM); + SelectPredicatedStore(Node, 2, 1, AArch64::ST2H, AArch64::ST2H_IMM); return; } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { - SelectPredicatedStore</*Scale=*/2>(Node, 2, AArch64::ST2W, - AArch64::ST2W_IMM); + SelectPredicatedStore(Node, 2, 2, AArch64::ST2W, AArch64::ST2W_IMM); return; } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { - SelectPredicatedStore</*Scale=*/3>(Node, 2, AArch64::ST2D, - AArch64::ST2D_IMM); + SelectPredicatedStore(Node, 2, 3, AArch64::ST2D, AArch64::ST2D_IMM); return; } break; } case Intrinsic::aarch64_sve_st3: { if (VT == MVT::nxv16i8) { - SelectPredicatedStore</*Scale=*/0>(Node, 3, AArch64::ST3B, - AArch64::ST3B_IMM); + SelectPredicatedStore(Node, 3, 0, AArch64::ST3B, AArch64::ST3B_IMM); return; } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) { - SelectPredicatedStore</*Scale=*/1>(Node, 3, AArch64::ST3H, - AArch64::ST3H_IMM); + SelectPredicatedStore(Node, 3, 1, AArch64::ST3H, AArch64::ST3H_IMM); return; } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { - SelectPredicatedStore</*Scale=*/2>(Node, 3, AArch64::ST3W, - AArch64::ST3W_IMM); + SelectPredicatedStore(Node, 3, 2, AArch64::ST3W, AArch64::ST3W_IMM); return; } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { - SelectPredicatedStore</*Scale=*/3>(Node, 3, AArch64::ST3D, - AArch64::ST3D_IMM); + SelectPredicatedStore(Node, 3, 3, AArch64::ST3D, AArch64::ST3D_IMM); return; } break; } case Intrinsic::aarch64_sve_st4: { if (VT == MVT::nxv16i8) { - SelectPredicatedStore</*Scale=*/0>(Node, 4, AArch64::ST4B, - AArch64::ST4B_IMM); + SelectPredicatedStore(Node, 4, 0, AArch64::ST4B, AArch64::ST4B_IMM); return; } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) { - SelectPredicatedStore</*Scale=*/1>(Node, 4, AArch64::ST4H, - AArch64::ST4H_IMM); + SelectPredicatedStore(Node, 4, 1, AArch64::ST4H, AArch64::ST4H_IMM); return; } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { - SelectPredicatedStore</*Scale=*/2>(Node, 4, AArch64::ST4W, - AArch64::ST4W_IMM); + SelectPredicatedStore(Node, 4, 2, AArch64::ST4W, AArch64::ST4W_IMM); return; } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { - SelectPredicatedStore</*Scale=*/3>(Node, 4, AArch64::ST4D, - AArch64::ST4D_IMM); + SelectPredicatedStore(Node, 4, 3, AArch64::ST4D, AArch64::ST4D_IMM); return; } break; @@ -4741,51 +4736,51 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } case AArch64ISD::SVE_LD2_MERGE_ZERO: { if (VT == MVT::nxv16i8) { - SelectPredicatedLoad(Node, 2, AArch64::LD2B_IMM); + SelectPredicatedLoad(Node, 2, 0, AArch64::LD2B_IMM, AArch64::LD2B); return; } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) { - SelectPredicatedLoad(Node, 2, AArch64::LD2H_IMM); + SelectPredicatedLoad(Node, 2, 1, AArch64::LD2H_IMM, AArch64::LD2H); return; } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { - SelectPredicatedLoad(Node, 2, AArch64::LD2W_IMM); + SelectPredicatedLoad(Node, 2, 2, AArch64::LD2W_IMM, AArch64::LD2W); return; } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { - SelectPredicatedLoad(Node, 2, AArch64::LD2D_IMM); + SelectPredicatedLoad(Node, 2, 3, AArch64::LD2D_IMM, AArch64::LD2D); return; } break; } case AArch64ISD::SVE_LD3_MERGE_ZERO: { if (VT == MVT::nxv16i8) { - SelectPredicatedLoad(Node, 3, AArch64::LD3B_IMM); + SelectPredicatedLoad(Node, 3, 0, AArch64::LD3B_IMM, AArch64::LD3B); return; } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) { - SelectPredicatedLoad(Node, 3, AArch64::LD3H_IMM); + SelectPredicatedLoad(Node, 3, 1, AArch64::LD3H_IMM, AArch64::LD3H); return; } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { - SelectPredicatedLoad(Node, 3, AArch64::LD3W_IMM); + SelectPredicatedLoad(Node, 3, 2, AArch64::LD3W_IMM, AArch64::LD3W); return; } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { - SelectPredicatedLoad(Node, 3, AArch64::LD3D_IMM); + SelectPredicatedLoad(Node, 3, 3, AArch64::LD3D_IMM, AArch64::LD3D); return; } break; } case AArch64ISD::SVE_LD4_MERGE_ZERO: { if (VT == MVT::nxv16i8) { - SelectPredicatedLoad(Node, 4, AArch64::LD4B_IMM); + SelectPredicatedLoad(Node, 4, 0, AArch64::LD4B_IMM, AArch64::LD4B); return; } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) { - SelectPredicatedLoad(Node, 4, AArch64::LD4H_IMM); + SelectPredicatedLoad(Node, 4, 1, AArch64::LD4H_IMM, AArch64::LD4H); return; } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { - SelectPredicatedLoad(Node, 4, AArch64::LD4W_IMM); + SelectPredicatedLoad(Node, 4, 2, AArch64::LD4W_IMM, AArch64::LD4W); return; } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { - SelectPredicatedLoad(Node, 4, AArch64::LD4D_IMM); + SelectPredicatedLoad(Node, 4, 3, AArch64::LD4D_IMM, AArch64::LD4D); return; } break; @@ -4805,10 +4800,14 @@ FunctionPass *llvm::createAArch64ISelDag(AArch64TargetMachine &TM, /// When \p PredVT is a scalable vector predicate in the form /// MVT::nx<M>xi1, it builds the correspondent scalable vector of -/// integers MVT::nx<M>xi<bits> s.t. M x bits = 128. If the input +/// integers MVT::nx<M>xi<bits> s.t. M x bits = 128. When targeting +/// structured vectors (NumVec >1), the output data type is +/// MVT::nx<M*NumVec>xi<bits> s.t. M x bits = 128. If the input /// PredVT is not in the form MVT::nx<M>xi1, it returns an invalid /// EVT. -static EVT getPackedVectorTypeFromPredicateType(LLVMContext &Ctx, EVT PredVT) { +static EVT getPackedVectorTypeFromPredicateType(LLVMContext &Ctx, EVT PredVT, + unsigned NumVec) { + assert(NumVec > 0 && NumVec < 5 && "Invalid number of vectors."); if (!PredVT.isScalableVector() || PredVT.getVectorElementType() != MVT::i1) return EVT(); @@ -4818,7 +4817,8 @@ static EVT getPackedVectorTypeFromPredicateType(LLVMContext &Ctx, EVT PredVT) { ElementCount EC = PredVT.getVectorElementCount(); EVT ScalarVT = EVT::getIntegerVT(Ctx, AArch64::SVEBitsPerBlock / EC.Min); - EVT MemVT = EVT::getVectorVT(Ctx, ScalarVT, EC); + EVT MemVT = EVT::getVectorVT(Ctx, ScalarVT, EC * NumVec); + return MemVT; } @@ -4842,6 +4842,15 @@ static EVT getMemVTFromNode(LLVMContext &Ctx, SDNode *Root) { return cast<VTSDNode>(Root->getOperand(3))->getVT(); case AArch64ISD::ST1_PRED: return cast<VTSDNode>(Root->getOperand(4))->getVT(); + case AArch64ISD::SVE_LD2_MERGE_ZERO: + return getPackedVectorTypeFromPredicateType( + Ctx, Root->getOperand(1)->getValueType(0), /*NumVec=*/2); + case AArch64ISD::SVE_LD3_MERGE_ZERO: + return getPackedVectorTypeFromPredicateType( + Ctx, Root->getOperand(1)->getValueType(0), /*NumVec=*/3); + case AArch64ISD::SVE_LD4_MERGE_ZERO: + return getPackedVectorTypeFromPredicateType( + Ctx, Root->getOperand(1)->getValueType(0), /*NumVec=*/4); default: break; } @@ -4857,7 +4866,7 @@ static EVT getMemVTFromNode(LLVMContext &Ctx, SDNode *Root) { // We are using an SVE prefetch intrinsic. Type must be inferred // from the width of the predicate. return getPackedVectorTypeFromPredicateType( - Ctx, Root->getOperand(2)->getValueType(0)); + Ctx, Root->getOperand(2)->getValueType(0), /*NumVec=*/1); } /// SelectAddrModeIndexedSVE - Attempt selection of the addressing mode: diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 85db14ab66fe..1500da2fdfc7 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -932,8 +932,11 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::SHL, VT, Custom); setOperationAction(ISD::SRL, VT, Custom); setOperationAction(ISD::SRA, VT, Custom); - if (VT.getScalarType() == MVT::i1) + if (VT.getScalarType() == MVT::i1) { setOperationAction(ISD::SETCC, VT, Custom); + setOperationAction(ISD::TRUNCATE, VT, Custom); + setOperationAction(ISD::CONCAT_VECTORS, VT, Legal); + } } } @@ -8858,6 +8861,16 @@ SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); + if (VT.getScalarType() == MVT::i1) { + // Lower i1 truncate to `(x & 1) != 0`. + SDLoc dl(Op); + EVT OpVT = Op.getOperand(0).getValueType(); + SDValue Zero = DAG.getConstant(0, dl, OpVT); + SDValue One = DAG.getConstant(1, dl, OpVT); + SDValue And = DAG.getNode(ISD::AND, dl, OpVT, Op.getOperand(0), One); + return DAG.getSetCC(dl, VT, And, Zero, ISD::SETNE); + } + if (!VT.isVector() || VT.isScalableVector()) return Op; @@ -12288,6 +12301,9 @@ static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG) { "Unsupported opcode."); SDLoc DL(N); EVT VT = N->getValueType(0); + if (VT == MVT::nxv8bf16 && + !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16()) + return SDValue(); EVT LoadVT = VT; if (VT.isFloatingPoint()) @@ -14909,6 +14925,11 @@ bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const { if (isa<ScalableVectorType>(Inst.getOperand(i)->getType())) return true; + if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) { + if (isa<ScalableVectorType>(AI->getAllocatedType())) + return true; + } + return false; } diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index 6df7970f4d82..4f4ba692c2db 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -495,6 +495,9 @@ def SImmS4XForm : SDNodeXForm<imm, [{ def SImmS16XForm : SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(N->getSExtValue() / 16, SDLoc(N), MVT::i64); }]>; +def SImmS32XForm : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(N->getSExtValue() / 32, SDLoc(N), MVT::i64); +}]>; // simm6sN predicate - True if the immediate is a multiple of N in the range // [-32 * N, 31 * N]. @@ -546,7 +549,7 @@ def simm4s16 : Operand<i64>, ImmLeaf<i64, let DecoderMethod = "DecodeSImm<4>"; } def simm4s32 : Operand<i64>, ImmLeaf<i64, -[{ return Imm >=-256 && Imm <= 224 && (Imm % 32) == 0x0; }]> { +[{ return Imm >=-256 && Imm <= 224 && (Imm % 32) == 0x0; }], SImmS32XForm> { let PrintMethod = "printImmScale<32>"; let ParserMatchClass = SImm4s32Operand; let DecoderMethod = "DecodeSImm<4>"; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 5139ae5ccaf1..08f80c9aa361 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -2744,6 +2744,35 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, return; } + // Copy a Z register pair by copying the individual sub-registers. + if (AArch64::ZPR2RegClass.contains(DestReg) && + AArch64::ZPR2RegClass.contains(SrcReg)) { + static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1}; + copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ, + Indices); + return; + } + + // Copy a Z register triple by copying the individual sub-registers. + if (AArch64::ZPR3RegClass.contains(DestReg) && + AArch64::ZPR3RegClass.contains(SrcReg)) { + static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1, + AArch64::zsub2}; + copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ, + Indices); + return; + } + + // Copy a Z register quad by copying the individual sub-registers. + if (AArch64::ZPR4RegClass.contains(DestReg) && + AArch64::ZPR4RegClass.contains(SrcReg)) { + static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1, + AArch64::zsub2, AArch64::zsub3}; + copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ, + Indices); + return; + } + if (AArch64::GPR64spRegClass.contains(DestReg) && (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) { if (DestReg == AArch64::SP || SrcReg == AArch64::SP) { diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index 886158ca4490..83a488afc797 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -40,6 +40,14 @@ AArch64RegisterInfo::AArch64RegisterInfo(const Triple &TT) AArch64_MC::initLLVMToCVRegMapping(this); } +static bool hasSVEArgsOrReturn(const MachineFunction *MF) { + const Function &F = MF->getFunction(); + return isa<ScalableVectorType>(F.getReturnType()) || + any_of(F.args(), [](const Argument &Arg) { + return isa<ScalableVectorType>(Arg.getType()); + }); +} + const MCPhysReg * AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { assert(MF && "Invalid MachineFunction pointer."); @@ -75,6 +83,8 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { // This is for OSes other than Windows; Windows is a separate case further // above. return CSR_AArch64_AAPCS_X18_SaveList; + if (hasSVEArgsOrReturn(MF)) + return CSR_AArch64_SVE_AAPCS_SaveList; return CSR_AArch64_AAPCS_SaveList; } @@ -343,6 +353,15 @@ bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const { if (MFI.hasVarSizedObjects() || MF.hasEHFunclets()) { if (needsStackRealignment(MF)) return true; + + if (MF.getSubtarget<AArch64Subtarget>().hasSVE()) { + const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); + // Frames that have variable sized objects and scalable SVE objects, + // should always use a basepointer. + if (!AFI->hasCalculatedStackSizeSVE() || AFI->getStackSizeSVE()) + return true; + } + // Conservatively estimate whether the negative offset from the frame // pointer will be sufficient to reach. If a function has a smallish // frame, it's less likely to have lots of spills and callee saved @@ -379,8 +398,15 @@ AArch64RegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const { // (closer to SP). // // The beginning works most reliably if we have a frame pointer. + // In the presence of any non-constant space between FP and locals, + // (e.g. in case of stack realignment or a scalable SVE area), it is + // better to use SP or BP. const AArch64FrameLowering &TFI = *getFrameLowering(MF); - return TFI.hasFP(MF); + const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); + assert((!MF.getSubtarget<AArch64Subtarget>().hasSVE() || + AFI->hasCalculatedStackSizeSVE()) && + "Expected SVE area to be calculated by this point"); + return TFI.hasFP(MF) && !needsStackRealignment(MF) && !AFI->getStackSizeSVE(); } bool AArch64RegisterInfo::requiresFrameIndexScavenging( diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 28a54e6f7d79..3449a8bd16d2 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -1109,6 +1109,28 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio defm TRN1_PPP : sve_int_perm_bin_perm_pp<0b100, "trn1", AArch64trn1>; defm TRN2_PPP : sve_int_perm_bin_perm_pp<0b101, "trn2", AArch64trn2>; + // Extract lo/hi halves of legal predicate types. + def : Pat<(nxv2i1 (extract_subvector (nxv4i1 PPR:$Ps), (i64 0))), + (ZIP1_PPP_S PPR:$Ps, (PFALSE))>; + def : Pat<(nxv2i1 (extract_subvector (nxv4i1 PPR:$Ps), (i64 2))), + (ZIP2_PPP_S PPR:$Ps, (PFALSE))>; + def : Pat<(nxv4i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 0))), + (ZIP1_PPP_H PPR:$Ps, (PFALSE))>; + def : Pat<(nxv4i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 4))), + (ZIP2_PPP_H PPR:$Ps, (PFALSE))>; + def : Pat<(nxv8i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 0))), + (ZIP1_PPP_B PPR:$Ps, (PFALSE))>; + def : Pat<(nxv8i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 8))), + (ZIP2_PPP_B PPR:$Ps, (PFALSE))>; + + // Concatenate two predicates. + def : Pat<(nxv4i1 (concat_vectors nxv2i1:$p1, nxv2i1:$p2)), + (UZP1_PPP_S $p1, $p2)>; + def : Pat<(nxv8i1 (concat_vectors nxv4i1:$p1, nxv4i1:$p2)), + (UZP1_PPP_H $p1, $p2)>; + def : Pat<(nxv16i1 (concat_vectors nxv8i1:$p1, nxv8i1:$p2)), + (UZP1_PPP_B $p1, $p2)>; + defm CMPHS_PPzZZ : sve_int_cmp_0<0b000, "cmphs", SETUGE, SETULE>; defm CMPHI_PPzZZ : sve_int_cmp_0<0b001, "cmphi", SETUGT, SETULT>; defm CMPGE_PPzZZ : sve_int_cmp_0<0b100, "cmpge", SETGE, SETLE>; diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index a005d1e65abe..c56a65b9e212 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -7718,9 +7718,13 @@ multiclass sve_mem_ldor_si<bits<2> sz, string asm, RegisterOperand listty, (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s32:$imm4), 0>; // Base addressing mode - def : Pat<(Ty (Ld1ro (PredTy PPR3bAny:$gp), GPR64sp:$base)), - (!cast<Instruction>(NAME) PPR3bAny:$gp, GPR64sp:$base, (i64 0))>; - + def : Pat<(Ty (Ld1ro (PredTy PPR3bAny:$Pg), GPR64sp:$base)), + (!cast<Instruction>(NAME) PPR3bAny:$Pg, GPR64sp:$base, (i64 0))>; + let AddedComplexity = 2 in { + // Reg + Imm addressing mode + def : Pat<(Ty (Ld1ro (PredTy PPR3bAny:$Pg), (add GPR64:$base, (i64 simm4s32:$imm)))), + (!cast<Instruction>(NAME) $Pg, $base, simm4s32:$imm)>; + } } class sve_mem_ldor_ss<bits<2> sz, string asm, RegisterOperand VecList, diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 11454841cab7..5c1a4cb16568 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -9111,13 +9111,15 @@ SDValue PPCTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const { Op0.getOperand(1)); } -static const SDValue *getNormalLoadInput(const SDValue &Op) { +static const SDValue *getNormalLoadInput(const SDValue &Op, bool &IsPermuted) { const SDValue *InputLoad = &Op; if (InputLoad->getOpcode() == ISD::BITCAST) InputLoad = &InputLoad->getOperand(0); if (InputLoad->getOpcode() == ISD::SCALAR_TO_VECTOR || - InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED) + InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED) { + IsPermuted = InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED; InputLoad = &InputLoad->getOperand(0); + } if (InputLoad->getOpcode() != ISD::LOAD) return nullptr; LoadSDNode *LD = cast<LoadSDNode>(*InputLoad); @@ -9289,7 +9291,9 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, if (!BVNIsConstantSplat || SplatBitSize > 32) { - const SDValue *InputLoad = getNormalLoadInput(Op.getOperand(0)); + bool IsPermutedLoad = false; + const SDValue *InputLoad = + getNormalLoadInput(Op.getOperand(0), IsPermutedLoad); // Handle load-and-splat patterns as we have instructions that will do this // in one go. if (InputLoad && DAG.isSplatValue(Op, true)) { @@ -9912,7 +9916,8 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, // If this is a load-and-splat, we can do that with a single instruction // in some cases. However if the load has multiple uses, we don't want to // combine it because that will just produce multiple loads. - const SDValue *InputLoad = getNormalLoadInput(V1); + bool IsPermutedLoad = false; + const SDValue *InputLoad = getNormalLoadInput(V1, IsPermutedLoad); if (InputLoad && Subtarget.hasVSX() && V2.isUndef() && (PPC::isSplatShuffleMask(SVOp, 4) || PPC::isSplatShuffleMask(SVOp, 8)) && InputLoad->hasOneUse()) { @@ -9920,6 +9925,16 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, int SplatIdx = PPC::getSplatIdxForPPCMnemonics(SVOp, IsFourByte ? 4 : 8, DAG); + // The splat index for permuted loads will be in the left half of the vector + // which is strictly wider than the loaded value by 8 bytes. So we need to + // adjust the splat index to point to the correct address in memory. + if (IsPermutedLoad) { + assert(isLittleEndian && "Unexpected permuted load on big endian target"); + SplatIdx += IsFourByte ? 2 : 1; + assert((SplatIdx < (IsFourByte ? 4 : 2)) && + "Splat of a value outside of the loaded memory"); + } + LoadSDNode *LD = cast<LoadSDNode>(*InputLoad); // For 4-byte load-and-splat, we need Power9. if ((IsFourByte && Subtarget.hasP9Vector()) || !IsFourByte) { @@ -9929,10 +9944,6 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, else Offset = isLittleEndian ? (1 - SplatIdx) * 8 : SplatIdx * 8; - // If we are loading a partial vector, it does not make sense to adjust - // the base pointer. This happens with (splat (s_to_v_permuted (ld))). - if (LD->getMemoryVT().getSizeInBits() == (IsFourByte ? 32 : 64)) - Offset = 0; SDValue BasePtr = LD->getBasePtr(); if (Offset != 0) BasePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index a0ae05081adc..7570385e38e3 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -184,6 +184,330 @@ bool RISCVDAGToDAGISel::SelectAddrFI(SDValue Addr, SDValue &Base) { return false; } +// Check that it is a SLOI (Shift Left Ones Immediate). We first check that +// it is the right node tree: +// +// (OR (SHL RS1, VC2), VC1) +// +// and then we check that VC1, the mask used to fill with ones, is compatible +// with VC2, the shamt: +// +// VC1 == maskTrailingOnes<uint64_t>(VC2) + +bool RISCVDAGToDAGISel::SelectSLOI(SDValue N, SDValue &RS1, SDValue &Shamt) { + MVT XLenVT = Subtarget->getXLenVT(); + if (N.getOpcode() == ISD::OR) { + SDValue Or = N; + if (Or.getOperand(0).getOpcode() == ISD::SHL) { + SDValue Shl = Or.getOperand(0); + if (isa<ConstantSDNode>(Shl.getOperand(1)) && + isa<ConstantSDNode>(Or.getOperand(1))) { + if (XLenVT == MVT::i64) { + uint64_t VC1 = Or.getConstantOperandVal(1); + uint64_t VC2 = Shl.getConstantOperandVal(1); + if (VC1 == maskTrailingOnes<uint64_t>(VC2)) { + RS1 = Shl.getOperand(0); + Shamt = CurDAG->getTargetConstant(VC2, SDLoc(N), + Shl.getOperand(1).getValueType()); + return true; + } + } + if (XLenVT == MVT::i32) { + uint32_t VC1 = Or.getConstantOperandVal(1); + uint32_t VC2 = Shl.getConstantOperandVal(1); + if (VC1 == maskTrailingOnes<uint32_t>(VC2)) { + RS1 = Shl.getOperand(0); + Shamt = CurDAG->getTargetConstant(VC2, SDLoc(N), + Shl.getOperand(1).getValueType()); + return true; + } + } + } + } + } + return false; +} + +// Check that it is a SROI (Shift Right Ones Immediate). We first check that +// it is the right node tree: +// +// (OR (SRL RS1, VC2), VC1) +// +// and then we check that VC1, the mask used to fill with ones, is compatible +// with VC2, the shamt: +// +// VC1 == maskLeadingOnes<uint64_t>(VC2) + +bool RISCVDAGToDAGISel::SelectSROI(SDValue N, SDValue &RS1, SDValue &Shamt) { + MVT XLenVT = Subtarget->getXLenVT(); + if (N.getOpcode() == ISD::OR) { + SDValue Or = N; + if (Or.getOperand(0).getOpcode() == ISD::SRL) { + SDValue Srl = Or.getOperand(0); + if (isa<ConstantSDNode>(Srl.getOperand(1)) && + isa<ConstantSDNode>(Or.getOperand(1))) { + if (XLenVT == MVT::i64) { + uint64_t VC1 = Or.getConstantOperandVal(1); + uint64_t VC2 = Srl.getConstantOperandVal(1); + if (VC1 == maskLeadingOnes<uint64_t>(VC2)) { + RS1 = Srl.getOperand(0); + Shamt = CurDAG->getTargetConstant(VC2, SDLoc(N), + Srl.getOperand(1).getValueType()); + return true; + } + } + if (XLenVT == MVT::i32) { + uint32_t VC1 = Or.getConstantOperandVal(1); + uint32_t VC2 = Srl.getConstantOperandVal(1); + if (VC1 == maskLeadingOnes<uint32_t>(VC2)) { + RS1 = Srl.getOperand(0); + Shamt = CurDAG->getTargetConstant(VC2, SDLoc(N), + Srl.getOperand(1).getValueType()); + return true; + } + } + } + } + } + return false; +} + +// Check that it is a RORI (Rotate Right Immediate). We first check that +// it is the right node tree: +// +// (ROTL RS1, VC) +// +// The compiler translates immediate rotations to the right given by the call +// to the rotateright32/rotateright64 intrinsics as rotations to the left. +// Since the rotation to the left can be easily emulated as a rotation to the +// right by negating the constant, there is no encoding for ROLI. +// We then select the immediate left rotations as RORI by the complementary +// constant: +// +// Shamt == XLen - VC + +bool RISCVDAGToDAGISel::SelectRORI(SDValue N, SDValue &RS1, SDValue &Shamt) { + MVT XLenVT = Subtarget->getXLenVT(); + if (N.getOpcode() == ISD::ROTL) { + if (isa<ConstantSDNode>(N.getOperand(1))) { + if (XLenVT == MVT::i64) { + uint64_t VC = N.getConstantOperandVal(1); + Shamt = CurDAG->getTargetConstant((64 - VC), SDLoc(N), + N.getOperand(1).getValueType()); + RS1 = N.getOperand(0); + return true; + } + if (XLenVT == MVT::i32) { + uint32_t VC = N.getConstantOperandVal(1); + Shamt = CurDAG->getTargetConstant((32 - VC), SDLoc(N), + N.getOperand(1).getValueType()); + RS1 = N.getOperand(0); + return true; + } + } + } + return false; +} + + +// Check that it is a SLLIUW (Shift Logical Left Immediate Unsigned i32 +// on RV64). +// SLLIUW is the same as SLLI except for the fact that it clears the bits +// XLEN-1:32 of the input RS1 before shifting. +// We first check that it is the right node tree: +// +// (AND (SHL RS1, VC2), VC1) +// +// We check that VC2, the shamt is less than 32, otherwise the pattern is +// exactly the same as SLLI and we give priority to that. +// Eventually we check that that VC1, the mask used to clear the upper 32 bits +// of RS1, is correct: +// +// VC1 == (0xFFFFFFFF << VC2) + +bool RISCVDAGToDAGISel::SelectSLLIUW(SDValue N, SDValue &RS1, SDValue &Shamt) { + if (N.getOpcode() == ISD::AND && Subtarget->getXLenVT() == MVT::i64) { + SDValue And = N; + if (And.getOperand(0).getOpcode() == ISD::SHL) { + SDValue Shl = And.getOperand(0); + if (isa<ConstantSDNode>(Shl.getOperand(1)) && + isa<ConstantSDNode>(And.getOperand(1))) { + uint64_t VC1 = And.getConstantOperandVal(1); + uint64_t VC2 = Shl.getConstantOperandVal(1); + if (VC2 < 32 && VC1 == ((uint64_t)0xFFFFFFFF << VC2)) { + RS1 = Shl.getOperand(0); + Shamt = CurDAG->getTargetConstant(VC2, SDLoc(N), + Shl.getOperand(1).getValueType()); + return true; + } + } + } + } + return false; +} + +// Check that it is a SLOIW (Shift Left Ones Immediate i32 on RV64). +// We first check that it is the right node tree: +// +// (SIGN_EXTEND_INREG (OR (SHL RS1, VC2), VC1)) +// +// and then we check that VC1, the mask used to fill with ones, is compatible +// with VC2, the shamt: +// +// VC1 == maskTrailingOnes<uint32_t>(VC2) + +bool RISCVDAGToDAGISel::SelectSLOIW(SDValue N, SDValue &RS1, SDValue &Shamt) { + if (Subtarget->getXLenVT() == MVT::i64 && + N.getOpcode() == ISD::SIGN_EXTEND_INREG && + cast<VTSDNode>(N.getOperand(1))->getVT() == MVT::i32) { + if (N.getOperand(0).getOpcode() == ISD::OR) { + SDValue Or = N.getOperand(0); + if (Or.getOperand(0).getOpcode() == ISD::SHL) { + SDValue Shl = Or.getOperand(0); + if (isa<ConstantSDNode>(Shl.getOperand(1)) && + isa<ConstantSDNode>(Or.getOperand(1))) { + uint32_t VC1 = Or.getConstantOperandVal(1); + uint32_t VC2 = Shl.getConstantOperandVal(1); + if (VC1 == maskTrailingOnes<uint32_t>(VC2)) { + RS1 = Shl.getOperand(0); + Shamt = CurDAG->getTargetConstant(VC2, SDLoc(N), + Shl.getOperand(1).getValueType()); + return true; + } + } + } + } + } + return false; +} + +// Check that it is a SROIW (Shift Right Ones Immediate i32 on RV64). +// We first check that it is the right node tree: +// +// (OR (SHL RS1, VC2), VC1) +// +// and then we check that VC1, the mask used to fill with ones, is compatible +// with VC2, the shamt: +// +// VC1 == maskLeadingOnes<uint32_t>(VC2) + +bool RISCVDAGToDAGISel::SelectSROIW(SDValue N, SDValue &RS1, SDValue &Shamt) { + if (N.getOpcode() == ISD::OR && Subtarget->getXLenVT() == MVT::i64) { + SDValue Or = N; + if (Or.getOperand(0).getOpcode() == ISD::SRL) { + SDValue Srl = Or.getOperand(0); + if (isa<ConstantSDNode>(Srl.getOperand(1)) && + isa<ConstantSDNode>(Or.getOperand(1))) { + uint32_t VC1 = Or.getConstantOperandVal(1); + uint32_t VC2 = Srl.getConstantOperandVal(1); + if (VC1 == maskLeadingOnes<uint32_t>(VC2)) { + RS1 = Srl.getOperand(0); + Shamt = CurDAG->getTargetConstant(VC2, SDLoc(N), + Srl.getOperand(1).getValueType()); + return true; + } + } + } + } + return false; +} + +// Check that it is a RORIW (i32 Right Rotate Immediate on RV64). +// We first check that it is the right node tree: +// +// (SIGN_EXTEND_INREG (OR (SHL (AsserSext RS1, i32), VC2), +// (SRL (AND (AssertSext RS2, i32), VC3), VC1))) +// +// Then we check that the constant operands respect these constraints: +// +// VC2 == 32 - VC1 +// VC3 == maskLeadingOnes<uint32_t>(VC2) +// +// being VC1 the Shamt we need, VC2 the complementary of Shamt over 32 +// and VC3 a 32 bit mask of (32 - VC1) leading ones. + +bool RISCVDAGToDAGISel::SelectRORIW(SDValue N, SDValue &RS1, SDValue &Shamt) { + if (N.getOpcode() == ISD::SIGN_EXTEND_INREG && + Subtarget->getXLenVT() == MVT::i64 && + cast<VTSDNode>(N.getOperand(1))->getVT() == MVT::i32) { + if (N.getOperand(0).getOpcode() == ISD::OR) { + SDValue Or = N.getOperand(0); + if (Or.getOperand(0).getOpcode() == ISD::SHL && + Or.getOperand(1).getOpcode() == ISD::SRL) { + SDValue Shl = Or.getOperand(0); + SDValue Srl = Or.getOperand(1); + if (Srl.getOperand(0).getOpcode() == ISD::AND) { + SDValue And = Srl.getOperand(0); + if (isa<ConstantSDNode>(Srl.getOperand(1)) && + isa<ConstantSDNode>(Shl.getOperand(1)) && + isa<ConstantSDNode>(And.getOperand(1))) { + uint32_t VC1 = Srl.getConstantOperandVal(1); + uint32_t VC2 = Shl.getConstantOperandVal(1); + uint32_t VC3 = And.getConstantOperandVal(1); + if (VC2 == (32 - VC1) && + VC3 == maskLeadingOnes<uint32_t>(VC2)) { + RS1 = Shl.getOperand(0); + Shamt = CurDAG->getTargetConstant(VC1, SDLoc(N), + Srl.getOperand(1).getValueType()); + return true; + } + } + } + } + } + } + return false; +} + +// Check that it is a FSRIW (i32 Funnel Shift Right Immediate on RV64). +// We first check that it is the right node tree: +// +// (SIGN_EXTEND_INREG (OR (SHL (AsserSext RS1, i32), VC2), +// (SRL (AND (AssertSext RS2, i32), VC3), VC1))) +// +// Then we check that the constant operands respect these constraints: +// +// VC2 == 32 - VC1 +// VC3 == maskLeadingOnes<uint32_t>(VC2) +// +// being VC1 the Shamt we need, VC2 the complementary of Shamt over 32 +// and VC3 a 32 bit mask of (32 - VC1) leading ones. + +bool RISCVDAGToDAGISel::SelectFSRIW(SDValue N, SDValue &RS1, SDValue &RS2, + SDValue &Shamt) { + if (N.getOpcode() == ISD::SIGN_EXTEND_INREG && + Subtarget->getXLenVT() == MVT::i64 && + cast<VTSDNode>(N.getOperand(1))->getVT() == MVT::i32) { + if (N.getOperand(0).getOpcode() == ISD::OR) { + SDValue Or = N.getOperand(0); + if (Or.getOperand(0).getOpcode() == ISD::SHL && + Or.getOperand(1).getOpcode() == ISD::SRL) { + SDValue Shl = Or.getOperand(0); + SDValue Srl = Or.getOperand(1); + if (Srl.getOperand(0).getOpcode() == ISD::AND) { + SDValue And = Srl.getOperand(0); + if (isa<ConstantSDNode>(Srl.getOperand(1)) && + isa<ConstantSDNode>(Shl.getOperand(1)) && + isa<ConstantSDNode>(And.getOperand(1))) { + uint32_t VC1 = Srl.getConstantOperandVal(1); + uint32_t VC2 = Shl.getConstantOperandVal(1); + uint32_t VC3 = And.getConstantOperandVal(1); + if (VC2 == (32 - VC1) && + VC3 == maskLeadingOnes<uint32_t>(VC2)) { + RS1 = Shl.getOperand(0); + RS2 = And.getOperand(0); + Shamt = CurDAG->getTargetConstant(VC1, SDLoc(N), + Srl.getOperand(1).getValueType()); + return true; + } + } + } + } + } + } + return false; +} + // Merge an ADDI into the offset of a load/store instruction where possible. // (load (addi base, off1), off2) -> (load base, off1+off2) // (store val, (addi base, off1), off2) -> (store val, base, off1+off2) diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h index dcf733ec3675..0ca12510a230 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h @@ -45,6 +45,15 @@ public: bool SelectAddrFI(SDValue Addr, SDValue &Base); + bool SelectSLOI(SDValue N, SDValue &RS1, SDValue &Shamt); + bool SelectSROI(SDValue N, SDValue &RS1, SDValue &Shamt); + bool SelectRORI(SDValue N, SDValue &RS1, SDValue &Shamt); + bool SelectSLLIUW(SDValue N, SDValue &RS1, SDValue &Shamt); + bool SelectSLOIW(SDValue N, SDValue &RS1, SDValue &Shamt); + bool SelectSROIW(SDValue N, SDValue &RS1, SDValue &Shamt); + bool SelectRORIW(SDValue N, SDValue &RS1, SDValue &Shamt); + bool SelectFSRIW(SDValue N, SDValue &RS1, SDValue &RS2, SDValue &Shamt); + // Include the pieces autogenerated from the target description. #include "RISCVGenDAGISel.inc" diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 91fc69b5bc10..03d9eefd59d0 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -149,12 +149,27 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SRL_PARTS, XLenVT, Custom); setOperationAction(ISD::SRA_PARTS, XLenVT, Custom); - setOperationAction(ISD::ROTL, XLenVT, Expand); - setOperationAction(ISD::ROTR, XLenVT, Expand); - setOperationAction(ISD::BSWAP, XLenVT, Expand); - setOperationAction(ISD::CTTZ, XLenVT, Expand); - setOperationAction(ISD::CTLZ, XLenVT, Expand); - setOperationAction(ISD::CTPOP, XLenVT, Expand); + if (!(Subtarget.hasStdExtZbb() || Subtarget.hasStdExtZbp())) { + setOperationAction(ISD::ROTL, XLenVT, Expand); + setOperationAction(ISD::ROTR, XLenVT, Expand); + } + + if (!Subtarget.hasStdExtZbp()) + setOperationAction(ISD::BSWAP, XLenVT, Expand); + + if (!Subtarget.hasStdExtZbb()) { + setOperationAction(ISD::CTTZ, XLenVT, Expand); + setOperationAction(ISD::CTLZ, XLenVT, Expand); + setOperationAction(ISD::CTPOP, XLenVT, Expand); + } + + if (Subtarget.hasStdExtZbp()) + setOperationAction(ISD::BITREVERSE, XLenVT, Legal); + + if (Subtarget.hasStdExtZbt()) { + setOperationAction(ISD::FSHL, XLenVT, Legal); + setOperationAction(ISD::FSHR, XLenVT, Legal); + } ISD::CondCode FPCCToExtend[] = { ISD::SETOGT, ISD::SETOGE, ISD::SETONE, ISD::SETUEQ, ISD::SETUGT, diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoB.td b/llvm/lib/Target/RISCV/RISCVInstrInfoB.td index 34a463626e29..afac509f743d 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoB.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoB.td @@ -632,3 +632,432 @@ let Predicates = [HasStdExtZbproposedc, HasStdExtZbbOrZbp, HasStdExtC, IsRV64] i def : CompressPat<(PACK GPRC:$rs1, GPRC:$rs1, X0), (C_ZEXTW GPRC:$rs1)>; } // Predicates = [HasStdExtZbproposedc, HasStdExtC, IsRV64] + +//===----------------------------------------------------------------------===// +// Codegen patterns +//===----------------------------------------------------------------------===// +def SLOIPat : ComplexPattern<XLenVT, 2, "SelectSLOI", [or]>; +def SROIPat : ComplexPattern<XLenVT, 2, "SelectSROI", [or]>; +def RORIPat : ComplexPattern<XLenVT, 2, "SelectRORI", [rotl]>; +def SLLIUWPat : ComplexPattern<i64, 2, "SelectSLLIUW", [and]>; +def SLOIWPat : ComplexPattern<i64, 2, "SelectSLOIW", [sext_inreg]>; +def SROIWPat : ComplexPattern<i64, 2, "SelectSROIW", [or]>; +def RORIWPat : ComplexPattern<i64, 2, "SelectRORIW", [sext_inreg]>; +def FSRIWPat : ComplexPattern<i64, 3, "SelectFSRIW", [sext_inreg]>; + +let Predicates = [HasStdExtZbbOrZbp] in { +def : Pat<(and GPR:$rs1, (not GPR:$rs2)), (ANDN GPR:$rs1, GPR:$rs2)>; +def : Pat<(or GPR:$rs1, (not GPR:$rs2)), (ORN GPR:$rs1, GPR:$rs2)>; +def : Pat<(xor GPR:$rs1, (not GPR:$rs2)), (XNOR GPR:$rs1, GPR:$rs2)>; +} // Predicates = [HasStdExtZbbOrZbp] + +let Predicates = [HasStdExtZbb] in { +def : Pat<(xor (shl (xor GPR:$rs1, -1), GPR:$rs2), -1), + (SLO GPR:$rs1, GPR:$rs2)>; +def : Pat<(xor (srl (xor GPR:$rs1, -1), GPR:$rs2), -1), + (SRO GPR:$rs1, GPR:$rs2)>; +} // Predicates = [HasStdExtZbb] + +let Predicates = [HasStdExtZbbOrZbp] in { +def : Pat<(rotl GPR:$rs1, GPR:$rs2), (ROL GPR:$rs1, GPR:$rs2)>; +def : Pat<(fshl GPR:$rs1, GPR:$rs1, GPR:$rs2), (ROL GPR:$rs1, GPR:$rs2)>; +def : Pat<(rotr GPR:$rs1, GPR:$rs2), (ROR GPR:$rs1, GPR:$rs2)>; +def : Pat<(fshr GPR:$rs1, GPR:$rs1, GPR:$rs2), (ROR GPR:$rs1, GPR:$rs2)>; +} // Predicates = [HasStdExtZbbOrZbp] + +let Predicates = [HasStdExtZbs, IsRV32] in +def : Pat<(and (xor (shl 1, (and GPR:$rs2, 31)), -1), GPR:$rs1), + (SBCLR GPR:$rs1, GPR:$rs2)>; +let Predicates = [HasStdExtZbs, IsRV64] in +def : Pat<(and (xor (shl 1, (and GPR:$rs2, 63)), -1), GPR:$rs1), + (SBCLR GPR:$rs1, GPR:$rs2)>; + +let Predicates = [HasStdExtZbs] in +def : Pat<(and (rotl -2, GPR:$rs2), GPR:$rs1), (SBCLR GPR:$rs1, GPR:$rs2)>; + +let Predicates = [HasStdExtZbs, IsRV32] in +def : Pat<(or (shl 1, (and GPR:$rs2, 31)), GPR:$rs1), + (SBSET GPR:$rs1, GPR:$rs2)>; +let Predicates = [HasStdExtZbs, IsRV64] in +def : Pat<(or (shl 1, (and GPR:$rs2, 63)), GPR:$rs1), + (SBSET GPR:$rs1, GPR:$rs2)>; + +let Predicates = [HasStdExtZbs, IsRV32] in +def : Pat<(xor (shl 1, (and GPR:$rs2, 31)), GPR:$rs1), + (SBINV GPR:$rs1, GPR:$rs2)>; +let Predicates = [HasStdExtZbs, IsRV64] in +def : Pat<(xor (shl 1, (and GPR:$rs2, 63)), GPR:$rs1), + (SBINV GPR:$rs1, GPR:$rs2)>; + +let Predicates = [HasStdExtZbs, IsRV32] in +def : Pat<(and (srl GPR:$rs1, (and GPR:$rs2, 31)), 1), + (SBEXT GPR:$rs1, GPR:$rs2)>; + +let Predicates = [HasStdExtZbs, IsRV64] in +def : Pat<(and (srl GPR:$rs1, (and GPR:$rs2, 63)), 1), + (SBEXT GPR:$rs1, GPR:$rs2)>; + +let Predicates = [HasStdExtZbb] in { +def : Pat<(SLOIPat GPR:$rs1, uimmlog2xlen:$shamt), + (SLOI GPR:$rs1, uimmlog2xlen:$shamt)>; +def : Pat<(SROIPat GPR:$rs1, uimmlog2xlen:$shamt), + (SROI GPR:$rs1, uimmlog2xlen:$shamt)>; +} // Predicates = [HasStdExtZbb] + +// There's no encoding for roli in the current version of the 'B' extension +// (v0.92) as it can be implemented with rori by negating the immediate. +// For this reason we pattern-match only against rori[w]. +let Predicates = [HasStdExtZbbOrZbp] in +def : Pat<(RORIPat GPR:$rs1, uimmlog2xlen:$shamt), + (RORI GPR:$rs1, uimmlog2xlen:$shamt)>; + +// We don't pattern-match sbclri[w], sbseti[w], sbinvi[w] because they are +// pattern-matched by simple andi, ori, and xori. +let Predicates = [HasStdExtZbs] in +def : Pat<(and (srl GPR:$rs1, uimmlog2xlen:$shamt), (XLenVT 1)), + (SBEXTI GPR:$rs1, uimmlog2xlen:$shamt)>; + +let Predicates = [HasStdExtZbp, IsRV32] in { +def : Pat<(or (or (and (srl GPR:$rs1, (i32 1)), (i32 0x55555555)), GPR:$rs1), + (and (shl GPR:$rs1, (i32 1)), (i32 0xAAAAAAAA))), + (GORCI GPR:$rs1, (i32 1))>; +def : Pat<(or (or (and (srl GPR:$rs1, (i32 2)), (i32 0x33333333)), GPR:$rs1), + (and (shl GPR:$rs1, (i32 2)), (i32 0xCCCCCCCC))), + (GORCI GPR:$rs1, (i32 2))>; +def : Pat<(or (or (and (srl GPR:$rs1, (i32 4)), (i32 0x0F0F0F0F)), GPR:$rs1), + (and (shl GPR:$rs1, (i32 4)), (i32 0xF0F0F0F0))), + (GORCI GPR:$rs1, (i32 4))>; +def : Pat<(or (or (and (srl GPR:$rs1, (i32 8)), (i32 0x00FF00FF)), GPR:$rs1), + (and (shl GPR:$rs1, (i32 8)), (i32 0xFF00FF00))), + (GORCI GPR:$rs1, (i32 8))>; +def : Pat<(or (or (srl GPR:$rs1, (i32 16)), GPR:$rs1), + (shl GPR:$rs1, (i32 16))), + (GORCI GPR:$rs1, (i32 16))>; +} // Predicates = [HasStdExtZbp, IsRV32] + +let Predicates = [HasStdExtZbp, IsRV64] in { +def : Pat<(or (or (and (srl GPR:$rs1, (i64 1)), (i64 0x5555555555555555)), + GPR:$rs1), + (and (shl GPR:$rs1, (i64 1)), (i64 0xAAAAAAAAAAAAAAAA))), + (GORCI GPR:$rs1, (i64 1))>; +def : Pat<(or (or (and (srl GPR:$rs1, (i64 2)), (i64 0x3333333333333333)), + GPR:$rs1), + (and (shl GPR:$rs1, (i64 2)), (i64 0xCCCCCCCCCCCCCCCC))), + (GORCI GPR:$rs1, (i64 2))>; +def : Pat<(or (or (and (srl GPR:$rs1, (i64 4)), (i64 0x0F0F0F0F0F0F0F0F)), + GPR:$rs1), + (and (shl GPR:$rs1, (i64 4)), (i64 0xF0F0F0F0F0F0F0F0))), + (GORCI GPR:$rs1, (i64 4))>; +def : Pat<(or (or (and (srl GPR:$rs1, (i64 8)), (i64 0x00FF00FF00FF00FF)), + GPR:$rs1), + (and (shl GPR:$rs1, (i64 8)), (i64 0xFF00FF00FF00FF00))), + (GORCI GPR:$rs1, (i64 8))>; +def : Pat<(or (or (and (srl GPR:$rs1, (i64 16)), (i64 0x0000FFFF0000FFFF)), + GPR:$rs1), + (and (shl GPR:$rs1, (i64 16)), (i64 0xFFFF0000FFFF0000))), + (GORCI GPR:$rs1, (i64 16))>; +def : Pat<(or (or (srl GPR:$rs1, (i64 32)), GPR:$rs1), + (shl GPR:$rs1, (i64 32))), + (GORCI GPR:$rs1, (i64 32))>; +} // Predicates = [HasStdExtZbp, IsRV64] + +let Predicates = [HasStdExtZbp, IsRV32] in { +def : Pat<(or (and (shl GPR:$rs1, (i32 1)), (i32 0xAAAAAAAA)), + (and (srl GPR:$rs1, (i32 1)), (i32 0x55555555))), + (GREVI GPR:$rs1, (i32 1))>; +def : Pat<(or (and (shl GPR:$rs1, (i32 2)), (i32 0xCCCCCCCC)), + (and (srl GPR:$rs1, (i32 2)), (i32 0x33333333))), + (GREVI GPR:$rs1, (i32 2))>; +def : Pat<(or (and (shl GPR:$rs1, (i32 4)), (i32 0xF0F0F0F0)), + (and (srl GPR:$rs1, (i32 4)), (i32 0x0F0F0F0F))), + (GREVI GPR:$rs1, (i32 4))>; +def : Pat<(or (and (shl GPR:$rs1, (i32 8)), (i32 0xFF00FF00)), + (and (srl GPR:$rs1, (i32 8)), (i32 0x00FF00FF))), + (GREVI GPR:$rs1, (i32 8))>; +def : Pat<(rotr (bswap GPR:$rs1), (i32 16)), (GREVI GPR:$rs1, (i32 8))>; +def : Pat<(or (shl GPR:$rs1, (i32 16)), (srl GPR:$rs1, (i32 16))), + (GREVI GPR:$rs1, (i32 16))>; +def : Pat<(rotl GPR:$rs1, (i32 16)), (GREVI GPR:$rs1, (i32 16))>; +def : Pat<(bswap GPR:$rs1), (GREVI GPR:$rs1, (i32 24))>; +def : Pat<(bitreverse GPR:$rs1), (GREVI GPR:$rs1, (i32 31))>; +} // Predicates = [HasStdExtZbp, IsRV32] + +let Predicates = [HasStdExtZbp, IsRV64] in { +def : Pat<(or (and (shl GPR:$rs1, (i64 1)), (i64 0xAAAAAAAAAAAAAAAA)), + (and (srl GPR:$rs1, (i64 1)), (i64 0x5555555555555555))), + (GREVI GPR:$rs1, (i64 1))>; +def : Pat<(or (and (shl GPR:$rs1, (i64 2)), (i64 0xCCCCCCCCCCCCCCCC)), + (and (srl GPR:$rs1, (i64 2)), (i64 0x3333333333333333))), + (GREVI GPR:$rs1, (i64 2))>; +def : Pat<(or (and (shl GPR:$rs1, (i64 4)), (i64 0xF0F0F0F0F0F0F0F0)), + (and (srl GPR:$rs1, (i64 4)), (i64 0x0F0F0F0F0F0F0F0F))), + (GREVI GPR:$rs1, (i64 4))>; +def : Pat<(or (and (shl GPR:$rs1, (i64 8)), (i64 0xFF00FF00FF00FF00)), + (and (srl GPR:$rs1, (i64 8)), (i64 0x00FF00FF00FF00FF))), + (GREVI GPR:$rs1, (i64 8))>; +def : Pat<(or (and (shl GPR:$rs1, (i64 16)), (i64 0xFFFF0000FFFF0000)), + (and (srl GPR:$rs1, (i64 16)), (i64 0x0000FFFF0000FFFF))), + (GREVI GPR:$rs1, (i64 16))>; +def : Pat<(or (shl GPR:$rs1, (i64 32)), (srl GPR:$rs1, (i64 32))), + (GREVI GPR:$rs1, (i64 32))>; +def : Pat<(rotl GPR:$rs1, (i64 32)), (GREVI GPR:$rs1, (i64 32))>; +def : Pat<(bswap GPR:$rs1), (GREVI GPR:$rs1, (i64 56))>; +def : Pat<(bitreverse GPR:$rs1), (GREVI GPR:$rs1, (i64 63))>; +} // Predicates = [HasStdExtZbp, IsRV64] + +let Predicates = [HasStdExtZbt] in { +def : Pat<(or (and (xor GPR:$rs2, -1), GPR:$rs3), (and GPR:$rs2, GPR:$rs1)), + (CMIX GPR:$rs1, GPR:$rs2, GPR:$rs3)>; +def : Pat<(riscv_selectcc GPR:$rs2, (XLenVT 0), (XLenVT 17), GPR:$rs3, GPR:$rs1), + (CMOV GPR:$rs1, GPR:$rs2, GPR:$rs3)>; +def : Pat<(fshl GPR:$rs1, GPR:$rs2, GPR:$rs3), + (FSL GPR:$rs1, GPR:$rs2, GPR:$rs3)>; +def : Pat<(fshr GPR:$rs1, GPR:$rs2, GPR:$rs3), + (FSR GPR:$rs1, GPR:$rs2, GPR:$rs3)>; +def : Pat<(fshr GPR:$rs1, GPR:$rs2, uimmlog2xlen:$shamt), + (FSRI GPR:$rs1, GPR:$rs2, uimmlog2xlen:$shamt)>; +} // Predicates = [HasStdExtZbt] + +let Predicates = [HasStdExtZbb] in { +def : Pat<(ctlz GPR:$rs1), (CLZ GPR:$rs1)>; +def : Pat<(cttz GPR:$rs1), (CTZ GPR:$rs1)>; +def : Pat<(ctpop GPR:$rs1), (PCNT GPR:$rs1)>; +} // Predicates = [HasStdExtZbb] + +let Predicates = [HasStdExtZbb, IsRV32] in +def : Pat<(sra (shl GPR:$rs1, (i32 24)), (i32 24)), (SEXTB GPR:$rs1)>; +let Predicates = [HasStdExtZbb, IsRV64] in +def : Pat<(sra (shl GPR:$rs1, (i64 56)), (i64 56)), (SEXTB GPR:$rs1)>; + +let Predicates = [HasStdExtZbb, IsRV32] in +def : Pat<(sra (shl GPR:$rs1, (i32 16)), (i32 16)), (SEXTH GPR:$rs1)>; +let Predicates = [HasStdExtZbb, IsRV64] in +def : Pat<(sra (shl GPR:$rs1, (i64 48)), (i64 48)), (SEXTH GPR:$rs1)>; + +let Predicates = [HasStdExtZbb] in { +def : Pat<(smin GPR:$rs1, GPR:$rs2), (MIN GPR:$rs1, GPR:$rs2)>; +def : Pat<(riscv_selectcc GPR:$rs1, GPR:$rs2, (XLenVT 20), GPR:$rs1, GPR:$rs2), + (MIN GPR:$rs1, GPR:$rs2)>; +def : Pat<(smax GPR:$rs1, GPR:$rs2), (MAX GPR:$rs1, GPR:$rs2)>; +def : Pat<(riscv_selectcc GPR:$rs2, GPR:$rs1, (XLenVT 20), GPR:$rs1, GPR:$rs2), + (MAX GPR:$rs1, GPR:$rs2)>; +def : Pat<(umin GPR:$rs1, GPR:$rs2), (MINU GPR:$rs1, GPR:$rs2)>; +def : Pat<(riscv_selectcc GPR:$rs1, GPR:$rs2, (XLenVT 12), GPR:$rs1, GPR:$rs2), + (MINU GPR:$rs1, GPR:$rs2)>; +def : Pat<(umax GPR:$rs1, GPR:$rs2), (MAXU GPR:$rs1, GPR:$rs2)>; +def : Pat<(riscv_selectcc GPR:$rs2, GPR:$rs1, (XLenVT 12), GPR:$rs1, GPR:$rs2), + (MAXU GPR:$rs1, GPR:$rs2)>; +} // Predicates = [HasStdExtZbb] + +let Predicates = [HasStdExtZbbOrZbp, IsRV32] in +def : Pat<(or (and GPR:$rs1, 0x0000FFFF), (shl GPR:$rs2, (i32 16))), + (PACK GPR:$rs1, GPR:$rs2)>; +let Predicates = [HasStdExtZbbOrZbp, IsRV64] in +def : Pat<(or (and GPR:$rs1, 0x00000000FFFFFFFF), (shl GPR:$rs2, (i64 32))), + (PACK GPR:$rs1, GPR:$rs2)>; +let Predicates = [HasStdExtZbbOrZbp, IsRV32] in +def : Pat<(or (and GPR:$rs2, 0xFFFF0000), (srl GPR:$rs1, (i32 16))), + (PACKU GPR:$rs1, GPR:$rs2)>; +let Predicates = [HasStdExtZbbOrZbp, IsRV64] in +def : Pat<(or (and GPR:$rs2, 0xFFFFFFFF00000000), (srl GPR:$rs1, (i64 32))), + (PACKU GPR:$rs1, GPR:$rs2)>; +let Predicates = [HasStdExtZbbOrZbp] in +def : Pat<(or (and (shl GPR:$rs2, (XLenVT 8)), 0xFF00), + (and GPR:$rs1, 0x00FF)), + (PACKH GPR:$rs1, GPR:$rs2)>; + +let Predicates = [HasStdExtZbp, IsRV32] in { +def : Pat<(or (or (and (shl GPR:$rs1, (i32 8)), (i32 0x00FF0000)), + (and GPR:$rs1, (i32 0xFF0000FF))), + (and (srl GPR:$rs1, (i32 8)), (i32 0x0000FF00))), + (SHFLI GPR:$rs1, (i32 8))>; +def : Pat<(or (or (and (shl GPR:$rs1, (i32 4)), (i32 0x0F000F00)), + (and GPR:$rs1, (i32 0xF00FF00F))), + (and (srl GPR:$rs1, (i32 4)), (i32 0x00F000F0))), + (SHFLI GPR:$rs1, (i32 4))>; +def : Pat<(or (or (and (shl GPR:$rs1, (i32 2)), (i32 0x30303030)), + (and GPR:$rs1, (i32 0xC3C3C3C3))), + (and (srl GPR:$rs1, (i32 2)), (i32 0x0C0C0C0C))), + (SHFLI GPR:$rs1, (i32 2))>; +def : Pat<(or (or (and (shl GPR:$rs1, (i32 1)), (i32 0x44444444)), + (and GPR:$rs1, (i32 0x99999999))), + (and (srl GPR:$rs1, (i32 1)), (i32 0x22222222))), + (SHFLI GPR:$rs1, (i32 1))>; +} // Predicates = [HasStdExtZbp, IsRV32] + +let Predicates = [HasStdExtZbp, IsRV64] in { +def : Pat<(or (or (and (shl GPR:$rs1, (i64 16)), (i64 0x0000FFFF00000000)), + (and GPR:$rs1, (i64 0xFFFF00000000FFFF))), + (and (srl GPR:$rs1, (i64 16)), (i64 0x00000000FFFF0000))), + (SHFLI GPR:$rs1, (i64 16))>; +def : Pat<(or (or (and (shl GPR:$rs1, (i64 8)), (i64 0x00FF000000FF0000)), + (and GPR:$rs1, (i64 0xFF0000FFFF0000FF))), + (and (srl GPR:$rs1, (i64 8)), (i64 0x0000FF000000FF00))), + (SHFLI GPR:$rs1, (i64 8))>; +def : Pat<(or (or (and (shl GPR:$rs1, (i64 4)), (i64 0x0F000F000F000F00)), + (and GPR:$rs1, (i64 0xF00FF00FF00FF00F))), + (and (srl GPR:$rs1, (i64 4)), (i64 0x00F000F000F000F0))), + (SHFLI GPR:$rs1, (i64 4))>; +def : Pat<(or (or (and (shl GPR:$rs1, (i64 2)), (i64 0x3030303030303030)), + (and GPR:$rs1, (i64 0xC3C3C3C3C3C3C3C3))), + (and (srl GPR:$rs1, (i64 2)), (i64 0x0C0C0C0C0C0C0C0C))), + (SHFLI GPR:$rs1, (i64 2))>; +def : Pat<(or (or (and (shl GPR:$rs1, (i64 1)), (i64 0x4444444444444444)), + (and GPR:$rs1, (i64 0x9999999999999999))), + (and (srl GPR:$rs1, (i64 1)), (i64 0x2222222222222222))), + (SHFLI GPR:$rs1, (i64 1))>; +} // Predicates = [HasStdExtZbp, IsRV64] + +let Predicates = [HasStdExtZbb, IsRV64] in { +def : Pat<(and (add GPR:$rs, simm12:$simm12), (i64 0xFFFFFFFF)), + (ADDIWU GPR:$rs, simm12:$simm12)>; +def : Pat<(SLLIUWPat GPR:$rs1, uimmlog2xlen:$shamt), + (SLLIUW GPR:$rs1, uimmlog2xlen:$shamt)>; +def : Pat<(and (add GPR:$rs1, GPR:$rs2), (i64 0xFFFFFFFF)), + (ADDWU GPR:$rs1, GPR:$rs2)>; +def : Pat<(and (sub GPR:$rs1, GPR:$rs2), (i64 0xFFFFFFFF)), + (SUBWU GPR:$rs1, GPR:$rs2)>; +def : Pat<(add GPR:$rs1, (and GPR:$rs2, (i64 0xFFFFFFFF))), + (ADDUW GPR:$rs1, GPR:$rs2)>; +def : Pat<(sub GPR:$rs1, (and GPR:$rs2, (i64 0xFFFFFFFF))), + (SUBUW GPR:$rs1, GPR:$rs2)>; +def : Pat<(xor (riscv_sllw (xor GPR:$rs1, -1), GPR:$rs2), -1), + (SLOW GPR:$rs1, GPR:$rs2)>; +def : Pat<(xor (riscv_srlw (xor GPR:$rs1, -1), GPR:$rs2), -1), + (SROW GPR:$rs1, GPR:$rs2)>; +} // Predicates = [HasStdExtZbb, IsRV64] + +let Predicates = [HasStdExtZbbOrZbp, IsRV64] in { +def : Pat<(or (riscv_sllw (assertsexti32 GPR:$rs1), (assertsexti32 GPR:$rs2)), + (riscv_srlw (assertsexti32 GPR:$rs1), + (sub (i64 0), (assertsexti32 GPR:$rs2)))), + (ROLW GPR:$rs1, GPR:$rs2)>; +def : Pat<(or (riscv_sllw (assertsexti32 GPR:$rs1), + (sub (i64 0), (assertsexti32 GPR:$rs2))), + (riscv_srlw (assertsexti32 GPR:$rs1), (assertsexti32 GPR:$rs2))), + (RORW GPR:$rs1, GPR:$rs2)>; +} // Predicates = [HasStdExtZbbOrZbp, IsRV64] + +let Predicates = [HasStdExtZbs, IsRV64] in { +def : Pat<(and (xor (riscv_sllw 1, (assertsexti32 GPR:$rs2)), -1), + (assertsexti32 GPR:$rs1)), + (SBCLRW GPR:$rs1, GPR:$rs2)>; +def : Pat<(or (riscv_sllw 1, (assertsexti32 GPR:$rs2)), + (assertsexti32 GPR:$rs1)), + (SBSETW GPR:$rs1, GPR:$rs2)>; +def : Pat<(xor (riscv_sllw 1, (assertsexti32 GPR:$rs2)), + (assertsexti32 GPR:$rs1)), + (SBINVW GPR:$rs1, GPR:$rs2)>; +def : Pat<(and (riscv_srlw (assertsexti32 GPR:$rs1), (assertsexti32 GPR:$rs2)), + 1), + (SBEXTW GPR:$rs1, GPR:$rs2)>; +} // Predicates = [HasStdExtZbs, IsRV64] + +let Predicates = [HasStdExtZbb, IsRV64] in { +def : Pat<(SLOIWPat GPR:$rs1, uimmlog2xlen:$shamt), + (SLOIW GPR:$rs1, uimmlog2xlen:$shamt)>; +def : Pat<(SROIWPat GPR:$rs1, uimmlog2xlen:$shamt), + (SROIW GPR:$rs1, uimmlog2xlen:$shamt)>; +} // Predicates = [HasStdExtZbb, IsRV64] + +let Predicates = [HasStdExtZbbOrZbp, IsRV64] in +def : Pat<(RORIWPat GPR:$rs1, uimmlog2xlen:$shamt), + (RORIW GPR:$rs1, uimmlog2xlen:$shamt)>; + +let Predicates = [HasStdExtZbp, IsRV64] in { +def : Pat<(sext_inreg (or (or (and (srl GPR:$rs1, (i64 1)), (i64 0x55555555)), + GPR:$rs1), + (and (shl GPR:$rs1, (i64 1)), (i64 0xAAAAAAAA))), + i32), + (GORCIW GPR:$rs1, (i64 1))>; +def : Pat<(sext_inreg (or (or (and (srl GPR:$rs1, (i64 2)), (i64 0x33333333)), + GPR:$rs1), + (and (shl GPR:$rs1, (i64 2)), (i64 0xCCCCCCCC))), + i32), + (GORCIW GPR:$rs1, (i64 2))>; +def : Pat<(sext_inreg (or (or (and (srl GPR:$rs1, (i64 4)), (i64 0x0F0F0F0F)), + GPR:$rs1), + (and (shl GPR:$rs1, (i64 4)), (i64 0xF0F0F0F0))), + i32), + (GORCIW GPR:$rs1, (i64 4))>; +def : Pat<(sext_inreg (or (or (and (srl GPR:$rs1, (i64 8)), (i64 0x00FF00FF)), + GPR:$rs1), + (and (shl GPR:$rs1, (i64 8)), (i64 0xFF00FF00))), + i32), + (GORCIW GPR:$rs1, (i64 8))>; +def : Pat<(sext_inreg (or (or (and (srl GPR:$rs1, (i64 16)), (i64 0x0000FFFF)), + GPR:$rs1), + (and (shl GPR:$rs1, (i64 16)), (i64 0xFFFF0000))), + i32), + (GORCIW GPR:$rs1, (i64 16))>; +def : Pat<(sext_inreg (or (or (srl (and GPR:$rs1, (i64 0xFFFF0000)), (i64 16)), + GPR:$rs1), + (shl GPR:$rs1, (i64 16))), i32), + (GORCIW GPR:$rs1, (i64 16))>; + +def : Pat<(sext_inreg (or (and (shl GPR:$rs1, (i64 1)), (i64 0xAAAAAAAA)), + (and (srl GPR:$rs1, (i64 1)), (i64 0x55555555))), + i32), + (GREVIW GPR:$rs1, (i64 1))>; +def : Pat<(sext_inreg (or (and (shl GPR:$rs1, (i64 2)), (i64 0xCCCCCCCC)), + (and (srl GPR:$rs1, (i64 2)), (i64 0x33333333))), + i32), + (GREVIW GPR:$rs1, (i64 2))>; +def : Pat<(sext_inreg (or (and (shl GPR:$rs1, (i64 4)), (i64 0xF0F0F0F0)), + (and (srl GPR:$rs1, (i64 4)), (i64 0x0F0F0F0F))), + i32), + (GREVIW GPR:$rs1, (i64 4))>; +def : Pat<(sext_inreg (or (and (shl GPR:$rs1, (i64 8)), (i64 0xFF00FF00)), + (and (srl GPR:$rs1, (i64 8)), (i64 0x00FF00FF))), + i32), + (GREVIW GPR:$rs1, (i64 8))>; +def : Pat<(sext_inreg (or (shl GPR:$rs1, (i64 16)), + (srl (and GPR:$rs1, 0xFFFF0000), (i64 16))), i32), + (GREVIW GPR:$rs1, (i64 16))>; +def : Pat<(sra (bswap GPR:$rs1), (i64 32)), (GREVIW GPR:$rs1, (i64 24))>; +def : Pat<(sra (bitreverse GPR:$rs1), (i64 32)), (GREVIW GPR:$rs1, (i64 31))>; +} // Predicates = [HasStdExtZbp, IsRV64] + +let Predicates = [HasStdExtZbt, IsRV64] in { +def : Pat<(riscv_selectcc (and (assertsexti32 GPR:$rs3), 31), + (i64 0), + (i64 17), + (assertsexti32 GPR:$rs1), + (or (riscv_sllw (assertsexti32 GPR:$rs1), + (and (assertsexti32 GPR:$rs3), 31)), + (riscv_srlw (assertsexti32 GPR:$rs2), + (sub (i64 32), + (assertsexti32 GPR:$rs3))))), + (FSLW GPR:$rs1, GPR:$rs2, GPR:$rs3)>; +def : Pat<(riscv_selectcc (and (assertsexti32 GPR:$rs3), 31), + (i64 0), + (i64 17), + (assertsexti32 GPR:$rs2), + (or (riscv_sllw (assertsexti32 GPR:$rs1), + (sub (i64 32), + (assertsexti32 GPR:$rs3))), + (riscv_srlw (assertsexti32 GPR:$rs2), + (and (assertsexti32 GPR:$rs3), 31)))), + (FSRW GPR:$rs1, GPR:$rs2, GPR:$rs3)>; +def : Pat<(FSRIWPat GPR:$rs1, GPR:$rs2, uimmlog2xlen:$shamt), + (FSRIW GPR:$rs1, GPR:$rs2, uimmlog2xlen:$shamt)>; +} // Predicates = [HasStdExtZbt, IsRV64] + +let Predicates = [HasStdExtZbb, IsRV64] in { +def : Pat<(add (ctlz (and GPR:$rs1, (i64 0xFFFFFFFF))), (i64 -32)), + (CLZW GPR:$rs1)>; +// We don't pattern-match CTZW here as it has the same pattern and result as +// RV64 CTZ +def : Pat<(ctpop (and GPR:$rs1, (i64 0xFFFFFFFF))), (PCNTW GPR:$rs1)>; +} // Predicates = [HasStdExtZbb, IsRV64] + +let Predicates = [HasStdExtZbbOrZbp, IsRV64] in { +def : Pat<(sext_inreg (or (shl (assertsexti32 GPR:$rs2), (i64 16)), + (and (assertsexti32 GPR:$rs1), 0x000000000000FFFF)), + i32), + (PACKW GPR:$rs1, GPR:$rs2)>; +def : Pat<(or (and (assertsexti32 GPR:$rs2), 0xFFFFFFFFFFFF0000), + (srl (and (assertsexti32 GPR:$rs1), 0x00000000FFFF0000), + (i64 16))), + (PACKUW GPR:$rs1, GPR:$rs2)>; +} // Predicates = [HasStdExtZbbOrZbp, IsRV64] diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index f8b6b7eb3aff..86aa85e965f6 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -30953,6 +30953,34 @@ bool X86TargetLowering::areJTsAllowed(const Function *Fn) const { // X86 Scheduler Hooks //===----------------------------------------------------------------------===// +// Returns true if EFLAG is consumed after this iterator in the rest of the +// basic block or any successors of the basic block. +static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr, + MachineBasicBlock *BB) { + // Scan forward through BB for a use/def of EFLAGS. + for (MachineBasicBlock::iterator miI = std::next(Itr), miE = BB->end(); + miI != miE; ++miI) { + const MachineInstr& mi = *miI; + if (mi.readsRegister(X86::EFLAGS)) + return true; + // If we found a def, we can stop searching. + if (mi.definesRegister(X86::EFLAGS)) + return false; + } + + // If we hit the end of the block, check whether EFLAGS is live into a + // successor. + for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(), + sEnd = BB->succ_end(); + sItr != sEnd; ++sItr) { + MachineBasicBlock* succ = *sItr; + if (succ->isLiveIn(X86::EFLAGS)) + return true; + } + + return false; +} + /// Utility function to emit xbegin specifying the start of an RTM region. static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, const TargetInstrInfo *TII) { @@ -30985,6 +31013,12 @@ static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, MF->insert(I, fallMBB); MF->insert(I, sinkMBB); + if (isEFLAGSLiveAfter(MI, MBB)) { + mainMBB->addLiveIn(X86::EFLAGS); + fallMBB->addLiveIn(X86::EFLAGS); + sinkMBB->addLiveIn(X86::EFLAGS); + } + // Transfer the remainder of BB and its successor edges to sinkMBB. sinkMBB->splice(sinkMBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)), MBB->end()); @@ -31373,27 +31407,8 @@ MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock* BB, const TargetRegisterInfo* TRI) { - // Scan forward through BB for a use/def of EFLAGS. - MachineBasicBlock::iterator miI(std::next(SelectItr)); - for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) { - const MachineInstr& mi = *miI; - if (mi.readsRegister(X86::EFLAGS)) - return false; - if (mi.definesRegister(X86::EFLAGS)) - break; // Should have kill-flag - update below. - } - - // If we hit the end of the block, check whether EFLAGS is live into a - // successor. - if (miI == BB->end()) { - for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(), - sEnd = BB->succ_end(); - sItr != sEnd; ++sItr) { - MachineBasicBlock* succ = *sItr; - if (succ->isLiveIn(X86::EFLAGS)) - return false; - } - } + if (isEFLAGSLiveAfter(SelectItr, BB)) + return false; // We found a def, or hit the end of the basic block and EFLAGS wasn't live // out. SelectMI should have a kill flag on EFLAGS. @@ -44349,8 +44364,8 @@ static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG, /// A horizontal-op B, for some already available A and B, and if so then LHS is /// set to A, RHS to B, and the routine returns 'true'. static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, - const X86Subtarget &Subtarget, - bool IsCommutative) { + const X86Subtarget &Subtarget, bool IsCommutative, + SmallVectorImpl<int> &PostShuffleMask) { // If either operand is undef, bail out. The binop should be simplified. if (LHS.isUndef() || RHS.isUndef()) return false; @@ -44443,6 +44458,12 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, RMask.push_back(i); } + // Avoid 128-bit lane crossing if pre-AVX2 and FP (integer will split). + if (!Subtarget.hasAVX2() && VT.isFloatingPoint() && + (isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), LMask) || + isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), RMask))) + return false; + // If A and B occur in reverse order in RHS, then canonicalize by commuting // RHS operands and shuffle mask. if (A != C) { @@ -44453,6 +44474,9 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, if (!(A == C && B == D)) return false; + PostShuffleMask.clear(); + PostShuffleMask.append(NumElts, SM_SentinelUndef); + // LHS and RHS are now: // LHS = shuffle A, B, LMask // RHS = shuffle A, B, RMask @@ -44461,6 +44485,7 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, // so we just repeat the inner loop if this is a 256-bit op. unsigned Num128BitChunks = VT.getSizeInBits() / 128; unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks; + unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2; assert((NumEltsPer128BitChunk % 2 == 0) && "Vector type should have an even number of elements in each lane"); for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) { @@ -44472,25 +44497,40 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts))) continue; + // Check that successive odd/even elements are being operated on. If not, + // this is not a horizontal operation. + if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) && + !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative)) + return false; + + // Compute the post-shuffle mask index based on where the element + // is stored in the HOP result, and where it needs to be moved to. + int Base = LIdx & ~1u; + int Index = ((Base % NumEltsPer128BitChunk) / 2) + + ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1)); + // The low half of the 128-bit result must choose from A. // The high half of the 128-bit result must choose from B, // unless B is undef. In that case, we are always choosing from A. - unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2; - unsigned Src = B.getNode() ? i >= NumEltsPer64BitChunk : 0; - - // Check that successive elements are being operated on. If not, this is - // not a horizontal operation. - int Index = 2 * (i % NumEltsPer64BitChunk) + NumElts * Src + j; - if (!(LIdx == Index && RIdx == Index + 1) && - !(IsCommutative && LIdx == Index + 1 && RIdx == Index)) - return false; + if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk)) + Index += NumEltsPer64BitChunk; + PostShuffleMask[i + j] = Index; } } LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it. RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it. - if (!shouldUseHorizontalOp(LHS == RHS && NumShuffles < 2, DAG, Subtarget)) + bool IsIdentityPostShuffle = + isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0); + if (IsIdentityPostShuffle) + PostShuffleMask.clear(); + + // Assume a SingleSource HOP if we only shuffle one input and don't need to + // shuffle the result. + if (!shouldUseHorizontalOp(LHS == RHS && + (NumShuffles < 2 || !IsIdentityPostShuffle), + DAG, Subtarget)) return false; LHS = DAG.getBitcast(VT, LHS); @@ -44509,10 +44549,16 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG, assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode"); // Try to synthesize horizontal add/sub from adds/subs of shuffles. + SmallVector<int, 8> PostShuffleMask; if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && - isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd)) - return DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS); + isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd, PostShuffleMask)) { + SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS); + if (!PostShuffleMask.empty()) + HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp, + DAG.getUNDEF(VT), PostShuffleMask); + return HorizBinOp; + } // NOTE: isHorizontalBinOp may have changed LHS/RHS variables. @@ -47605,17 +47651,22 @@ static SDValue combineAddOrSubToHADDorHSUB(SDNode *N, SelectionDAG &DAG, bool IsAdd = N->getOpcode() == ISD::ADD; assert((IsAdd || N->getOpcode() == ISD::SUB) && "Wrong opcode"); + SmallVector<int, 8> PostShuffleMask; if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasSSSE3() && - isHorizontalBinOp(Op0, Op1, DAG, Subtarget, IsAdd)) { + isHorizontalBinOp(Op0, Op1, DAG, Subtarget, IsAdd, PostShuffleMask)) { auto HOpBuilder = [IsAdd](SelectionDAG &DAG, const SDLoc &DL, ArrayRef<SDValue> Ops) { - return DAG.getNode(IsAdd ? X86ISD::HADD : X86ISD::HSUB, - DL, Ops[0].getValueType(), Ops); + return DAG.getNode(IsAdd ? X86ISD::HADD : X86ISD::HSUB, DL, + Ops[0].getValueType(), Ops); }; - return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1}, - HOpBuilder); + SDValue HorizBinOp = + SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1}, HOpBuilder); + if (!PostShuffleMask.empty()) + HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp, + DAG.getUNDEF(VT), PostShuffleMask); + return HorizBinOp; } return SDValue(); diff --git a/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp b/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp index c40901255424..cd39428b9c38 100644 --- a/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp +++ b/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp @@ -191,9 +191,11 @@ static void appendFile(std::vector<NewArchiveMember> &Members, file_magic Magic = identify_magic(MB.getBuffer()); if (Magic != file_magic::coff_object && Magic != file_magic::bitcode && - Magic != file_magic::archive && Magic != file_magic::windows_resource) { + Magic != file_magic::archive && Magic != file_magic::windows_resource && + Magic != file_magic::coff_import_library) { llvm::errs() << MB.getBufferIdentifier() - << ": not a COFF object, bitcode, archive or resource file\n"; + << ": not a COFF object, bitcode, archive, import library or " + "resource file\n"; exit(1); } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 7cfe4c8b5892..c7f2f4ec3ca1 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -1030,12 +1030,6 @@ Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II, APInt DemandedElts, int DMaskIdx) { - // FIXME: Allow v3i16/v3f16 in buffer intrinsics when the types are fully supported. - if (DMaskIdx < 0 && - II->getType()->getScalarSizeInBits() != 32 && - DemandedElts.getActiveBits() == 3) - return nullptr; - auto *IIVTy = cast<VectorType>(II->getType()); unsigned VWidth = IIVTy->getNumElements(); if (VWidth == 1) @@ -1124,6 +1118,11 @@ Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II, if (!NewNumElts) return UndefValue::get(II->getType()); + // FIXME: Allow v3i16/v3f16 in buffer and image intrinsics when the types are + // fully supported. + if (II->getType()->getScalarSizeInBits() == 16 && NewNumElts == 3) + return nullptr; + if (NewNumElts >= VWidth && DemandedElts.isMask()) { if (DMaskIdx >= 0) II->setArgOperand(DMaskIdx, Args[DMaskIdx]); diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index b3254c10a0b2..17a5ec3f87fa 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -1543,7 +1543,7 @@ Instruction *InstCombiner::foldVectorBinop(BinaryOperator &Inst) { Constant *C; if (match(&Inst, m_c_BinOp(m_OneUse(m_Shuffle(m_Value(V1), m_Undef(), m_Mask(Mask))), - m_Constant(C))) && + m_Constant(C))) && !isa<ConstantExpr>(C) && cast<FixedVectorType>(V1->getType())->getNumElements() <= NumElts) { assert(Inst.getType()->getScalarType() == V1->getType()->getScalarType() && "Shuffle should not change scalar type"); diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp index 9d0500419a7f..2f379b7f6160 100644 --- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -1859,6 +1859,14 @@ bool JumpThreadingPass::ProcessBranchOnXOR(BinaryOperator *BO) { return true; } + // If any of predecessors end with an indirect goto, we can't change its + // destination. Same for CallBr. + if (any_of(BlocksToFoldInto, [](BasicBlock *Pred) { + return isa<IndirectBrInst>(Pred->getTerminator()) || + isa<CallBrInst>(Pred->getTerminator()); + })) + return false; + // Try to duplicate BB into PredBB. return DuplicateCondBranchOnPHIIntoPred(BB, BlocksToFoldInto); } diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 5bc35aa4695f..f950d0d4eb2b 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -7397,8 +7397,17 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { // Look for the next elements with the same type. SmallVector<Value *, 4>::iterator SameTypeIt = IncIt; Type *EltTy = (*IncIt)->getType(); - unsigned EltSize = EltTy->isSized() ? DL->getTypeSizeInBits(EltTy) - : MaxVecRegSize; + + assert(EltTy->isSized() && + "Instructions should all be sized at this point"); + TypeSize EltTS = DL->getTypeSizeInBits(EltTy); + if (EltTS.isScalable()) { + // For now, just ignore vectorizing scalable types. + ++IncIt; + continue; + } + + unsigned EltSize = EltTS.getFixedSize(); unsigned MaxNumElts = MaxVecRegSize / EltSize; if (MaxNumElts < 2) { ++IncIt; diff --git a/openmp/runtime/src/kmp_ftn_entry.h b/openmp/runtime/src/kmp_ftn_entry.h index ab57907e088e..b4b0dea0d1af 100644 --- a/openmp/runtime/src/kmp_ftn_entry.h +++ b/openmp/runtime/src/kmp_ftn_entry.h @@ -939,7 +939,7 @@ void FTN_STDCALL KMP_EXPAND_NAME(FTN_SET_DEFAULT_DEVICE)(int KMP_DEREF arg) { // Get number of NON-HOST devices. // libomptarget, if loaded, provides this function in api.cpp. -int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_NUM_DEVICES)(void) KMP_WEAK_ATTRIBUTE; +int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_NUM_DEVICES)(void) KMP_WEAK_ATTRIBUTE_EXTERNAL; int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_NUM_DEVICES)(void) { #if KMP_MIC || KMP_OS_DARWIN || KMP_OS_WINDOWS || defined(KMP_STUB) return 0; @@ -957,13 +957,13 @@ int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_NUM_DEVICES)(void) { // This function always returns true when called on host device. // Compiler/libomptarget should handle when it is called inside target region. -int FTN_STDCALL KMP_EXPAND_NAME(FTN_IS_INITIAL_DEVICE)(void) KMP_WEAK_ATTRIBUTE; +int FTN_STDCALL KMP_EXPAND_NAME(FTN_IS_INITIAL_DEVICE)(void) KMP_WEAK_ATTRIBUTE_EXTERNAL; int FTN_STDCALL KMP_EXPAND_NAME(FTN_IS_INITIAL_DEVICE)(void) { return 1; // This is the host } // libomptarget, if loaded, provides this function -int FTN_STDCALL FTN_GET_INITIAL_DEVICE(void) KMP_WEAK_ATTRIBUTE; +int FTN_STDCALL FTN_GET_INITIAL_DEVICE(void) KMP_WEAK_ATTRIBUTE_EXTERNAL; int FTN_STDCALL FTN_GET_INITIAL_DEVICE(void) { #if KMP_MIC || KMP_OS_DARWIN || KMP_OS_WINDOWS || defined(KMP_STUB) return KMP_HOST_DEVICE; @@ -1318,7 +1318,7 @@ int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_MAX_TASK_PRIORITY)(void) { // This function will be defined in libomptarget. When libomptarget is not // loaded, we assume we are on the host and return KMP_HOST_DEVICE. // Compiler/libomptarget will handle this if called inside target. -int FTN_STDCALL FTN_GET_DEVICE_NUM(void) KMP_WEAK_ATTRIBUTE; +int FTN_STDCALL FTN_GET_DEVICE_NUM(void) KMP_WEAK_ATTRIBUTE_EXTERNAL; int FTN_STDCALL FTN_GET_DEVICE_NUM(void) { return KMP_HOST_DEVICE; } // Compiler will ensure that this is only called from host in sequential region diff --git a/openmp/runtime/src/kmp_os.h b/openmp/runtime/src/kmp_os.h index bfe7765b2a96..d1511904e94b 100644 --- a/openmp/runtime/src/kmp_os.h +++ b/openmp/runtime/src/kmp_os.h @@ -338,10 +338,16 @@ extern "C" { #define KMP_ALIAS(alias_of) __attribute__((alias(alias_of))) #endif +#if KMP_HAVE_WEAK_ATTRIBUTE && !KMP_DYNAMIC_LIB +#define KMP_WEAK_ATTRIBUTE_EXTERNAL __attribute__((weak)) +#else +#define KMP_WEAK_ATTRIBUTE_EXTERNAL /* Nothing */ +#endif + #if KMP_HAVE_WEAK_ATTRIBUTE -#define KMP_WEAK_ATTRIBUTE __attribute__((weak)) +#define KMP_WEAK_ATTRIBUTE_INTERNAL __attribute__((weak)) #else -#define KMP_WEAK_ATTRIBUTE /* Nothing */ +#define KMP_WEAK_ATTRIBUTE_INTERNAL /* Nothing */ #endif // Define KMP_VERSION_SYMBOL and KMP_EXPAND_NAME diff --git a/openmp/runtime/src/ompt-specific.cpp b/openmp/runtime/src/ompt-specific.cpp index a7288f08a661..9be699110fc6 100644 --- a/openmp/runtime/src/ompt-specific.cpp +++ b/openmp/runtime/src/ompt-specific.cpp @@ -27,7 +27,7 @@ #define THREAD_LOCAL __thread #endif -#define OMPT_WEAK_ATTRIBUTE KMP_WEAK_ATTRIBUTE +#define OMPT_WEAK_ATTRIBUTE KMP_WEAK_ATTRIBUTE_INTERNAL //****************************************************************************** // macros |