diff options
Diffstat (limited to 'lib/Target/NVPTX')
42 files changed, 3175 insertions, 2312 deletions
diff --git a/lib/Target/NVPTX/CMakeLists.txt b/lib/Target/NVPTX/CMakeLists.txt index 05fe06dbc07c..b67c40500861 100644 --- a/lib/Target/NVPTX/CMakeLists.txt +++ b/lib/Target/NVPTX/CMakeLists.txt @@ -18,6 +18,7 @@ set(NVPTXCodeGen_sources NVPTXISelDAGToDAG.cpp NVPTXISelLowering.cpp NVPTXImageOptimizer.cpp + NVPTXInferAddressSpaces.cpp NVPTXInstrInfo.cpp NVPTXLowerAggrCopies.cpp NVPTXLowerKernelArgs.cpp @@ -31,6 +32,7 @@ set(NVPTXCodeGen_sources NVPTXTargetMachine.cpp NVPTXTargetTransformInfo.cpp NVPTXUtilities.cpp + NVVMIntrRange.cpp NVVMReflect.cpp ) diff --git a/lib/Target/NVPTX/InstPrinter/Makefile b/lib/Target/NVPTX/InstPrinter/Makefile deleted file mode 100644 index 7b7865436bf3..000000000000 --- a/lib/Target/NVPTX/InstPrinter/Makefile +++ /dev/null @@ -1,15 +0,0 @@ -##===- lib/Target/NVPTX/AsmPrinter/Makefile ----------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## -LEVEL = ../../../.. -LIBRARYNAME = LLVMNVPTXAsmPrinter - -# Hack: we need to include 'main' ptx target directory to grab private headers -CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. - -include $(LEVEL)/Makefile.common diff --git a/lib/Target/NVPTX/MCTargetDesc/Makefile b/lib/Target/NVPTX/MCTargetDesc/Makefile deleted file mode 100644 index 31d06cb5948d..000000000000 --- a/lib/Target/NVPTX/MCTargetDesc/Makefile +++ /dev/null @@ -1,16 +0,0 @@ -##===- lib/Target/NVPTX/TargetDesc/Makefile ----------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## - -LEVEL = ../../../.. -LIBRARYNAME = LLVMNVPTXDesc - -# Hack: we need to include 'main' target directory to grab private headers -CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. - -include $(LEVEL)/Makefile.common diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp index ef36c13b49f1..78bdf4e698d8 100644 --- a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp +++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp @@ -34,13 +34,16 @@ NVPTXMCAsmInfo::NVPTXMCAsmInfo(const Triple &TheTriple) { HasSingleParameterDotFile = false; - InlineAsmStart = " inline asm"; - InlineAsmEnd = " inline asm"; + InlineAsmStart = " begin inline asm"; + InlineAsmEnd = " end inline asm"; SupportsDebugInformation = CompileForDebugging; // PTX does not allow .align on functions. HasFunctionAlignment = false; HasDotTypeDotSizeDirective = false; + // PTX does not allow .hidden or .protected + HiddenDeclarationVisibilityAttr = HiddenVisibilityAttr = MCSA_Invalid; + ProtectedVisibilityAttr = MCSA_Invalid; Data8bitsDirective = " .b8 "; Data16bitsDirective = " .b16 "; diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp index ad7302037cad..e356a965a04b 100644 --- a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp +++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp @@ -14,7 +14,6 @@ #include "NVPTXMCTargetDesc.h" #include "InstPrinter/NVPTXInstPrinter.h" #include "NVPTXMCAsmInfo.h" -#include "llvm/MC/MCCodeGenInfo.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSubtargetInfo.h" @@ -49,18 +48,6 @@ createNVPTXMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) { return createNVPTXMCSubtargetInfoImpl(TT, CPU, FS); } -static MCCodeGenInfo *createNVPTXMCCodeGenInfo(const Triple &TT, - Reloc::Model RM, - CodeModel::Model CM, - CodeGenOpt::Level OL) { - MCCodeGenInfo *X = new MCCodeGenInfo(); - - // The default relocation model is used regardless of what the client has - // specified, as it is the only relocation model currently supported. - X->initMCCodeGenInfo(Reloc::Default, CM, OL); - return X; -} - static MCInstPrinter *createNVPTXMCInstPrinter(const Triple &T, unsigned SyntaxVariant, const MCAsmInfo &MAI, @@ -77,9 +64,6 @@ extern "C" void LLVMInitializeNVPTXTargetMC() { // Register the MC asm info. RegisterMCAsmInfo<NVPTXMCAsmInfo> X(*T); - // Register the MC codegen info. - TargetRegistry::RegisterMCCodeGenInfo(*T, createNVPTXMCCodeGenInfo); - // Register the MC instruction info. TargetRegistry::RegisterMCInstrInfo(*T, createNVPTXMCInstrInfo); diff --git a/lib/Target/NVPTX/Makefile b/lib/Target/NVPTX/Makefile deleted file mode 100644 index 8db20ebed2c2..000000000000 --- a/lib/Target/NVPTX/Makefile +++ /dev/null @@ -1,23 +0,0 @@ -##===- lib/Target/NVPTX/Makefile ---------------------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## - -LEVEL = ../../.. -LIBRARYNAME = LLVMNVPTXCodeGen -TARGET = NVPTX - -# Make sure that tblgen is run, first thing. -BUILT_SOURCES = NVPTXGenAsmWriter.inc \ - NVPTXGenDAGISel.inc \ - NVPTXGenInstrInfo.inc \ - NVPTXGenRegisterInfo.inc \ - NVPTXGenSubtargetInfo.inc - -DIRS = InstPrinter TargetInfo MCTargetDesc - -include $(LEVEL)/Makefile.common diff --git a/lib/Target/NVPTX/NVPTX.h b/lib/Target/NVPTX/NVPTX.h index e5fae85bacf2..e91385ac13f2 100644 --- a/lib/Target/NVPTX/NVPTX.h +++ b/lib/Target/NVPTX/NVPTX.h @@ -46,8 +46,10 @@ FunctionPass *createNVPTXISelDag(NVPTXTargetMachine &TM, ModulePass *createNVPTXAssignValidGlobalNamesPass(); ModulePass *createGenericToNVVMPass(); FunctionPass *createNVPTXFavorNonGenericAddrSpacesPass(); -ModulePass *createNVVMReflectPass(); -ModulePass *createNVVMReflectPass(const StringMap<int>& Mapping); +FunctionPass *createNVPTXInferAddressSpacesPass(); +FunctionPass *createNVVMIntrRangePass(unsigned int SmVersion); +FunctionPass *createNVVMReflectPass(); +FunctionPass *createNVVMReflectPass(const StringMap<int> &Mapping); MachineFunctionPass *createNVPTXPrologEpilogPass(); MachineFunctionPass *createNVPTXReplaceImageHandlesPass(); FunctionPass *createNVPTXImageOptimizerPass(); @@ -55,8 +57,6 @@ FunctionPass *createNVPTXLowerKernelArgsPass(const NVPTXTargetMachine *TM); BasicBlockPass *createNVPTXLowerAllocaPass(); MachineFunctionPass *createNVPTXPeephole(); -bool isImageOrSamplerVal(const Value *, const Module *); - extern Target TheNVPTXTarget32; extern Target TheNVPTXTarget64; diff --git a/lib/Target/NVPTX/NVPTX.td b/lib/Target/NVPTX/NVPTX.td index 96abfa859119..032991a20cc9 100644 --- a/lib/Target/NVPTX/NVPTX.td +++ b/lib/Target/NVPTX/NVPTX.td @@ -44,6 +44,12 @@ def SM52 : SubtargetFeature<"sm_52", "SmVersion", "52", "Target SM 5.2">; def SM53 : SubtargetFeature<"sm_53", "SmVersion", "53", "Target SM 5.3">; +def SM60 : SubtargetFeature<"sm_60", "SmVersion", "60", + "Target SM 6.0">; +def SM61 : SubtargetFeature<"sm_61", "SmVersion", "61", + "Target SM 6.1">; +def SM62 : SubtargetFeature<"sm_62", "SmVersion", "62", + "Target SM 6.2">; // PTX Versions def PTX32 : SubtargetFeature<"ptx32", "PTXVersion", "32", @@ -54,6 +60,10 @@ def PTX41 : SubtargetFeature<"ptx41", "PTXVersion", "41", "Use PTX version 4.1">; def PTX42 : SubtargetFeature<"ptx42", "PTXVersion", "42", "Use PTX version 4.2">; +def PTX43 : SubtargetFeature<"ptx43", "PTXVersion", "43", + "Use PTX version 4.3">; +def PTX50 : SubtargetFeature<"ptx50", "PTXVersion", "50", + "Use PTX version 5.0">; //===----------------------------------------------------------------------===// // NVPTX supported processors. @@ -71,7 +81,9 @@ def : Proc<"sm_37", [SM37, PTX41]>; def : Proc<"sm_50", [SM50, PTX40]>; def : Proc<"sm_52", [SM52, PTX41]>; def : Proc<"sm_53", [SM53, PTX42]>; - +def : Proc<"sm_60", [SM60, PTX50]>; +def : Proc<"sm_61", [SM61, PTX50]>; +def : Proc<"sm_62", [SM62, PTX50]>; def NVPTXInstrInfo : InstrInfo { } diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index e8c36089a779..660016bfcd05 100644 --- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -117,7 +117,7 @@ void NVPTXAsmPrinter::emitLineNumberAsDotLoc(const MachineInstr &MI) { if (ignoreLoc(MI)) return; - DebugLoc curLoc = MI.getDebugLoc(); + const DebugLoc &curLoc = MI.getDebugLoc(); if (!prevDebugLoc && !curLoc) return; @@ -277,7 +277,7 @@ bool NVPTXAsmPrinter::lowerOperand(const MachineOperand &MO, break; case MachineOperand::MO_FPImmediate: { const ConstantFP *Cnt = MO.getFPImm(); - APFloat Val = Cnt->getValueAPF(); + const APFloat &Val = Cnt->getValueAPF(); switch (Cnt->getType()->getTypeID()) { default: report_fatal_error("Unsupported FP type"); break; @@ -432,7 +432,8 @@ bool NVPTXAsmPrinter::isLoopHeaderOfNoUnroll( continue; } if (const BasicBlock *PBB = PMBB->getBasicBlock()) { - if (MDNode *LoopID = PBB->getTerminator()->getMetadata("llvm.loop")) { + if (MDNode *LoopID = + PBB->getTerminator()->getMetadata(LLVMContext::MD_loop)) { if (GetUnrollMetadata(LoopID, "llvm.loop.unroll.disable")) return true; } @@ -798,10 +799,18 @@ void NVPTXAsmPrinter::recordAndEmitFilenames(Module &M) { if (filenameMap.find(Filename) != filenameMap.end()) continue; filenameMap[Filename] = i; + OutStreamer->EmitDwarfFileDirective(i, "", Filename); ++i; } } +static bool isEmptyXXStructor(GlobalVariable *GV) { + if (!GV) return true; + const ConstantArray *InitList = dyn_cast<ConstantArray>(GV->getInitializer()); + if (!InitList) return true; // Not an array; we don't know how to parse. + return InitList->getNumOperands() == 0; +} + bool NVPTXAsmPrinter::doInitialization(Module &M) { // Construct a default subtarget off of the TargetMachine defaults. The // rest of NVPTX isn't friendly to change subtargets per function and @@ -812,6 +821,21 @@ bool NVPTXAsmPrinter::doInitialization(Module &M) { const NVPTXTargetMachine &NTM = static_cast<const NVPTXTargetMachine &>(TM); const NVPTXSubtarget STI(TT, CPU, FS, NTM); + if (M.alias_size()) { + report_fatal_error("Module has aliases, which NVPTX does not support."); + return true; // error + } + if (!isEmptyXXStructor(M.getNamedGlobal("llvm.global_ctors"))) { + report_fatal_error( + "Module has a nontrivial global ctor, which NVPTX does not support."); + return true; // error + } + if (!isEmptyXXStructor(M.getNamedGlobal("llvm.global_dtors"))) { + report_fatal_error( + "Module has a nontrivial global dtor, which NVPTX does not support."); + return true; // error + } + SmallString<128> Str1; raw_svector_ostream OS1(Str1); @@ -1017,7 +1041,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, // Skip meta data if (GVar->hasSection()) { - if (GVar->getSection() == StringRef("llvm.metadata")) + if (GVar->getSection() == "llvm.metadata") return; } @@ -1030,7 +1054,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, // GlobalVariables are always constant pointers themselves. PointerType *PTy = GVar->getType(); - Type *ETy = PTy->getElementType(); + Type *ETy = GVar->getValueType(); if (GVar->hasExternalLinkage()) { if (GVar->hasInitializer()) @@ -1341,11 +1365,10 @@ void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar, const DataLayout &DL = getDataLayout(); // GlobalVariables are always constant pointers themselves. - PointerType *PTy = GVar->getType(); - Type *ETy = PTy->getElementType(); + Type *ETy = GVar->getValueType(); O << "."; - emitPTXAddressSpace(PTy->getAddressSpace(), O); + emitPTXAddressSpace(GVar->getType()->getAddressSpace(), O); if (GVar->getAlignment() == 0) O << " .align " << (int)DL.getPrefTypeAlignment(ETy); else @@ -1429,6 +1452,11 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { bool isABI = (nvptxSubtarget->getSmVersion() >= 20); MVT thePointerTy = TLI->getPointerTy(DL); + if (F->arg_empty()) { + O << "()\n"; + return; + } + O << "(\n"; for (I = F->arg_begin(), E = F->arg_end(); I != E; ++I, paramIndex++) { @@ -1715,9 +1743,8 @@ void NVPTXAsmPrinter::printScalarConstant(const Constant *CPV, raw_ostream &O) { return; } if (const GlobalValue *GVar = dyn_cast<GlobalValue>(CPV)) { - PointerType *PTy = dyn_cast<PointerType>(GVar->getType()); bool IsNonGenericPointer = false; - if (PTy && PTy->getAddressSpace() != 0) { + if (GVar->getType()->getAddressSpace() != 0) { IsNonGenericPointer = true; } if (EmitGeneric && !isa<Function>(CPV) && !IsNonGenericPointer) { @@ -1883,8 +1910,7 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes, case Type::ArrayTyID: case Type::VectorTyID: case Type::StructTyID: { - if (isa<ConstantArray>(CPV) || isa<ConstantVector>(CPV) || - isa<ConstantStruct>(CPV) || isa<ConstantDataSequential>(CPV)) { + if (isa<ConstantAggregate>(CPV) || isa<ConstantDataSequential>(CPV)) { int ElementSize = DL.getTypeAllocSize(CPV->getType()); bufferAggregateConstant(CPV, aggBuffer); if (Bytes > ElementSize) @@ -2315,7 +2341,7 @@ void NVPTXAsmPrinter::emitSrcInText(StringRef filename, unsigned line) { this->OutStreamer->EmitRawText(temp.str()); } -LineReader *NVPTXAsmPrinter::getReader(std::string filename) { +LineReader *NVPTXAsmPrinter::getReader(const std::string &filename) { if (!reader) { reader = new LineReader(filename); } diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.h b/lib/Target/NVPTX/NVPTXAsmPrinter.h index 76bf179896a8..85660fbdb26e 100644 --- a/lib/Target/NVPTX/NVPTXAsmPrinter.h +++ b/lib/Target/NVPTX/NVPTXAsmPrinter.h @@ -18,14 +18,14 @@ #include "NVPTX.h" #include "NVPTXSubtarget.h" #include "NVPTXTargetMachine.h" -#include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringExtras.h" #include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/IR/Function.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" -#include "llvm/Support/CommandLine.h" #include "llvm/Support/FormattedStream.h" #include "llvm/Target/TargetMachine.h" #include <fstream> @@ -293,7 +293,7 @@ private: bool isLoopHeaderOfNoUnroll(const MachineBasicBlock &MBB) const; LineReader *reader; - LineReader *getReader(std::string); + LineReader *getReader(const std::string &); // Used to control the need to emit .generic() in the initializer of // module scope variables. diff --git a/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp b/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp index 95813c8430d1..7c5a54162d77 100644 --- a/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp +++ b/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp @@ -7,6 +7,9 @@ // //===----------------------------------------------------------------------===// // +// FIXME: This pass is deprecated in favor of NVPTXInferAddressSpaces, which +// uses a new algorithm that handles pointer induction variables. +// // When a load/store accesses the generic address space, checks whether the // address is casted from a non-generic address space. If so, remove this // addrspacecast because accessing non-generic address spaces is typically @@ -164,8 +167,8 @@ Value *NVPTXFavorNonGenericAddrSpaces::hoistAddrSpaceCastFromGEP( GEP->getSourceElementType(), Cast->getOperand(0), Indices, "", GEPI); NewGEP->setIsInBounds(GEP->isInBounds()); + NewGEP->takeName(GEP); NewASC = new AddrSpaceCastInst(NewGEP, GEP->getType(), "", GEPI); - NewASC->takeName(GEP); // Without RAUWing GEP, the compiler would visit GEP again and emit // redundant instructions. This is exercised in test @rauw in // access-non-generic.ll. @@ -263,7 +266,7 @@ bool NVPTXFavorNonGenericAddrSpaces::optimizeMemoryInstruction(Instruction *MI, } bool NVPTXFavorNonGenericAddrSpaces::runOnFunction(Function &F) { - if (DisableFavorNonGeneric) + if (DisableFavorNonGeneric || skipFunction(F)) return false; bool Changed = false; diff --git a/lib/Target/NVPTX/NVPTXFrameLowering.cpp b/lib/Target/NVPTX/NVPTXFrameLowering.cpp index 9b34aef3fdec..bbcb497ead9d 100644 --- a/lib/Target/NVPTX/NVPTXFrameLowering.cpp +++ b/lib/Target/NVPTX/NVPTXFrameLowering.cpp @@ -16,7 +16,6 @@ #include "NVPTXRegisterInfo.h" #include "NVPTXSubtarget.h" #include "NVPTXTargetMachine.h" -#include "llvm/ADT/BitVector.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -35,7 +34,7 @@ void NVPTXFrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { if (MF.getFrameInfo()->hasStackObjects()) { assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); - MachineInstr *MI = MBB.begin(); + MachineInstr *MI = &MBB.front(); MachineRegisterInfo &MR = MF.getRegInfo(); // This instruction really occurs before first instruction @@ -70,10 +69,10 @@ void NVPTXFrameLowering::emitEpilogue(MachineFunction &MF, // This function eliminates ADJCALLSTACKDOWN, // ADJCALLSTACKUP pseudo instructions -void NVPTXFrameLowering::eliminateCallFramePseudoInstr( +MachineBasicBlock::iterator NVPTXFrameLowering::eliminateCallFramePseudoInstr( MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { // Simply discard ADJCALLSTACKDOWN, // ADJCALLSTACKUP instructions. - MBB.erase(I); + return MBB.erase(I); } diff --git a/lib/Target/NVPTX/NVPTXFrameLowering.h b/lib/Target/NVPTX/NVPTXFrameLowering.h index 14f8bb7b98fe..320ca9a2f095 100644 --- a/lib/Target/NVPTX/NVPTXFrameLowering.h +++ b/lib/Target/NVPTX/NVPTXFrameLowering.h @@ -26,7 +26,7 @@ public: void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; - void + MachineBasicBlock::iterator eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const override; }; diff --git a/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp b/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp index 62ca5e9f9f62..66a964082c5f 100644 --- a/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp +++ b/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp @@ -86,7 +86,7 @@ bool GenericToNVVM::runOnModule(Module &M) { !llvm::isTexture(*GV) && !llvm::isSurface(*GV) && !llvm::isSampler(*GV) && !GV->getName().startswith("llvm.")) { GlobalVariable *NewGV = new GlobalVariable( - M, GV->getType()->getElementType(), GV->isConstant(), + M, GV->getValueType(), GV->isConstant(), GV->getLinkage(), GV->hasInitializer() ? GV->getInitializer() : nullptr, "", GV, GV->getThreadLocalMode(), llvm::ADDRESS_SPACE_GLOBAL); @@ -172,7 +172,7 @@ Value *GenericToNVVM::getOrInsertCVTA(Module *M, Function *F, // See if the address space conversion requires the operand to be bitcast // to i8 addrspace(n)* first. - EVT ExtendedGVType = EVT::getEVT(GVType->getElementType(), true); + EVT ExtendedGVType = EVT::getEVT(GV->getValueType(), true); if (!ExtendedGVType.isInteger() && !ExtendedGVType.isFloatingPoint()) { // A bitcast to i8 addrspace(n)* on the operand is needed. LLVMContext &Context = M->getContext(); @@ -182,21 +182,18 @@ Value *GenericToNVVM::getOrInsertCVTA(Module *M, Function *F, // Insert the address space conversion. Type *ResultType = PointerType::get(Type::getInt8Ty(Context), llvm::ADDRESS_SPACE_GENERIC); - SmallVector<Type *, 2> ParamTypes; - ParamTypes.push_back(ResultType); - ParamTypes.push_back(DestTy); Function *CVTAFunction = Intrinsic::getDeclaration( - M, Intrinsic::nvvm_ptr_global_to_gen, ParamTypes); + M, Intrinsic::nvvm_ptr_global_to_gen, {ResultType, DestTy}); CVTA = Builder.CreateCall(CVTAFunction, CVTA, "cvta"); // Another bitcast from i8 * to <the element type of GVType> * is // required. DestTy = - PointerType::get(GVType->getElementType(), llvm::ADDRESS_SPACE_GENERIC); + PointerType::get(GV->getValueType(), llvm::ADDRESS_SPACE_GENERIC); CVTA = Builder.CreateBitCast(CVTA, DestTy, "cvta"); } else { // A simple CVTA is enough. SmallVector<Type *, 2> ParamTypes; - ParamTypes.push_back(PointerType::get(GVType->getElementType(), + ParamTypes.push_back(PointerType::get(GV->getValueType(), llvm::ADDRESS_SPACE_GENERIC)); ParamTypes.push_back(GVType); Function *CVTAFunction = Intrinsic::getDeclaration( @@ -230,8 +227,7 @@ Value *GenericToNVVM::remapConstant(Module *M, Function *F, Constant *C, if (I != GVMap.end()) { NewValue = getOrInsertCVTA(M, F, I->second, Builder); } - } else if (isa<ConstantVector>(C) || isa<ConstantArray>(C) || - isa<ConstantStruct>(C)) { + } else if (isa<ConstantAggregate>(C)) { // If any element in the constant vector or aggregate C is or uses a global // variable in GVMap, the constant C needs to be reconstructed, using a set // of instructions. diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 2d0098b392f4..61c6758ef118 100644 --- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -105,57 +105,66 @@ bool NVPTXDAGToDAGISel::allowFMA() const { /// Select - Select instructions not customized! Used for /// expanded, promoted and normal instructions. -SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) { +void NVPTXDAGToDAGISel::Select(SDNode *N) { if (N->isMachineOpcode()) { N->setNodeId(-1); - return nullptr; // Already selected. + return; // Already selected. } - SDNode *ResNode = nullptr; switch (N->getOpcode()) { case ISD::LOAD: - ResNode = SelectLoad(N); + if (tryLoad(N)) + return; break; case ISD::STORE: - ResNode = SelectStore(N); + if (tryStore(N)) + return; break; case NVPTXISD::LoadV2: case NVPTXISD::LoadV4: - ResNode = SelectLoadVector(N); + if (tryLoadVector(N)) + return; break; case NVPTXISD::LDGV2: case NVPTXISD::LDGV4: case NVPTXISD::LDUV2: case NVPTXISD::LDUV4: - ResNode = SelectLDGLDU(N); + if (tryLDGLDU(N)) + return; break; case NVPTXISD::StoreV2: case NVPTXISD::StoreV4: - ResNode = SelectStoreVector(N); + if (tryStoreVector(N)) + return; break; case NVPTXISD::LoadParam: case NVPTXISD::LoadParamV2: case NVPTXISD::LoadParamV4: - ResNode = SelectLoadParam(N); + if (tryLoadParam(N)) + return; break; case NVPTXISD::StoreRetval: case NVPTXISD::StoreRetvalV2: case NVPTXISD::StoreRetvalV4: - ResNode = SelectStoreRetval(N); + if (tryStoreRetval(N)) + return; break; case NVPTXISD::StoreParam: case NVPTXISD::StoreParamV2: case NVPTXISD::StoreParamV4: case NVPTXISD::StoreParamS32: case NVPTXISD::StoreParamU32: - ResNode = SelectStoreParam(N); + if (tryStoreParam(N)) + return; break; case ISD::INTRINSIC_WO_CHAIN: - ResNode = SelectIntrinsicNoChain(N); + if (tryIntrinsicNoChain(N)) + return; break; case ISD::INTRINSIC_W_CHAIN: - ResNode = SelectIntrinsicChain(N); + if (tryIntrinsicChain(N)) + return; break; case NVPTXISD::Tex1DFloatS32: case NVPTXISD::Tex1DFloatFloat: @@ -325,7 +334,8 @@ SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) { case NVPTXISD::Tld4UnifiedG2DU64Float: case NVPTXISD::Tld4UnifiedB2DU64Float: case NVPTXISD::Tld4UnifiedA2DU64Float: - ResNode = SelectTextureIntrinsic(N); + if (tryTextureIntrinsic(N)) + return; break; case NVPTXISD::Suld1DI8Clamp: case NVPTXISD::Suld1DI16Clamp: @@ -492,37 +502,37 @@ SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) { case NVPTXISD::Suld3DV4I8Zero: case NVPTXISD::Suld3DV4I16Zero: case NVPTXISD::Suld3DV4I32Zero: - ResNode = SelectSurfaceIntrinsic(N); + if (trySurfaceIntrinsic(N)) + return; break; case ISD::AND: case ISD::SRA: case ISD::SRL: // Try to select BFE - ResNode = SelectBFE(N); + if (tryBFE(N)) + return; break; case ISD::ADDRSPACECAST: - ResNode = SelectAddrSpaceCast(N); - break; + SelectAddrSpaceCast(N); + return; default: break; } - if (ResNode) - return ResNode; - return SelectCode(N); + SelectCode(N); } -SDNode *NVPTXDAGToDAGISel::SelectIntrinsicChain(SDNode *N) { +bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) { unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); switch (IID) { default: - return NULL; + return false; case Intrinsic::nvvm_ldg_global_f: case Intrinsic::nvvm_ldg_global_i: case Intrinsic::nvvm_ldg_global_p: case Intrinsic::nvvm_ldu_global_f: case Intrinsic::nvvm_ldu_global_i: case Intrinsic::nvvm_ldu_global_p: - return SelectLDGLDU(N); + return tryLDGLDU(N); } } @@ -579,25 +589,26 @@ static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget, return true; } -SDNode *NVPTXDAGToDAGISel::SelectIntrinsicNoChain(SDNode *N) { +bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) { unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); switch (IID) { default: - return nullptr; + return false; case Intrinsic::nvvm_texsurf_handle_internal: - return SelectTexSurfHandle(N); + SelectTexSurfHandle(N); + return true; } } -SDNode *NVPTXDAGToDAGISel::SelectTexSurfHandle(SDNode *N) { +void NVPTXDAGToDAGISel::SelectTexSurfHandle(SDNode *N) { // Op 0 is the intrinsic ID SDValue Wrapper = N->getOperand(1); SDValue GlobalVal = Wrapper.getOperand(0); - return CurDAG->getMachineNode(NVPTX::texsurf_handles, SDLoc(N), MVT::i64, - GlobalVal); + ReplaceNode(N, CurDAG->getMachineNode(NVPTX::texsurf_handles, SDLoc(N), + MVT::i64, GlobalVal)); } -SDNode *NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { +void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { SDValue Src = N->getOperand(0); AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N); unsigned SrcAddrSpace = CastN->getSrcAddressSpace(); @@ -624,7 +635,9 @@ SDNode *NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { Opc = TM.is64Bit() ? NVPTX::cvta_local_yes_64 : NVPTX::cvta_local_yes; break; } - return CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0), Src); + ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0), + Src)); + return; } else { // Generic to specific if (SrcAddrSpace != 0) @@ -653,11 +666,13 @@ SDNode *NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { : NVPTX::nvvm_ptr_gen_to_param; break; } - return CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0), Src); + ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0), + Src)); + return; } } -SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) { +bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { SDLoc dl(N); LoadSDNode *LD = cast<LoadSDNode>(N); EVT LoadedVT = LD->getMemoryVT(); @@ -665,16 +680,16 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) { // do not support pre/post inc/dec if (LD->isIndexed()) - return nullptr; + return false; if (!LoadedVT.isSimple()) - return nullptr; + return false; // Address Space Setting unsigned int codeAddrSpace = getCodeAddrSpace(LD); if (canLowerToLDG(LD, *Subtarget, codeAddrSpace, MF)) { - return SelectLDGLDU(N); + return tryLDGLDU(N); } // Volatile Setting @@ -695,7 +710,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) { else if (num == 4) vecType = NVPTX::PTXLdStInstCode::V4; else - return nullptr; + return false; } // Type Setting: fromType + fromTypeWidth @@ -744,7 +759,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) { Opcode = NVPTX::LD_f64_avar; break; default: - return nullptr; + return false; } SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(codeAddrSpace, dl), getI32Imm(vecType, dl), getI32Imm(fromType, dl), @@ -772,7 +787,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) { Opcode = NVPTX::LD_f64_asi; break; default: - return nullptr; + return false; } SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(codeAddrSpace, dl), getI32Imm(vecType, dl), getI32Imm(fromType, dl), @@ -801,7 +816,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) { Opcode = NVPTX::LD_f64_ari_64; break; default: - return nullptr; + return false; } } else { switch (TargetVT) { @@ -824,7 +839,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) { Opcode = NVPTX::LD_f64_ari; break; default: - return nullptr; + return false; } } SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(codeAddrSpace, dl), @@ -853,7 +868,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) { Opcode = NVPTX::LD_f64_areg_64; break; default: - return nullptr; + return false; } } else { switch (TargetVT) { @@ -876,7 +891,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) { Opcode = NVPTX::LD_f64_areg; break; default: - return nullptr; + return false; } } SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(codeAddrSpace, dl), @@ -885,16 +900,18 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) { NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT, MVT::Other, Ops); } - if (NVPTXLD) { - MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1); - MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand(); - cast<MachineSDNode>(NVPTXLD)->setMemRefs(MemRefs0, MemRefs0 + 1); - } + if (!NVPTXLD) + return false; + + MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1); + MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand(); + cast<MachineSDNode>(NVPTXLD)->setMemRefs(MemRefs0, MemRefs0 + 1); - return NVPTXLD; + ReplaceNode(N, NVPTXLD); + return true; } -SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) { +bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { SDValue Chain = N->getOperand(0); SDValue Op1 = N->getOperand(1); @@ -906,13 +923,13 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) { EVT LoadedVT = MemSD->getMemoryVT(); if (!LoadedVT.isSimple()) - return nullptr; + return false; // Address Space Setting unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD); if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) { - return SelectLDGLDU(N); + return tryLDGLDU(N); } // Volatile Setting @@ -956,7 +973,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) { VecType = NVPTX::PTXLdStInstCode::V4; break; default: - return nullptr; + return false; } EVT EltVT = N->getValueType(0); @@ -964,11 +981,11 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) { if (SelectDirectAddr(Op1, Addr)) { switch (N->getOpcode()) { default: - return nullptr; + return false; case NVPTXISD::LoadV2: switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::LDV_i8_v2_avar; break; @@ -992,7 +1009,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) { case NVPTXISD::LoadV4: switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::LDV_i8_v4_avar; break; @@ -1017,11 +1034,11 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) { : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) { switch (N->getOpcode()) { default: - return nullptr; + return false; case NVPTXISD::LoadV2: switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::LDV_i8_v2_asi; break; @@ -1045,7 +1062,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) { case NVPTXISD::LoadV4: switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::LDV_i8_v4_asi; break; @@ -1071,11 +1088,11 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) { if (TM.is64Bit()) { switch (N->getOpcode()) { default: - return nullptr; + return false; case NVPTXISD::LoadV2: switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::LDV_i8_v2_ari_64; break; @@ -1099,7 +1116,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) { case NVPTXISD::LoadV4: switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::LDV_i8_v4_ari_64; break; @@ -1118,11 +1135,11 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) { } else { switch (N->getOpcode()) { default: - return nullptr; + return false; case NVPTXISD::LoadV2: switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::LDV_i8_v2_ari; break; @@ -1146,7 +1163,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) { case NVPTXISD::LoadV4: switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::LDV_i8_v4_ari; break; @@ -1173,11 +1190,11 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) { if (TM.is64Bit()) { switch (N->getOpcode()) { default: - return nullptr; + return false; case NVPTXISD::LoadV2: switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::LDV_i8_v2_areg_64; break; @@ -1201,7 +1218,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) { case NVPTXISD::LoadV4: switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::LDV_i8_v4_areg_64; break; @@ -1220,11 +1237,11 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) { } else { switch (N->getOpcode()) { default: - return nullptr; + return false; case NVPTXISD::LoadV2: switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::LDV_i8_v2_areg; break; @@ -1248,7 +1265,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) { case NVPTXISD::LoadV4: switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::LDV_i8_v4_areg; break; @@ -1276,17 +1293,18 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) { MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand(); cast<MachineSDNode>(LD)->setMemRefs(MemRefs0, MemRefs0 + 1); - return LD; + ReplaceNode(N, LD); + return true; } -SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { +bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) { SDValue Chain = N->getOperand(0); SDValue Op1; MemSDNode *Mem; bool IsLDG = true; - // If this is an LDG intrinsic, the address is the third operand. Its its an + // If this is an LDG intrinsic, the address is the third operand. If its an // LDG/LDU SD node (from custom vector handling), then its the second operand if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) { Op1 = N->getOperand(2); @@ -1294,7 +1312,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); switch (IID) { default: - return NULL; + return false; case Intrinsic::nvvm_ldg_global_f: case Intrinsic::nvvm_ldg_global_i: case Intrinsic::nvvm_ldg_global_p: @@ -1317,19 +1335,32 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { SDValue Base, Offset, Addr; EVT EltVT = Mem->getMemoryVT(); + unsigned NumElts = 1; if (EltVT.isVector()) { + NumElts = EltVT.getVectorNumElements(); EltVT = EltVT.getVectorElementType(); } + // Build the "promoted" result VTList for the load. If we are really loading + // i8s, then the return type will be promoted to i16 since we do not expose + // 8-bit registers in NVPTX. + EVT NodeVT = (EltVT == MVT::i8) ? MVT::i16 : EltVT; + SmallVector<EVT, 5> InstVTs; + for (unsigned i = 0; i != NumElts; ++i) { + InstVTs.push_back(NodeVT); + } + InstVTs.push_back(MVT::Other); + SDVTList InstVTList = CurDAG->getVTList(InstVTs); + if (SelectDirectAddr(Op1, Addr)) { switch (N->getOpcode()) { default: - return nullptr; + return false; case ISD::INTRINSIC_W_CHAIN: if (IsLDG) { switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8avar; break; @@ -1352,7 +1383,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { } else { switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8avar; break; @@ -1377,7 +1408,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { case NVPTXISD::LDGV2: switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar; break; @@ -1401,7 +1432,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { case NVPTXISD::LDUV2: switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar; break; @@ -1425,7 +1456,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { case NVPTXISD::LDGV4: switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar; break; @@ -1443,7 +1474,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { case NVPTXISD::LDUV4: switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar; break; @@ -1461,19 +1492,19 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { } SDValue Ops[] = { Addr, Chain }; - LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops); + LD = CurDAG->getMachineNode(Opcode, DL, InstVTList, Ops); } else if (TM.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset) : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) { if (TM.is64Bit()) { switch (N->getOpcode()) { default: - return nullptr; + return false; case ISD::LOAD: case ISD::INTRINSIC_W_CHAIN: if (IsLDG) { switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8ari64; break; @@ -1496,7 +1527,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { } else { switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8ari64; break; @@ -1522,7 +1553,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { case NVPTXISD::LDGV2: switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64; break; @@ -1546,7 +1577,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { case NVPTXISD::LDUV2: switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari64; break; @@ -1571,7 +1602,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { case NVPTXISD::LDGV4: switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64; break; @@ -1589,7 +1620,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { case NVPTXISD::LDUV4: switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari64; break; @@ -1608,13 +1639,13 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { } else { switch (N->getOpcode()) { default: - return nullptr; + return false; case ISD::LOAD: case ISD::INTRINSIC_W_CHAIN: if (IsLDG) { switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8ari; break; @@ -1637,7 +1668,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { } else { switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8ari; break; @@ -1663,7 +1694,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { case NVPTXISD::LDGV2: switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32; break; @@ -1687,7 +1718,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { case NVPTXISD::LDUV2: switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32; break; @@ -1712,7 +1743,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { case NVPTXISD::LDGV4: switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32; break; @@ -1730,7 +1761,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { case NVPTXISD::LDUV4: switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari32; break; @@ -1750,18 +1781,18 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { SDValue Ops[] = { Base, Offset, Chain }; - LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops); + LD = CurDAG->getMachineNode(Opcode, DL, InstVTList, Ops); } else { if (TM.is64Bit()) { switch (N->getOpcode()) { default: - return nullptr; + return false; case ISD::LOAD: case ISD::INTRINSIC_W_CHAIN: if (IsLDG) { switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8areg64; break; @@ -1784,7 +1815,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { } else { switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8areg64; break; @@ -1810,7 +1841,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { case NVPTXISD::LDGV2: switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64; break; @@ -1834,7 +1865,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { case NVPTXISD::LDUV2: switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg64; break; @@ -1859,7 +1890,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { case NVPTXISD::LDGV4: switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64; break; @@ -1877,7 +1908,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { case NVPTXISD::LDUV4: switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg64; break; @@ -1896,13 +1927,13 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { } else { switch (N->getOpcode()) { default: - return nullptr; + return false; case ISD::LOAD: case ISD::INTRINSIC_W_CHAIN: if (IsLDG) { switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8areg; break; @@ -1925,7 +1956,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { } else { switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8areg; break; @@ -1951,7 +1982,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { case NVPTXISD::LDGV2: switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32; break; @@ -1975,7 +2006,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { case NVPTXISD::LDUV2: switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg32; break; @@ -2000,7 +2031,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { case NVPTXISD::LDGV4: switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32; break; @@ -2018,7 +2049,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { case NVPTXISD::LDUV4: switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg32; break; @@ -2037,17 +2068,54 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { } SDValue Ops[] = { Op1, Chain }; - LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops); + LD = CurDAG->getMachineNode(Opcode, DL, InstVTList, Ops); } MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1); MemRefs0[0] = Mem->getMemOperand(); cast<MachineSDNode>(LD)->setMemRefs(MemRefs0, MemRefs0 + 1); - return LD; + // For automatic generation of LDG (through SelectLoad[Vector], not the + // intrinsics), we may have an extending load like: + // + // i32,ch = load<LD1[%data1(addrspace=1)], zext from i8> t0, t7, undef:i64 + // + // In this case, the matching logic above will select a load for the original + // memory type (in this case, i8) and our types will not match (the node needs + // to return an i32 in this case). Our LDG/LDU nodes do not support the + // concept of sign-/zero-extension, so emulate it here by adding an explicit + // CVT instruction. Ptxas should clean up any redundancies here. + + EVT OrigType = N->getValueType(0); + LoadSDNode *LdNode = dyn_cast<LoadSDNode>(N); + + if (OrigType != EltVT && LdNode) { + // We have an extending-load. The instruction we selected operates on the + // smaller type, but the SDNode we are replacing has the larger type. We + // need to emit a CVT to make the types match. + bool IsSigned = LdNode->getExtensionType() == ISD::SEXTLOAD; + unsigned CvtOpc = GetConvertOpcode(OrigType.getSimpleVT(), + EltVT.getSimpleVT(), IsSigned); + + // For each output value, apply the manual sign/zero-extension and make sure + // all users of the load go through that CVT. + for (unsigned i = 0; i != NumElts; ++i) { + SDValue Res(LD, i); + SDValue OrigVal(N, i); + + SDNode *CvtNode = + CurDAG->getMachineNode(CvtOpc, DL, OrigType, Res, + CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, + DL, MVT::i32)); + ReplaceUses(OrigVal, SDValue(CvtNode, 0)); + } + } + + ReplaceNode(N, LD); + return true; } -SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) { +bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { SDLoc dl(N); StoreSDNode *ST = cast<StoreSDNode>(N); EVT StoreVT = ST->getMemoryVT(); @@ -2055,10 +2123,10 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) { // do not support pre/post inc/dec if (ST->isIndexed()) - return nullptr; + return false; if (!StoreVT.isSimple()) - return nullptr; + return false; // Address Space Setting unsigned int codeAddrSpace = getCodeAddrSpace(ST); @@ -2081,7 +2149,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) { else if (num == 4) vecType = NVPTX::PTXLdStInstCode::V4; else - return nullptr; + return false; } // Type Setting: toType + toTypeWidth @@ -2125,7 +2193,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) { Opcode = NVPTX::ST_f64_avar; break; default: - return nullptr; + return false; } SDValue Ops[] = { N1, getI32Imm(isVolatile, dl), getI32Imm(codeAddrSpace, dl), getI32Imm(vecType, dl), @@ -2154,7 +2222,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) { Opcode = NVPTX::ST_f64_asi; break; default: - return nullptr; + return false; } SDValue Ops[] = { N1, getI32Imm(isVolatile, dl), getI32Imm(codeAddrSpace, dl), getI32Imm(vecType, dl), @@ -2184,7 +2252,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) { Opcode = NVPTX::ST_f64_ari_64; break; default: - return nullptr; + return false; } } else { switch (SourceVT) { @@ -2207,7 +2275,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) { Opcode = NVPTX::ST_f64_ari; break; default: - return nullptr; + return false; } } SDValue Ops[] = { N1, getI32Imm(isVolatile, dl), @@ -2237,7 +2305,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) { Opcode = NVPTX::ST_f64_areg_64; break; default: - return nullptr; + return false; } } else { switch (SourceVT) { @@ -2260,7 +2328,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) { Opcode = NVPTX::ST_f64_areg; break; default: - return nullptr; + return false; } } SDValue Ops[] = { N1, getI32Imm(isVolatile, dl), @@ -2270,16 +2338,17 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) { NVPTXST = CurDAG->getMachineNode(Opcode, dl, MVT::Other, Ops); } - if (NVPTXST) { - MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1); - MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand(); - cast<MachineSDNode>(NVPTXST)->setMemRefs(MemRefs0, MemRefs0 + 1); - } + if (!NVPTXST) + return false; - return NVPTXST; + MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1); + MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand(); + cast<MachineSDNode>(NVPTXST)->setMemRefs(MemRefs0, MemRefs0 + 1); + ReplaceNode(N, NVPTXST); + return true; } -SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) { +bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { SDValue Chain = N->getOperand(0); SDValue Op1 = N->getOperand(1); SDValue Addr, Offset, Base; @@ -2337,7 +2406,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) { N2 = N->getOperand(5); break; default: - return nullptr; + return false; } StOps.push_back(getI32Imm(IsVolatile, DL)); @@ -2349,11 +2418,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) { if (SelectDirectAddr(N2, Addr)) { switch (N->getOpcode()) { default: - return nullptr; + return false; case NVPTXISD::StoreV2: switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::STV_i8_v2_avar; break; @@ -2377,7 +2446,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) { case NVPTXISD::StoreV4: switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::STV_i8_v4_avar; break; @@ -2398,11 +2467,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) { : SelectADDRsi(N2.getNode(), N2, Base, Offset)) { switch (N->getOpcode()) { default: - return nullptr; + return false; case NVPTXISD::StoreV2: switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::STV_i8_v2_asi; break; @@ -2426,7 +2495,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) { case NVPTXISD::StoreV4: switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::STV_i8_v4_asi; break; @@ -2449,11 +2518,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) { if (TM.is64Bit()) { switch (N->getOpcode()) { default: - return nullptr; + return false; case NVPTXISD::StoreV2: switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::STV_i8_v2_ari_64; break; @@ -2477,7 +2546,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) { case NVPTXISD::StoreV4: switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::STV_i8_v4_ari_64; break; @@ -2496,11 +2565,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) { } else { switch (N->getOpcode()) { default: - return nullptr; + return false; case NVPTXISD::StoreV2: switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::STV_i8_v2_ari; break; @@ -2524,7 +2593,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) { case NVPTXISD::StoreV4: switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::STV_i8_v4_ari; break; @@ -2547,11 +2616,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) { if (TM.is64Bit()) { switch (N->getOpcode()) { default: - return nullptr; + return false; case NVPTXISD::StoreV2: switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::STV_i8_v2_areg_64; break; @@ -2575,7 +2644,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) { case NVPTXISD::StoreV4: switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::STV_i8_v4_areg_64; break; @@ -2594,11 +2663,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) { } else { switch (N->getOpcode()) { default: - return nullptr; + return false; case NVPTXISD::StoreV2: switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::STV_i8_v2_areg; break; @@ -2622,7 +2691,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) { case NVPTXISD::StoreV4: switch (EltVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i8: Opcode = NVPTX::STV_i8_v4_areg; break; @@ -2650,10 +2719,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) { MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand(); cast<MachineSDNode>(ST)->setMemRefs(MemRefs0, MemRefs0 + 1); - return ST; + ReplaceNode(N, ST); + return true; } -SDNode *NVPTXDAGToDAGISel::SelectLoadParam(SDNode *Node) { +bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) { SDValue Chain = Node->getOperand(0); SDValue Offset = Node->getOperand(2); SDValue Flag = Node->getOperand(3); @@ -2663,7 +2733,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadParam(SDNode *Node) { unsigned VecSize; switch (Node->getOpcode()) { default: - return nullptr; + return false; case NVPTXISD::LoadParam: VecSize = 1; break; @@ -2682,11 +2752,11 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadParam(SDNode *Node) { switch (VecSize) { default: - return nullptr; + return false; case 1: switch (MemVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i1: Opc = NVPTX::LoadParamMemI8; break; @@ -2713,7 +2783,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadParam(SDNode *Node) { case 2: switch (MemVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i1: Opc = NVPTX::LoadParamMemV2I8; break; @@ -2740,7 +2810,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadParam(SDNode *Node) { case 4: switch (MemVT.getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i1: Opc = NVPTX::LoadParamMemV4I8; break; @@ -2777,10 +2847,11 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadParam(SDNode *Node) { Ops.push_back(Chain); Ops.push_back(Flag); - return CurDAG->getMachineNode(Opc, DL, VTs, Ops); + ReplaceNode(Node, CurDAG->getMachineNode(Opc, DL, VTs, Ops)); + return true; } -SDNode *NVPTXDAGToDAGISel::SelectStoreRetval(SDNode *N) { +bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) { SDLoc DL(N); SDValue Chain = N->getOperand(0); SDValue Offset = N->getOperand(1); @@ -2791,7 +2862,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreRetval(SDNode *N) { unsigned NumElts = 1; switch (N->getOpcode()) { default: - return nullptr; + return false; case NVPTXISD::StoreRetval: NumElts = 1; break; @@ -2816,11 +2887,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreRetval(SDNode *N) { unsigned Opcode = 0; switch (NumElts) { default: - return nullptr; + return false; case 1: switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i1: Opcode = NVPTX::StoreRetvalI8; break; @@ -2847,7 +2918,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreRetval(SDNode *N) { case 2: switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i1: Opcode = NVPTX::StoreRetvalV2I8; break; @@ -2874,7 +2945,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreRetval(SDNode *N) { case 4: switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i1: Opcode = NVPTX::StoreRetvalV4I8; break; @@ -2900,10 +2971,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreRetval(SDNode *N) { MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand(); cast<MachineSDNode>(Ret)->setMemRefs(MemRefs0, MemRefs0 + 1); - return Ret; + ReplaceNode(N, Ret); + return true; } -SDNode *NVPTXDAGToDAGISel::SelectStoreParam(SDNode *N) { +bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) { SDLoc DL(N); SDValue Chain = N->getOperand(0); SDValue Param = N->getOperand(1); @@ -2917,7 +2989,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreParam(SDNode *N) { unsigned NumElts = 1; switch (N->getOpcode()) { default: - return nullptr; + return false; case NVPTXISD::StoreParamU32: case NVPTXISD::StoreParamS32: case NVPTXISD::StoreParam: @@ -2948,11 +3020,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreParam(SDNode *N) { default: switch (NumElts) { default: - return nullptr; + return false; case 1: switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i1: Opcode = NVPTX::StoreParamI8; break; @@ -2979,7 +3051,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreParam(SDNode *N) { case 2: switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i1: Opcode = NVPTX::StoreParamV2I8; break; @@ -3006,7 +3078,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreParam(SDNode *N) { case 4: switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) { default: - return nullptr; + return false; case MVT::i1: Opcode = NVPTX::StoreParamV4I8; break; @@ -3056,17 +3128,17 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreParam(SDNode *N) { MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand(); cast<MachineSDNode>(Ret)->setMemRefs(MemRefs0, MemRefs0 + 1); - return Ret; + ReplaceNode(N, Ret); + return true; } -SDNode *NVPTXDAGToDAGISel::SelectTextureIntrinsic(SDNode *N) { +bool NVPTXDAGToDAGISel::tryTextureIntrinsic(SDNode *N) { SDValue Chain = N->getOperand(0); - SDNode *Ret = nullptr; unsigned Opc = 0; SmallVector<SDValue, 8> Ops; switch (N->getOpcode()) { - default: return nullptr; + default: return false; case NVPTXISD::Tex1DFloatS32: Opc = NVPTX::TEX_1D_F32_S32; break; @@ -3579,18 +3651,17 @@ SDNode *NVPTXDAGToDAGISel::SelectTextureIntrinsic(SDNode *N) { } Ops.push_back(Chain); - Ret = CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops); - return Ret; + ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops)); + return true; } -SDNode *NVPTXDAGToDAGISel::SelectSurfaceIntrinsic(SDNode *N) { +bool NVPTXDAGToDAGISel::trySurfaceIntrinsic(SDNode *N) { SDValue Chain = N->getOperand(0); SDValue TexHandle = N->getOperand(1); - SDNode *Ret = nullptr; unsigned Opc = 0; SmallVector<SDValue, 8> Ops; switch (N->getOpcode()) { - default: return nullptr; + default: return false; case NVPTXISD::Suld1DI8Clamp: Opc = NVPTX::SULD_1D_I8_CLAMP; Ops.push_back(TexHandle); @@ -4780,14 +4851,14 @@ SDNode *NVPTXDAGToDAGISel::SelectSurfaceIntrinsic(SDNode *N) { Ops.push_back(Chain); break; } - Ret = CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops); - return Ret; + ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops)); + return true; } /// SelectBFE - Look for instruction sequences that can be made more efficient /// by using the 'bfe' (bit-field extract) PTX instruction -SDNode *NVPTXDAGToDAGISel::SelectBFE(SDNode *N) { +bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) { SDLoc DL(N); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); @@ -4806,7 +4877,7 @@ SDNode *NVPTXDAGToDAGISel::SelectBFE(SDNode *N) { ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS); if (!Mask) { // We need a constant mask on the RHS of the AND - return NULL; + return false; } // Extract the mask bits @@ -4815,7 +4886,7 @@ SDNode *NVPTXDAGToDAGISel::SelectBFE(SDNode *N) { // We *could* handle shifted masks here, but doing so would require an // 'and' operation to fix up the low-order bits so we would trade // shr+and for bfe+and, which has the same throughput - return NULL; + return false; } // How many bits are in our mask? @@ -4836,7 +4907,7 @@ SDNode *NVPTXDAGToDAGISel::SelectBFE(SDNode *N) { // Do not handle the case where bits have been shifted in. In theory // we could handle this, but the cost is likely higher than just // emitting the srl/and pair. - return NULL; + return false; } Start = CurDAG->getTargetConstant(StartVal, DL, MVT::i32); } else { @@ -4844,20 +4915,20 @@ SDNode *NVPTXDAGToDAGISel::SelectBFE(SDNode *N) { // was found) is not constant. We could handle this case, but it would // require run-time logic that would be more expensive than just // emitting the srl/and pair. - return NULL; + return false; } } else { // Do not handle the case where the LHS of the and is not a shift. While // it would be trivial to handle this case, it would just transform // 'and' -> 'bfe', but 'and' has higher-throughput. - return NULL; + return false; } } else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) { if (LHS->getOpcode() == ISD::AND) { ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(RHS); if (!ShiftCnst) { // Shift amount must be constant - return NULL; + return false; } uint64_t ShiftAmt = ShiftCnst->getZExtValue(); @@ -4873,7 +4944,7 @@ SDNode *NVPTXDAGToDAGISel::SelectBFE(SDNode *N) { ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(AndRHS); if (!MaskCnst) { // Mask must be constant - return NULL; + return false; } uint64_t MaskVal = MaskCnst->getZExtValue(); @@ -4893,13 +4964,13 @@ SDNode *NVPTXDAGToDAGISel::SelectBFE(SDNode *N) { NumBits = NumZeros + NumOnes - ShiftAmt; } else { // This is not a mask we can handle - return NULL; + return false; } if (ShiftAmt < NumZeros) { // Handling this case would require extra logic that would make this // transformation non-profitable - return NULL; + return false; } Val = AndLHS; @@ -4919,7 +4990,7 @@ SDNode *NVPTXDAGToDAGISel::SelectBFE(SDNode *N) { ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(ShlRHS); if (!ShlCnst) { // Shift amount must be constant - return NULL; + return false; } uint64_t InnerShiftAmt = ShlCnst->getZExtValue(); @@ -4927,20 +4998,20 @@ SDNode *NVPTXDAGToDAGISel::SelectBFE(SDNode *N) { ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(ShrRHS); if (!ShrCnst) { // Shift amount must be constant - return NULL; + return false; } uint64_t OuterShiftAmt = ShrCnst->getZExtValue(); // To avoid extra codegen and be profitable, we need Outer >= Inner if (OuterShiftAmt < InnerShiftAmt) { - return NULL; + return false; } // If the outer shift is more than the type size, we have no bitfield to // extract (since we also check that the inner shift is <= the outer shift // then this also implies that the inner shift is < the type size) if (OuterShiftAmt >= Val.getValueType().getSizeInBits()) { - return NULL; + return false; } Start = @@ -4956,11 +5027,11 @@ SDNode *NVPTXDAGToDAGISel::SelectBFE(SDNode *N) { } } else { // No can do... - return NULL; + return false; } } else { // No can do... - return NULL; + return false; } @@ -4981,14 +5052,15 @@ SDNode *NVPTXDAGToDAGISel::SelectBFE(SDNode *N) { } } else { // We cannot handle this type - return NULL; + return false; } SDValue Ops[] = { Val, Start, Len }; - return CurDAG->getMachineNode(Opc, DL, N->getVTList(), Ops); + ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getVTList(), Ops)); + return true; } // SelectDirectAddr - Match a direct address for DAG. @@ -5122,3 +5194,57 @@ bool NVPTXDAGToDAGISel::SelectInlineAsmMemoryOperand( } return true; } + +/// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a +/// conversion from \p SrcTy to \p DestTy. +unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy, + bool IsSigned) { + switch (SrcTy.SimpleTy) { + default: + llvm_unreachable("Unhandled source type"); + case MVT::i8: + switch (DestTy.SimpleTy) { + default: + llvm_unreachable("Unhandled dest type"); + case MVT::i16: + return IsSigned ? NVPTX::CVT_s16_s8 : NVPTX::CVT_u16_u8; + case MVT::i32: + return IsSigned ? NVPTX::CVT_s32_s8 : NVPTX::CVT_u32_u8; + case MVT::i64: + return IsSigned ? NVPTX::CVT_s64_s8 : NVPTX::CVT_u64_u8; + } + case MVT::i16: + switch (DestTy.SimpleTy) { + default: + llvm_unreachable("Unhandled dest type"); + case MVT::i8: + return IsSigned ? NVPTX::CVT_s8_s16 : NVPTX::CVT_u8_u16; + case MVT::i32: + return IsSigned ? NVPTX::CVT_s32_s16 : NVPTX::CVT_u32_u16; + case MVT::i64: + return IsSigned ? NVPTX::CVT_s64_s16 : NVPTX::CVT_u64_u16; + } + case MVT::i32: + switch (DestTy.SimpleTy) { + default: + llvm_unreachable("Unhandled dest type"); + case MVT::i8: + return IsSigned ? NVPTX::CVT_s8_s32 : NVPTX::CVT_u8_u32; + case MVT::i16: + return IsSigned ? NVPTX::CVT_s16_s32 : NVPTX::CVT_u16_u32; + case MVT::i64: + return IsSigned ? NVPTX::CVT_s64_s32 : NVPTX::CVT_u64_u32; + } + case MVT::i64: + switch (DestTy.SimpleTy) { + default: + llvm_unreachable("Unhandled dest type"); + case MVT::i8: + return IsSigned ? NVPTX::CVT_s8_s64 : NVPTX::CVT_u8_u64; + case MVT::i16: + return IsSigned ? NVPTX::CVT_s16_s64 : NVPTX::CVT_u16_u64; + case MVT::i32: + return IsSigned ? NVPTX::CVT_s32_s64 : NVPTX::CVT_u32_u64; + } + } +} diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index fe20580c83a2..d53c92f1eff3 100644 --- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -21,9 +21,8 @@ #include "llvm/CodeGen/SelectionDAGISel.h" #include "llvm/IR/Intrinsics.h" #include "llvm/Support/Compiler.h" -using namespace llvm; -namespace { +namespace llvm { class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel { const NVPTXTargetMachine &TM; @@ -54,24 +53,24 @@ private: // Include the pieces autogenerated from the target description. #include "NVPTXGenDAGISel.inc" - SDNode *Select(SDNode *N) override; - SDNode *SelectIntrinsicNoChain(SDNode *N); - SDNode *SelectIntrinsicChain(SDNode *N); - SDNode *SelectTexSurfHandle(SDNode *N); - SDNode *SelectLoad(SDNode *N); - SDNode *SelectLoadVector(SDNode *N); - SDNode *SelectLDGLDU(SDNode *N); - SDNode *SelectStore(SDNode *N); - SDNode *SelectStoreVector(SDNode *N); - SDNode *SelectLoadParam(SDNode *N); - SDNode *SelectStoreRetval(SDNode *N); - SDNode *SelectStoreParam(SDNode *N); - SDNode *SelectAddrSpaceCast(SDNode *N); - SDNode *SelectTextureIntrinsic(SDNode *N); - SDNode *SelectSurfaceIntrinsic(SDNode *N); - SDNode *SelectBFE(SDNode *N); - - inline SDValue getI32Imm(unsigned Imm, SDLoc DL) { + void Select(SDNode *N) override; + bool tryIntrinsicNoChain(SDNode *N); + bool tryIntrinsicChain(SDNode *N); + void SelectTexSurfHandle(SDNode *N); + bool tryLoad(SDNode *N); + bool tryLoadVector(SDNode *N); + bool tryLDGLDU(SDNode *N); + bool tryStore(SDNode *N); + bool tryStoreVector(SDNode *N); + bool tryLoadParam(SDNode *N); + bool tryStoreRetval(SDNode *N); + bool tryStoreParam(SDNode *N); + void SelectAddrSpaceCast(SDNode *N); + bool tryTextureIntrinsic(SDNode *N); + bool trySurfaceIntrinsic(SDNode *N); + bool tryBFE(SDNode *N); + + inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) { return CurDAG->getTargetConstant(Imm, DL, MVT::i32); } @@ -94,7 +93,8 @@ private: bool ChkMemSDNodeAddressSpace(SDNode *N, unsigned int spN) const; + static unsigned GetConvertOpcode(MVT DestTy, MVT SrcTy, bool IsSigned); }; -} +} // end namespace llvm #endif diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp index be735f6c1bce..f28c89cd976a 100644 --- a/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -257,15 +257,9 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setOperationAction(ISD::CTLZ, MVT::i16, Legal); setOperationAction(ISD::CTLZ, MVT::i32, Legal); setOperationAction(ISD::CTLZ, MVT::i64, Legal); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Legal); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Legal); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Legal); setOperationAction(ISD::CTTZ, MVT::i16, Expand); setOperationAction(ISD::CTTZ, MVT::i32, Expand); setOperationAction(ISD::CTTZ, MVT::i64, Expand); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Expand); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); setOperationAction(ISD::CTPOP, MVT::i16, Legal); setOperationAction(ISD::CTPOP, MVT::i32, Legal); setOperationAction(ISD::CTPOP, MVT::i64, Legal); @@ -273,6 +267,10 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, // PTX does not directly support SELP of i1, so promote to i32 first setOperationAction(ISD::SELECT, MVT::i1, Custom); + // PTX cannot multiply two i64s in a single instruction. + setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); + setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); + // We have some custom DAG combine patterns for these nodes setTargetDAGCombine(ISD::ADD); setTargetDAGCombine(ISD::AND); @@ -310,8 +308,12 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { return "NVPTXISD::DeclareRetParam"; case NVPTXISD::PrintCall: return "NVPTXISD::PrintCall"; + case NVPTXISD::PrintConvergentCall: + return "NVPTXISD::PrintConvergentCall"; case NVPTXISD::PrintCallUni: return "NVPTXISD::PrintCallUni"; + case NVPTXISD::PrintConvergentCallUni: + return "NVPTXISD::PrintConvergentCallUni"; case NVPTXISD::LoadParam: return "NVPTXISD::LoadParam"; case NVPTXISD::LoadParamV2: @@ -1309,9 +1311,9 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, InFlag }; unsigned opcode = NVPTXISD::StoreParam; - if (Outs[OIdx].Flags.isZExt()) + if (Outs[OIdx].Flags.isZExt() && VT.getSizeInBits() < 32) opcode = NVPTXISD::StoreParamU32; - else if (Outs[OIdx].Flags.isSExt()) + else if (Outs[OIdx].Flags.isSExt() && VT.getSizeInBits() < 32) opcode = NVPTXISD::StoreParamS32; Chain = DAG.getMemIntrinsicNode(opcode, dl, CopyParamVTs, CopyParamOps, VT, MachinePointerInfo()); @@ -1351,8 +1353,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, OutVals[OIdx], DAG.getConstant(curOffset, dl, PtrVT)); SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr, - MachinePointerInfo(), false, false, false, - PartAlign); + MachinePointerInfo(), PartAlign); if (elemtype.getSizeInBits() < 16) { theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal); } @@ -1435,8 +1436,12 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SDValue PrintCallOps[] = { Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InFlag }; - Chain = DAG.getNode(Func ? (NVPTXISD::PrintCallUni) : (NVPTXISD::PrintCall), - dl, PrintCallVTs, PrintCallOps); + // We model convergent calls as separate opcodes. + unsigned Opcode = Func ? NVPTXISD::PrintCallUni : NVPTXISD::PrintCall; + if (CLI.IsConvergent) + Opcode = Opcode == NVPTXISD::PrintCallUni ? NVPTXISD::PrintConvergentCallUni + : NVPTXISD::PrintConvergentCall; + Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps); InFlag = Chain.getValue(1); // Ops to print out the function name @@ -1608,9 +1613,11 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, for (unsigned i = 0, e = Ins.size(); i != e; ++i) { unsigned sz = VTs[i].getSizeInBits(); unsigned AlignI = GreatestCommonDivisor64(RetAlign, Offsets[i]); - bool needTruncate = sz < 8; - if (VTs[i].isInteger() && (sz < 8)) + bool needTruncate = false; + if (VTs[i].isInteger() && sz < 8) { sz = 8; + needTruncate = true; + } SmallVector<EVT, 4> LoadRetVTs; EVT TheLoadType = VTs[i]; @@ -1619,10 +1626,16 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // aggregates. LoadRetVTs.push_back(MVT::i32); TheLoadType = MVT::i32; + needTruncate = true; } else if (sz < 16) { // If loading i1/i8 result, generate // load i8 (-> i16) // trunc i16 to i1/i8 + + // FIXME: Do we need to set needTruncate to true here, too? We could + // not figure out what this branch is for in D17872, so we left it + // alone. The comment above about loading i1/i8 may be wrong, as the + // branch above seems to cover integers of size < 32. LoadRetVTs.push_back(MVT::i16); } else LoadRetVTs.push_back(Ins[i].VT); @@ -1678,7 +1691,7 @@ NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { DAG.getIntPtrConstant(j, dl))); } } - return DAG.getNode(ISD::BUILD_VECTOR, dl, Node->getValueType(0), Ops); + return DAG.getBuildVector(Node->getValueType(0), dl, Ops); } /// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which @@ -1872,10 +1885,9 @@ SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const { assert(LD->getExtensionType() == ISD::NON_EXTLOAD); assert(Node->getValueType(0) == MVT::i1 && "Custom lowering for i1 load only"); - SDValue newLD = - DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(), - LD->getPointerInfo(), LD->isVolatile(), LD->isNonTemporal(), - LD->isInvariant(), LD->getAlignment()); + SDValue newLD = DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(), + LD->getPointerInfo(), LD->getAlignment(), + LD->getMemOperand()->getFlags()); SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD); // The legalizer (the caller) is expecting two values from the legalized // load, so we build a MergeValues node for it. See ExpandUnalignedLoad() @@ -2002,13 +2014,10 @@ SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const { SDValue Tmp2 = ST->getBasePtr(); SDValue Tmp3 = ST->getValue(); assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only"); - unsigned Alignment = ST->getAlignment(); - bool isVolatile = ST->isVolatile(); - bool isNonTemporal = ST->isNonTemporal(); Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3); - SDValue Result = DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, - ST->getPointerInfo(), MVT::i8, isNonTemporal, - isVolatile, Alignment); + SDValue Result = + DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8, + ST->getAlignment(), ST->getMemOperand()->getFlags()); return Result; } @@ -2027,7 +2036,7 @@ NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const { // Check to see if the kernel argument is image*_t or sampler_t -bool llvm::isImageOrSamplerVal(const Value *arg, const Module *context) { +static bool isImageOrSamplerVal(const Value *arg, const Module *context) { static const char *const specialTypes[] = { "struct._image2d_t", "struct._image3d_t", "struct._sampler_t" }; @@ -2042,16 +2051,17 @@ bool llvm::isImageOrSamplerVal(const Value *arg, const Module *context) { return false; auto *STy = dyn_cast<StructType>(PTy->getElementType()); - const std::string TypeName = STy && !STy->isLiteral() ? STy->getName() : ""; + if (!STy || STy->isLiteral()) + return false; return std::find(std::begin(specialTypes), std::end(specialTypes), - TypeName) != std::end(specialTypes); + STy->getName()) != std::end(specialTypes); } SDValue NVPTXTargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, - const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc dl, SelectionDAG &DAG, - SmallVectorImpl<SDValue> &InVals) const { + const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, + SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { MachineFunction &MF = DAG.getMachineFunction(); const DataLayout &DL = DAG.getDataLayout(); auto PtrVT = getPointerTy(DAG.getDataLayout()); @@ -2171,12 +2181,10 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( ISD::LoadExtType ExtOp = Ins[InsIdx].Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD; p = DAG.getExtLoad(ExtOp, dl, Ins[InsIdx].VT, Root, srcAddr, - MachinePointerInfo(srcValue), partVT, false, - false, false, partAlign); + MachinePointerInfo(srcValue), partVT, partAlign); } else { p = DAG.getLoad(partVT, dl, Root, srcAddr, - MachinePointerInfo(srcValue), false, false, false, - partAlign); + MachinePointerInfo(srcValue), partAlign); } if (p.getNode()) p.getNode()->setIROrder(idx + 1); @@ -2202,9 +2210,9 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( Value *SrcValue = Constant::getNullValue(PointerType::get( EltVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM)); SDValue P = DAG.getLoad( - EltVT, dl, Root, Arg, MachinePointerInfo(SrcValue), false, false, - true, - DL.getABITypeAlignment(EltVT.getTypeForEVT(F->getContext()))); + EltVT, dl, Root, Arg, MachinePointerInfo(SrcValue), + DL.getABITypeAlignment(EltVT.getTypeForEVT(F->getContext())), + MachineMemOperand::MOInvariant); if (P.getNode()) P.getNode()->setIROrder(idx + 1); @@ -2219,9 +2227,9 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( Value *SrcValue = Constant::getNullValue(PointerType::get( VecVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM)); SDValue P = DAG.getLoad( - VecVT, dl, Root, Arg, MachinePointerInfo(SrcValue), false, false, - true, - DL.getABITypeAlignment(VecVT.getTypeForEVT(F->getContext()))); + VecVT, dl, Root, Arg, MachinePointerInfo(SrcValue), + DL.getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())), + MachineMemOperand::MOInvariant); if (P.getNode()) P.getNode()->setIROrder(idx + 1); @@ -2241,10 +2249,9 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( } else { // V4 loads // We have at least 4 elements (<3 x Ty> expands to 4 elements) and - // the - // vector will be expanded to a power of 2 elements, so we know we can - // always round up to the next multiple of 4 when creating the vector - // loads. + // the vector will be expanded to a power of 2 elements, so we know we + // can always round up to the next multiple of 4 when creating the + // vector loads. // e.g. 4 elem => 1 ld.v4 // 6 elem => 2 ld.v4 // 8 elem => 2 ld.v4 @@ -2262,9 +2269,9 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( SDValue SrcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, DAG.getConstant(Ofst, dl, PtrVT)); SDValue P = DAG.getLoad( - VecVT, dl, Root, SrcAddr, MachinePointerInfo(SrcValue), false, - false, true, - DL.getABITypeAlignment(VecVT.getTypeForEVT(F->getContext()))); + VecVT, dl, Root, SrcAddr, MachinePointerInfo(SrcValue), + DL.getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())), + MachineMemOperand::MOInvariant); if (P.getNode()) P.getNode()->setIROrder(idx + 1); @@ -2298,12 +2305,11 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( ISD::SEXTLOAD : ISD::ZEXTLOAD; p = DAG.getExtLoad( ExtOp, dl, Ins[InsIdx].VT, Root, Arg, MachinePointerInfo(srcValue), - ObjectVT, false, false, false, + ObjectVT, DL.getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext()))); } else { p = DAG.getLoad( - Ins[InsIdx].VT, dl, Root, Arg, MachinePointerInfo(srcValue), false, - false, false, + Ins[InsIdx].VT, dl, Root, Arg, MachinePointerInfo(srcValue), DL.getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext()))); } if (p.getNode()) @@ -2350,13 +2356,12 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( return Chain; } - SDValue NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::OutputArg> &Outs, const SmallVectorImpl<SDValue> &OutVals, - SDLoc dl, SelectionDAG &DAG) const { + const SDLoc &dl, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); const Function *F = MF.getFunction(); Type *RetTy = F->getReturnType(); @@ -3940,9 +3945,8 @@ static SDValue PerformADDCombine(SDNode *N, SDValue N1 = N->getOperand(1); // First try with the default operand order. - SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, - OptLevel); - if (Result.getNode()) + if (SDValue Result = + PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, OptLevel)) return Result; // If that didn't work, try again with the operands commuted. @@ -4139,7 +4143,7 @@ static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS, // The RHS can be a demotable op or a constant if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) { - APInt Val = CI->getAPIntValue(); + const APInt &Val = CI->getAPIntValue(); if (LHSSign == Unsigned) { return Val.isIntN(OptSize); } else { @@ -4230,8 +4234,7 @@ static SDValue PerformMULCombine(SDNode *N, CodeGenOpt::Level OptLevel) { if (OptLevel > 0) { // Try mul.wide combining at OptLevel > 0 - SDValue Ret = TryMULWIDECombine(N, DCI); - if (Ret.getNode()) + if (SDValue Ret = TryMULWIDECombine(N, DCI)) return Ret; } @@ -4244,8 +4247,7 @@ static SDValue PerformSHLCombine(SDNode *N, CodeGenOpt::Level OptLevel) { if (OptLevel > 0) { // Try mul.wide combining at OptLevel > 0 - SDValue Ret = TryMULWIDECombine(N, DCI); - if (Ret.getNode()) + if (SDValue Ret = TryMULWIDECombine(N, DCI)) return Ret; } @@ -4368,7 +4370,7 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, SDValue LoadChain = NewLD.getValue(NumElts); - SDValue BuildVec = DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, ScalarRes); + SDValue BuildVec = DAG.getBuildVector(ResVT, DL, ScalarRes); Results.push_back(BuildVec); Results.push_back(LoadChain); @@ -4481,7 +4483,7 @@ static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, SDValue LoadChain = NewLD.getValue(NumElts); SDValue BuildVec = - DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, ScalarRes); + DAG.getBuildVector(ResVT, DL, ScalarRes); Results.push_back(BuildVec); Results.push_back(LoadChain); diff --git a/lib/Target/NVPTX/NVPTXISelLowering.h b/lib/Target/NVPTX/NVPTXISelLowering.h index 60914c1d09b4..1c32232024d1 100644 --- a/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/lib/Target/NVPTX/NVPTXISelLowering.h @@ -34,7 +34,9 @@ enum NodeType : unsigned { DeclareRet, DeclareScalarRet, PrintCall, + PrintConvergentCall, PrintCallUni, + PrintConvergentCallUni, CallArgBegin, CallArg, LastCallArg, @@ -475,10 +477,11 @@ public: getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override; - SDValue LowerFormalArguments( - SDValue Chain, CallingConv::ID CallConv, bool isVarArg, - const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc dl, SelectionDAG &DAG, - SmallVectorImpl<SDValue> &InVals) const override; + SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + const SDLoc &dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const override; SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const override; @@ -488,11 +491,10 @@ public: unsigned retAlignment, const ImmutableCallSite *CS) const; - SDValue - LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, - const SmallVectorImpl<ISD::OutputArg> &Outs, - const SmallVectorImpl<SDValue> &OutVals, SDLoc dl, - SelectionDAG &DAG) const override; + SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, const SDLoc &dl, + SelectionDAG &DAG) const override; void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops, diff --git a/lib/Target/NVPTX/NVPTXImageOptimizer.cpp b/lib/Target/NVPTX/NVPTXImageOptimizer.cpp index aa36b6be7250..8d00bbb5e9c2 100644 --- a/lib/Target/NVPTX/NVPTXImageOptimizer.cpp +++ b/lib/Target/NVPTX/NVPTXImageOptimizer.cpp @@ -50,6 +50,9 @@ NVPTXImageOptimizer::NVPTXImageOptimizer() : FunctionPass(ID) {} bool NVPTXImageOptimizer::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + bool Changed = false; InstrToDelete.clear(); diff --git a/lib/Target/NVPTX/NVPTXInferAddressSpaces.cpp b/lib/Target/NVPTX/NVPTXInferAddressSpaces.cpp new file mode 100644 index 000000000000..e451d273cf44 --- /dev/null +++ b/lib/Target/NVPTX/NVPTXInferAddressSpaces.cpp @@ -0,0 +1,586 @@ +//===-- NVPTXInferAddressSpace.cpp - ---------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// CUDA C/C++ includes memory space designation as variable type qualifers (such +// as __global__ and __shared__). Knowing the space of a memory access allows +// CUDA compilers to emit faster PTX loads and stores. For example, a load from +// shared memory can be translated to `ld.shared` which is roughly 10% faster +// than a generic `ld` on an NVIDIA Tesla K40c. +// +// Unfortunately, type qualifiers only apply to variable declarations, so CUDA +// compilers must infer the memory space of an address expression from +// type-qualified variables. +// +// LLVM IR uses non-zero (so-called) specific address spaces to represent memory +// spaces (e.g. addrspace(3) means shared memory). The Clang frontend +// places only type-qualified variables in specific address spaces, and then +// conservatively `addrspacecast`s each type-qualified variable to addrspace(0) +// (so-called the generic address space) for other instructions to use. +// +// For example, the Clang translates the following CUDA code +// __shared__ float a[10]; +// float v = a[i]; +// to +// %0 = addrspacecast [10 x float] addrspace(3)* @a to [10 x float]* +// %1 = gep [10 x float], [10 x float]* %0, i64 0, i64 %i +// %v = load float, float* %1 ; emits ld.f32 +// @a is in addrspace(3) since it's type-qualified, but its use from %1 is +// redirected to %0 (the generic version of @a). +// +// The optimization implemented in this file propagates specific address spaces +// from type-qualified variable declarations to its users. For example, it +// optimizes the above IR to +// %1 = gep [10 x float] addrspace(3)* @a, i64 0, i64 %i +// %v = load float addrspace(3)* %1 ; emits ld.shared.f32 +// propagating the addrspace(3) from @a to %1. As the result, the NVPTX +// codegen is able to emit ld.shared.f32 for %v. +// +// Address space inference works in two steps. First, it uses a data-flow +// analysis to infer as many generic pointers as possible to point to only one +// specific address space. In the above example, it can prove that %1 only +// points to addrspace(3). This algorithm was published in +// CUDA: Compiling and optimizing for a GPU platform +// Chakrabarti, Grover, Aarts, Kong, Kudlur, Lin, Marathe, Murphy, Wang +// ICCS 2012 +// +// Then, address space inference replaces all refinable generic pointers with +// equivalent specific pointers. +// +// The major challenge of implementing this optimization is handling PHINodes, +// which may create loops in the data flow graph. This brings two complications. +// +// First, the data flow analysis in Step 1 needs to be circular. For example, +// %generic.input = addrspacecast float addrspace(3)* %input to float* +// loop: +// %y = phi [ %generic.input, %y2 ] +// %y2 = getelementptr %y, 1 +// %v = load %y2 +// br ..., label %loop, ... +// proving %y specific requires proving both %generic.input and %y2 specific, +// but proving %y2 specific circles back to %y. To address this complication, +// the data flow analysis operates on a lattice: +// uninitialized > specific address spaces > generic. +// All address expressions (our implementation only considers phi, bitcast, +// addrspacecast, and getelementptr) start with the uninitialized address space. +// The monotone transfer function moves the address space of a pointer down a +// lattice path from uninitialized to specific and then to generic. A join +// operation of two different specific address spaces pushes the expression down +// to the generic address space. The analysis completes once it reaches a fixed +// point. +// +// Second, IR rewriting in Step 2 also needs to be circular. For example, +// converting %y to addrspace(3) requires the compiler to know the converted +// %y2, but converting %y2 needs the converted %y. To address this complication, +// we break these cycles using "undef" placeholders. When converting an +// instruction `I` to a new address space, if its operand `Op` is not converted +// yet, we let `I` temporarily use `undef` and fix all the uses of undef later. +// For instance, our algorithm first converts %y to +// %y' = phi float addrspace(3)* [ %input, undef ] +// Then, it converts %y2 to +// %y2' = getelementptr %y', 1 +// Finally, it fixes the undef in %y' so that +// %y' = phi float addrspace(3)* [ %input, %y2' ] +// +// TODO: This pass is experimental and not enabled by default. Users can turn it +// on by setting the -nvptx-use-infer-addrspace flag of llc. We plan to replace +// NVPTXNonFavorGenericAddrSpaces with this pass shortly. +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "nvptx-infer-addrspace" + +#include "NVPTX.h" +#include "MCTargetDesc/NVPTXBaseInfo.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Operator.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/ValueMapper.h" + +using namespace llvm; + +namespace { +const unsigned ADDRESS_SPACE_UNINITIALIZED = (unsigned)-1; + +using ValueToAddrSpaceMapTy = DenseMap<const Value *, unsigned>; + +/// \brief NVPTXInferAddressSpaces +class NVPTXInferAddressSpaces: public FunctionPass { +public: + static char ID; + + NVPTXInferAddressSpaces() : FunctionPass(ID) {} + + bool runOnFunction(Function &F) override; + +private: + // Returns the new address space of V if updated; otherwise, returns None. + Optional<unsigned> + updateAddressSpace(const Value &V, + const ValueToAddrSpaceMapTy &InferredAddrSpace); + + // Tries to infer the specific address space of each address expression in + // Postorder. + void inferAddressSpaces(const std::vector<Value *> &Postorder, + ValueToAddrSpaceMapTy *InferredAddrSpace); + + // Changes the generic address expressions in function F to point to specific + // address spaces if InferredAddrSpace says so. Postorder is the postorder of + // all generic address expressions in the use-def graph of function F. + bool + rewriteWithNewAddressSpaces(const std::vector<Value *> &Postorder, + const ValueToAddrSpaceMapTy &InferredAddrSpace, + Function *F); +}; +} // end anonymous namespace + +char NVPTXInferAddressSpaces::ID = 0; + +namespace llvm { +void initializeNVPTXInferAddressSpacesPass(PassRegistry &); +} +INITIALIZE_PASS(NVPTXInferAddressSpaces, "nvptx-infer-addrspace", + "Infer address spaces", + false, false) + +// Returns true if V is an address expression. +// TODO: Currently, we consider only phi, bitcast, addrspacecast, and +// getelementptr operators. +static bool isAddressExpression(const Value &V) { + if (!isa<Operator>(V)) + return false; + + switch (cast<Operator>(V).getOpcode()) { + case Instruction::PHI: + case Instruction::BitCast: + case Instruction::AddrSpaceCast: + case Instruction::GetElementPtr: + return true; + default: + return false; + } +} + +// Returns the pointer operands of V. +// +// Precondition: V is an address expression. +static SmallVector<Value *, 2> getPointerOperands(const Value &V) { + assert(isAddressExpression(V)); + const Operator& Op = cast<Operator>(V); + switch (Op.getOpcode()) { + case Instruction::PHI: { + auto IncomingValues = cast<PHINode>(Op).incoming_values(); + return SmallVector<Value *, 2>(IncomingValues.begin(), + IncomingValues.end()); + } + case Instruction::BitCast: + case Instruction::AddrSpaceCast: + case Instruction::GetElementPtr: + return {Op.getOperand(0)}; + default: + llvm_unreachable("Unexpected instruction type."); + } +} + +// If V is an unvisited generic address expression, appends V to PostorderStack +// and marks it as visited. +static void appendsGenericAddressExpressionToPostorderStack( + Value *V, std::vector<std::pair<Value *, bool>> *PostorderStack, + DenseSet<Value *> *Visited) { + assert(V->getType()->isPointerTy()); + if (isAddressExpression(*V) && + V->getType()->getPointerAddressSpace() == + AddressSpace::ADDRESS_SPACE_GENERIC) { + if (Visited->insert(V).second) + PostorderStack->push_back(std::make_pair(V, false)); + } +} + +// Returns all generic address expressions in function F. The elements are +// ordered in postorder. +static std::vector<Value *> collectGenericAddressExpressions(Function &F) { + // This function implements a non-recursive postorder traversal of a partial + // use-def graph of function F. + std::vector<std::pair<Value*, bool>> PostorderStack; + // The set of visited expressions. + DenseSet<Value*> Visited; + // We only explore address expressions that are reachable from loads and + // stores for now because we aim at generating faster loads and stores. + for (Instruction &I : instructions(F)) { + if (isa<LoadInst>(I)) { + appendsGenericAddressExpressionToPostorderStack( + I.getOperand(0), &PostorderStack, &Visited); + } else if (isa<StoreInst>(I)) { + appendsGenericAddressExpressionToPostorderStack( + I.getOperand(1), &PostorderStack, &Visited); + } + } + + std::vector<Value *> Postorder; // The resultant postorder. + while (!PostorderStack.empty()) { + // If the operands of the expression on the top are already explored, + // adds that expression to the resultant postorder. + if (PostorderStack.back().second) { + Postorder.push_back(PostorderStack.back().first); + PostorderStack.pop_back(); + continue; + } + // Otherwise, adds its operands to the stack and explores them. + PostorderStack.back().second = true; + for (Value *PtrOperand : getPointerOperands(*PostorderStack.back().first)) { + appendsGenericAddressExpressionToPostorderStack( + PtrOperand, &PostorderStack, &Visited); + } + } + return Postorder; +} + +// A helper function for cloneInstructionWithNewAddressSpace. Returns the clone +// of OperandUse.get() in the new address space. If the clone is not ready yet, +// returns an undef in the new address space as a placeholder. +static Value *operandWithNewAddressSpaceOrCreateUndef( + const Use &OperandUse, unsigned NewAddrSpace, + const ValueToValueMapTy &ValueWithNewAddrSpace, + SmallVectorImpl<const Use *> *UndefUsesToFix) { + Value *Operand = OperandUse.get(); + if (Value *NewOperand = ValueWithNewAddrSpace.lookup(Operand)) + return NewOperand; + + UndefUsesToFix->push_back(&OperandUse); + return UndefValue::get( + Operand->getType()->getPointerElementType()->getPointerTo(NewAddrSpace)); +} + +// Returns a clone of `I` with its operands converted to those specified in +// ValueWithNewAddrSpace. Due to potential cycles in the data flow graph, an +// operand whose address space needs to be modified might not exist in +// ValueWithNewAddrSpace. In that case, uses undef as a placeholder operand and +// adds that operand use to UndefUsesToFix so that caller can fix them later. +// +// Note that we do not necessarily clone `I`, e.g., if it is an addrspacecast +// from a pointer whose type already matches. Therefore, this function returns a +// Value* instead of an Instruction*. +static Value *cloneInstructionWithNewAddressSpace( + Instruction *I, unsigned NewAddrSpace, + const ValueToValueMapTy &ValueWithNewAddrSpace, + SmallVectorImpl<const Use *> *UndefUsesToFix) { + Type *NewPtrType = + I->getType()->getPointerElementType()->getPointerTo(NewAddrSpace); + + if (I->getOpcode() == Instruction::AddrSpaceCast) { + Value *Src = I->getOperand(0); + // Because `I` is generic, the source address space must be specific. + // Therefore, the inferred address space must be the source space, according + // to our algorithm. + assert(Src->getType()->getPointerAddressSpace() == NewAddrSpace); + if (Src->getType() != NewPtrType) + return new BitCastInst(Src, NewPtrType); + return Src; + } + + // Computes the converted pointer operands. + SmallVector<Value *, 4> NewPointerOperands; + for (const Use &OperandUse : I->operands()) { + if (!OperandUse.get()->getType()->isPointerTy()) + NewPointerOperands.push_back(nullptr); + else + NewPointerOperands.push_back(operandWithNewAddressSpaceOrCreateUndef( + OperandUse, NewAddrSpace, ValueWithNewAddrSpace, UndefUsesToFix)); + } + + switch (I->getOpcode()) { + case Instruction::BitCast: + return new BitCastInst(NewPointerOperands[0], NewPtrType); + case Instruction::PHI: { + assert(I->getType()->isPointerTy()); + PHINode *PHI = cast<PHINode>(I); + PHINode *NewPHI = PHINode::Create(NewPtrType, PHI->getNumIncomingValues()); + for (unsigned Index = 0; Index < PHI->getNumIncomingValues(); ++Index) { + unsigned OperandNo = PHINode::getOperandNumForIncomingValue(Index); + NewPHI->addIncoming(NewPointerOperands[OperandNo], + PHI->getIncomingBlock(Index)); + } + return NewPHI; + } + case Instruction::GetElementPtr: { + GetElementPtrInst *GEP = cast<GetElementPtrInst>(I); + GetElementPtrInst *NewGEP = GetElementPtrInst::Create( + GEP->getSourceElementType(), NewPointerOperands[0], + SmallVector<Value *, 4>(GEP->idx_begin(), GEP->idx_end())); + NewGEP->setIsInBounds(GEP->isInBounds()); + return NewGEP; + } + default: + llvm_unreachable("Unexpected opcode"); + } +} + +// Similar to cloneInstructionWithNewAddressSpace, returns a clone of the +// constant expression `CE` with its operands replaced as specified in +// ValueWithNewAddrSpace. +static Value *cloneConstantExprWithNewAddressSpace( + ConstantExpr *CE, unsigned NewAddrSpace, + const ValueToValueMapTy &ValueWithNewAddrSpace) { + Type *TargetType = + CE->getType()->getPointerElementType()->getPointerTo(NewAddrSpace); + + if (CE->getOpcode() == Instruction::AddrSpaceCast) { + // Because CE is generic, the source address space must be specific. + // Therefore, the inferred address space must be the source space according + // to our algorithm. + assert(CE->getOperand(0)->getType()->getPointerAddressSpace() == + NewAddrSpace); + return ConstantExpr::getBitCast(CE->getOperand(0), TargetType); + } + + // Computes the operands of the new constant expression. + SmallVector<Constant *, 4> NewOperands; + for (unsigned Index = 0; Index < CE->getNumOperands(); ++Index) { + Constant *Operand = CE->getOperand(Index); + // If the address space of `Operand` needs to be modified, the new operand + // with the new address space should already be in ValueWithNewAddrSpace + // because (1) the constant expressions we consider (i.e. addrspacecast, + // bitcast, and getelementptr) do not incur cycles in the data flow graph + // and (2) this function is called on constant expressions in postorder. + if (Value *NewOperand = ValueWithNewAddrSpace.lookup(Operand)) { + NewOperands.push_back(cast<Constant>(NewOperand)); + } else { + // Otherwise, reuses the old operand. + NewOperands.push_back(Operand); + } + } + + if (CE->getOpcode() == Instruction::GetElementPtr) { + // Needs to specify the source type while constructing a getelementptr + // constant expression. + return CE->getWithOperands( + NewOperands, TargetType, /*OnlyIfReduced=*/false, + NewOperands[0]->getType()->getPointerElementType()); + } + + return CE->getWithOperands(NewOperands, TargetType); +} + +// Returns a clone of the value `V`, with its operands replaced as specified in +// ValueWithNewAddrSpace. This function is called on every generic address +// expression whose address space needs to be modified, in postorder. +// +// See cloneInstructionWithNewAddressSpace for the meaning of UndefUsesToFix. +static Value * +cloneValueWithNewAddressSpace(Value *V, unsigned NewAddrSpace, + const ValueToValueMapTy &ValueWithNewAddrSpace, + SmallVectorImpl<const Use *> *UndefUsesToFix) { + // All values in Postorder are generic address expressions. + assert(isAddressExpression(*V) && + V->getType()->getPointerAddressSpace() == + AddressSpace::ADDRESS_SPACE_GENERIC); + + if (Instruction *I = dyn_cast<Instruction>(V)) { + Value *NewV = cloneInstructionWithNewAddressSpace( + I, NewAddrSpace, ValueWithNewAddrSpace, UndefUsesToFix); + if (Instruction *NewI = dyn_cast<Instruction>(NewV)) { + if (NewI->getParent() == nullptr) { + NewI->insertBefore(I); + NewI->takeName(I); + } + } + return NewV; + } + + return cloneConstantExprWithNewAddressSpace( + cast<ConstantExpr>(V), NewAddrSpace, ValueWithNewAddrSpace); +} + +// Defines the join operation on the address space lattice (see the file header +// comments). +static unsigned joinAddressSpaces(unsigned AS1, unsigned AS2) { + if (AS1 == AddressSpace::ADDRESS_SPACE_GENERIC || + AS2 == AddressSpace::ADDRESS_SPACE_GENERIC) + return AddressSpace::ADDRESS_SPACE_GENERIC; + + if (AS1 == ADDRESS_SPACE_UNINITIALIZED) + return AS2; + if (AS2 == ADDRESS_SPACE_UNINITIALIZED) + return AS1; + + // The join of two different specific address spaces is generic. + return AS1 == AS2 ? AS1 : (unsigned)AddressSpace::ADDRESS_SPACE_GENERIC; +} + +bool NVPTXInferAddressSpaces::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + + // Collects all generic address expressions in postorder. + std::vector<Value *> Postorder = collectGenericAddressExpressions(F); + + // Runs a data-flow analysis to refine the address spaces of every expression + // in Postorder. + ValueToAddrSpaceMapTy InferredAddrSpace; + inferAddressSpaces(Postorder, &InferredAddrSpace); + + // Changes the address spaces of the generic address expressions who are + // inferred to point to a specific address space. + return rewriteWithNewAddressSpaces(Postorder, InferredAddrSpace, &F); +} + +void NVPTXInferAddressSpaces::inferAddressSpaces( + const std::vector<Value *> &Postorder, + ValueToAddrSpaceMapTy *InferredAddrSpace) { + SetVector<Value *> Worklist(Postorder.begin(), Postorder.end()); + // Initially, all expressions are in the uninitialized address space. + for (Value *V : Postorder) + (*InferredAddrSpace)[V] = ADDRESS_SPACE_UNINITIALIZED; + + while (!Worklist.empty()) { + Value* V = Worklist.pop_back_val(); + + // Tries to update the address space of the stack top according to the + // address spaces of its operands. + DEBUG(dbgs() << "Updating the address space of\n" + << " " << *V << "\n"); + Optional<unsigned> NewAS = updateAddressSpace(*V, *InferredAddrSpace); + if (!NewAS.hasValue()) + continue; + // If any updates are made, grabs its users to the worklist because + // their address spaces can also be possibly updated. + DEBUG(dbgs() << " to " << NewAS.getValue() << "\n"); + (*InferredAddrSpace)[V] = NewAS.getValue(); + + for (Value *User : V->users()) { + // Skip if User is already in the worklist. + if (Worklist.count(User)) + continue; + + auto Pos = InferredAddrSpace->find(User); + // Our algorithm only updates the address spaces of generic address + // expressions, which are those in InferredAddrSpace. + if (Pos == InferredAddrSpace->end()) + continue; + + // Function updateAddressSpace moves the address space down a lattice + // path. Therefore, nothing to do if User is already inferred as + // generic (the bottom element in the lattice). + if (Pos->second == AddressSpace::ADDRESS_SPACE_GENERIC) + continue; + + Worklist.insert(User); + } + } +} + +Optional<unsigned> NVPTXInferAddressSpaces::updateAddressSpace( + const Value &V, const ValueToAddrSpaceMapTy &InferredAddrSpace) { + assert(InferredAddrSpace.count(&V)); + + // The new inferred address space equals the join of the address spaces + // of all its pointer operands. + unsigned NewAS = ADDRESS_SPACE_UNINITIALIZED; + for (Value *PtrOperand : getPointerOperands(V)) { + unsigned OperandAS; + if (InferredAddrSpace.count(PtrOperand)) + OperandAS = InferredAddrSpace.lookup(PtrOperand); + else + OperandAS = PtrOperand->getType()->getPointerAddressSpace(); + NewAS = joinAddressSpaces(NewAS, OperandAS); + // join(generic, *) = generic. So we can break if NewAS is already generic. + if (NewAS == AddressSpace::ADDRESS_SPACE_GENERIC) + break; + } + + unsigned OldAS = InferredAddrSpace.lookup(&V); + assert(OldAS != AddressSpace::ADDRESS_SPACE_GENERIC); + if (OldAS == NewAS) + return None; + return NewAS; +} + +bool NVPTXInferAddressSpaces::rewriteWithNewAddressSpaces( + const std::vector<Value *> &Postorder, + const ValueToAddrSpaceMapTy &InferredAddrSpace, Function *F) { + // For each address expression to be modified, creates a clone of it with its + // pointer operands converted to the new address space. Since the pointer + // operands are converted, the clone is naturally in the new address space by + // construction. + ValueToValueMapTy ValueWithNewAddrSpace; + SmallVector<const Use *, 32> UndefUsesToFix; + for (Value* V : Postorder) { + unsigned NewAddrSpace = InferredAddrSpace.lookup(V); + if (V->getType()->getPointerAddressSpace() != NewAddrSpace) { + ValueWithNewAddrSpace[V] = cloneValueWithNewAddressSpace( + V, NewAddrSpace, ValueWithNewAddrSpace, &UndefUsesToFix); + } + } + + if (ValueWithNewAddrSpace.empty()) + return false; + + // Fixes all the undef uses generated by cloneInstructionWithNewAddressSpace. + for (const Use* UndefUse : UndefUsesToFix) { + User *V = UndefUse->getUser(); + User *NewV = cast<User>(ValueWithNewAddrSpace.lookup(V)); + unsigned OperandNo = UndefUse->getOperandNo(); + assert(isa<UndefValue>(NewV->getOperand(OperandNo))); + NewV->setOperand(OperandNo, ValueWithNewAddrSpace.lookup(UndefUse->get())); + } + + // Replaces the uses of the old address expressions with the new ones. + for (Value *V : Postorder) { + Value *NewV = ValueWithNewAddrSpace.lookup(V); + if (NewV == nullptr) + continue; + + SmallVector<Use *, 4> Uses; + for (Use &U : V->uses()) + Uses.push_back(&U); + DEBUG(dbgs() << "Replacing the uses of " << *V << "\n to\n " << *NewV + << "\n"); + for (Use *U : Uses) { + if (isa<LoadInst>(U->getUser()) || + (isa<StoreInst>(U->getUser()) && U->getOperandNo() == 1)) { + // If V is used as the pointer operand of a load/store, sets the pointer + // operand to NewV. This replacement does not change the element type, + // so the resultant load/store is still valid. + U->set(NewV); + } else if (isa<Instruction>(U->getUser())) { + // Otherwise, replaces the use with generic(NewV). + // TODO: Some optimization opportunities are missed. For example, in + // %0 = icmp eq float* %p, %q + // if both p and q are inferred to be shared, we can rewrite %0 as + // %0 = icmp eq float addrspace(3)* %new_p, %new_q + // instead of currently + // %generic_p = addrspacecast float addrspace(3)* %new_p to float* + // %generic_q = addrspacecast float addrspace(3)* %new_q to float* + // %0 = icmp eq float* %generic_p, %generic_q + if (Instruction *I = dyn_cast<Instruction>(V)) { + BasicBlock::iterator InsertPos = std::next(I->getIterator()); + while (isa<PHINode>(InsertPos)) + ++InsertPos; + U->set(new AddrSpaceCastInst(NewV, V->getType(), "", &*InsertPos)); + } else { + U->set(ConstantExpr::getAddrSpaceCast(cast<Constant>(NewV), + V->getType())); + } + } + } + if (V->use_empty()) + RecursivelyDeleteTriviallyDeadInstructions(V); + } + + return true; +} + +FunctionPass *llvm::createNVPTXInferAddressSpacesPass() { + return new NVPTXInferAddressSpaces(); +} diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/lib/Target/NVPTX/NVPTXInstrInfo.cpp index 9f3cf4551955..0c7c6cbc4512 100644 --- a/lib/Target/NVPTX/NVPTXInstrInfo.cpp +++ b/lib/Target/NVPTX/NVPTXInstrInfo.cpp @@ -30,9 +30,10 @@ void NVPTXInstrInfo::anchor() {} NVPTXInstrInfo::NVPTXInstrInfo() : NVPTXGenInstrInfo(), RegInfo() {} -void NVPTXInstrInfo::copyPhysReg( - MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL, - unsigned DestReg, unsigned SrcReg, bool KillSrc) const { +void NVPTXInstrInfo::copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const DebugLoc &DL, unsigned DestReg, + unsigned SrcReg, bool KillSrc) const { const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); const TargetRegisterClass *DestRC = MRI.getRegClass(DestReg); const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg); @@ -111,7 +112,7 @@ bool NVPTXInstrInfo::isStoreInstr(const MachineInstr &MI, bool NVPTXInstrInfo::CanTailMerge(const MachineInstr *MI) const { unsigned addrspace = 0; - if (MI->getOpcode() == NVPTX::INT_CUDA_SYNCTHREADS) + if (MI->getOpcode() == NVPTX::INT_BARRIER0) return false; if (isLoadInstr(*MI, addrspace)) if (addrspace == NVPTX::PTXLdStInstCode::SHARED) @@ -145,26 +146,28 @@ bool NVPTXInstrInfo::CanTailMerge(const MachineInstr *MI) const { /// Note that RemoveBranch and InsertBranch must be implemented to support /// cases where this method returns success. /// -bool NVPTXInstrInfo::AnalyzeBranch( - MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, - SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const { +bool NVPTXInstrInfo::analyzeBranch(MachineBasicBlock &MBB, + MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify) const { // If the block has no terminators, it just falls into the block after it. MachineBasicBlock::iterator I = MBB.end(); - if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) + if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) return false; // Get the last instruction in the block. - MachineInstr *LastInst = I; + MachineInstr &LastInst = *I; // If there is only one terminator instruction, process it. - if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) { - if (LastInst->getOpcode() == NVPTX::GOTO) { - TBB = LastInst->getOperand(0).getMBB(); + if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) { + if (LastInst.getOpcode() == NVPTX::GOTO) { + TBB = LastInst.getOperand(0).getMBB(); return false; - } else if (LastInst->getOpcode() == NVPTX::CBranch) { + } else if (LastInst.getOpcode() == NVPTX::CBranch) { // Block ends with fall-through condbranch. - TBB = LastInst->getOperand(1).getMBB(); - Cond.push_back(LastInst->getOperand(0)); + TBB = LastInst.getOperand(1).getMBB(); + Cond.push_back(LastInst.getOperand(0)); return false; } // Otherwise, don't know what this is. @@ -172,26 +175,26 @@ bool NVPTXInstrInfo::AnalyzeBranch( } // Get the instruction before it if it's a terminator. - MachineInstr *SecondLastInst = I; + MachineInstr &SecondLastInst = *I; // If there are three terminators, we don't know what sort of block this is. - if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I)) + if (I != MBB.begin() && isUnpredicatedTerminator(*--I)) return true; // If the block ends with NVPTX::GOTO and NVPTX:CBranch, handle it. - if (SecondLastInst->getOpcode() == NVPTX::CBranch && - LastInst->getOpcode() == NVPTX::GOTO) { - TBB = SecondLastInst->getOperand(1).getMBB(); - Cond.push_back(SecondLastInst->getOperand(0)); - FBB = LastInst->getOperand(0).getMBB(); + if (SecondLastInst.getOpcode() == NVPTX::CBranch && + LastInst.getOpcode() == NVPTX::GOTO) { + TBB = SecondLastInst.getOperand(1).getMBB(); + Cond.push_back(SecondLastInst.getOperand(0)); + FBB = LastInst.getOperand(0).getMBB(); return false; } // If the block ends with two NVPTX:GOTOs, handle it. The second one is not // executed, so remove it. - if (SecondLastInst->getOpcode() == NVPTX::GOTO && - LastInst->getOpcode() == NVPTX::GOTO) { - TBB = SecondLastInst->getOperand(0).getMBB(); + if (SecondLastInst.getOpcode() == NVPTX::GOTO && + LastInst.getOpcode() == NVPTX::GOTO) { + TBB = SecondLastInst.getOperand(0).getMBB(); I = LastInst; if (AllowModify) I->eraseFromParent(); @@ -226,9 +229,11 @@ unsigned NVPTXInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { return 2; } -unsigned NVPTXInstrInfo::InsertBranch( - MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, - ArrayRef<MachineOperand> Cond, DebugLoc DL) const { +unsigned NVPTXInstrInfo::InsertBranch(MachineBasicBlock &MBB, + MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + ArrayRef<MachineOperand> Cond, + const DebugLoc &DL) const { // Shouldn't be a fall through. assert(TBB && "InsertBranch must not be told to insert a fallthrough"); assert((Cond.size() == 1 || Cond.size() == 0) && diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.h b/lib/Target/NVPTX/NVPTXInstrInfo.h index 3e407223f010..050bf12fe859 100644 --- a/lib/Target/NVPTX/NVPTXInstrInfo.h +++ b/lib/Target/NVPTX/NVPTXInstrInfo.h @@ -49,9 +49,9 @@ public: * const TargetRegisterClass *RC) const; */ - void copyPhysReg( - MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL, - unsigned DestReg, unsigned SrcReg, bool KillSrc) const override; + void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, + bool KillSrc) const override; virtual bool isMoveInstr(const MachineInstr &MI, unsigned &SrcReg, unsigned &DestReg) const; bool isLoadInstr(const MachineInstr &MI, unsigned &AddrSpace) const; @@ -59,13 +59,14 @@ public: virtual bool CanTailMerge(const MachineInstr *MI) const; // Branch analysis. - bool AnalyzeBranch( - MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, - SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const override; + bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify) const override; unsigned RemoveBranch(MachineBasicBlock &MBB) const override; - unsigned InsertBranch( - MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, - ArrayRef<MachineOperand> Cond, DebugLoc DL) const override; + unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, + MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond, + const DebugLoc &DL) const override; unsigned getLdStCodeAddrSpace(const MachineInstr &MI) const { return MI.getOperand(2).getImm(); } diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td index 6fdd60f3ed2d..c158cc6cdab2 100644 --- a/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -14,7 +14,9 @@ include "NVPTXInstrFormats.td" // A NOP instruction -def NOP : NVPTXInst<(outs), (ins), "", []>; +let hasSideEffects = 0 in { + def NOP : NVPTXInst<(outs), (ins), "", []>; +} // List of vector specific properties def isVecLD : VecInstTypeEnum<1>; @@ -162,130 +164,146 @@ def hasPTX31 : Predicate<"Subtarget->getPTXVersion() >= 31">; // Some Common Instruction Class Templates //===----------------------------------------------------------------------===// +// Template for instructions which take three int64, int32, or int16 args. +// The instructions are named "<OpcStr><Width>" (e.g. "add.s64"). multiclass I3<string OpcStr, SDNode OpNode> { - def i64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b), - !strconcat(OpcStr, "64 \t$dst, $a, $b;"), - [(set Int64Regs:$dst, (OpNode Int64Regs:$a, - Int64Regs:$b))]>; - def i64ri : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b), - !strconcat(OpcStr, "64 \t$dst, $a, $b;"), - [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>; - def i32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), - !strconcat(OpcStr, "32 \t$dst, $a, $b;"), - [(set Int32Regs:$dst, (OpNode Int32Regs:$a, - Int32Regs:$b))]>; - def i32ri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), - !strconcat(OpcStr, "32 \t$dst, $a, $b;"), - [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>; - def i16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), - !strconcat(OpcStr, "16 \t$dst, $a, $b;"), - [(set Int16Regs:$dst, (OpNode Int16Regs:$a, - Int16Regs:$b))]>; - def i16ri : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b), - !strconcat(OpcStr, "16 \t$dst, $a, $b;"), - [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>; + def i64rr : + NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b), + !strconcat(OpcStr, "64 \t$dst, $a, $b;"), + [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>; + def i64ri : + NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b), + !strconcat(OpcStr, "64 \t$dst, $a, $b;"), + [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>; + def i32rr : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), + !strconcat(OpcStr, "32 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>; + def i32ri : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), + !strconcat(OpcStr, "32 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>; + def i16rr : + NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), + !strconcat(OpcStr, "16 \t$dst, $a, $b;"), + [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>; + def i16ri : + NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b), + !strconcat(OpcStr, "16 \t$dst, $a, $b;"), + [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>; } +// Template for instructions which take 3 int32 args. The instructions are +// named "<OpcStr>.s32" (e.g. "addc.cc.s32"). multiclass ADD_SUB_INT_32<string OpcStr, SDNode OpNode> { - def i32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, - Int32Regs:$b), - !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"), - [(set Int32Regs:$dst, (OpNode Int32Regs:$a, - Int32Regs:$b))]>; - def i32ri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), - !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"), - [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>; + def i32rr : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), + !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>; + def i32ri : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), + !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>; } +// Template for instructions which take three fp64 or fp32 args. The +// instructions are named "<OpcStr>.f<Width>" (e.g. "add.f64"). +// +// Also defines ftz (flush subnormal inputs and results to sign-preserving +// zero) variants for fp32 functions. multiclass F3<string OpcStr, SDNode OpNode> { - def f64rr : NVPTXInst<(outs Float64Regs:$dst), - (ins Float64Regs:$a, Float64Regs:$b), - !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), - [(set Float64Regs:$dst, - (OpNode Float64Regs:$a, Float64Regs:$b))]>, - Requires<[allowFMA]>; - def f64ri : NVPTXInst<(outs Float64Regs:$dst), - (ins Float64Regs:$a, f64imm:$b), - !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), - [(set Float64Regs:$dst, - (OpNode Float64Regs:$a, fpimm:$b))]>, - Requires<[allowFMA]>; - def f32rr_ftz : NVPTXInst<(outs Float32Regs:$dst), - (ins Float32Regs:$a, Float32Regs:$b), - !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), - [(set Float32Regs:$dst, - (OpNode Float32Regs:$a, Float32Regs:$b))]>, - Requires<[allowFMA, doF32FTZ]>; - def f32ri_ftz : NVPTXInst<(outs Float32Regs:$dst), - (ins Float32Regs:$a, f32imm:$b), - !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), - [(set Float32Regs:$dst, - (OpNode Float32Regs:$a, fpimm:$b))]>, - Requires<[allowFMA, doF32FTZ]>; - def f32rr : NVPTXInst<(outs Float32Regs:$dst), - (ins Float32Regs:$a, Float32Regs:$b), - !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), - [(set Float32Regs:$dst, - (OpNode Float32Regs:$a, Float32Regs:$b))]>, - Requires<[allowFMA]>; - def f32ri : NVPTXInst<(outs Float32Regs:$dst), - (ins Float32Regs:$a, f32imm:$b), - !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), - [(set Float32Regs:$dst, - (OpNode Float32Regs:$a, fpimm:$b))]>, - Requires<[allowFMA]>; + def f64rr : + NVPTXInst<(outs Float64Regs:$dst), + (ins Float64Regs:$a, Float64Regs:$b), + !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), + [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>, + Requires<[allowFMA]>; + def f64ri : + NVPTXInst<(outs Float64Regs:$dst), + (ins Float64Regs:$a, f64imm:$b), + !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), + [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>, + Requires<[allowFMA]>; + def f32rr_ftz : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b), + !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), + [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, + Requires<[allowFMA, doF32FTZ]>; + def f32ri_ftz : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, f32imm:$b), + !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), + [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, + Requires<[allowFMA, doF32FTZ]>; + def f32rr : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b), + !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), + [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, + Requires<[allowFMA]>; + def f32ri : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, f32imm:$b), + !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), + [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, + Requires<[allowFMA]>; } +// Same as F3, but defines ".rn" variants (round to nearest even). multiclass F3_rn<string OpcStr, SDNode OpNode> { - def f64rr : NVPTXInst<(outs Float64Regs:$dst), - (ins Float64Regs:$a, Float64Regs:$b), - !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"), - [(set Float64Regs:$dst, - (OpNode Float64Regs:$a, Float64Regs:$b))]>, - Requires<[noFMA]>; - def f64ri : NVPTXInst<(outs Float64Regs:$dst), - (ins Float64Regs:$a, f64imm:$b), - !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"), - [(set Float64Regs:$dst, - (OpNode Float64Regs:$a, fpimm:$b))]>, - Requires<[noFMA]>; - def f32rr_ftz : NVPTXInst<(outs Float32Regs:$dst), - (ins Float32Regs:$a, Float32Regs:$b), - !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"), - [(set Float32Regs:$dst, - (OpNode Float32Regs:$a, Float32Regs:$b))]>, - Requires<[noFMA, doF32FTZ]>; - def f32ri_ftz : NVPTXInst<(outs Float32Regs:$dst), - (ins Float32Regs:$a, f32imm:$b), - !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"), - [(set Float32Regs:$dst, - (OpNode Float32Regs:$a, fpimm:$b))]>, - Requires<[noFMA, doF32FTZ]>; - def f32rr : NVPTXInst<(outs Float32Regs:$dst), - (ins Float32Regs:$a, Float32Regs:$b), - !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"), - [(set Float32Regs:$dst, - (OpNode Float32Regs:$a, Float32Regs:$b))]>, - Requires<[noFMA]>; - def f32ri : NVPTXInst<(outs Float32Regs:$dst), - (ins Float32Regs:$a, f32imm:$b), - !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"), - [(set Float32Regs:$dst, - (OpNode Float32Regs:$a, fpimm:$b))]>, - Requires<[noFMA]>; + def f64rr : + NVPTXInst<(outs Float64Regs:$dst), + (ins Float64Regs:$a, Float64Regs:$b), + !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"), + [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>, + Requires<[noFMA]>; + def f64ri : + NVPTXInst<(outs Float64Regs:$dst), + (ins Float64Regs:$a, f64imm:$b), + !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"), + [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>, + Requires<[noFMA]>; + def f32rr_ftz : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b), + !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"), + [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, + Requires<[noFMA, doF32FTZ]>; + def f32ri_ftz : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, f32imm:$b), + !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"), + [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, + Requires<[noFMA, doF32FTZ]>; + def f32rr : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b), + !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"), + [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, + Requires<[noFMA]>; + def f32ri : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, f32imm:$b), + !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"), + [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, + Requires<[noFMA]>; } +// Template for operations which take two f32 or f64 operands. Provides three +// instructions: <OpcStr>.f64, <OpcStr>.f32, and <OpcStr>.ftz.f32 (flush +// subnormal inputs and results to zero). multiclass F2<string OpcStr, SDNode OpNode> { - def f64 : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a), - !strconcat(OpcStr, ".f64 \t$dst, $a;"), - [(set Float64Regs:$dst, (OpNode Float64Regs:$a))]>; + def f64 : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a), + !strconcat(OpcStr, ".f64 \t$dst, $a;"), + [(set Float64Regs:$dst, (OpNode Float64Regs:$a))]>; def f32_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a), - !strconcat(OpcStr, ".ftz.f32 \t$dst, $a;"), - [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>, - Requires<[doF32FTZ]>; - def f32 : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a), - !strconcat(OpcStr, ".f32 \t$dst, $a;"), - [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>; + !strconcat(OpcStr, ".ftz.f32 \t$dst, $a;"), + [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>, + Requires<[doF32FTZ]>; + def f32 : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a), + !strconcat(OpcStr, ".f32 \t$dst, $a;"), + [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>; } //===----------------------------------------------------------------------===// @@ -293,160 +311,251 @@ multiclass F2<string OpcStr, SDNode OpNode> { //===----------------------------------------------------------------------===// //----------------------------------- -// General Type Conversion +// Type Conversion //----------------------------------- let hasSideEffects = 0 in { -// Generate a cvt to the given type from all possible types. -// Each instance takes a CvtMode immediate that defines the conversion mode to -// use. It can be CvtNONE to omit a conversion mode. -multiclass CVT_FROM_ALL<string FromName, RegisterClass RC> { - def _s16 : NVPTXInst<(outs RC:$dst), - (ins Int16Regs:$src, CvtMode:$mode), - !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", - FromName, ".s16\t$dst, $src;"), - []>; - def _u16 : NVPTXInst<(outs RC:$dst), - (ins Int16Regs:$src, CvtMode:$mode), - !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", - FromName, ".u16\t$dst, $src;"), - []>; - def _f16 : NVPTXInst<(outs RC:$dst), - (ins Int16Regs:$src, CvtMode:$mode), - !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", - FromName, ".f16\t$dst, $src;"), - []>; - def _s32 : NVPTXInst<(outs RC:$dst), - (ins Int32Regs:$src, CvtMode:$mode), - !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", - FromName, ".s32\t$dst, $src;"), - []>; - def _u32 : NVPTXInst<(outs RC:$dst), - (ins Int32Regs:$src, CvtMode:$mode), - !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", - FromName, ".u32\t$dst, $src;"), - []>; - def _s64 : NVPTXInst<(outs RC:$dst), - (ins Int64Regs:$src, CvtMode:$mode), - !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", - FromName, ".s64\t$dst, $src;"), - []>; - def _u64 : NVPTXInst<(outs RC:$dst), - (ins Int64Regs:$src, CvtMode:$mode), - !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", - FromName, ".u64\t$dst, $src;"), - []>; - def _f32 : NVPTXInst<(outs RC:$dst), - (ins Float32Regs:$src, CvtMode:$mode), - !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", - FromName, ".f32\t$dst, $src;"), - []>; - def _f64 : NVPTXInst<(outs RC:$dst), - (ins Float64Regs:$src, CvtMode:$mode), - !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", - FromName, ".f64\t$dst, $src;"), - []>; -} - -// Generate a cvt to all possible types. -defm CVT_s16 : CVT_FROM_ALL<"s16", Int16Regs>; -defm CVT_u16 : CVT_FROM_ALL<"u16", Int16Regs>; -defm CVT_f16 : CVT_FROM_ALL<"f16", Int16Regs>; -defm CVT_s32 : CVT_FROM_ALL<"s32", Int32Regs>; -defm CVT_u32 : CVT_FROM_ALL<"u32", Int32Regs>; -defm CVT_s64 : CVT_FROM_ALL<"s64", Int64Regs>; -defm CVT_u64 : CVT_FROM_ALL<"u64", Int64Regs>; -defm CVT_f32 : CVT_FROM_ALL<"f32", Float32Regs>; -defm CVT_f64 : CVT_FROM_ALL<"f64", Float64Regs>; - -// This set of cvt is different from the above. The type of the source -// and target are the same. -// -def CVT_INREG_s16_s8 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), - "cvt.s16.s8 \t$dst, $src;", []>; -def CVT_INREG_s32_s8 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), - "cvt.s32.s8 \t$dst, $src;", []>; -def CVT_INREG_s32_s16 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), - "cvt.s32.s16 \t$dst, $src;", []>; -def CVT_INREG_s64_s8 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), - "cvt.s64.s8 \t$dst, $src;", []>; -def CVT_INREG_s64_s16 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), - "cvt.s64.s16 \t$dst, $src;", []>; -def CVT_INREG_s64_s32 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), - "cvt.s64.s32 \t$dst, $src;", []>; + // Generate a cvt to the given type from all possible types. Each instance + // takes a CvtMode immediate that defines the conversion mode to use. It can + // be CvtNONE to omit a conversion mode. + multiclass CVT_FROM_ALL<string FromName, RegisterClass RC> { + def _s8 : + NVPTXInst<(outs RC:$dst), + (ins Int16Regs:$src, CvtMode:$mode), + !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", + FromName, ".s8\t$dst, $src;"), []>; + def _u8 : + NVPTXInst<(outs RC:$dst), + (ins Int16Regs:$src, CvtMode:$mode), + !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", + FromName, ".u8\t$dst, $src;"), []>; + def _s16 : + NVPTXInst<(outs RC:$dst), + (ins Int16Regs:$src, CvtMode:$mode), + !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", + FromName, ".s16\t$dst, $src;"), []>; + def _u16 : + NVPTXInst<(outs RC:$dst), + (ins Int16Regs:$src, CvtMode:$mode), + !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", + FromName, ".u16\t$dst, $src;"), []>; + def _f16 : + NVPTXInst<(outs RC:$dst), + (ins Int16Regs:$src, CvtMode:$mode), + !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", + FromName, ".f16\t$dst, $src;"), []>; + def _s32 : + NVPTXInst<(outs RC:$dst), + (ins Int32Regs:$src, CvtMode:$mode), + !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", + FromName, ".s32\t$dst, $src;"), []>; + def _u32 : + NVPTXInst<(outs RC:$dst), + (ins Int32Regs:$src, CvtMode:$mode), + !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", + FromName, ".u32\t$dst, $src;"), []>; + def _s64 : + NVPTXInst<(outs RC:$dst), + (ins Int64Regs:$src, CvtMode:$mode), + !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", + FromName, ".s64\t$dst, $src;"), []>; + def _u64 : + NVPTXInst<(outs RC:$dst), + (ins Int64Regs:$src, CvtMode:$mode), + !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", + FromName, ".u64\t$dst, $src;"), []>; + def _f32 : + NVPTXInst<(outs RC:$dst), + (ins Float32Regs:$src, CvtMode:$mode), + !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", + FromName, ".f32\t$dst, $src;"), []>; + def _f64 : + NVPTXInst<(outs RC:$dst), + (ins Float64Regs:$src, CvtMode:$mode), + !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", + FromName, ".f64\t$dst, $src;"), []>; + } + + // Generate cvts from all types to all types. + defm CVT_s8 : CVT_FROM_ALL<"s8", Int16Regs>; + defm CVT_u8 : CVT_FROM_ALL<"u8", Int16Regs>; + defm CVT_s16 : CVT_FROM_ALL<"s16", Int16Regs>; + defm CVT_u16 : CVT_FROM_ALL<"u16", Int16Regs>; + defm CVT_f16 : CVT_FROM_ALL<"f16", Int16Regs>; + defm CVT_s32 : CVT_FROM_ALL<"s32", Int32Regs>; + defm CVT_u32 : CVT_FROM_ALL<"u32", Int32Regs>; + defm CVT_s64 : CVT_FROM_ALL<"s64", Int64Regs>; + defm CVT_u64 : CVT_FROM_ALL<"u64", Int64Regs>; + defm CVT_f32 : CVT_FROM_ALL<"f32", Float32Regs>; + defm CVT_f64 : CVT_FROM_ALL<"f64", Float64Regs>; + + // These cvts are different from those above: The source and dest registers + // are of the same type. + def CVT_INREG_s16_s8 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), + "cvt.s16.s8 \t$dst, $src;", []>; + def CVT_INREG_s32_s8 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), + "cvt.s32.s8 \t$dst, $src;", []>; + def CVT_INREG_s32_s16 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), + "cvt.s32.s16 \t$dst, $src;", []>; + def CVT_INREG_s64_s8 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), + "cvt.s64.s8 \t$dst, $src;", []>; + def CVT_INREG_s64_s16 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), + "cvt.s64.s16 \t$dst, $src;", []>; + def CVT_INREG_s64_s32 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), + "cvt.s64.s32 \t$dst, $src;", []>; } //----------------------------------- // Integer Arithmetic //----------------------------------- +// Template for xor masquerading as int1 arithmetic. multiclass ADD_SUB_i1<SDNode OpNode> { def _rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b), - "xor.pred \t$dst, $a, $b;", - [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>; + "xor.pred \t$dst, $a, $b;", + [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>; def _ri: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b), - "xor.pred \t$dst, $a, $b;", - [(set Int1Regs:$dst, (OpNode Int1Regs:$a, (imm):$b))]>; + "xor.pred \t$dst, $a, $b;", + [(set Int1Regs:$dst, (OpNode Int1Regs:$a, (imm):$b))]>; } +// int1 addition and subtraction are both just xor. defm ADD_i1 : ADD_SUB_i1<add>; defm SUB_i1 : ADD_SUB_i1<sub>; - +// int16, int32, and int64 signed addition. Since nvptx is 2's compliment, we +// also use these for unsigned arithmetic. defm ADD : I3<"add.s", add>; defm SUB : I3<"sub.s", sub>; +// int32 addition and subtraction with carry-out. +// FIXME: PTX 4.3 adds a 64-bit add.cc (and maybe also 64-bit addc.cc?). defm ADDCC : ADD_SUB_INT_32<"add.cc", addc>; defm SUBCC : ADD_SUB_INT_32<"sub.cc", subc>; +// int32 addition and subtraction with carry-in and carry-out. defm ADDCCC : ADD_SUB_INT_32<"addc.cc", adde>; defm SUBCCC : ADD_SUB_INT_32<"subc.cc", sube>; -//mul.wide PTX instruction +defm MULT : I3<"mul.lo.s", mul>; + +defm MULTHS : I3<"mul.hi.s", mulhs>; +defm MULTHU : I3<"mul.hi.u", mulhu>; + +defm SDIV : I3<"div.s", sdiv>; +defm UDIV : I3<"div.u", udiv>; + +// The ri versions of rem.s and rem.u won't be selected; DAGCombiner::visitSREM +// will lower it. +defm SREM : I3<"rem.s", srem>; +defm UREM : I3<"rem.u", urem>; + + +// +// Wide multiplication +// +def MULWIDES64 : + NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), + "mul.wide.s32 \t$dst, $a, $b;", []>; +def MULWIDES64Imm : + NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b), + "mul.wide.s32 \t$dst, $a, $b;", []>; +def MULWIDES64Imm64 : + NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b), + "mul.wide.s32 \t$dst, $a, $b;", []>; + +def MULWIDEU64 : + NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), + "mul.wide.u32 \t$dst, $a, $b;", []>; +def MULWIDEU64Imm : + NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b), + "mul.wide.u32 \t$dst, $a, $b;", []>; +def MULWIDEU64Imm64 : + NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b), + "mul.wide.u32 \t$dst, $a, $b;", []>; + +def MULWIDES32 : + NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), + "mul.wide.s16 \t$dst, $a, $b;", []>; +def MULWIDES32Imm : + NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b), + "mul.wide.s16 \t$dst, $a, $b;", []>; +def MULWIDES32Imm32 : + NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b), + "mul.wide.s16 \t$dst, $a, $b;", []>; + +def MULWIDEU32 : + NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), + "mul.wide.u16 \t$dst, $a, $b;", []>; +def MULWIDEU32Imm : + NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b), + "mul.wide.u16 \t$dst, $a, $b;", []>; +def MULWIDEU32Imm32 : + NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b), + "mul.wide.u16 \t$dst, $a, $b;", []>; + +def SDTMulWide : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>; +def mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>; +def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>; + +// Matchers for signed, unsigned mul.wide ISD nodes. +def : Pat<(i32 (mul_wide_signed Int16Regs:$a, Int16Regs:$b)), + (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>, + Requires<[doMulWide]>; +def : Pat<(i32 (mul_wide_signed Int16Regs:$a, imm:$b)), + (MULWIDES32Imm Int16Regs:$a, imm:$b)>, + Requires<[doMulWide]>; +def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, Int16Regs:$b)), + (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>, + Requires<[doMulWide]>; +def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, imm:$b)), + (MULWIDEU32Imm Int16Regs:$a, imm:$b)>, + Requires<[doMulWide]>; + +def : Pat<(i64 (mul_wide_signed Int32Regs:$a, Int32Regs:$b)), + (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>, + Requires<[doMulWide]>; +def : Pat<(i64 (mul_wide_signed Int32Regs:$a, imm:$b)), + (MULWIDES64Imm Int32Regs:$a, imm:$b)>, + Requires<[doMulWide]>; +def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, Int32Regs:$b)), + (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>, + Requires<[doMulWide]>; +def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, imm:$b)), + (MULWIDEU64Imm Int32Regs:$a, imm:$b)>, + Requires<[doMulWide]>; + +// Predicates used for converting some patterns to mul.wide. def SInt32Const : PatLeaf<(imm), [{ const APInt &v = N->getAPIntValue(); - if (v.isSignedIntN(32)) - return true; - return false; + return v.isSignedIntN(32); }]>; def UInt32Const : PatLeaf<(imm), [{ const APInt &v = N->getAPIntValue(); - if (v.isIntN(32)) - return true; - return false; + return v.isIntN(32); }]>; def SInt16Const : PatLeaf<(imm), [{ const APInt &v = N->getAPIntValue(); - if (v.isSignedIntN(16)) - return true; - return false; + return v.isSignedIntN(16); }]>; def UInt16Const : PatLeaf<(imm), [{ const APInt &v = N->getAPIntValue(); - if (v.isIntN(16)) - return true; - return false; + return v.isIntN(16); }]>; def Int5Const : PatLeaf<(imm), [{ + // Check if 0 <= v < 32; only then will the result of (x << v) be an int32. const APInt &v = N->getAPIntValue(); - // Check if 0 <= v < 32 - // Only then the result from (x << v) will be i32 - if (v.sge(0) && v.slt(32)) - return true; - return false; + return v.sge(0) && v.slt(32); }]>; def Int4Const : PatLeaf<(imm), [{ + // Check if 0 <= v < 16; only then will the result of (x << v) be an int16. const APInt &v = N->getAPIntValue(); - // Check if 0 <= v < 16 - // Only then the result from (x << v) will be i16 - if (v.sge(0) && v.slt(16)) - return true; - return false; + return v.sge(0) && v.slt(16); }]>; def SHL2MUL32 : SDNodeXForm<imm, [{ @@ -461,215 +570,133 @@ def SHL2MUL16 : SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i16); }]>; -def MULWIDES64 - : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), - "mul.wide.s32 \t$dst, $a, $b;", []>; -def MULWIDES64Imm - : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b), - "mul.wide.s32 \t$dst, $a, $b;", []>; -def MULWIDES64Imm64 - : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b), - "mul.wide.s32 \t$dst, $a, $b;", []>; - -def MULWIDEU64 - : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), - "mul.wide.u32 \t$dst, $a, $b;", []>; -def MULWIDEU64Imm - : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b), - "mul.wide.u32 \t$dst, $a, $b;", []>; -def MULWIDEU64Imm64 - : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b), - "mul.wide.u32 \t$dst, $a, $b;", []>; - -def MULWIDES32 - : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), - "mul.wide.s16 \t$dst, $a, $b;", []>; -def MULWIDES32Imm - : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b), - "mul.wide.s16 \t$dst, $a, $b;", []>; -def MULWIDES32Imm32 - : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b), - "mul.wide.s16 \t$dst, $a, $b;", []>; - -def MULWIDEU32 - : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), - "mul.wide.u16 \t$dst, $a, $b;", []>; -def MULWIDEU32Imm - : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b), - "mul.wide.u16 \t$dst, $a, $b;", []>; -def MULWIDEU32Imm32 - : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b), - "mul.wide.u16 \t$dst, $a, $b;", []>; - +// Convert "sign/zero-extend, then shift left by an immediate" to mul.wide. def : Pat<(shl (sext Int32Regs:$a), (i32 Int5Const:$b)), (MULWIDES64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>, - Requires<[doMulWide]>; + Requires<[doMulWide]>; def : Pat<(shl (zext Int32Regs:$a), (i32 Int5Const:$b)), (MULWIDEU64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>, - Requires<[doMulWide]>; + Requires<[doMulWide]>; def : Pat<(shl (sext Int16Regs:$a), (i16 Int4Const:$b)), (MULWIDES32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>, - Requires<[doMulWide]>; + Requires<[doMulWide]>; def : Pat<(shl (zext Int16Regs:$a), (i16 Int4Const:$b)), (MULWIDEU32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>, - Requires<[doMulWide]>; + Requires<[doMulWide]>; +// Convert "sign/zero-extend then multiply" to mul.wide. def : Pat<(mul (sext Int32Regs:$a), (sext Int32Regs:$b)), (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>, - Requires<[doMulWide]>; + Requires<[doMulWide]>; def : Pat<(mul (sext Int32Regs:$a), (i64 SInt32Const:$b)), (MULWIDES64Imm64 Int32Regs:$a, (i64 SInt32Const:$b))>, - Requires<[doMulWide]>; + Requires<[doMulWide]>; def : Pat<(mul (zext Int32Regs:$a), (zext Int32Regs:$b)), (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>, Requires<[doMulWide]>; def : Pat<(mul (zext Int32Regs:$a), (i64 UInt32Const:$b)), (MULWIDEU64Imm64 Int32Regs:$a, (i64 UInt32Const:$b))>, - Requires<[doMulWide]>; + Requires<[doMulWide]>; def : Pat<(mul (sext Int16Regs:$a), (sext Int16Regs:$b)), (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>, Requires<[doMulWide]>; def : Pat<(mul (sext Int16Regs:$a), (i32 SInt16Const:$b)), (MULWIDES32Imm32 Int16Regs:$a, (i32 SInt16Const:$b))>, - Requires<[doMulWide]>; + Requires<[doMulWide]>; def : Pat<(mul (zext Int16Regs:$a), (zext Int16Regs:$b)), (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>, Requires<[doMulWide]>; def : Pat<(mul (zext Int16Regs:$a), (i32 UInt16Const:$b)), (MULWIDEU32Imm32 Int16Regs:$a, (i32 UInt16Const:$b))>, - Requires<[doMulWide]>; - - -def SDTMulWide - : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>; -def mul_wide_signed - : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>; -def mul_wide_unsigned - : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>; - -def : Pat<(i32 (mul_wide_signed Int16Regs:$a, Int16Regs:$b)), - (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>, Requires<[doMulWide]>; -def : Pat<(i32 (mul_wide_signed Int16Regs:$a, imm:$b)), - (MULWIDES32Imm Int16Regs:$a, imm:$b)>, - Requires<[doMulWide]>; -def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, Int16Regs:$b)), - (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>, - Requires<[doMulWide]>; -def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, imm:$b)), - (MULWIDEU32Imm Int16Regs:$a, imm:$b)>, - Requires<[doMulWide]>; - -def : Pat<(i64 (mul_wide_signed Int32Regs:$a, Int32Regs:$b)), - (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>, - Requires<[doMulWide]>; -def : Pat<(i64 (mul_wide_signed Int32Regs:$a, imm:$b)), - (MULWIDES64Imm Int32Regs:$a, imm:$b)>, - Requires<[doMulWide]>; -def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, Int32Regs:$b)), - (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>, - Requires<[doMulWide]>; -def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, imm:$b)), - (MULWIDEU64Imm Int32Regs:$a, imm:$b)>, - Requires<[doMulWide]>; - -defm MULT : I3<"mul.lo.s", mul>; - -defm MULTHS : I3<"mul.hi.s", mulhs>; -defm MULTHU : I3<"mul.hi.u", mulhu>; - -defm SDIV : I3<"div.s", sdiv>; -defm UDIV : I3<"div.u", udiv>; - -defm SREM : I3<"rem.s", srem>; -// The ri version will not be selected as DAGCombiner::visitSREM will lower it. -defm UREM : I3<"rem.u", urem>; -// The ri version will not be selected as DAGCombiner::visitUREM will lower it. - -def SDTIMAD - : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>, - SDTCisInt<2>, SDTCisSameAs<0, 2>, - SDTCisSameAs<0, 3>]>; -def imad - : SDNode<"NVPTXISD::IMAD", SDTIMAD>; - -def MAD16rrr : NVPTXInst<(outs Int16Regs:$dst), - (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c), - "mad.lo.s16 \t$dst, $a, $b, $c;", - [(set Int16Regs:$dst, - (imad Int16Regs:$a, Int16Regs:$b, Int16Regs:$c))]>; -def MAD16rri : NVPTXInst<(outs Int16Regs:$dst), - (ins Int16Regs:$a, Int16Regs:$b, i16imm:$c), - "mad.lo.s16 \t$dst, $a, $b, $c;", - [(set Int16Regs:$dst, - (imad Int16Regs:$a, Int16Regs:$b, imm:$c))]>; -def MAD16rir : NVPTXInst<(outs Int16Regs:$dst), - (ins Int16Regs:$a, i16imm:$b, Int16Regs:$c), - "mad.lo.s16 \t$dst, $a, $b, $c;", - [(set Int16Regs:$dst, - (imad Int16Regs:$a, imm:$b, Int16Regs:$c))]>; -def MAD16rii : NVPTXInst<(outs Int16Regs:$dst), - (ins Int16Regs:$a, i16imm:$b, i16imm:$c), - "mad.lo.s16 \t$dst, $a, $b, $c;", - [(set Int16Regs:$dst, - (imad Int16Regs:$a, imm:$b, imm:$c))]>; - -def MAD32rrr : NVPTXInst<(outs Int32Regs:$dst), - (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c), - "mad.lo.s32 \t$dst, $a, $b, $c;", - [(set Int32Regs:$dst, - (imad Int32Regs:$a, Int32Regs:$b, Int32Regs:$c))]>; -def MAD32rri : NVPTXInst<(outs Int32Regs:$dst), - (ins Int32Regs:$a, Int32Regs:$b, i32imm:$c), - "mad.lo.s32 \t$dst, $a, $b, $c;", - [(set Int32Regs:$dst, - (imad Int32Regs:$a, Int32Regs:$b, imm:$c))]>; -def MAD32rir : NVPTXInst<(outs Int32Regs:$dst), - (ins Int32Regs:$a, i32imm:$b, Int32Regs:$c), - "mad.lo.s32 \t$dst, $a, $b, $c;", - [(set Int32Regs:$dst, - (imad Int32Regs:$a, imm:$b, Int32Regs:$c))]>; -def MAD32rii : NVPTXInst<(outs Int32Regs:$dst), - (ins Int32Regs:$a, i32imm:$b, i32imm:$c), - "mad.lo.s32 \t$dst, $a, $b, $c;", - [(set Int32Regs:$dst, - (imad Int32Regs:$a, imm:$b, imm:$c))]>; - -def MAD64rrr : NVPTXInst<(outs Int64Regs:$dst), - (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c), - "mad.lo.s64 \t$dst, $a, $b, $c;", - [(set Int64Regs:$dst, - (imad Int64Regs:$a, Int64Regs:$b, Int64Regs:$c))]>; -def MAD64rri : NVPTXInst<(outs Int64Regs:$dst), - (ins Int64Regs:$a, Int64Regs:$b, i64imm:$c), - "mad.lo.s64 \t$dst, $a, $b, $c;", - [(set Int64Regs:$dst, - (imad Int64Regs:$a, Int64Regs:$b, imm:$c))]>; -def MAD64rir : NVPTXInst<(outs Int64Regs:$dst), - (ins Int64Regs:$a, i64imm:$b, Int64Regs:$c), - "mad.lo.s64 \t$dst, $a, $b, $c;", - [(set Int64Regs:$dst, - (imad Int64Regs:$a, imm:$b, Int64Regs:$c))]>; -def MAD64rii : NVPTXInst<(outs Int64Regs:$dst), - (ins Int64Regs:$a, i64imm:$b, i64imm:$c), - "mad.lo.s64 \t$dst, $a, $b, $c;", - [(set Int64Regs:$dst, - (imad Int64Regs:$a, imm:$b, imm:$c))]>; - -def INEG16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), - "neg.s16 \t$dst, $src;", - [(set Int16Regs:$dst, (ineg Int16Regs:$src))]>; -def INEG32 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), - "neg.s32 \t$dst, $src;", - [(set Int32Regs:$dst, (ineg Int32Regs:$src))]>; -def INEG64 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), - "neg.s64 \t$dst, $src;", - [(set Int64Regs:$dst, (ineg Int64Regs:$src))]>; +// +// Integer multiply-add +// +def SDTIMAD : + SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<2>, + SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>]>; +def imad : SDNode<"NVPTXISD::IMAD", SDTIMAD>; + +def MAD16rrr : + NVPTXInst<(outs Int16Regs:$dst), + (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c), + "mad.lo.s16 \t$dst, $a, $b, $c;", + [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, Int16Regs:$c))]>; +def MAD16rri : + NVPTXInst<(outs Int16Regs:$dst), + (ins Int16Regs:$a, Int16Regs:$b, i16imm:$c), + "mad.lo.s16 \t$dst, $a, $b, $c;", + [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, imm:$c))]>; +def MAD16rir : + NVPTXInst<(outs Int16Regs:$dst), + (ins Int16Regs:$a, i16imm:$b, Int16Regs:$c), + "mad.lo.s16 \t$dst, $a, $b, $c;", + [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, Int16Regs:$c))]>; +def MAD16rii : + NVPTXInst<(outs Int16Regs:$dst), + (ins Int16Regs:$a, i16imm:$b, i16imm:$c), + "mad.lo.s16 \t$dst, $a, $b, $c;", + [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, imm:$c))]>; + +def MAD32rrr : + NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c), + "mad.lo.s32 \t$dst, $a, $b, $c;", + [(set Int32Regs:$dst, (imad Int32Regs:$a, Int32Regs:$b, Int32Regs:$c))]>; +def MAD32rri : + NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$a, Int32Regs:$b, i32imm:$c), + "mad.lo.s32 \t$dst, $a, $b, $c;", + [(set Int32Regs:$dst, (imad Int32Regs:$a, Int32Regs:$b, imm:$c))]>; +def MAD32rir : + NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$a, i32imm:$b, Int32Regs:$c), + "mad.lo.s32 \t$dst, $a, $b, $c;", + [(set Int32Regs:$dst, (imad Int32Regs:$a, imm:$b, Int32Regs:$c))]>; +def MAD32rii : + NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$a, i32imm:$b, i32imm:$c), + "mad.lo.s32 \t$dst, $a, $b, $c;", + [(set Int32Regs:$dst, (imad Int32Regs:$a, imm:$b, imm:$c))]>; + +def MAD64rrr : + NVPTXInst<(outs Int64Regs:$dst), + (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c), + "mad.lo.s64 \t$dst, $a, $b, $c;", + [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, Int64Regs:$c))]>; +def MAD64rri : + NVPTXInst<(outs Int64Regs:$dst), + (ins Int64Regs:$a, Int64Regs:$b, i64imm:$c), + "mad.lo.s64 \t$dst, $a, $b, $c;", + [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, imm:$c))]>; +def MAD64rir : + NVPTXInst<(outs Int64Regs:$dst), + (ins Int64Regs:$a, i64imm:$b, Int64Regs:$c), + "mad.lo.s64 \t$dst, $a, $b, $c;", + [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, Int64Regs:$c))]>; +def MAD64rii : + NVPTXInst<(outs Int64Regs:$dst), + (ins Int64Regs:$a, i64imm:$b, i64imm:$c), + "mad.lo.s64 \t$dst, $a, $b, $c;", + [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, imm:$c))]>; + +def INEG16 : + NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), + "neg.s16 \t$dst, $src;", + [(set Int16Regs:$dst, (ineg Int16Regs:$src))]>; +def INEG32 : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), + "neg.s32 \t$dst, $src;", + [(set Int32Regs:$dst, (ineg Int32Regs:$src))]>; +def INEG64 : + NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), + "neg.s64 \t$dst, $src;", + [(set Int64Regs:$dst, (ineg Int64Regs:$src))]>; //----------------------------------- // Floating Point Arithmetic @@ -677,17 +704,13 @@ def INEG64 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), // Constant 1.0f def FloatConst1 : PatLeaf<(fpimm), [{ - if (&(N->getValueAPF().getSemantics()) != &llvm::APFloat::IEEEsingle) - return false; - float f = (float)N->getValueAPF().convertToFloat(); - return (f==1.0f); + return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEsingle && + N->getValueAPF().convertToFloat() == 1.0f; }]>; -// Constand (double)1.0 +// Constant 1.0 (double) def DoubleConst1 : PatLeaf<(fpimm), [{ - if (&(N->getValueAPF().getSemantics()) != &llvm::APFloat::IEEEdouble) - return false; - double d = (double)N->getValueAPF().convertToDouble(); - return (d==1.0); + return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEdouble && + N->getValueAPF().convertToDouble() == 1.0; }]>; defm FADD : F3<"add", fadd>; @@ -698,157 +721,157 @@ defm FADD_rn : F3_rn<"add", fadd>; defm FSUB_rn : F3_rn<"sub", fsub>; defm FMUL_rn : F3_rn<"mul", fmul>; -defm FABS : F2<"abs", fabs>; -defm FNEG : F2<"neg", fneg>; +defm FABS : F2<"abs", fabs>; +defm FNEG : F2<"neg", fneg>; defm FSQRT : F2<"sqrt.rn", fsqrt>; // // F64 division // -def FDIV641r : NVPTXInst<(outs Float64Regs:$dst), - (ins f64imm:$a, Float64Regs:$b), - "rcp.rn.f64 \t$dst, $b;", - [(set Float64Regs:$dst, - (fdiv DoubleConst1:$a, Float64Regs:$b))]>; -def FDIV64rr : NVPTXInst<(outs Float64Regs:$dst), - (ins Float64Regs:$a, Float64Regs:$b), - "div.rn.f64 \t$dst, $a, $b;", - [(set Float64Regs:$dst, - (fdiv Float64Regs:$a, Float64Regs:$b))]>; -def FDIV64ri : NVPTXInst<(outs Float64Regs:$dst), - (ins Float64Regs:$a, f64imm:$b), - "div.rn.f64 \t$dst, $a, $b;", - [(set Float64Regs:$dst, - (fdiv Float64Regs:$a, fpimm:$b))]>; +def FDIV641r : + NVPTXInst<(outs Float64Regs:$dst), + (ins f64imm:$a, Float64Regs:$b), + "rcp.rn.f64 \t$dst, $b;", + [(set Float64Regs:$dst, (fdiv DoubleConst1:$a, Float64Regs:$b))]>; +def FDIV64rr : + NVPTXInst<(outs Float64Regs:$dst), + (ins Float64Regs:$a, Float64Regs:$b), + "div.rn.f64 \t$dst, $a, $b;", + [(set Float64Regs:$dst, (fdiv Float64Regs:$a, Float64Regs:$b))]>; +def FDIV64ri : + NVPTXInst<(outs Float64Regs:$dst), + (ins Float64Regs:$a, f64imm:$b), + "div.rn.f64 \t$dst, $a, $b;", + [(set Float64Regs:$dst, (fdiv Float64Regs:$a, fpimm:$b))]>; // // F32 Approximate reciprocal // -def FDIV321r_ftz : NVPTXInst<(outs Float32Regs:$dst), - (ins f32imm:$a, Float32Regs:$b), - "rcp.approx.ftz.f32 \t$dst, $b;", - [(set Float32Regs:$dst, - (fdiv FloatConst1:$a, Float32Regs:$b))]>, - Requires<[do_DIVF32_APPROX, doF32FTZ]>; -def FDIV321r : NVPTXInst<(outs Float32Regs:$dst), - (ins f32imm:$a, Float32Regs:$b), - "rcp.approx.f32 \t$dst, $b;", - [(set Float32Regs:$dst, - (fdiv FloatConst1:$a, Float32Regs:$b))]>, - Requires<[do_DIVF32_APPROX]>; +def FDIV321r_ftz : + NVPTXInst<(outs Float32Regs:$dst), + (ins f32imm:$a, Float32Regs:$b), + "rcp.approx.ftz.f32 \t$dst, $b;", + [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, + Requires<[do_DIVF32_APPROX, doF32FTZ]>; +def FDIV321r : + NVPTXInst<(outs Float32Regs:$dst), + (ins f32imm:$a, Float32Regs:$b), + "rcp.approx.f32 \t$dst, $b;", + [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, + Requires<[do_DIVF32_APPROX]>; // // F32 Approximate division // -def FDIV32approxrr_ftz : NVPTXInst<(outs Float32Regs:$dst), - (ins Float32Regs:$a, Float32Regs:$b), - "div.approx.ftz.f32 \t$dst, $a, $b;", - [(set Float32Regs:$dst, - (fdiv Float32Regs:$a, Float32Regs:$b))]>, - Requires<[do_DIVF32_APPROX, doF32FTZ]>; -def FDIV32approxri_ftz : NVPTXInst<(outs Float32Regs:$dst), - (ins Float32Regs:$a, f32imm:$b), - "div.approx.ftz.f32 \t$dst, $a, $b;", - [(set Float32Regs:$dst, - (fdiv Float32Regs:$a, fpimm:$b))]>, - Requires<[do_DIVF32_APPROX, doF32FTZ]>; -def FDIV32approxrr : NVPTXInst<(outs Float32Regs:$dst), - (ins Float32Regs:$a, Float32Regs:$b), - "div.approx.f32 \t$dst, $a, $b;", - [(set Float32Regs:$dst, - (fdiv Float32Regs:$a, Float32Regs:$b))]>, - Requires<[do_DIVF32_APPROX]>; -def FDIV32approxri : NVPTXInst<(outs Float32Regs:$dst), - (ins Float32Regs:$a, f32imm:$b), - "div.approx.f32 \t$dst, $a, $b;", - [(set Float32Regs:$dst, - (fdiv Float32Regs:$a, fpimm:$b))]>, - Requires<[do_DIVF32_APPROX]>; +def FDIV32approxrr_ftz : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b), + "div.approx.ftz.f32 \t$dst, $a, $b;", + [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, + Requires<[do_DIVF32_APPROX, doF32FTZ]>; +def FDIV32approxri_ftz : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, f32imm:$b), + "div.approx.ftz.f32 \t$dst, $a, $b;", + [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, + Requires<[do_DIVF32_APPROX, doF32FTZ]>; +def FDIV32approxrr : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b), + "div.approx.f32 \t$dst, $a, $b;", + [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, + Requires<[do_DIVF32_APPROX]>; +def FDIV32approxri : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, f32imm:$b), + "div.approx.f32 \t$dst, $a, $b;", + [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, + Requires<[do_DIVF32_APPROX]>; // // F32 Semi-accurate reciprocal // // rcp.approx gives the same result as div.full(1.0f, a) and is faster. // -def FDIV321r_approx_ftz : NVPTXInst<(outs Float32Regs:$dst), - (ins f32imm:$a, Float32Regs:$b), - "rcp.approx.ftz.f32 \t$dst, $b;", - [(set Float32Regs:$dst, - (fdiv FloatConst1:$a, Float32Regs:$b))]>, - Requires<[do_DIVF32_FULL, doF32FTZ]>; -def FDIV321r_approx : NVPTXInst<(outs Float32Regs:$dst), - (ins f32imm:$a, Float32Regs:$b), - "rcp.approx.f32 \t$dst, $b;", - [(set Float32Regs:$dst, - (fdiv FloatConst1:$a, Float32Regs:$b))]>, - Requires<[do_DIVF32_FULL]>; +def FDIV321r_approx_ftz : + NVPTXInst<(outs Float32Regs:$dst), + (ins f32imm:$a, Float32Regs:$b), + "rcp.approx.ftz.f32 \t$dst, $b;", + [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, + Requires<[do_DIVF32_FULL, doF32FTZ]>; +def FDIV321r_approx : + NVPTXInst<(outs Float32Regs:$dst), + (ins f32imm:$a, Float32Regs:$b), + "rcp.approx.f32 \t$dst, $b;", + [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, + Requires<[do_DIVF32_FULL]>; // // F32 Semi-accurate division // -def FDIV32rr_ftz : NVPTXInst<(outs Float32Regs:$dst), - (ins Float32Regs:$a, Float32Regs:$b), - "div.full.ftz.f32 \t$dst, $a, $b;", - [(set Float32Regs:$dst, - (fdiv Float32Regs:$a, Float32Regs:$b))]>, - Requires<[do_DIVF32_FULL, doF32FTZ]>; -def FDIV32ri_ftz : NVPTXInst<(outs Float32Regs:$dst), - (ins Float32Regs:$a, f32imm:$b), - "div.full.ftz.f32 \t$dst, $a, $b;", - [(set Float32Regs:$dst, - (fdiv Float32Regs:$a, fpimm:$b))]>, - Requires<[do_DIVF32_FULL, doF32FTZ]>; -def FDIV32rr : NVPTXInst<(outs Float32Regs:$dst), - (ins Float32Regs:$a, Float32Regs:$b), - "div.full.f32 \t$dst, $a, $b;", - [(set Float32Regs:$dst, - (fdiv Float32Regs:$a, Float32Regs:$b))]>, - Requires<[do_DIVF32_FULL]>; -def FDIV32ri : NVPTXInst<(outs Float32Regs:$dst), - (ins Float32Regs:$a, f32imm:$b), - "div.full.f32 \t$dst, $a, $b;", - [(set Float32Regs:$dst, - (fdiv Float32Regs:$a, fpimm:$b))]>, - Requires<[do_DIVF32_FULL]>; +def FDIV32rr_ftz : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b), + "div.full.ftz.f32 \t$dst, $a, $b;", + [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, + Requires<[do_DIVF32_FULL, doF32FTZ]>; +def FDIV32ri_ftz : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, f32imm:$b), + "div.full.ftz.f32 \t$dst, $a, $b;", + [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, + Requires<[do_DIVF32_FULL, doF32FTZ]>; +def FDIV32rr : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b), + "div.full.f32 \t$dst, $a, $b;", + [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, + Requires<[do_DIVF32_FULL]>; +def FDIV32ri : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, f32imm:$b), + "div.full.f32 \t$dst, $a, $b;", + [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, + Requires<[do_DIVF32_FULL]>; // // F32 Accurate reciprocal // -def FDIV321r_prec_ftz : NVPTXInst<(outs Float32Regs:$dst), - (ins f32imm:$a, Float32Regs:$b), - "rcp.rn.ftz.f32 \t$dst, $b;", - [(set Float32Regs:$dst, - (fdiv FloatConst1:$a, Float32Regs:$b))]>, - Requires<[reqPTX20, doF32FTZ]>; -def FDIV321r_prec : NVPTXInst<(outs Float32Regs:$dst), - (ins f32imm:$a, Float32Regs:$b), - "rcp.rn.f32 \t$dst, $b;", - [(set Float32Regs:$dst, - (fdiv FloatConst1:$a, Float32Regs:$b))]>, - Requires<[reqPTX20]>; +def FDIV321r_prec_ftz : + NVPTXInst<(outs Float32Regs:$dst), + (ins f32imm:$a, Float32Regs:$b), + "rcp.rn.ftz.f32 \t$dst, $b;", + [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, + Requires<[reqPTX20, doF32FTZ]>; +def FDIV321r_prec : + NVPTXInst<(outs Float32Regs:$dst), + (ins f32imm:$a, Float32Regs:$b), + "rcp.rn.f32 \t$dst, $b;", + [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, + Requires<[reqPTX20]>; // // F32 Accurate division // -def FDIV32rr_prec_ftz : NVPTXInst<(outs Float32Regs:$dst), - (ins Float32Regs:$a, Float32Regs:$b), - "div.rn.ftz.f32 \t$dst, $a, $b;", - [(set Float32Regs:$dst, - (fdiv Float32Regs:$a, Float32Regs:$b))]>, - Requires<[doF32FTZ, reqPTX20]>; -def FDIV32ri_prec_ftz : NVPTXInst<(outs Float32Regs:$dst), - (ins Float32Regs:$a, f32imm:$b), - "div.rn.ftz.f32 \t$dst, $a, $b;", - [(set Float32Regs:$dst, - (fdiv Float32Regs:$a, fpimm:$b))]>, - Requires<[doF32FTZ, reqPTX20]>; -def FDIV32rr_prec : NVPTXInst<(outs Float32Regs:$dst), - (ins Float32Regs:$a, Float32Regs:$b), - "div.rn.f32 \t$dst, $a, $b;", - [(set Float32Regs:$dst, - (fdiv Float32Regs:$a, Float32Regs:$b))]>, - Requires<[reqPTX20]>; -def FDIV32ri_prec : NVPTXInst<(outs Float32Regs:$dst), - (ins Float32Regs:$a, f32imm:$b), - "div.rn.f32 \t$dst, $a, $b;", - [(set Float32Regs:$dst, - (fdiv Float32Regs:$a, fpimm:$b))]>, - Requires<[reqPTX20]>; +def FDIV32rr_prec_ftz : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b), + "div.rn.ftz.f32 \t$dst, $a, $b;", + [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, + Requires<[doF32FTZ, reqPTX20]>; +def FDIV32ri_prec_ftz : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, f32imm:$b), + "div.rn.ftz.f32 \t$dst, $a, $b;", + [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, + Requires<[doF32FTZ, reqPTX20]>; +def FDIV32rr_prec : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b), + "div.rn.f32 \t$dst, $a, $b;", + [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, + Requires<[reqPTX20]>; +def FDIV32ri_prec : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, f32imm:$b), + "div.rn.f32 \t$dst, $a, $b;", + [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, + Requires<[reqPTX20]>; // // F32 rsqrt @@ -857,68 +880,39 @@ def FDIV32ri_prec : NVPTXInst<(outs Float32Regs:$dst), def RSQRTF32approx1r : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$b), "rsqrt.approx.f32 \t$dst, $b;", []>; +// Convert 1.0f/sqrt(x) to rsqrt.approx.f32. (There is an rsqrt.approx.f64, but +// it's emulated in software.) def: Pat<(fdiv FloatConst1, (int_nvvm_sqrt_f Float32Regs:$b)), (RSQRTF32approx1r Float32Regs:$b)>, Requires<[do_DIVF32_FULL, do_SQRTF32_APPROX, doNoF32FTZ]>; -multiclass FPCONTRACT32<string OpcStr, Predicate Pred> { - def rrr : NVPTXInst<(outs Float32Regs:$dst), - (ins Float32Regs:$a, Float32Regs:$b, Float32Regs:$c), - !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set Float32Regs:$dst, - (fma Float32Regs:$a, Float32Regs:$b, Float32Regs:$c))]>, - Requires<[Pred]>; - def rri : NVPTXInst<(outs Float32Regs:$dst), - (ins Float32Regs:$a, Float32Regs:$b, f32imm:$c), - !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set Float32Regs:$dst, - (fma Float32Regs:$a, Float32Regs:$b, fpimm:$c))]>, - Requires<[Pred]>; - def rir : NVPTXInst<(outs Float32Regs:$dst), - (ins Float32Regs:$a, f32imm:$b, Float32Regs:$c), - !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set Float32Regs:$dst, - (fma Float32Regs:$a, fpimm:$b, Float32Regs:$c))]>, - Requires<[Pred]>; - def rii : NVPTXInst<(outs Float32Regs:$dst), - (ins Float32Regs:$a, f32imm:$b, f32imm:$c), - !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set Float32Regs:$dst, - (fma Float32Regs:$a, fpimm:$b, fpimm:$c))]>, - Requires<[Pred]>; +multiclass FMA<string OpcStr, RegisterClass RC, Operand ImmCls, Predicate Pred> { + def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c), + !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), + [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>, + Requires<[Pred]>; + def rri : NVPTXInst<(outs RC:$dst), + (ins RC:$a, RC:$b, ImmCls:$c), + !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), + [(set RC:$dst, (fma RC:$a, RC:$b, fpimm:$c))]>, + Requires<[Pred]>; + def rir : NVPTXInst<(outs RC:$dst), + (ins RC:$a, ImmCls:$b, RC:$c), + !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), + [(set RC:$dst, (fma RC:$a, fpimm:$b, RC:$c))]>, + Requires<[Pred]>; + def rii : NVPTXInst<(outs RC:$dst), + (ins RC:$a, ImmCls:$b, ImmCls:$c), + !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), + [(set RC:$dst, (fma RC:$a, fpimm:$b, fpimm:$c))]>, + Requires<[Pred]>; } -multiclass FPCONTRACT64<string OpcStr, Predicate Pred> { - def rrr : NVPTXInst<(outs Float64Regs:$dst), - (ins Float64Regs:$a, Float64Regs:$b, Float64Regs:$c), - !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set Float64Regs:$dst, - (fma Float64Regs:$a, Float64Regs:$b, Float64Regs:$c))]>, - Requires<[Pred]>; - def rri : NVPTXInst<(outs Float64Regs:$dst), - (ins Float64Regs:$a, Float64Regs:$b, f64imm:$c), - !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set Float64Regs:$dst, - (fma Float64Regs:$a, Float64Regs:$b, fpimm:$c))]>, - Requires<[Pred]>; - def rir : NVPTXInst<(outs Float64Regs:$dst), - (ins Float64Regs:$a, f64imm:$b, Float64Regs:$c), - !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set Float64Regs:$dst, - (fma Float64Regs:$a, fpimm:$b, Float64Regs:$c))]>, - Requires<[Pred]>; - def rii : NVPTXInst<(outs Float64Regs:$dst), - (ins Float64Regs:$a, f64imm:$b, f64imm:$c), - !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set Float64Regs:$dst, - (fma Float64Regs:$a, fpimm:$b, fpimm:$c))]>, - Requires<[Pred]>; -} - -defm FMA32_ftz : FPCONTRACT32<"fma.rn.ftz.f32", doF32FTZ>; -defm FMA32 : FPCONTRACT32<"fma.rn.f32", true>; -defm FMA64 : FPCONTRACT64<"fma.rn.f64", true>; +defm FMA32_ftz : FMA<"fma.rn.ftz.f32", Float32Regs, f32imm, doF32FTZ>; +defm FMA32 : FMA<"fma.rn.f32", Float32Regs, f32imm, true>; +defm FMA64 : FMA<"fma.rn.f64", Float64Regs, f64imm, true>; +// sin/cos def SINF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src), "sin.approx.f32 \t$dst, $src;", [(set Float32Regs:$dst, (fsin Float32Regs:$src))]>; @@ -926,8 +920,8 @@ def COSF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src), "cos.approx.f32 \t$dst, $src;", [(set Float32Regs:$dst, (fcos Float32Regs:$src))]>; -// Lower (frem x, y) into (sub x, (mul (floor (div x, y)) y)) -// e.g. "poor man's fmod()" +// Lower (frem x, y) into (sub x, (mul (floor (div x, y)) y)), +// i.e. "poor man's fmod()" // frem - f32 FTZ def : Pat<(frem Float32Regs:$x, Float32Regs:$y), @@ -962,183 +956,152 @@ def : Pat<(frem Float64Regs:$x, fpimm:$y), fpimm:$y))>; //----------------------------------- -// Logical Arithmetic +// Bitwise operations //----------------------------------- -multiclass LOG_FORMAT<string OpcStr, SDNode OpNode> { - def b1rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b), - !strconcat(OpcStr, ".pred \t$dst, $a, $b;"), - [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>; - def b1ri: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b), - !strconcat(OpcStr, ".pred \t$dst, $a, $b;"), - [(set Int1Regs:$dst, (OpNode Int1Regs:$a, imm:$b))]>; - def b16rr: NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), - !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"), - [(set Int16Regs:$dst, (OpNode Int16Regs:$a, - Int16Regs:$b))]>; - def b16ri: NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b), - !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"), - [(set Int16Regs:$dst, (OpNode Int16Regs:$a, imm:$b))]>; - def b32rr: NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), - !strconcat(OpcStr, ".b32 \t$dst, $a, $b;"), - [(set Int32Regs:$dst, (OpNode Int32Regs:$a, - Int32Regs:$b))]>; - def b32ri: NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), - !strconcat(OpcStr, ".b32 \t$dst, $a, $b;"), - [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>; - def b64rr: NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b), - !strconcat(OpcStr, ".b64 \t$dst, $a, $b;"), - [(set Int64Regs:$dst, (OpNode Int64Regs:$a, - Int64Regs:$b))]>; - def b64ri: NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b), - !strconcat(OpcStr, ".b64 \t$dst, $a, $b;"), - [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>; +// Template for three-arg bitwise operations. Takes three args, Creates .b16, +// .b32, .b64, and .pred (predicate registers -- i.e., i1) versions of OpcStr. +multiclass BITWISE<string OpcStr, SDNode OpNode> { + def b1rr : + NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b), + !strconcat(OpcStr, ".pred \t$dst, $a, $b;"), + [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>; + def b1ri : + NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b), + !strconcat(OpcStr, ".pred \t$dst, $a, $b;"), + [(set Int1Regs:$dst, (OpNode Int1Regs:$a, imm:$b))]>; + def b16rr : + NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), + !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"), + [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>; + def b16ri : + NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b), + !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"), + [(set Int16Regs:$dst, (OpNode Int16Regs:$a, imm:$b))]>; + def b32rr : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), + !strconcat(OpcStr, ".b32 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>; + def b32ri : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), + !strconcat(OpcStr, ".b32 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>; + def b64rr : + NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b), + !strconcat(OpcStr, ".b64 \t$dst, $a, $b;"), + [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>; + def b64ri : + NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b), + !strconcat(OpcStr, ".b64 \t$dst, $a, $b;"), + [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>; } -defm OR : LOG_FORMAT<"or", or>; -defm AND : LOG_FORMAT<"and", and>; -defm XOR : LOG_FORMAT<"xor", xor>; +defm OR : BITWISE<"or", or>; +defm AND : BITWISE<"and", and>; +defm XOR : BITWISE<"xor", xor>; -def NOT1: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src), +def NOT1 : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src), "not.pred \t$dst, $src;", [(set Int1Regs:$dst, (not Int1Regs:$src))]>; -def NOT16: NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), +def NOT16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), "not.b16 \t$dst, $src;", [(set Int16Regs:$dst, (not Int16Regs:$src))]>; -def NOT32: NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), +def NOT32 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), "not.b32 \t$dst, $src;", [(set Int32Regs:$dst, (not Int32Regs:$src))]>; -def NOT64: NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), - "not.b64 \t$dst, $src;", - [(set Int64Regs:$dst, (not Int64Regs:$src))]>; - -// For shifts, the second src operand must be 32-bit value -multiclass LSHIFT_FORMAT<string OpcStr, SDNode OpNode> { - def i64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, - Int32Regs:$b), - !strconcat(OpcStr, "64 \t$dst, $a, $b;"), - [(set Int64Regs:$dst, (OpNode Int64Regs:$a, - Int32Regs:$b))]>; - def i64ri : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b), - !strconcat(OpcStr, "64 \t$dst, $a, $b;"), - [(set Int64Regs:$dst, (OpNode Int64Regs:$a, - (i32 imm:$b)))]>; - def i32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, - Int32Regs:$b), - !strconcat(OpcStr, "32 \t$dst, $a, $b;"), - [(set Int32Regs:$dst, (OpNode Int32Regs:$a, - Int32Regs:$b))]>; - def i32ri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), - !strconcat(OpcStr, "32 \t$dst, $a, $b;"), - [(set Int32Regs:$dst, (OpNode Int32Regs:$a, - (i32 imm:$b)))]>; - def i32ii : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b), - !strconcat(OpcStr, "32 \t$dst, $a, $b;"), - [(set Int32Regs:$dst, (OpNode (i32 imm:$a), - (i32 imm:$b)))]>; - def i16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, - Int32Regs:$b), - !strconcat(OpcStr, "16 \t$dst, $a, $b;"), - [(set Int16Regs:$dst, (OpNode Int16Regs:$a, - Int32Regs:$b))]>; - def i16ri : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b), - !strconcat(OpcStr, "16 \t$dst, $a, $b;"), - [(set Int16Regs:$dst, (OpNode Int16Regs:$a, - (i32 imm:$b)))]>; -} +def NOT64 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), + "not.b64 \t$dst, $src;", + [(set Int64Regs:$dst, (not Int64Regs:$src))]>; -defm SHL : LSHIFT_FORMAT<"shl.b", shl>; - -// For shifts, the second src operand must be 32-bit value -// Need to add cvt for the 8-bits. -multiclass RSHIFT_FORMAT<string OpcStr, SDNode OpNode> { - def i64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, - Int32Regs:$b), - !strconcat(OpcStr, "64 \t$dst, $a, $b;"), - [(set Int64Regs:$dst, (OpNode Int64Regs:$a, - Int32Regs:$b))]>; - def i64ri : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b), - !strconcat(OpcStr, "64 \t$dst, $a, $b;"), - [(set Int64Regs:$dst, (OpNode Int64Regs:$a, - (i32 imm:$b)))]>; - def i32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, - Int32Regs:$b), - !strconcat(OpcStr, "32 \t$dst, $a, $b;"), - [(set Int32Regs:$dst, (OpNode Int32Regs:$a, - Int32Regs:$b))]>; - def i32ri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), - !strconcat(OpcStr, "32 \t$dst, $a, $b;"), - [(set Int32Regs:$dst, (OpNode Int32Regs:$a, - (i32 imm:$b)))]>; - def i32ii : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b), - !strconcat(OpcStr, "32 \t$dst, $a, $b;"), - [(set Int32Regs:$dst, (OpNode (i32 imm:$a), - (i32 imm:$b)))]>; - def i16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, - Int32Regs:$b), - !strconcat(OpcStr, "16 \t$dst, $a, $b;"), - [(set Int16Regs:$dst, (OpNode Int16Regs:$a, - Int32Regs:$b))]>; - def i16ri : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b), - !strconcat(OpcStr, "16 \t$dst, $a, $b;"), - [(set Int16Regs:$dst, (OpNode Int16Regs:$a, - (i32 imm:$b)))]>; +// Template for left/right shifts. Takes three operands, +// [dest (reg), src (reg), shift (reg or imm)]. +// dest and src may be int64, int32, or int16, but shift is always int32. +// +// This template also defines a 32-bit shift (imm, imm) instruction. +multiclass SHIFT<string OpcStr, SDNode OpNode> { + def i64rr : + NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int32Regs:$b), + !strconcat(OpcStr, "64 \t$dst, $a, $b;"), + [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int32Regs:$b))]>; + def i64ri : + NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b), + !strconcat(OpcStr, "64 \t$dst, $a, $b;"), + [(set Int64Regs:$dst, (OpNode Int64Regs:$a, (i32 imm:$b)))]>; + def i32rr : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), + !strconcat(OpcStr, "32 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>; + def i32ri : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), + !strconcat(OpcStr, "32 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode Int32Regs:$a, (i32 imm:$b)))]>; + def i32ii : + NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b), + !strconcat(OpcStr, "32 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode (i32 imm:$a), (i32 imm:$b)))]>; + def i16rr : + NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int32Regs:$b), + !strconcat(OpcStr, "16 \t$dst, $a, $b;"), + [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int32Regs:$b))]>; + def i16ri : + NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b), + !strconcat(OpcStr, "16 \t$dst, $a, $b;"), + [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (i32 imm:$b)))]>; } -defm SRA : RSHIFT_FORMAT<"shr.s", sra>; -defm SRL : RSHIFT_FORMAT<"shr.u", srl>; +defm SHL : SHIFT<"shl.b", shl>; +defm SRA : SHIFT<"shr.s", sra>; +defm SRL : SHIFT<"shr.u", srl>; // -// Rotate: use ptx shf instruction if available. +// Rotate: Use ptx shf instruction if available. // // 32 bit r2 = rotl r1, n // => // r2 = shf.l r1, r1, n -def ROTL32imm_hw : NVPTXInst<(outs Int32Regs:$dst), - (ins Int32Regs:$src, i32imm:$amt), - "shf.l.wrap.b32 \t$dst, $src, $src, $amt;", - [(set Int32Regs:$dst, (rotl Int32Regs:$src, (i32 imm:$amt)))]>, - Requires<[hasHWROT32]> ; - -def ROTL32reg_hw : NVPTXInst<(outs Int32Regs:$dst), - (ins Int32Regs:$src, Int32Regs:$amt), - "shf.l.wrap.b32 \t$dst, $src, $src, $amt;", - [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>, - Requires<[hasHWROT32]>; +def ROTL32imm_hw : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt), + "shf.l.wrap.b32 \t$dst, $src, $src, $amt;", + [(set Int32Regs:$dst, (rotl Int32Regs:$src, (i32 imm:$amt)))]>, + Requires<[hasHWROT32]>; + +def ROTL32reg_hw : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), + "shf.l.wrap.b32 \t$dst, $src, $src, $amt;", + [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>, + Requires<[hasHWROT32]>; // 32 bit r2 = rotr r1, n // => // r2 = shf.r r1, r1, n -def ROTR32imm_hw : NVPTXInst<(outs Int32Regs:$dst), - (ins Int32Regs:$src, i32imm:$amt), - "shf.r.wrap.b32 \t$dst, $src, $src, $amt;", - [(set Int32Regs:$dst, (rotr Int32Regs:$src, (i32 imm:$amt)))]>, - Requires<[hasHWROT32]>; - -def ROTR32reg_hw : NVPTXInst<(outs Int32Regs:$dst), - (ins Int32Regs:$src, Int32Regs:$amt), - "shf.r.wrap.b32 \t$dst, $src, $src, $amt;", - [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>, - Requires<[hasHWROT32]>; - -// -// Rotate: if ptx shf instruction is not available, then use shift+add -// -// 32bit -def ROT32imm_sw : NVPTXInst<(outs Int32Regs:$dst), - (ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2), - !strconcat("{{\n\t", - !strconcat(".reg .b32 %lhs;\n\t", - !strconcat(".reg .b32 %rhs;\n\t", - !strconcat("shl.b32 \t%lhs, $src, $amt1;\n\t", - !strconcat("shr.b32 \t%rhs, $src, $amt2;\n\t", - !strconcat("add.u32 \t$dst, %lhs, %rhs;\n\t", - !strconcat("}}", ""))))))), - []>; +def ROTR32imm_hw : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt), + "shf.r.wrap.b32 \t$dst, $src, $src, $amt;", + [(set Int32Regs:$dst, (rotr Int32Regs:$src, (i32 imm:$amt)))]>, + Requires<[hasHWROT32]>; + +def ROTR32reg_hw : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), + "shf.r.wrap.b32 \t$dst, $src, $src, $amt;", + [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>, + Requires<[hasHWROT32]>; + +// 32-bit software rotate by immediate. $amt2 should equal 32 - $amt1. +def ROT32imm_sw : + NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2), + "{{\n\t" + ".reg .b32 %lhs;\n\t" + ".reg .b32 %rhs;\n\t" + "shl.b32 \t%lhs, $src, $amt1;\n\t" + "shr.b32 \t%rhs, $src, $amt2;\n\t" + "add.u32 \t$dst, %lhs, %rhs;\n\t" + "}}", + []>; def SUB_FRM_32 : SDNodeXForm<imm, [{ - return CurDAG->getTargetConstant(32-N->getZExtValue(), SDLoc(N), MVT::i32); + return CurDAG->getTargetConstant(32 - N->getZExtValue(), SDLoc(N), MVT::i32); }]>; def : Pat<(rotl Int32Regs:$src, (i32 imm:$amt)), @@ -1148,45 +1111,48 @@ def : Pat<(rotr Int32Regs:$src, (i32 imm:$amt)), (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>, Requires<[noHWROT32]>; -def ROTL32reg_sw : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, - Int32Regs:$amt), - !strconcat("{{\n\t", - !strconcat(".reg .b32 %lhs;\n\t", - !strconcat(".reg .b32 %rhs;\n\t", - !strconcat(".reg .b32 %amt2;\n\t", - !strconcat("shl.b32 \t%lhs, $src, $amt;\n\t", - !strconcat("sub.s32 \t%amt2, 32, $amt;\n\t", - !strconcat("shr.b32 \t%rhs, $src, %amt2;\n\t", - !strconcat("add.u32 \t$dst, %lhs, %rhs;\n\t", - !strconcat("}}", ""))))))))), - [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>, - Requires<[noHWROT32]>; - -def ROTR32reg_sw : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, - Int32Regs:$amt), - !strconcat("{{\n\t", - !strconcat(".reg .b32 %lhs;\n\t", - !strconcat(".reg .b32 %rhs;\n\t", - !strconcat(".reg .b32 %amt2;\n\t", - !strconcat("shr.b32 \t%lhs, $src, $amt;\n\t", - !strconcat("sub.s32 \t%amt2, 32, $amt;\n\t", - !strconcat("shl.b32 \t%rhs, $src, %amt2;\n\t", - !strconcat("add.u32 \t$dst, %lhs, %rhs;\n\t", - !strconcat("}}", ""))))))))), - [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>, - Requires<[noHWROT32]>; - -// 64bit -def ROT64imm_sw : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, - i32imm:$amt1, i32imm:$amt2), - !strconcat("{{\n\t", - !strconcat(".reg .b64 %lhs;\n\t", - !strconcat(".reg .b64 %rhs;\n\t", - !strconcat("shl.b64 \t%lhs, $src, $amt1;\n\t", - !strconcat("shr.b64 \t%rhs, $src, $amt2;\n\t", - !strconcat("add.u64 \t$dst, %lhs, %rhs;\n\t", - !strconcat("}}", ""))))))), - []>; +// 32-bit software rotate left by register. +def ROTL32reg_sw : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), + "{{\n\t" + ".reg .b32 %lhs;\n\t" + ".reg .b32 %rhs;\n\t" + ".reg .b32 %amt2;\n\t" + "shl.b32 \t%lhs, $src, $amt;\n\t" + "sub.s32 \t%amt2, 32, $amt;\n\t" + "shr.b32 \t%rhs, $src, %amt2;\n\t" + "add.u32 \t$dst, %lhs, %rhs;\n\t" + "}}", + [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>, + Requires<[noHWROT32]>; + +// 32-bit software rotate right by register. +def ROTR32reg_sw : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), + "{{\n\t" + ".reg .b32 %lhs;\n\t" + ".reg .b32 %rhs;\n\t" + ".reg .b32 %amt2;\n\t" + "shr.b32 \t%lhs, $src, $amt;\n\t" + "sub.s32 \t%amt2, 32, $amt;\n\t" + "shl.b32 \t%rhs, $src, %amt2;\n\t" + "add.u32 \t$dst, %lhs, %rhs;\n\t" + "}}", + [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>, + Requires<[noHWROT32]>; + +// 64-bit software rotate by immediate. $amt2 should equal 64 - $amt1. +def ROT64imm_sw : + NVPTXInst<(outs Int64Regs:$dst), + (ins Int64Regs:$src, i32imm:$amt1, i32imm:$amt2), + "{{\n\t" + ".reg .b64 %lhs;\n\t" + ".reg .b64 %rhs;\n\t" + "shl.b64 \t%lhs, $src, $amt1;\n\t" + "shr.b64 \t%rhs, $src, $amt2;\n\t" + "add.u64 \t$dst, %lhs, %rhs;\n\t" + "}}", + []>; def SUB_FRM_64 : SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(64-N->getZExtValue(), SDLoc(N), MVT::i32); @@ -1197,37 +1163,70 @@ def : Pat<(rotl Int64Regs:$src, (i32 imm:$amt)), def : Pat<(rotr Int64Regs:$src, (i32 imm:$amt)), (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>; -def ROTL64reg_sw : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, - Int32Regs:$amt), - !strconcat("{{\n\t", - !strconcat(".reg .b64 %lhs;\n\t", - !strconcat(".reg .b64 %rhs;\n\t", - !strconcat(".reg .u32 %amt2;\n\t", - !strconcat("shl.b64 \t%lhs, $src, $amt;\n\t", - !strconcat("sub.u32 \t%amt2, 64, $amt;\n\t", - !strconcat("shr.b64 \t%rhs, $src, %amt2;\n\t", - !strconcat("add.u64 \t$dst, %lhs, %rhs;\n\t", - !strconcat("}}", ""))))))))), - [(set Int64Regs:$dst, (rotl Int64Regs:$src, Int32Regs:$amt))]>; - -def ROTR64reg_sw : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, - Int32Regs:$amt), - !strconcat("{{\n\t", - !strconcat(".reg .b64 %lhs;\n\t", - !strconcat(".reg .b64 %rhs;\n\t", - !strconcat(".reg .u32 %amt2;\n\t", - !strconcat("shr.b64 \t%lhs, $src, $amt;\n\t", - !strconcat("sub.u32 \t%amt2, 64, $amt;\n\t", - !strconcat("shl.b64 \t%rhs, $src, %amt2;\n\t", - !strconcat("add.u64 \t$dst, %lhs, %rhs;\n\t", - !strconcat("}}", ""))))))))), - [(set Int64Regs:$dst, (rotr Int64Regs:$src, Int32Regs:$amt))]>; +// 64-bit software rotate left by register. +def ROTL64reg_sw : + NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt), + "{{\n\t" + ".reg .b64 %lhs;\n\t" + ".reg .b64 %rhs;\n\t" + ".reg .u32 %amt2;\n\t" + "shl.b64 \t%lhs, $src, $amt;\n\t" + "sub.u32 \t%amt2, 64, $amt;\n\t" + "shr.b64 \t%rhs, $src, %amt2;\n\t" + "add.u64 \t$dst, %lhs, %rhs;\n\t" + "}}", + [(set Int64Regs:$dst, (rotl Int64Regs:$src, Int32Regs:$amt))]>; + +def ROTR64reg_sw : + NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt), + "{{\n\t" + ".reg .b64 %lhs;\n\t" + ".reg .b64 %rhs;\n\t" + ".reg .u32 %amt2;\n\t" + "shr.b64 \t%lhs, $src, $amt;\n\t" + "sub.u32 \t%amt2, 64, $amt;\n\t" + "shl.b64 \t%rhs, $src, %amt2;\n\t" + "add.u64 \t$dst, %lhs, %rhs;\n\t" + "}}", + [(set Int64Regs:$dst, (rotr Int64Regs:$src, Int32Regs:$amt))]>; + +// +// Funnnel shift in clamp mode +// + +// Create SDNodes so they can be used in the DAG code, e.g. +// NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts) +def SDTIntShiftDOp : + SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, + SDTCisInt<0>, SDTCisInt<3>]>; +def FUN_SHFL_CLAMP : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>; +def FUN_SHFR_CLAMP : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>; + +def FUNSHFLCLAMP : + NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), + "shf.l.clamp.b32 \t$dst, $lo, $hi, $amt;", + [(set Int32Regs:$dst, + (FUN_SHFL_CLAMP Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt))]>; +def FUNSHFRCLAMP : + NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), + "shf.r.clamp.b32 \t$dst, $lo, $hi, $amt;", + [(set Int32Regs:$dst, + (FUN_SHFR_CLAMP Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt))]>; + +// // BFE - bit-field extract +// +// Template for BFE instructions. Takes four args, +// [dest (reg), src (reg), start (reg or imm), end (reg or imm)]. +// Start may be an imm only if end is also an imm. FIXME: Is this a +// restriction in PTX? +// +// dest and src may be int32 or int64, but start and end are always int32. multiclass BFE<string TyStr, RegisterClass RC> { - // BFE supports both 32-bit and 64-bit values, but the start and length - // operands are always 32-bit def rrr : NVPTXInst<(outs RC:$d), (ins RC:$a, Int32Regs:$b, Int32Regs:$c), @@ -1242,29 +1241,35 @@ multiclass BFE<string TyStr, RegisterClass RC> { !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>; } -defm BFE_S32 : BFE<"s32", Int32Regs>; -defm BFE_U32 : BFE<"u32", Int32Regs>; -defm BFE_S64 : BFE<"s64", Int64Regs>; -defm BFE_U64 : BFE<"u64", Int64Regs>; +let hasSideEffects = 0 in { + defm BFE_S32 : BFE<"s32", Int32Regs>; + defm BFE_U32 : BFE<"u32", Int32Regs>; + defm BFE_S64 : BFE<"s64", Int64Regs>; + defm BFE_U64 : BFE<"u64", Int64Regs>; +} //----------------------------------- -// General Comparison +// Comparison instructions (setp, set) //----------------------------------- -// General setp instructions -multiclass SETP<string TypeStr, RegisterClass RC, Operand ImmCls> { - def rr : NVPTXInst<(outs Int1Regs:$dst), - (ins RC:$a, RC:$b, CmpMode:$cmp), - !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr, "\t$dst, $a, $b;"), - []>; - def ri : NVPTXInst<(outs Int1Regs:$dst), - (ins RC:$a, ImmCls:$b, CmpMode:$cmp), - !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr, "\t$dst, $a, $b;"), - []>; - def ir : NVPTXInst<(outs Int1Regs:$dst), - (ins ImmCls:$a, RC:$b, CmpMode:$cmp), - !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr, "\t$dst, $a, $b;"), - []>; +// FIXME: This doesn't cover versions of set and setp that combine with a +// boolean predicate, e.g. setp.eq.and.b16. + +let hasSideEffects = 0 in { + multiclass SETP<string TypeStr, RegisterClass RC, Operand ImmCls> { + def rr : + NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, RC:$b, CmpMode:$cmp), + !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr, + "\t$dst, $a, $b;"), []>; + def ri : + NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, ImmCls:$b, CmpMode:$cmp), + !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr, + "\t$dst, $a, $b;"), []>; + def ir : + NVPTXInst<(outs Int1Regs:$dst), (ins ImmCls:$a, RC:$b, CmpMode:$cmp), + !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr, + "\t$dst, $a, $b;"), []>; + } } defm SETP_b16 : SETP<"b16", Int16Regs, i16imm>; @@ -1279,17 +1284,22 @@ defm SETP_u64 : SETP<"u64", Int64Regs, i64imm>; defm SETP_f32 : SETP<"f32", Float32Regs, f32imm>; defm SETP_f64 : SETP<"f64", Float64Regs, f64imm>; -// General set instructions -multiclass SET<string TypeStr, RegisterClass RC, Operand ImmCls> { - def rr : NVPTXInst<(outs Int32Regs:$dst), - (ins RC:$a, RC:$b, CmpMode:$cmp), - !strconcat("set$cmp.", TypeStr, "\t$dst, $a, $b;"), []>; - def ri : NVPTXInst<(outs Int32Regs:$dst), - (ins RC:$a, ImmCls:$b, CmpMode:$cmp), - !strconcat("set$cmp.", TypeStr, "\t$dst, $a, $b;"), []>; - def ir : NVPTXInst<(outs Int32Regs:$dst), - (ins ImmCls:$a, RC:$b, CmpMode:$cmp), - !strconcat("set$cmp.", TypeStr, "\t$dst, $a, $b;"), []>; +// FIXME: This doesn't appear to be correct. The "set" mnemonic has the form +// "set.CmpOp{.ftz}.dtype.stype", where dtype is the type of the destination +// reg, either u32, s32, or f32. Anyway these aren't used at the moment. + +let hasSideEffects = 0 in { + multiclass SET<string TypeStr, RegisterClass RC, Operand ImmCls> { + def rr : NVPTXInst<(outs Int32Regs:$dst), + (ins RC:$a, RC:$b, CmpMode:$cmp), + !strconcat("set$cmp.", TypeStr, "\t$dst, $a, $b;"), []>; + def ri : NVPTXInst<(outs Int32Regs:$dst), + (ins RC:$a, ImmCls:$b, CmpMode:$cmp), + !strconcat("set$cmp.", TypeStr, "\t$dst, $a, $b;"), []>; + def ir : NVPTXInst<(outs Int32Regs:$dst), + (ins ImmCls:$a, RC:$b, CmpMode:$cmp), + !strconcat("set$cmp.", TypeStr, "\t$dst, $a, $b;"), []>; + } } defm SET_b16 : SET<"b16", Int16Regs, i16imm>; @@ -1305,45 +1315,56 @@ defm SET_f32 : SET<"f32", Float32Regs, f32imm>; defm SET_f64 : SET<"f64", Float64Regs, f64imm>; //----------------------------------- -// General Selection +// Selection instructions (selp) //----------------------------------- -// General selp instructions -multiclass SELP<string TypeStr, RegisterClass RC, Operand ImmCls> { - def rr : NVPTXInst<(outs RC:$dst), - (ins RC:$a, RC:$b, Int1Regs:$p), - !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>; - def ri : NVPTXInst<(outs RC:$dst), - (ins RC:$a, ImmCls:$b, Int1Regs:$p), - !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>; - def ir : NVPTXInst<(outs RC:$dst), - (ins ImmCls:$a, RC:$b, Int1Regs:$p), - !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>; - def ii : NVPTXInst<(outs RC:$dst), - (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p), - !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>; -} +// FIXME: Missing slct -multiclass SELP_PATTERN<string TypeStr, RegisterClass RC, Operand ImmCls, - SDNode ImmNode> { - def rr : NVPTXInst<(outs RC:$dst), - (ins RC:$a, RC:$b, Int1Regs:$p), - !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), - [(set RC:$dst, (select Int1Regs:$p, RC:$a, RC:$b))]>; - def ri : NVPTXInst<(outs RC:$dst), - (ins RC:$a, ImmCls:$b, Int1Regs:$p), - !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), - [(set RC:$dst, (select Int1Regs:$p, RC:$a, ImmNode:$b))]>; - def ir : NVPTXInst<(outs RC:$dst), - (ins ImmCls:$a, RC:$b, Int1Regs:$p), - !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), - [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, RC:$b))]>; - def ii : NVPTXInst<(outs RC:$dst), - (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p), - !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), - [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, ImmNode:$b))]>; +// selp instructions that don't have any pattern matches; we explicitly use +// them within this file. +let hasSideEffects = 0 in { + multiclass SELP<string TypeStr, RegisterClass RC, Operand ImmCls> { + def rr : NVPTXInst<(outs RC:$dst), + (ins RC:$a, RC:$b, Int1Regs:$p), + !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>; + def ri : NVPTXInst<(outs RC:$dst), + (ins RC:$a, ImmCls:$b, Int1Regs:$p), + !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>; + def ir : NVPTXInst<(outs RC:$dst), + (ins ImmCls:$a, RC:$b, Int1Regs:$p), + !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>; + def ii : NVPTXInst<(outs RC:$dst), + (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p), + !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>; + } + + multiclass SELP_PATTERN<string TypeStr, RegisterClass RC, Operand ImmCls, + SDNode ImmNode> { + def rr : + NVPTXInst<(outs RC:$dst), + (ins RC:$a, RC:$b, Int1Regs:$p), + !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), + [(set RC:$dst, (select Int1Regs:$p, RC:$a, RC:$b))]>; + def ri : + NVPTXInst<(outs RC:$dst), + (ins RC:$a, ImmCls:$b, Int1Regs:$p), + !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), + [(set RC:$dst, (select Int1Regs:$p, RC:$a, ImmNode:$b))]>; + def ir : + NVPTXInst<(outs RC:$dst), + (ins ImmCls:$a, RC:$b, Int1Regs:$p), + !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), + [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, RC:$b))]>; + def ii : + NVPTXInst<(outs RC:$dst), + (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p), + !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), + [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, ImmNode:$b))]>; + } } +// Don't pattern match on selp.{s,u}{16,32,64} -- selp.b{16,32,64} is just as +// good. defm SELP_b16 : SELP_PATTERN<"b16", Int16Regs, i16imm, imm>; defm SELP_s16 : SELP<"s16", Int16Regs, i16imm>; defm SELP_u16 : SELP<"u16", Int16Regs, i16imm>; @@ -1356,40 +1377,14 @@ defm SELP_u64 : SELP<"u64", Int64Regs, i64imm>; defm SELP_f32 : SELP_PATTERN<"f32", Float32Regs, f32imm, fpimm>; defm SELP_f64 : SELP_PATTERN<"f64", Float64Regs, f64imm, fpimm>; -// -// Funnnel shift in clamp mode -// -// - SDNodes are created so they can be used in the DAG code, -// e.g. NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts) -// -def SDTIntShiftDOp: SDTypeProfile<1, 3, - [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, - SDTCisInt<0>, SDTCisInt<3>]>; -def FUN_SHFL_CLAMP : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>; -def FUN_SHFR_CLAMP : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>; - -def FUNSHFLCLAMP : NVPTXInst<(outs Int32Regs:$dst), - (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), - "shf.l.clamp.b32 \t$dst, $lo, $hi, $amt;", - [(set Int32Regs:$dst, - (FUN_SHFL_CLAMP Int32Regs:$lo, - Int32Regs:$hi, Int32Regs:$amt))]>; - -def FUNSHFRCLAMP : NVPTXInst<(outs Int32Regs:$dst), - (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), - "shf.r.clamp.b32 \t$dst, $lo, $hi, $amt;", - [(set Int32Regs:$dst, - (FUN_SHFR_CLAMP Int32Regs:$lo, - Int32Regs:$hi, Int32Regs:$amt))]>; - //----------------------------------- // Data Movement (Load / Store, Move) //----------------------------------- def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex], - [SDNPWantRoot]>; + [SDNPWantRoot]>; def ADDRri64 : ComplexPattern<i64, 2, "SelectADDRri64", [frameindex], - [SDNPWantRoot]>; + [SDNPWantRoot]>; def MEMri : Operand<i32> { let PrintMethod = "printMemOperand"; @@ -1401,82 +1396,83 @@ def MEMri64 : Operand<i64> { } def imem : Operand<iPTR> { - let PrintMethod = "printOperand"; + let PrintMethod = "printOperand"; } def imemAny : Operand<iPTRAny> { - let PrintMethod = "printOperand"; + let PrintMethod = "printOperand"; } def LdStCode : Operand<i32> { - let PrintMethod = "printLdStCode"; + let PrintMethod = "printLdStCode"; } def SDTWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>; def Wrapper : SDNode<"NVPTXISD::Wrapper", SDTWrapper>; +// Load a memory address into a u32 or u64 register. def MOV_ADDR : NVPTXInst<(outs Int32Regs:$dst), (ins imem:$a), - "mov.u32 \t$dst, $a;", - [(set Int32Regs:$dst, (Wrapper tglobaladdr:$a))]>; - + "mov.u32 \t$dst, $a;", + [(set Int32Regs:$dst, (Wrapper tglobaladdr:$a))]>; def MOV_ADDR64 : NVPTXInst<(outs Int64Regs:$dst), (ins imem:$a), - "mov.u64 \t$dst, $a;", - [(set Int64Regs:$dst, (Wrapper tglobaladdr:$a))]>; + "mov.u64 \t$dst, $a;", + [(set Int64Regs:$dst, (Wrapper tglobaladdr:$a))]>; -// Get pointer to local stack -def MOV_DEPOT_ADDR - : NVPTXInst<(outs Int32Regs:$d), (ins i32imm:$num), - "mov.u32 \t$d, __local_depot$num;", []>; -def MOV_DEPOT_ADDR_64 - : NVPTXInst<(outs Int64Regs:$d), (ins i32imm:$num), - "mov.u64 \t$d, __local_depot$num;", []>; +// Get pointer to local stack. +let hasSideEffects = 0 in { + def MOV_DEPOT_ADDR : NVPTXInst<(outs Int32Regs:$d), (ins i32imm:$num), + "mov.u32 \t$d, __local_depot$num;", []>; + def MOV_DEPOT_ADDR_64 : NVPTXInst<(outs Int64Regs:$d), (ins i32imm:$num), + "mov.u64 \t$d, __local_depot$num;", []>; +} // copyPhysreg is hard-coded in NVPTXInstrInfo.cpp -let IsSimpleMove=1 in { -def IMOV1rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$sss), - "mov.pred \t$dst, $sss;", []>; -def IMOV16rr: NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss), - "mov.u16 \t$dst, $sss;", []>; -def IMOV32rr: NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss), - "mov.u32 \t$dst, $sss;", []>; -def IMOV64rr: NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss), - "mov.u64 \t$dst, $sss;", []>; - -def FMOV32rr: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src), - "mov.f32 \t$dst, $src;", []>; -def FMOV64rr: NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src), - "mov.f64 \t$dst, $src;", []>; +let IsSimpleMove=1, hasSideEffects=0 in { + def IMOV1rr : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$sss), + "mov.pred \t$dst, $sss;", []>; + def IMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss), + "mov.u16 \t$dst, $sss;", []>; + def IMOV32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss), + "mov.u32 \t$dst, $sss;", []>; + def IMOV64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss), + "mov.u64 \t$dst, $sss;", []>; + + def FMOV32rr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src), + "mov.f32 \t$dst, $src;", []>; + def FMOV64rr : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src), + "mov.f64 \t$dst, $src;", []>; } -def IMOV1ri: NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src), - "mov.pred \t$dst, $src;", - [(set Int1Regs:$dst, imm:$src)]>; -def IMOV16ri: NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src), - "mov.u16 \t$dst, $src;", - [(set Int16Regs:$dst, imm:$src)]>; -def IMOV32ri: NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src), - "mov.u32 \t$dst, $src;", - [(set Int32Regs:$dst, imm:$src)]>; -def IMOV64i: NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src), - "mov.u64 \t$dst, $src;", - [(set Int64Regs:$dst, imm:$src)]>; - -def FMOV32ri: NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src), - "mov.f32 \t$dst, $src;", - [(set Float32Regs:$dst, fpimm:$src)]>; -def FMOV64ri: NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src), - "mov.f64 \t$dst, $src;", - [(set Float64Regs:$dst, fpimm:$src)]>; + +def IMOV1ri : NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src), + "mov.pred \t$dst, $src;", + [(set Int1Regs:$dst, imm:$src)]>; +def IMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src), + "mov.u16 \t$dst, $src;", + [(set Int16Regs:$dst, imm:$src)]>; +def IMOV32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src), + "mov.u32 \t$dst, $src;", + [(set Int32Regs:$dst, imm:$src)]>; +def IMOV64i : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src), + "mov.u64 \t$dst, $src;", + [(set Int64Regs:$dst, imm:$src)]>; + +def FMOV32ri : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src), + "mov.f32 \t$dst, $src;", + [(set Float32Regs:$dst, fpimm:$src)]>; +def FMOV64ri : NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src), + "mov.f64 \t$dst, $src;", + [(set Float64Regs:$dst, fpimm:$src)]>; def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOV32ri texternalsym:$dst)>; //---- Copy Frame Index ---- -def LEA_ADDRi : NVPTXInst<(outs Int32Regs:$dst), (ins MEMri:$addr), - "add.u32 \t$dst, ${addr:add};", - [(set Int32Regs:$dst, ADDRri:$addr)]>; +def LEA_ADDRi : NVPTXInst<(outs Int32Regs:$dst), (ins MEMri:$addr), + "add.u32 \t$dst, ${addr:add};", + [(set Int32Regs:$dst, ADDRri:$addr)]>; def LEA_ADDRi64 : NVPTXInst<(outs Int64Regs:$dst), (ins MEMri64:$addr), - "add.u64 \t$dst, ${addr:add};", - [(set Int64Regs:$dst, ADDRri64:$addr)]>; + "add.u64 \t$dst, ${addr:add};", + [(set Int64Regs:$dst, ADDRri64:$addr)]>; //----------------------------------- // Comparison and Selection @@ -1554,7 +1550,7 @@ multiclass ISET_FORMAT_SIGNED<PatFrag OpNode, PatLeaf Mode> SET_s16rr, SET_s16ri, SET_s16ir, SET_s32rr, SET_s32ri, SET_s32ir, SET_s64rr, SET_s64ri, SET_s64ir> { - // TableGen doesn't like empty multiclasses + // TableGen doesn't like empty multiclasses. def : PatLeaf<(i32 0)>; } @@ -1566,21 +1562,21 @@ multiclass ISET_FORMAT_UNSIGNED<PatFrag OpNode, PatLeaf Mode> SET_u16rr, SET_u16ri, SET_u16ir, SET_u32rr, SET_u32ri, SET_u32ir, SET_u64rr, SET_u64ri, SET_u64ir> { - // TableGen doesn't like empty multiclasses + // TableGen doesn't like empty multiclasses. def : PatLeaf<(i32 0)>; } defm : ISET_FORMAT_SIGNED<setgt, CmpGT>; -defm : ISET_FORMAT_UNSIGNED<setugt, CmpGT>; defm : ISET_FORMAT_SIGNED<setlt, CmpLT>; -defm : ISET_FORMAT_UNSIGNED<setult, CmpLT>; defm : ISET_FORMAT_SIGNED<setge, CmpGE>; -defm : ISET_FORMAT_UNSIGNED<setuge, CmpGE>; defm : ISET_FORMAT_SIGNED<setle, CmpLE>; -defm : ISET_FORMAT_UNSIGNED<setule, CmpLE>; defm : ISET_FORMAT_SIGNED<seteq, CmpEQ>; -defm : ISET_FORMAT_UNSIGNED<setueq, CmpEQ>; defm : ISET_FORMAT_SIGNED<setne, CmpNE>; +defm : ISET_FORMAT_UNSIGNED<setugt, CmpGT>; +defm : ISET_FORMAT_UNSIGNED<setult, CmpLT>; +defm : ISET_FORMAT_UNSIGNED<setuge, CmpGE>; +defm : ISET_FORMAT_UNSIGNED<setule, CmpLE>; +defm : ISET_FORMAT_UNSIGNED<setueq, CmpEQ>; defm : ISET_FORMAT_UNSIGNED<setune, CmpNE>; // i1 compares @@ -1678,13 +1674,14 @@ defm FSetNE : FSET_FORMAT<setne, CmpNE, CmpNE_FTZ>; defm FSetNUM : FSET_FORMAT<seto, CmpNUM, CmpNUM_FTZ>; defm FSetNAN : FSET_FORMAT<setuo, CmpNAN, CmpNAN_FTZ>; -//def ld_param : SDNode<"NVPTXISD::LOAD_PARAM", SDTLoad, -// [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; +// FIXME: What is this doing here? Can it be deleted? +// def ld_param : SDNode<"NVPTXISD::LOAD_PARAM", SDTLoad, +// [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; -def SDTDeclareParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, - SDTCisInt<2>]>; -def SDTDeclareScalarParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, - SDTCisInt<1>, SDTCisInt<2>]>; +def SDTDeclareParamProfile : + SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>; +def SDTDeclareScalarParamProfile : + SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>; def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>; def SDTLoadParamV2Profile : SDTypeProfile<2, 2, [SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisInt<3>]>; def SDTLoadParamV4Profile : SDTypeProfile<4, 2, [SDTCisInt<4>, SDTCisInt<5>]>; @@ -1704,185 +1701,200 @@ def SDTStoreRetvalV2Profile : SDTypeProfile<0, 3, [SDTCisInt<0>]>; def SDTStoreRetvalV4Profile : SDTypeProfile<0, 5, [SDTCisInt<0>]>; def SDTPseudoUseParamProfile : SDTypeProfile<0, 1, []>; -def DeclareParam : SDNode<"NVPTXISD::DeclareParam", SDTDeclareParamProfile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def DeclareScalarParam : SDNode<"NVPTXISD::DeclareScalarParam", - SDTDeclareScalarParamProfile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def DeclareRetParam : SDNode<"NVPTXISD::DeclareRetParam", - SDTDeclareParamProfile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def DeclareRet : SDNode<"NVPTXISD::DeclareRet", SDTDeclareScalarParamProfile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def LoadParam : SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile, - [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; -def LoadParamV2 : SDNode<"NVPTXISD::LoadParamV2", SDTLoadParamV2Profile, - [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; -def LoadParamV4 : SDNode<"NVPTXISD::LoadParamV4", SDTLoadParamV4Profile, - [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; -def PrintCall : SDNode<"NVPTXISD::PrintCall", SDTPrintCallProfile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def PrintCallUni : SDNode<"NVPTXISD::PrintCallUni", SDTPrintCallUniProfile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def StoreParam : SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def StoreParamV2 : SDNode<"NVPTXISD::StoreParamV2", SDTStoreParamV2Profile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def StoreParamV4 : SDNode<"NVPTXISD::StoreParamV4", SDTStoreParamV4Profile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def StoreParamU32 : SDNode<"NVPTXISD::StoreParamU32", SDTStoreParam32Profile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def StoreParamS32 : SDNode<"NVPTXISD::StoreParamS32", SDTStoreParam32Profile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def CallArgBegin : SDNode<"NVPTXISD::CallArgBegin", SDTCallArgMarkProfile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def CallArg : SDNode<"NVPTXISD::CallArg", SDTCallArgProfile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def LastCallArg : SDNode<"NVPTXISD::LastCallArg", SDTCallArgProfile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def CallArgEnd : SDNode<"NVPTXISD::CallArgEnd", SDTCallVoidProfile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def CallVoid : SDNode<"NVPTXISD::CallVoid", SDTCallVoidProfile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def Prototype : SDNode<"NVPTXISD::Prototype", SDTCallVoidProfile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def CallVal : SDNode<"NVPTXISD::CallVal", SDTCallValProfile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def MoveParam : SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile, - []>; -def StoreRetval : SDNode<"NVPTXISD::StoreRetval", SDTStoreRetvalProfile, - [SDNPHasChain, SDNPSideEffect]>; -def StoreRetvalV2 : SDNode<"NVPTXISD::StoreRetvalV2", SDTStoreRetvalV2Profile, - [SDNPHasChain, SDNPSideEffect]>; -def StoreRetvalV4 : SDNode<"NVPTXISD::StoreRetvalV4", SDTStoreRetvalV4Profile, - [SDNPHasChain, SDNPSideEffect]>; -def PseudoUseParam : SDNode<"NVPTXISD::PseudoUseParam", - SDTPseudoUseParamProfile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def RETURNNode : SDNode<"NVPTXISD::RETURN", SDTCallArgMarkProfile, - [SDNPHasChain, SDNPSideEffect]>; - -class LoadParamMemInst<NVPTXRegClass regclass, string opstr> : - NVPTXInst<(outs regclass:$dst), (ins i32imm:$b), - !strconcat(!strconcat("ld.param", opstr), - "\t$dst, [retval0+$b];"), - []>; +def DeclareParam : + SDNode<"NVPTXISD::DeclareParam", SDTDeclareParamProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def DeclareScalarParam : + SDNode<"NVPTXISD::DeclareScalarParam", SDTDeclareScalarParamProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def DeclareRetParam : + SDNode<"NVPTXISD::DeclareRetParam", SDTDeclareParamProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def DeclareRet : + SDNode<"NVPTXISD::DeclareRet", SDTDeclareScalarParamProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def LoadParam : + SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile, + [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; +def LoadParamV2 : + SDNode<"NVPTXISD::LoadParamV2", SDTLoadParamV2Profile, + [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; +def LoadParamV4 : + SDNode<"NVPTXISD::LoadParamV4", SDTLoadParamV4Profile, + [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; +def PrintCall : + SDNode<"NVPTXISD::PrintCall", SDTPrintCallProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def PrintConvergentCall : + SDNode<"NVPTXISD::PrintConvergentCall", SDTPrintCallProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def PrintCallUni : + SDNode<"NVPTXISD::PrintCallUni", SDTPrintCallUniProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def PrintConvergentCallUni : + SDNode<"NVPTXISD::PrintConvergentCallUni", SDTPrintCallUniProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def StoreParam : + SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def StoreParamV2 : + SDNode<"NVPTXISD::StoreParamV2", SDTStoreParamV2Profile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def StoreParamV4 : + SDNode<"NVPTXISD::StoreParamV4", SDTStoreParamV4Profile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def StoreParamU32 : + SDNode<"NVPTXISD::StoreParamU32", SDTStoreParam32Profile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def StoreParamS32 : + SDNode<"NVPTXISD::StoreParamS32", SDTStoreParam32Profile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def CallArgBegin : + SDNode<"NVPTXISD::CallArgBegin", SDTCallArgMarkProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def CallArg : + SDNode<"NVPTXISD::CallArg", SDTCallArgProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def LastCallArg : + SDNode<"NVPTXISD::LastCallArg", SDTCallArgProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def CallArgEnd : + SDNode<"NVPTXISD::CallArgEnd", SDTCallVoidProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def CallVoid : + SDNode<"NVPTXISD::CallVoid", SDTCallVoidProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def Prototype : + SDNode<"NVPTXISD::Prototype", SDTCallVoidProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def CallVal : + SDNode<"NVPTXISD::CallVal", SDTCallValProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def MoveParam : + SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile, []>; +def StoreRetval : + SDNode<"NVPTXISD::StoreRetval", SDTStoreRetvalProfile, + [SDNPHasChain, SDNPSideEffect]>; +def StoreRetvalV2 : + SDNode<"NVPTXISD::StoreRetvalV2", SDTStoreRetvalV2Profile, + [SDNPHasChain, SDNPSideEffect]>; +def StoreRetvalV4 : + SDNode<"NVPTXISD::StoreRetvalV4", SDTStoreRetvalV4Profile, + [SDNPHasChain, SDNPSideEffect]>; +def PseudoUseParam : + SDNode<"NVPTXISD::PseudoUseParam", SDTPseudoUseParamProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def RETURNNode : + SDNode<"NVPTXISD::RETURN", SDTCallArgMarkProfile, + [SDNPHasChain, SDNPSideEffect]>; + +let mayLoad = 1 in { + class LoadParamMemInst<NVPTXRegClass regclass, string opstr> : + NVPTXInst<(outs regclass:$dst), (ins i32imm:$b), + !strconcat(!strconcat("ld.param", opstr), + "\t$dst, [retval0+$b];"), + []>; + + class LoadParamV2MemInst<NVPTXRegClass regclass, string opstr> : + NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins i32imm:$b), + !strconcat("ld.param.v2", opstr, + "\t{{$dst, $dst2}}, [retval0+$b];"), []>; + + class LoadParamV4MemInst<NVPTXRegClass regclass, string opstr> : + NVPTXInst<(outs regclass:$dst, regclass:$dst2, regclass:$dst3, + regclass:$dst4), + (ins i32imm:$b), + !strconcat("ld.param.v4", opstr, + "\t{{$dst, $dst2, $dst3, $dst4}}, [retval0+$b];"), + []>; +} class LoadParamRegInst<NVPTXRegClass regclass, string opstr> : NVPTXInst<(outs regclass:$dst), (ins i32imm:$b), - !strconcat(!strconcat("mov", opstr), - "\t$dst, retval$b;"), + !strconcat("mov", opstr, "\t$dst, retval$b;"), [(set regclass:$dst, (LoadParam (i32 0), (i32 imm:$b)))]>; -class LoadParamV2MemInst<NVPTXRegClass regclass, string opstr> : - NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins i32imm:$b), - !strconcat(!strconcat("ld.param.v2", opstr), - "\t{{$dst, $dst2}}, [retval0+$b];"), []>; - -class LoadParamV4MemInst<NVPTXRegClass regclass, string opstr> : - NVPTXInst<(outs regclass:$dst, regclass:$dst2, regclass:$dst3, - regclass:$dst4), - (ins i32imm:$b), - !strconcat(!strconcat("ld.param.v4", opstr), - "\t{{$dst, $dst2, $dst3, $dst4}}, [retval0+$b];"), []>; - -class StoreParamInst<NVPTXRegClass regclass, string opstr> : - NVPTXInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b), - !strconcat(!strconcat("st.param", opstr), - "\t[param$a+$b], $val;"), - []>; - -class StoreParamV2Inst<NVPTXRegClass regclass, string opstr> : - NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, - i32imm:$a, i32imm:$b), - !strconcat(!strconcat("st.param.v2", opstr), - "\t[param$a+$b], {{$val, $val2}};"), - []>; - -class StoreParamV4Inst<NVPTXRegClass regclass, string opstr> : - NVPTXInst<(outs), (ins regclass:$val, regclass:$val1, regclass:$val2, - regclass:$val3, i32imm:$a, i32imm:$b), - !strconcat(!strconcat("st.param.v4", opstr), - "\t[param$a+$b], {{$val, $val2, $val3, $val4}};"), - []>; - -class StoreRetvalInst<NVPTXRegClass regclass, string opstr> : - NVPTXInst<(outs), (ins regclass:$val, i32imm:$a), - !strconcat(!strconcat("st.param", opstr), - "\t[func_retval0+$a], $val;"), - []>; - -class StoreRetvalV2Inst<NVPTXRegClass regclass, string opstr> : - NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, i32imm:$a), - !strconcat(!strconcat("st.param.v2", opstr), - "\t[func_retval0+$a], {{$val, $val2}};"), - []>; - -class StoreRetvalV4Inst<NVPTXRegClass regclass, string opstr> : - NVPTXInst<(outs), - (ins regclass:$val, regclass:$val2, regclass:$val3, - regclass:$val4, i32imm:$a), - !strconcat(!strconcat("st.param.v4", opstr), - "\t[func_retval0+$a], {{$val, $val2, $val3, $val4}};"), - []>; - -def PrintCallRetInst1 : NVPTXInst<(outs), (ins), -"call (retval0), ", - [(PrintCall (i32 1))]>; -def PrintCallRetInst2 : NVPTXInst<(outs), (ins), -"call (retval0, retval1), ", - [(PrintCall (i32 2))]>; -def PrintCallRetInst3 : NVPTXInst<(outs), (ins), -"call (retval0, retval1, retval2), ", - [(PrintCall (i32 3))]>; -def PrintCallRetInst4 : NVPTXInst<(outs), (ins), -"call (retval0, retval1, retval2, retval3), ", - [(PrintCall (i32 4))]>; -def PrintCallRetInst5 : NVPTXInst<(outs), (ins), -"call (retval0, retval1, retval2, retval3, retval4), ", - [(PrintCall (i32 5))]>; -def PrintCallRetInst6 : NVPTXInst<(outs), (ins), -"call (retval0, retval1, retval2, retval3, retval4, retval5), ", - [(PrintCall (i32 6))]>; -def PrintCallRetInst7 : NVPTXInst<(outs), (ins), -"call (retval0, retval1, retval2, retval3, retval4, retval5, retval6), ", - [(PrintCall (i32 7))]>; -def PrintCallRetInst8 : NVPTXInst<(outs), (ins), -!strconcat("call (retval0, retval1, retval2, retval3, retval4", - ", retval5, retval6, retval7), "), - [(PrintCall (i32 8))]>; - -def PrintCallNoRetInst : NVPTXInst<(outs), (ins), "call ", - [(PrintCall (i32 0))]>; - -def PrintCallUniRetInst1 : NVPTXInst<(outs), (ins), -"call.uni (retval0), ", - [(PrintCallUni (i32 1))]>; -def PrintCallUniRetInst2 : NVPTXInst<(outs), (ins), -"call.uni (retval0, retval1), ", - [(PrintCallUni (i32 2))]>; -def PrintCallUniRetInst3 : NVPTXInst<(outs), (ins), -"call.uni (retval0, retval1, retval2), ", - [(PrintCallUni (i32 3))]>; -def PrintCallUniRetInst4 : NVPTXInst<(outs), (ins), -"call.uni (retval0, retval1, retval2, retval3), ", - [(PrintCallUni (i32 4))]>; -def PrintCallUniRetInst5 : NVPTXInst<(outs), (ins), -"call.uni (retval0, retval1, retval2, retval3, retval4), ", - [(PrintCallUni (i32 5))]>; -def PrintCallUniRetInst6 : NVPTXInst<(outs), (ins), -"call.uni (retval0, retval1, retval2, retval3, retval4, retval5), ", - [(PrintCallUni (i32 6))]>; -def PrintCallUniRetInst7 : NVPTXInst<(outs), (ins), -"call.uni (retval0, retval1, retval2, retval3, retval4, retval5, retval6), ", - [(PrintCallUni (i32 7))]>; -def PrintCallUniRetInst8 : NVPTXInst<(outs), (ins), -!strconcat("call.uni (retval0, retval1, retval2, retval3, retval4", - ", retval5, retval6, retval7), "), - [(PrintCallUni (i32 8))]>; - -def PrintCallUniNoRetInst : NVPTXInst<(outs), (ins), "call.uni ", - [(PrintCallUni (i32 0))]>; +let mayStore = 1 in { + class StoreParamInst<NVPTXRegClass regclass, string opstr> : + NVPTXInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b), + !strconcat("st.param", opstr, "\t[param$a+$b], $val;"), + []>; + + class StoreParamV2Inst<NVPTXRegClass regclass, string opstr> : + NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, + i32imm:$a, i32imm:$b), + !strconcat("st.param.v2", opstr, + "\t[param$a+$b], {{$val, $val2}};"), + []>; + + class StoreParamV4Inst<NVPTXRegClass regclass, string opstr> : + NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, regclass:$val3, + regclass:$val4, i32imm:$a, + i32imm:$b), + !strconcat("st.param.v4", opstr, + "\t[param$a+$b], {{$val, $val2, $val3, $val4}};"), + []>; + + class StoreRetvalInst<NVPTXRegClass regclass, string opstr> : + NVPTXInst<(outs), (ins regclass:$val, i32imm:$a), + !strconcat("st.param", opstr, "\t[func_retval0+$a], $val;"), + []>; + + class StoreRetvalV2Inst<NVPTXRegClass regclass, string opstr> : + NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, i32imm:$a), + !strconcat("st.param.v2", opstr, + "\t[func_retval0+$a], {{$val, $val2}};"), + []>; + + class StoreRetvalV4Inst<NVPTXRegClass regclass, string opstr> : + NVPTXInst<(outs), + (ins regclass:$val, regclass:$val2, regclass:$val3, + regclass:$val4, i32imm:$a), + !strconcat("st.param.v4", opstr, + "\t[func_retval0+$a], {{$val, $val2, $val3, $val4}};"), + []>; +} + +let isCall=1 in { + multiclass CALL<string OpcStr, SDNode OpNode> { + def PrintCallNoRetInst : NVPTXInst<(outs), (ins), + !strconcat(OpcStr, " "), [(OpNode (i32 0))]>; + def PrintCallRetInst1 : NVPTXInst<(outs), (ins), + !strconcat(OpcStr, " (retval0), "), [(OpNode (i32 1))]>; + def PrintCallRetInst2 : NVPTXInst<(outs), (ins), + !strconcat(OpcStr, " (retval0, retval1), "), [(OpNode (i32 2))]>; + def PrintCallRetInst3 : NVPTXInst<(outs), (ins), + !strconcat(OpcStr, " (retval0, retval1, retval2), "), [(OpNode (i32 3))]>; + def PrintCallRetInst4 : NVPTXInst<(outs), (ins), + !strconcat(OpcStr, " (retval0, retval1, retval2, retval3), "), + [(OpNode (i32 4))]>; + def PrintCallRetInst5 : NVPTXInst<(outs), (ins), + !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4), "), + [(OpNode (i32 5))]>; + def PrintCallRetInst6 : NVPTXInst<(outs), (ins), + !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, " + "retval5), "), + [(OpNode (i32 6))]>; + def PrintCallRetInst7 : NVPTXInst<(outs), (ins), + !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, " + "retval5, retval6), "), + [(OpNode (i32 7))]>; + def PrintCallRetInst8 : NVPTXInst<(outs), (ins), + !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, " + "retval5, retval6, retval7), "), + [(OpNode (i32 8))]>; + } +} + +defm Call : CALL<"call", PrintCall>; +defm CallUni : CALL<"call.uni", PrintCallUni>; + +// Convergent call instructions. These are identical to regular calls, except +// they have the isConvergent bit set. +let isConvergent=1 in { + defm ConvergentCall : CALL<"call", PrintConvergentCall>; + defm ConvergentCallUni : CALL<"call.uni", PrintConvergentCallUni>; +} def LoadParamMemI64 : LoadParamMemInst<Int64Regs, ".b64">; def LoadParamMemI32 : LoadParamMemInst<Int32Regs, ".b32">; @@ -1911,39 +1923,15 @@ def StoreParamV2I32 : StoreParamV2Inst<Int32Regs, ".b32">; def StoreParamV2I16 : StoreParamV2Inst<Int16Regs, ".b16">; def StoreParamV2I8 : StoreParamV2Inst<Int16Regs, ".b8">; -// FIXME: StoreParamV4Inst crashes llvm-tblgen :( -//def StoreParamV4I32 : StoreParamV4Inst<Int32Regs, ".b32">; -def StoreParamV4I32 : NVPTXInst<(outs), (ins Int32Regs:$val, Int32Regs:$val2, - Int32Regs:$val3, Int32Regs:$val4, - i32imm:$a, i32imm:$b), - "st.param.v4.b32\t[param$a+$b], {{$val, $val2, $val3, $val4}};", - []>; - -def StoreParamV4I16 : NVPTXInst<(outs), (ins Int16Regs:$val, Int16Regs:$val2, - Int16Regs:$val3, Int16Regs:$val4, - i32imm:$a, i32imm:$b), - "st.param.v4.b16\t[param$a+$b], {{$val, $val2, $val3, $val4}};", - []>; - -def StoreParamV4I8 : NVPTXInst<(outs), (ins Int16Regs:$val, Int16Regs:$val2, - Int16Regs:$val3, Int16Regs:$val4, - i32imm:$a, i32imm:$b), - "st.param.v4.b8\t[param$a+$b], {{$val, $val2, $val3, $val4}};", - []>; - -def StoreParamF32 : StoreParamInst<Float32Regs, ".f32">; -def StoreParamF64 : StoreParamInst<Float64Regs, ".f64">; +def StoreParamV4I32 : StoreParamV4Inst<Int32Regs, ".b32">; +def StoreParamV4I16 : StoreParamV4Inst<Int16Regs, ".b16">; +def StoreParamV4I8 : StoreParamV4Inst<Int16Regs, ".b8">; + +def StoreParamF32 : StoreParamInst<Float32Regs, ".f32">; +def StoreParamF64 : StoreParamInst<Float64Regs, ".f64">; def StoreParamV2F32 : StoreParamV2Inst<Float32Regs, ".f32">; def StoreParamV2F64 : StoreParamV2Inst<Float64Regs, ".f64">; -// FIXME: StoreParamV4Inst crashes llvm-tblgen :( -//def StoreParamV4F32 : StoreParamV4Inst<Float32Regs, ".f32">; -def StoreParamV4F32 : NVPTXInst<(outs), - (ins Float32Regs:$val, Float32Regs:$val2, - Float32Regs:$val3, Float32Regs:$val4, - i32imm:$a, i32imm:$b), - "st.param.v4.f32\t[param$a+$b], {{$val, $val2, $val3, $val4}};", - []>; - +def StoreParamV4F32 : StoreParamV4Inst<Float32Regs, ".f32">; def StoreRetvalI64 : StoreRetvalInst<Int64Regs, ".b64">; def StoreRetvalI32 : StoreRetvalInst<Int32Regs, ".b32">; @@ -1969,89 +1957,88 @@ def CallArgEndInst0 : NVPTXInst<(outs), (ins), ")", [(CallArgEnd (i32 0))]>; def RETURNInst : NVPTXInst<(outs), (ins), "ret;", [(RETURNNode)]>; class CallArgInst<NVPTXRegClass regclass> : - NVPTXInst<(outs), (ins regclass:$a), "$a, ", - [(CallArg (i32 0), regclass:$a)]>; + NVPTXInst<(outs), (ins regclass:$a), "$a, ", + [(CallArg (i32 0), regclass:$a)]>; class LastCallArgInst<NVPTXRegClass regclass> : - NVPTXInst<(outs), (ins regclass:$a), "$a", - [(LastCallArg (i32 0), regclass:$a)]>; + NVPTXInst<(outs), (ins regclass:$a), "$a", + [(LastCallArg (i32 0), regclass:$a)]>; def CallArgI64 : CallArgInst<Int64Regs>; def CallArgI32 : CallArgInst<Int32Regs>; def CallArgI16 : CallArgInst<Int16Regs>; - def CallArgF64 : CallArgInst<Float64Regs>; def CallArgF32 : CallArgInst<Float32Regs>; def LastCallArgI64 : LastCallArgInst<Int64Regs>; def LastCallArgI32 : LastCallArgInst<Int32Regs>; def LastCallArgI16 : LastCallArgInst<Int16Regs>; - def LastCallArgF64 : LastCallArgInst<Float64Regs>; def LastCallArgF32 : LastCallArgInst<Float32Regs>; def CallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a, ", [(CallArg (i32 0), (i32 imm:$a))]>; def LastCallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a", - [(LastCallArg (i32 0), (i32 imm:$a))]>; + [(LastCallArg (i32 0), (i32 imm:$a))]>; def CallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a, ", [(CallArg (i32 1), (i32 imm:$a))]>; def LastCallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a", - [(LastCallArg (i32 1), (i32 imm:$a))]>; - -def CallVoidInst : NVPTXInst<(outs), (ins imem:$addr), - "$addr, ", - [(CallVoid (Wrapper tglobaladdr:$addr))]>; -def CallVoidInstReg : NVPTXInst<(outs), (ins Int32Regs:$addr), - "$addr, ", - [(CallVoid Int32Regs:$addr)]>; -def CallVoidInstReg64 : NVPTXInst<(outs), (ins Int64Regs:$addr), - "$addr, ", - [(CallVoid Int64Regs:$addr)]>; -def PrototypeInst : NVPTXInst<(outs), (ins i32imm:$val), - ", prototype_$val;", - [(Prototype (i32 imm:$val))]>; - -def DeclareRetMemInst : NVPTXInst<(outs), - (ins i32imm:$align, i32imm:$size, i32imm:$num), - ".param .align $align .b8 retval$num[$size];", - [(DeclareRetParam (i32 imm:$align), (i32 imm:$size), (i32 imm:$num))]>; -def DeclareRetScalarInst : NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num), - ".param .b$size retval$num;", - [(DeclareRet (i32 1), (i32 imm:$size), (i32 imm:$num))]>; -def DeclareRetRegInst : NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num), - ".reg .b$size retval$num;", - [(DeclareRet (i32 2), (i32 imm:$size), (i32 imm:$num))]>; - -def DeclareParamInst : NVPTXInst<(outs), - (ins i32imm:$align, i32imm:$a, i32imm:$size), - ".param .align $align .b8 param$a[$size];", - [(DeclareParam (i32 imm:$align), (i32 imm:$a), (i32 imm:$size))]>; -def DeclareScalarParamInst : NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size), - ".param .b$size param$a;", - [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 0))]>; -def DeclareScalarRegInst : NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size), - ".reg .b$size param$a;", - [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 1))]>; + [(LastCallArg (i32 1), (i32 imm:$a))]>; + +def CallVoidInst : NVPTXInst<(outs), (ins imem:$addr), "$addr, ", + [(CallVoid (Wrapper tglobaladdr:$addr))]>; +def CallVoidInstReg : NVPTXInst<(outs), (ins Int32Regs:$addr), "$addr, ", + [(CallVoid Int32Regs:$addr)]>; +def CallVoidInstReg64 : NVPTXInst<(outs), (ins Int64Regs:$addr), "$addr, ", + [(CallVoid Int64Regs:$addr)]>; +def PrototypeInst : NVPTXInst<(outs), (ins i32imm:$val), ", prototype_$val;", + [(Prototype (i32 imm:$val))]>; + +def DeclareRetMemInst : + NVPTXInst<(outs), (ins i32imm:$align, i32imm:$size, i32imm:$num), + ".param .align $align .b8 retval$num[$size];", + [(DeclareRetParam (i32 imm:$align), (i32 imm:$size), (i32 imm:$num))]>; +def DeclareRetScalarInst : + NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num), + ".param .b$size retval$num;", + [(DeclareRet (i32 1), (i32 imm:$size), (i32 imm:$num))]>; +def DeclareRetRegInst : + NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num), + ".reg .b$size retval$num;", + [(DeclareRet (i32 2), (i32 imm:$size), (i32 imm:$num))]>; + +def DeclareParamInst : + NVPTXInst<(outs), (ins i32imm:$align, i32imm:$a, i32imm:$size), + ".param .align $align .b8 param$a[$size];", + [(DeclareParam (i32 imm:$align), (i32 imm:$a), (i32 imm:$size))]>; +def DeclareScalarParamInst : + NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size), + ".param .b$size param$a;", + [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 0))]>; +def DeclareScalarRegInst : + NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size), + ".reg .b$size param$a;", + [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 1))]>; class MoveParamInst<NVPTXRegClass regclass, string asmstr> : - NVPTXInst<(outs regclass:$dst), (ins regclass:$src), - !strconcat(!strconcat("mov", asmstr), "\t$dst, $src;"), - [(set regclass:$dst, (MoveParam regclass:$src))]>; + NVPTXInst<(outs regclass:$dst), (ins regclass:$src), + !strconcat("mov", asmstr, "\t$dst, $src;"), + [(set regclass:$dst, (MoveParam regclass:$src))]>; def MoveParamI64 : MoveParamInst<Int64Regs, ".b64">; def MoveParamI32 : MoveParamInst<Int32Regs, ".b32">; -def MoveParamI16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), - "cvt.u16.u32\t$dst, $src;", - [(set Int16Regs:$dst, (MoveParam Int16Regs:$src))]>; +def MoveParamI16 : + NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), + "cvt.u16.u32\t$dst, $src;", + [(set Int16Regs:$dst, (MoveParam Int16Regs:$src))]>; def MoveParamF64 : MoveParamInst<Float64Regs, ".f64">; def MoveParamF32 : MoveParamInst<Float32Regs, ".f32">; class PseudoUseParamInst<NVPTXRegClass regclass> : - NVPTXInst<(outs), (ins regclass:$src), - "// Pseudo use of $src", - [(PseudoUseParam regclass:$src)]>; + NVPTXInst<(outs), (ins regclass:$src), + "// Pseudo use of $src", + [(PseudoUseParam regclass:$src)]>; def PseudoUseParamI64 : PseudoUseParamInst<Int64Regs>; def PseudoUseParamI32 : PseudoUseParamInst<Int32Regs>; @@ -2064,254 +2051,278 @@ def PseudoUseParamF32 : PseudoUseParamInst<Float32Regs>; // Load / Store Handling // multiclass LD<NVPTXRegClass regclass> { - def _avar : NVPTXInst<(outs regclass:$dst), + def _avar : NVPTXInst< + (outs regclass:$dst), (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, imem:$addr), -!strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", - "$fromWidth \t$dst, [$addr];"), []>; - def _areg : NVPTXInst<(outs regclass:$dst), + i32imm:$fromWidth, imem:$addr), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t$dst, [$addr];", []>; + def _areg : NVPTXInst< + (outs regclass:$dst), (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int32Regs:$addr), -!strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", - "$fromWidth \t$dst, [$addr];"), []>; - def _areg_64 : NVPTXInst<(outs regclass:$dst), + i32imm:$fromWidth, Int32Regs:$addr), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t$dst, [$addr];", []>; + def _areg_64 : NVPTXInst< + (outs regclass:$dst), (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int64Regs:$addr), - !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth", - " \t$dst, [$addr];"), []>; - def _ari : NVPTXInst<(outs regclass:$dst), + i32imm:$fromWidth, Int64Regs:$addr), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t$dst, [$addr];", []>; + def _ari : NVPTXInst< + (outs regclass:$dst), (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), -!strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", - "$fromWidth \t$dst, [$addr+$offset];"), []>; - def _ari_64 : NVPTXInst<(outs regclass:$dst), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), - !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth", - " \t$dst, [$addr+$offset];"), []>; - def _asi : NVPTXInst<(outs regclass:$dst), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, imem:$addr, i32imm:$offset), -!strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", - "$fromWidth \t$dst, [$addr+$offset];"), []>; + i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t$dst, [$addr+$offset];", []>; + def _ari_64 : NVPTXInst< + (outs regclass:$dst), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t$dst, [$addr+$offset];", []>; + def _asi : NVPTXInst< + (outs regclass:$dst), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t$dst, [$addr+$offset];", []>; } let mayLoad=1, hasSideEffects=0 in { -defm LD_i8 : LD<Int16Regs>; -defm LD_i16 : LD<Int16Regs>; -defm LD_i32 : LD<Int32Regs>; -defm LD_i64 : LD<Int64Regs>; -defm LD_f32 : LD<Float32Regs>; -defm LD_f64 : LD<Float64Regs>; + defm LD_i8 : LD<Int16Regs>; + defm LD_i16 : LD<Int16Regs>; + defm LD_i32 : LD<Int32Regs>; + defm LD_i64 : LD<Int64Regs>; + defm LD_f32 : LD<Float32Regs>; + defm LD_f64 : LD<Float64Regs>; } multiclass ST<NVPTXRegClass regclass> { - def _avar : NVPTXInst<(outs), - (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, - LdStCode:$Sign, i32imm:$toWidth, imem:$addr), -!strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth", - " \t[$addr], $src;"), []>; - def _areg : NVPTXInst<(outs), + def _avar : NVPTXInst< + (outs), (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, - LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr), -!strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth", - " \t[$addr], $src;"), []>; - def _areg_64 : NVPTXInst<(outs), + LdStCode:$Sign, i32imm:$toWidth, imem:$addr), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + " \t[$addr], $src;", []>; + def _areg : NVPTXInst< + (outs), + (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, + LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + " \t[$addr], $src;", []>; + def _areg_64 : NVPTXInst< + (outs), (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, - LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr), - !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth ", - "\t[$addr], $src;"), []>; - def _ari : NVPTXInst<(outs), + LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + " \t[$addr], $src;", []>; + def _ari : NVPTXInst< + (outs), (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, - LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset), -!strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth", - " \t[$addr+$offset], $src;"), []>; - def _ari_64 : NVPTXInst<(outs), + LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + " \t[$addr+$offset], $src;", []>; + def _ari_64 : NVPTXInst< + (outs), (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, - LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset), - !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth ", - "\t[$addr+$offset], $src;"), []>; - def _asi : NVPTXInst<(outs), + LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + " \t[$addr+$offset], $src;", []>; + def _asi : NVPTXInst< + (outs), (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, - LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset), -!strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth", - " \t[$addr+$offset], $src;"), []>; + LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + " \t[$addr+$offset], $src;", []>; } let mayStore=1, hasSideEffects=0 in { -defm ST_i8 : ST<Int16Regs>; -defm ST_i16 : ST<Int16Regs>; -defm ST_i32 : ST<Int32Regs>; -defm ST_i64 : ST<Int64Regs>; -defm ST_f32 : ST<Float32Regs>; -defm ST_f64 : ST<Float64Regs>; + defm ST_i8 : ST<Int16Regs>; + defm ST_i16 : ST<Int16Regs>; + defm ST_i32 : ST<Int32Regs>; + defm ST_i64 : ST<Int64Regs>; + defm ST_f32 : ST<Float32Regs>; + defm ST_f64 : ST<Float64Regs>; } -// The following is used only in and after vector elementizations. -// Vector elementization happens at the machine instruction level, so the -// following instruction -// never appears in the DAG. +// The following is used only in and after vector elementizations. Vector +// elementization happens at the machine instruction level, so the following +// instructions never appear in the DAG. multiclass LD_VEC<NVPTXRegClass regclass> { - def _v2_avar : NVPTXInst<(outs regclass:$dst1, regclass:$dst2), + def _v2_avar : NVPTXInst< + (outs regclass:$dst1, regclass:$dst2), (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, imem:$addr), - !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", - "$fromWidth \t{{$dst1, $dst2}}, [$addr];"), []>; - def _v2_areg : NVPTXInst<(outs regclass:$dst1, regclass:$dst2), + i32imm:$fromWidth, imem:$addr), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t{{$dst1, $dst2}}, [$addr];", []>; + def _v2_areg : NVPTXInst< + (outs regclass:$dst1, regclass:$dst2), (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int32Regs:$addr), - !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", - "$fromWidth \t{{$dst1, $dst2}}, [$addr];"), []>; - def _v2_areg_64 : NVPTXInst<(outs regclass:$dst1, regclass:$dst2), + i32imm:$fromWidth, Int32Regs:$addr), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t{{$dst1, $dst2}}, [$addr];", []>; + def _v2_areg_64 : NVPTXInst< + (outs regclass:$dst1, regclass:$dst2), (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int64Regs:$addr), - !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", - "$fromWidth \t{{$dst1, $dst2}}, [$addr];"), []>; - def _v2_ari : NVPTXInst<(outs regclass:$dst1, regclass:$dst2), + i32imm:$fromWidth, Int64Regs:$addr), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t{{$dst1, $dst2}}, [$addr];", []>; + def _v2_ari : NVPTXInst< + (outs regclass:$dst1, regclass:$dst2), (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), - !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", - "$fromWidth \t{{$dst1, $dst2}}, [$addr+$offset];"), []>; - def _v2_ari_64 : NVPTXInst<(outs regclass:$dst1, regclass:$dst2), + i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t{{$dst1, $dst2}}, [$addr+$offset];", []>; + def _v2_ari_64 : NVPTXInst< + (outs regclass:$dst1, regclass:$dst2), (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), - !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", - "$fromWidth \t{{$dst1, $dst2}}, [$addr+$offset];"), []>; - def _v2_asi : NVPTXInst<(outs regclass:$dst1, regclass:$dst2), + i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t{{$dst1, $dst2}}, [$addr+$offset];", []>; + def _v2_asi : NVPTXInst< + (outs regclass:$dst1, regclass:$dst2), (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, imem:$addr, i32imm:$offset), - !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", - "$fromWidth \t{{$dst1, $dst2}}, [$addr+$offset];"), []>; - def _v4_avar : NVPTXInst<(outs regclass:$dst1, regclass:$dst2, - regclass:$dst3, regclass:$dst4), + i32imm:$fromWidth, imem:$addr, i32imm:$offset), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t{{$dst1, $dst2}}, [$addr+$offset];", []>; + def _v4_avar : NVPTXInst< + (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, imem:$addr), - !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", - "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];"), []>; - def _v4_areg : NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, - regclass:$dst4), + i32imm:$fromWidth, imem:$addr), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; + def _v4_areg : NVPTXInst< + (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int32Regs:$addr), - !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", - "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];"), []>; - def _v4_areg_64 : NVPTXInst<(outs regclass:$dst1, regclass:$dst2, - regclass:$dst3, regclass:$dst4), + i32imm:$fromWidth, Int32Regs:$addr), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; + def _v4_areg_64 : NVPTXInst< + (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int64Regs:$addr), - !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", - "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];"), []>; - def _v4_ari : NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, - regclass:$dst4), + i32imm:$fromWidth, Int64Regs:$addr), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; + def _v4_ari : NVPTXInst< + (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), - !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", - "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];"), - []>; - def _v4_ari_64 : NVPTXInst<(outs regclass:$dst1, regclass:$dst2, - regclass:$dst3, regclass:$dst4), + i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>; + def _v4_ari_64 : NVPTXInst< + (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), - !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", - "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];"), - []>; - def _v4_asi : NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, - regclass:$dst4), + i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>; + def _v4_asi : NVPTXInst< + (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, imem:$addr, i32imm:$offset), - !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", - "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];"), - []>; + i32imm:$fromWidth, imem:$addr, i32imm:$offset), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>; } let mayLoad=1, hasSideEffects=0 in { -defm LDV_i8 : LD_VEC<Int16Regs>; -defm LDV_i16 : LD_VEC<Int16Regs>; -defm LDV_i32 : LD_VEC<Int32Regs>; -defm LDV_i64 : LD_VEC<Int64Regs>; -defm LDV_f32 : LD_VEC<Float32Regs>; -defm LDV_f64 : LD_VEC<Float64Regs>; + defm LDV_i8 : LD_VEC<Int16Regs>; + defm LDV_i16 : LD_VEC<Int16Regs>; + defm LDV_i32 : LD_VEC<Int32Regs>; + defm LDV_i64 : LD_VEC<Int64Regs>; + defm LDV_f32 : LD_VEC<Float32Regs>; + defm LDV_f64 : LD_VEC<Float64Regs>; } multiclass ST_VEC<NVPTXRegClass regclass> { - def _v2_avar : NVPTXInst<(outs), + def _v2_avar : NVPTXInst< + (outs), (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, - LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr), - !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", - "$fromWidth \t[$addr], {{$src1, $src2}};"), []>; - def _v2_areg : NVPTXInst<(outs), + LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t[$addr], {{$src1, $src2}};", []>; + def _v2_areg : NVPTXInst< + (outs), (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, - LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr), - !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", - "$fromWidth \t[$addr], {{$src1, $src2}};"), []>; - def _v2_areg_64 : NVPTXInst<(outs), + LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t[$addr], {{$src1, $src2}};", []>; + def _v2_areg_64 : NVPTXInst< + (outs), (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, - LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr), - !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", - "$fromWidth \t[$addr], {{$src1, $src2}};"), []>; - def _v2_ari : NVPTXInst<(outs), + LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t[$addr], {{$src1, $src2}};", []>; + def _v2_ari : NVPTXInst< + (outs), (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, - LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, - i32imm:$offset), - !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", - "$fromWidth \t[$addr+$offset], {{$src1, $src2}};"), []>; - def _v2_ari_64 : NVPTXInst<(outs), + LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, + i32imm:$offset), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t[$addr+$offset], {{$src1, $src2}};", []>; + def _v2_ari_64 : NVPTXInst< + (outs), (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, - LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, - i32imm:$offset), - !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", - "$fromWidth \t[$addr+$offset], {{$src1, $src2}};"), []>; - def _v2_asi : NVPTXInst<(outs), + LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, + i32imm:$offset), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t[$addr+$offset], {{$src1, $src2}};", []>; + def _v2_asi : NVPTXInst< + (outs), (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, - LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, - i32imm:$offset), - !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", - "$fromWidth \t[$addr+$offset], {{$src1, $src2}};"), []>; - def _v4_avar : NVPTXInst<(outs), + LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, + i32imm:$offset), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t[$addr+$offset], {{$src1, $src2}};", []>; + def _v4_avar : NVPTXInst< + (outs), (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, - LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, imem:$addr), - !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", - "$fromWidth \t[$addr], {{$src1, $src2, $src3, $src4}};"), []>; - def _v4_areg : NVPTXInst<(outs), + LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, imem:$addr), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; + def _v4_areg : NVPTXInst< + (outs), (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, - LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int32Regs:$addr), - !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", - "$fromWidth \t[$addr], {{$src1, $src2, $src3, $src4}};"), []>; - def _v4_areg_64 : NVPTXInst<(outs), + LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, Int32Regs:$addr), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; + def _v4_areg_64 : NVPTXInst< + (outs), (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, - LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int64Regs:$addr), - !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", - "$fromWidth \t[$addr], {{$src1, $src2, $src3, $src4}};"), []>; - def _v4_ari : NVPTXInst<(outs), + LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, Int64Regs:$addr), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; + def _v4_ari : NVPTXInst< + (outs), (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, - LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), - !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", - "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};"), - []>; - def _v4_ari_64 : NVPTXInst<(outs), + LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>; + def _v4_ari_64 : NVPTXInst< + (outs), (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, - LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), - !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", - "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};"), - []>; - def _v4_asi : NVPTXInst<(outs), + LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>; + def _v4_asi : NVPTXInst< + (outs), (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, - LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, imem:$addr, i32imm:$offset), - !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", - "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};"), - []>; + LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, imem:$addr, i32imm:$offset), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}" + "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>; } + let mayStore=1, hasSideEffects=0 in { -defm STV_i8 : ST_VEC<Int16Regs>; -defm STV_i16 : ST_VEC<Int16Regs>; -defm STV_i32 : ST_VEC<Int32Regs>; -defm STV_i64 : ST_VEC<Int64Regs>; -defm STV_f32 : ST_VEC<Float32Regs>; -defm STV_f64 : ST_VEC<Float64Regs>; + defm STV_i8 : ST_VEC<Int16Regs>; + defm STV_i16 : ST_VEC<Int16Regs>; + defm STV_i32 : ST_VEC<Int32Regs>; + defm STV_i64 : ST_VEC<Int64Regs>; + defm STV_f32 : ST_VEC<Float32Regs>; + defm STV_f64 : ST_VEC<Float64Regs>; } @@ -2525,64 +2536,52 @@ def : Pat<(select Int32Regs:$pred, Float64Regs:$a, Float64Regs:$b), (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; -// pack a set of smaller int registers to a larger int register -def V4I16toI64 : NVPTXInst<(outs Int64Regs:$d), - (ins Int16Regs:$s1, Int16Regs:$s2, - Int16Regs:$s3, Int16Regs:$s4), - "mov.b64\t$d, {{$s1, $s2, $s3, $s4}};", - []>; -def V2I16toI32 : NVPTXInst<(outs Int32Regs:$d), - (ins Int16Regs:$s1, Int16Regs:$s2), - "mov.b32\t$d, {{$s1, $s2}};", - []>; -def V2I32toI64 : NVPTXInst<(outs Int64Regs:$d), - (ins Int32Regs:$s1, Int32Regs:$s2), - "mov.b64\t$d, {{$s1, $s2}};", - []>; -def V2F32toF64 : NVPTXInst<(outs Float64Regs:$d), - (ins Float32Regs:$s1, Float32Regs:$s2), - "mov.b64\t$d, {{$s1, $s2}};", - []>; - -// unpack a larger int register to a set of smaller int registers -def I64toV4I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2, - Int16Regs:$d3, Int16Regs:$d4), - (ins Int64Regs:$s), - "mov.b64\t{{$d1, $d2, $d3, $d4}}, $s;", - []>; -def I32toV2I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2), - (ins Int32Regs:$s), - "mov.b32\t{{$d1, $d2}}, $s;", - []>; -def I64toV2I32 : NVPTXInst<(outs Int32Regs:$d1, Int32Regs:$d2), - (ins Int64Regs:$s), - "mov.b64\t{{$d1, $d2}}, $s;", - []>; -def F64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2), - (ins Float64Regs:$s), - "mov.b64\t{{$d1, $d2}}, $s;", - []>; +let hasSideEffects = 0 in { + // pack a set of smaller int registers to a larger int register + def V4I16toI64 : NVPTXInst<(outs Int64Regs:$d), + (ins Int16Regs:$s1, Int16Regs:$s2, + Int16Regs:$s3, Int16Regs:$s4), + "mov.b64\t$d, {{$s1, $s2, $s3, $s4}};", []>; + def V2I16toI32 : NVPTXInst<(outs Int32Regs:$d), + (ins Int16Regs:$s1, Int16Regs:$s2), + "mov.b32\t$d, {{$s1, $s2}};", []>; + def V2I32toI64 : NVPTXInst<(outs Int64Regs:$d), + (ins Int32Regs:$s1, Int32Regs:$s2), + "mov.b64\t$d, {{$s1, $s2}};", []>; + def V2F32toF64 : NVPTXInst<(outs Float64Regs:$d), + (ins Float32Regs:$s1, Float32Regs:$s2), + "mov.b64\t$d, {{$s1, $s2}};", []>; + + // unpack a larger int register to a set of smaller int registers + def I64toV4I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2, + Int16Regs:$d3, Int16Regs:$d4), + (ins Int64Regs:$s), + "mov.b64\t{{$d1, $d2, $d3, $d4}}, $s;", []>; + def I32toV2I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2), + (ins Int32Regs:$s), + "mov.b32\t{{$d1, $d2}}, $s;", []>; + def I64toV2I32 : NVPTXInst<(outs Int32Regs:$d1, Int32Regs:$d2), + (ins Int64Regs:$s), + "mov.b64\t{{$d1, $d2}}, $s;", []>; + def F64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2), + (ins Float64Regs:$s), + "mov.b64\t{{$d1, $d2}}, $s;", []>; +} // Count leading zeros -def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a), - "clz.b32\t$d, $a;", - []>; -def CLZr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), - "clz.b64\t$d, $a;", - []>; +let hasSideEffects = 0 in { + def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a), + "clz.b32\t$d, $a;", []>; + def CLZr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), + "clz.b64\t$d, $a;", []>; +} // 32-bit has a direct PTX instruction -def : Pat<(ctlz Int32Regs:$a), - (CLZr32 Int32Regs:$a)>; -def : Pat<(ctlz_zero_undef Int32Regs:$a), - (CLZr32 Int32Regs:$a)>; +def : Pat<(ctlz Int32Regs:$a), (CLZr32 Int32Regs:$a)>; // For 64-bit, the result in PTX is actually 32-bit so we zero-extend // to 64-bit to match the LLVM semantics -def : Pat<(ctlz Int64Regs:$a), - (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>; -def : Pat<(ctlz_zero_undef Int64Regs:$a), - (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>; +def : Pat<(ctlz Int64Regs:$a), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>; // For 16-bit, we zero-extend to 32-bit, then trunc the result back // to 16-bits (ctlz of a 16-bit value is guaranteed to require less @@ -2592,34 +2591,27 @@ def : Pat<(ctlz Int16Regs:$a), (SUBi16ri (CVT_u16_u32 (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE), 16)>; -def : Pat<(ctlz_zero_undef Int16Regs:$a), - (SUBi16ri (CVT_u16_u32 (CLZr32 - (CVT_u32_u16 Int16Regs:$a, CvtNONE)), - CvtNONE), 16)>; // Population count -def POPCr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a), - "popc.b32\t$d, $a;", - []>; -def POPCr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), - "popc.b64\t$d, $a;", - []>; +let hasSideEffects = 0 in { + def POPCr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a), + "popc.b32\t$d, $a;", []>; + def POPCr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), + "popc.b64\t$d, $a;", []>; +} // 32-bit has a direct PTX instruction -def : Pat<(ctpop Int32Regs:$a), - (POPCr32 Int32Regs:$a)>; +def : Pat<(ctpop Int32Regs:$a), (POPCr32 Int32Regs:$a)>; // For 64-bit, the result in PTX is actually 32-bit so we zero-extend // to 64-bit to match the LLVM semantics -def : Pat<(ctpop Int64Regs:$a), - (CVT_u64_u32 (POPCr64 Int64Regs:$a), CvtNONE)>; +def : Pat<(ctpop Int64Regs:$a), (CVT_u64_u32 (POPCr64 Int64Regs:$a), CvtNONE)>; // For 16-bit, we zero-extend to 32-bit, then trunc the result back // to 16-bits (ctpop of a 16-bit value is guaranteed to require less // than 16 bits to store) def : Pat<(ctpop Int16Regs:$a), - (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), - CvtNONE)>; + (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE)>; // fround f64 -> f32 def : Pat<(f32 (fround Float64Regs:$a)), @@ -2633,8 +2625,8 @@ def : Pat<(f64 (fextend Float32Regs:$a)), def : Pat<(f64 (fextend Float32Regs:$a)), (CVT_f64_f32 Float32Regs:$a, CvtNONE)>; -def retflag : SDNode<"NVPTXISD::RET_FLAG", SDTNone, - [SDNPHasChain, SDNPOptInGlue]>; +def retflag : SDNode<"NVPTXISD::RET_FLAG", SDTNone, + [SDNPHasChain, SDNPOptInGlue]>; //----------------------------------- // Control-flow @@ -2646,88 +2638,77 @@ let isTerminator=1 in { let isBranch=1 in def CBranch : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target), - "@$a bra \t$target;", - [(brcond Int1Regs:$a, bb:$target)]>; + "@$a bra \t$target;", + [(brcond Int1Regs:$a, bb:$target)]>; let isBranch=1 in def CBranchOther : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target), - "@!$a bra \t$target;", - []>; + "@!$a bra \t$target;", []>; let isBranch=1, isBarrier=1 in def GOTO : NVPTXInst<(outs), (ins brtarget:$target), - "bra.uni \t$target;", - [(br bb:$target)]>; + "bra.uni \t$target;", [(br bb:$target)]>; } def : Pat<(brcond Int32Regs:$a, bb:$target), (CBranch (SETP_u32ri Int32Regs:$a, 0, CmpNE), bb:$target)>; // SelectionDAGBuilder::visitSWitchCase() will invert the condition of a -// conditional branch if -// the target block is the next block so that the code can fall through to the -// target block. -// The invertion is done by 'xor condition, 1', which will be translated to -// (setne condition, -1). -// Since ptx supports '@!pred bra target', we should use it. +// conditional branch if the target block is the next block so that the code +// can fall through to the target block. The invertion is done by 'xor +// condition, 1', which will be translated to (setne condition, -1). Since ptx +// supports '@!pred bra target', we should use it. def : Pat<(brcond (i1 (setne Int1Regs:$a, -1)), bb:$target), - (CBranchOther Int1Regs:$a, bb:$target)>; + (CBranchOther Int1Regs:$a, bb:$target)>; // Call -def SDT_NVPTXCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>; -def SDT_NVPTXCallSeqEnd : SDCallSeqEnd<[ SDTCisVT<0, i32>, - SDTCisVT<1, i32> ]>; +def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>; +def SDT_NVPTXCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>; def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_NVPTXCallSeqStart, [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; -def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_NVPTXCallSeqEnd, +def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_NVPTXCallSeqEnd, [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, - SDNPSideEffect]>; + SDNPSideEffect]>; def SDT_NVPTXCall : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>; def call : SDNode<"NVPTXISD::CALL", SDT_NVPTXCall, [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; def calltarget : Operand<i32>; let isCall=1 in { - def CALL : NVPTXInst<(outs), (ins calltarget:$dst), - "call \t$dst, (1);", []>; + def CALL : NVPTXInst<(outs), (ins calltarget:$dst), "call \t$dst, (1);", []>; } -def : Pat<(call tglobaladdr:$dst), - (CALL tglobaladdr:$dst)>; -def : Pat<(call texternalsym:$dst), - (CALL texternalsym:$dst)>; +def : Pat<(call tglobaladdr:$dst), (CALL tglobaladdr:$dst)>; +def : Pat<(call texternalsym:$dst), (CALL texternalsym:$dst)>; // Pseudo instructions. class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern> : NVPTXInst<outs, ins, asmstr, pattern>; -// @TODO: We use some tricks here to emit curly braces. Can we clean this up -// a bit without TableGen modifications? -def Callseq_Start : NVPTXInst<(outs), (ins i32imm:$amt), - "// Callseq Start $amt\n\t{{\n\t.reg .b32 temp_param_reg;\n\t// <end>}}", - [(callseq_start timm:$amt)]>; -def Callseq_End : NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), - "\n\t//{{\n\t}}// Callseq End $amt1", - [(callseq_end timm:$amt1, timm:$amt2)]>; +def Callseq_Start : + NVPTXInst<(outs), (ins i32imm:$amt), + "\\{ // callseq $amt\n" + "\t.reg .b32 temp_param_reg;", + [(callseq_start timm:$amt)]>; +def Callseq_End : + NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), + "\\} // callseq $amt1", + [(callseq_end timm:$amt1, timm:$amt2)]>; // trap instruction - -def trapinst : NVPTXInst<(outs), (ins), - "trap;", - [(trap)]>; +def trapinst : NVPTXInst<(outs), (ins), "trap;", [(trap)]>; // Call prototype wrapper def SDTCallPrototype : SDTypeProfile<0, 1, [SDTCisInt<0>]>; -def CallPrototype - : SDNode<"NVPTXISD::CallPrototype", SDTCallPrototype, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def CallPrototype : + SDNode<"NVPTXISD::CallPrototype", SDTCallPrototype, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; def ProtoIdent : Operand<i32> { let PrintMethod = "printProtoIdent"; } -def CALL_PROTOTYPE - : NVPTXInst<(outs), (ins ProtoIdent:$ident), - "$ident", [(CallPrototype (i32 texternalsym:$ident))]>; - +def CALL_PROTOTYPE : + NVPTXInst<(outs), (ins ProtoIdent:$ident), + "$ident", [(CallPrototype (i32 texternalsym:$ident))]>; include "NVPTXIntrinsics.td" diff --git a/lib/Target/NVPTX/NVPTXIntrinsics.td b/lib/Target/NVPTX/NVPTXIntrinsics.td index 14e51aa309ea..ed16afa24752 100644 --- a/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -30,11 +30,9 @@ def immDouble1 : PatLeaf<(fpimm), [{ //----------------------------------- -// Synchronization Functions +// Synchronization and shuffle functions //----------------------------------- -def INT_CUDA_SYNCTHREADS : NVPTXInst<(outs), (ins), - "bar.sync \t0;", - [(int_cuda_syncthreads)]>; +let isConvergent = 1 in { def INT_BARRIER0 : NVPTXInst<(outs), (ins), "bar.sync \t0;", [(int_nvvm_barrier0)]>; @@ -64,6 +62,51 @@ def INT_BARRIER0_OR : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred), !strconcat("}}", ""))))))), [(set Int32Regs:$dst, (int_nvvm_barrier0_or Int32Regs:$pred))]>; +def INT_BAR_SYNC : NVPTXInst<(outs), (ins i32imm:$i), "bar.sync\t$i;", + [(int_nvvm_bar_sync imm:$i)]>; + +// shfl.{up,down,bfly,idx}.b32 +multiclass SHFL<NVPTXRegClass regclass, string mode, Intrinsic IntOp> { + // The last two parameters to shfl can be regs or imms. ptxas is smart + // enough to inline constant registers, so strictly speaking we don't need to + // handle immediates here. But it's easy enough, and it makes our ptx more + // readable. + def reg : NVPTXInst< + (outs regclass:$dst), + (ins regclass:$src, Int32Regs:$offset, Int32Regs:$mask), + !strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"), + [(set regclass:$dst, (IntOp regclass:$src, Int32Regs:$offset, Int32Regs:$mask))]>; + + def imm1 : NVPTXInst< + (outs regclass:$dst), + (ins regclass:$src, i32imm:$offset, Int32Regs:$mask), + !strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"), + [(set regclass:$dst, (IntOp regclass:$src, imm:$offset, Int32Regs:$mask))]>; + + def imm2 : NVPTXInst< + (outs regclass:$dst), + (ins regclass:$src, Int32Regs:$offset, i32imm:$mask), + !strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"), + [(set regclass:$dst, (IntOp regclass:$src, Int32Regs:$offset, imm:$mask))]>; + + def imm3 : NVPTXInst< + (outs regclass:$dst), + (ins regclass:$src, i32imm:$offset, i32imm:$mask), + !strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"), + [(set regclass:$dst, (IntOp regclass:$src, imm:$offset, imm:$mask))]>; +} + +defm INT_SHFL_DOWN_I32 : SHFL<Int32Regs, "down", int_nvvm_shfl_down_i32>; +defm INT_SHFL_DOWN_F32 : SHFL<Float32Regs, "down", int_nvvm_shfl_down_f32>; +defm INT_SHFL_UP_I32 : SHFL<Int32Regs, "up", int_nvvm_shfl_up_i32>; +defm INT_SHFL_UP_F32 : SHFL<Float32Regs, "up", int_nvvm_shfl_up_f32>; +defm INT_SHFL_BFLY_I32 : SHFL<Int32Regs, "bfly", int_nvvm_shfl_bfly_i32>; +defm INT_SHFL_BFLY_F32 : SHFL<Float32Regs, "bfly", int_nvvm_shfl_bfly_f32>; +defm INT_SHFL_IDX_I32 : SHFL<Int32Regs, "idx", int_nvvm_shfl_idx_i32>; +defm INT_SHFL_IDX_F32 : SHFL<Float32Regs, "idx", int_nvvm_shfl_idx_f32>; + +} // isConvergent = 1 + //----------------------------------- // Explicit Memory Fence Functions @@ -1335,51 +1378,17 @@ defm INT_PTX_ATOM_CAS_GEN_64_USE_G : F_ATOMIC_3<Int64Regs, ".global", ".b64", ".cas", atomic_cmp_swap_64_gen, i64imm, useAtomRedG64forGen64>; -//----------------------------------- -// Read Special Registers -//----------------------------------- -class F_SREG<string OpStr, NVPTXRegClass regclassOut, Intrinsic IntOp> : - NVPTXInst<(outs regclassOut:$dst), (ins), - OpStr, - [(set regclassOut:$dst, (IntOp))]>; - -def INT_PTX_SREG_TID_X : F_SREG<"mov.u32 \t$dst, %tid.x;", Int32Regs, - int_nvvm_read_ptx_sreg_tid_x>; -def INT_PTX_SREG_TID_Y : F_SREG<"mov.u32 \t$dst, %tid.y;", Int32Regs, - int_nvvm_read_ptx_sreg_tid_y>; -def INT_PTX_SREG_TID_Z : F_SREG<"mov.u32 \t$dst, %tid.z;", Int32Regs, - int_nvvm_read_ptx_sreg_tid_z>; - -def INT_PTX_SREG_NTID_X : F_SREG<"mov.u32 \t$dst, %ntid.x;", Int32Regs, - int_nvvm_read_ptx_sreg_ntid_x>; -def INT_PTX_SREG_NTID_Y : F_SREG<"mov.u32 \t$dst, %ntid.y;", Int32Regs, - int_nvvm_read_ptx_sreg_ntid_y>; -def INT_PTX_SREG_NTID_Z : F_SREG<"mov.u32 \t$dst, %ntid.z;", Int32Regs, - int_nvvm_read_ptx_sreg_ntid_z>; - -def INT_PTX_SREG_CTAID_X : F_SREG<"mov.u32 \t$dst, %ctaid.x;", Int32Regs, - int_nvvm_read_ptx_sreg_ctaid_x>; -def INT_PTX_SREG_CTAID_Y : F_SREG<"mov.u32 \t$dst, %ctaid.y;", Int32Regs, - int_nvvm_read_ptx_sreg_ctaid_y>; -def INT_PTX_SREG_CTAID_Z : F_SREG<"mov.u32 \t$dst, %ctaid.z;", Int32Regs, - int_nvvm_read_ptx_sreg_ctaid_z>; - -def INT_PTX_SREG_NCTAID_X : F_SREG<"mov.u32 \t$dst, %nctaid.x;", Int32Regs, - int_nvvm_read_ptx_sreg_nctaid_x>; -def INT_PTX_SREG_NCTAID_Y : F_SREG<"mov.u32 \t$dst, %nctaid.y;", Int32Regs, - int_nvvm_read_ptx_sreg_nctaid_y>; -def INT_PTX_SREG_NCTAID_Z : F_SREG<"mov.u32 \t$dst, %nctaid.z;", Int32Regs, - int_nvvm_read_ptx_sreg_nctaid_z>; - -def INT_PTX_SREG_WARPSIZE : F_SREG<"mov.u32 \t$dst, WARP_SZ;", Int32Regs, - int_nvvm_read_ptx_sreg_warpsize>; //----------------------------------- // Support for ldu on sm_20 or later //----------------------------------- +// Don't annotate ldu instructions as mayLoad, as they load from memory that is +// read-only in a kernel. + // Scalar + multiclass LDU_G<string TyStr, NVPTXRegClass regclass> { def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src), !strconcat("ldu.global.", TyStr), @@ -1475,6 +1484,10 @@ defm INT_PTX_LDU_G_v4f32_ELE // Support for ldg on sm_35 or later //----------------------------------- +// Don't annotate ld.global.nc as mayLoad, because these loads go through the +// non-coherent texture cache, and therefore the values read must be read-only +// during the lifetime of the kernel. + multiclass LDG_G<string TyStr, NVPTXRegClass regclass> { def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src), !strconcat("ld.global.nc.", TyStr), @@ -1836,54 +1849,61 @@ def : Pat<(int_nvvm_rotate_b32 Int32Regs:$src, Int32Regs:$amt), (ROTL32reg_sw Int32Regs:$src, Int32Regs:$amt)>, Requires<[noHWROT32]> ; -def GET_LO_INT64 - : NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src), - !strconcat("{{\n\t", - !strconcat(".reg .b32 %dummy;\n\t", - !strconcat("mov.b64 \t{$dst,%dummy}, $src;\n\t", - !strconcat("}}", "")))), - []> ; - -def GET_HI_INT64 - : NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src), - !strconcat("{{\n\t", - !strconcat(".reg .b32 %dummy;\n\t", - !strconcat("mov.b64 \t{%dummy,$dst}, $src;\n\t", - !strconcat("}}", "")))), - []> ; - -def PACK_TWO_INT32 - : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$lo, Int32Regs:$hi), - "mov.b64 \t$dst, {{$lo, $hi}};", []> ; +let hasSideEffects = 0 in { + def GET_LO_INT64 + : NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src), + !strconcat("{{\n\t", + !strconcat(".reg .b32 %dummy;\n\t", + !strconcat("mov.b64 \t{$dst,%dummy}, $src;\n\t", + !strconcat("}}", "")))), + []> ; + + def GET_HI_INT64 + : NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src), + !strconcat("{{\n\t", + !strconcat(".reg .b32 %dummy;\n\t", + !strconcat("mov.b64 \t{%dummy,$dst}, $src;\n\t", + !strconcat("}}", "")))), + []> ; +} + +let hasSideEffects = 0 in { + def PACK_TWO_INT32 + : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$lo, Int32Regs:$hi), + "mov.b64 \t$dst, {{$lo, $hi}};", []> ; +} def : Pat<(int_nvvm_swap_lo_hi_b64 Int64Regs:$src), (PACK_TWO_INT32 (GET_HI_INT64 Int64Regs:$src), (GET_LO_INT64 Int64Regs:$src))> ; -// funnel shift, requires >= sm_32 -def SHF_L_WRAP_B32_IMM - : NVPTXInst<(outs Int32Regs:$dst), - (ins Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt), - "shf.l.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>, - Requires<[hasHWROT32]>; +// Funnel shift, requires >= sm_32. Does not trap if amt is out of range, so +// no side effects. +let hasSideEffects = 0 in { + def SHF_L_WRAP_B32_IMM + : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt), + "shf.l.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>, + Requires<[hasHWROT32]>; -def SHF_L_WRAP_B32_REG - : NVPTXInst<(outs Int32Regs:$dst), - (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), - "shf.l.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>, - Requires<[hasHWROT32]>; + def SHF_L_WRAP_B32_REG + : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), + "shf.l.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>, + Requires<[hasHWROT32]>; -def SHF_R_WRAP_B32_IMM - : NVPTXInst<(outs Int32Regs:$dst), - (ins Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt), - "shf.r.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>, - Requires<[hasHWROT32]>; + def SHF_R_WRAP_B32_IMM + : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt), + "shf.r.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>, + Requires<[hasHWROT32]>; -def SHF_R_WRAP_B32_REG - : NVPTXInst<(outs Int32Regs:$dst), - (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), - "shf.r.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>, - Requires<[hasHWROT32]>; + def SHF_R_WRAP_B32_REG + : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), + "shf.r.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>, + Requires<[hasHWROT32]>; +} // HW version of rotate 64 def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, (i32 imm:$amt)), @@ -6950,98 +6970,95 @@ def : Pat<(int_nvvm_sust_p_3d_v4i32_trap Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; +//----------------------------------- +// Read Special Registers +//----------------------------------- - -//===-- Old PTX Back-end Intrinsics ---------------------------------------===// - -// These intrinsics are handled to retain compatibility with the old backend. - -// PTX Special Purpose Register Accessor Intrinsics - -class PTX_READ_SPECIAL_REGISTER_R64<string regname, Intrinsic intop> +class PTX_READ_SREG_R64<string regname, Intrinsic intop> : NVPTXInst<(outs Int64Regs:$d), (ins), !strconcat(!strconcat("mov.u64\t$d, %", regname), ";"), [(set Int64Regs:$d, (intop))]>; -class PTX_READ_SPECIAL_REGISTER_R32<string regname, Intrinsic intop> +class PTX_READ_SREG_R32<string regname, Intrinsic intop> : NVPTXInst<(outs Int32Regs:$d), (ins), !strconcat(!strconcat("mov.u32\t$d, %", regname), ";"), [(set Int32Regs:$d, (intop))]>; // TODO Add read vector-version of special registers -def PTX_READ_TID_X : PTX_READ_SPECIAL_REGISTER_R32<"tid.x", - int_ptx_read_tid_x>; -def PTX_READ_TID_Y : PTX_READ_SPECIAL_REGISTER_R32<"tid.y", - int_ptx_read_tid_y>; -def PTX_READ_TID_Z : PTX_READ_SPECIAL_REGISTER_R32<"tid.z", - int_ptx_read_tid_z>; -def PTX_READ_TID_W : PTX_READ_SPECIAL_REGISTER_R32<"tid.w", - int_ptx_read_tid_w>; - -def PTX_READ_NTID_X : PTX_READ_SPECIAL_REGISTER_R32<"ntid.x", - int_ptx_read_ntid_x>; -def PTX_READ_NTID_Y : PTX_READ_SPECIAL_REGISTER_R32<"ntid.y", - int_ptx_read_ntid_y>; -def PTX_READ_NTID_Z : PTX_READ_SPECIAL_REGISTER_R32<"ntid.z", - int_ptx_read_ntid_z>; -def PTX_READ_NTID_W : PTX_READ_SPECIAL_REGISTER_R32<"ntid.w", - int_ptx_read_ntid_w>; - -def PTX_READ_LANEID : PTX_READ_SPECIAL_REGISTER_R32<"laneid", - int_ptx_read_laneid>; -def PTX_READ_WARPID : PTX_READ_SPECIAL_REGISTER_R32<"warpid", - int_ptx_read_warpid>; -def PTX_READ_NWARPID : PTX_READ_SPECIAL_REGISTER_R32<"nwarpid", - int_ptx_read_nwarpid>; - -def PTX_READ_CTAID_X : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.x", - int_ptx_read_ctaid_x>; -def PTX_READ_CTAID_Y : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.y", - int_ptx_read_ctaid_y>; -def PTX_READ_CTAID_Z : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.z", - int_ptx_read_ctaid_z>; -def PTX_READ_CTAID_W : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.w", - int_ptx_read_ctaid_w>; - -def PTX_READ_NCTAID_X : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.x", - int_ptx_read_nctaid_x>; -def PTX_READ_NCTAID_Y : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.y", - int_ptx_read_nctaid_y>; -def PTX_READ_NCTAID_Z : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.z", - int_ptx_read_nctaid_z>; -def PTX_READ_NCTAID_W : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.w", - int_ptx_read_nctaid_w>; - -def PTX_READ_SMID : PTX_READ_SPECIAL_REGISTER_R32<"smid", - int_ptx_read_smid>; -def PTX_READ_NSMID : PTX_READ_SPECIAL_REGISTER_R32<"nsmid", - int_ptx_read_nsmid>; -def PTX_READ_GRIDID : PTX_READ_SPECIAL_REGISTER_R32<"gridid", - int_ptx_read_gridid>; - -def PTX_READ_LANEMASK_EQ - : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_eq", int_ptx_read_lanemask_eq>; -def PTX_READ_LANEMASK_LE - : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_le", int_ptx_read_lanemask_le>; -def PTX_READ_LANEMASK_LT - : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_lt", int_ptx_read_lanemask_lt>; -def PTX_READ_LANEMASK_GE - : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_ge", int_ptx_read_lanemask_ge>; -def PTX_READ_LANEMASK_GT - : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_gt", int_ptx_read_lanemask_gt>; - -def PTX_READ_CLOCK - : PTX_READ_SPECIAL_REGISTER_R32<"clock", int_ptx_read_clock>; -def PTX_READ_CLOCK64 - : PTX_READ_SPECIAL_REGISTER_R64<"clock64", int_ptx_read_clock64>; - -def PTX_READ_PM0 : PTX_READ_SPECIAL_REGISTER_R32<"pm0", int_ptx_read_pm0>; -def PTX_READ_PM1 : PTX_READ_SPECIAL_REGISTER_R32<"pm1", int_ptx_read_pm1>; -def PTX_READ_PM2 : PTX_READ_SPECIAL_REGISTER_R32<"pm2", int_ptx_read_pm2>; -def PTX_READ_PM3 : PTX_READ_SPECIAL_REGISTER_R32<"pm3", int_ptx_read_pm3>; - -// PTX Parallel Synchronization and Communication Intrinsics - -def PTX_BAR_SYNC : NVPTXInst<(outs), (ins i32imm:$i), "bar.sync\t$i;", - [(int_ptx_bar_sync imm:$i)]>; +def INT_PTX_SREG_TID_X : + PTX_READ_SREG_R32<"tid.x", int_nvvm_read_ptx_sreg_tid_x>; +def INT_PTX_SREG_TID_Y : + PTX_READ_SREG_R32<"tid.y", int_nvvm_read_ptx_sreg_tid_y>; +def INT_PTX_SREG_TID_Z : + PTX_READ_SREG_R32<"tid.z", int_nvvm_read_ptx_sreg_tid_z>; +def INT_PTX_SREG_TID_W : + PTX_READ_SREG_R32<"tid.w", int_nvvm_read_ptx_sreg_tid_w>; + +def INT_PTX_SREG_NTID_X : + PTX_READ_SREG_R32<"ntid.x", int_nvvm_read_ptx_sreg_ntid_x>; +def INT_PTX_SREG_NTID_Y : + PTX_READ_SREG_R32<"ntid.y", int_nvvm_read_ptx_sreg_ntid_y>; +def INT_PTX_SREG_NTID_Z : + PTX_READ_SREG_R32<"ntid.z", int_nvvm_read_ptx_sreg_ntid_z>; +def INT_PTX_SREG_NTID_W : + PTX_READ_SREG_R32<"ntid.w", int_nvvm_read_ptx_sreg_ntid_w>; + +def INT_PTX_SREG_LANEID : + PTX_READ_SREG_R32<"laneid", int_nvvm_read_ptx_sreg_laneid>; +def INT_PTX_SREG_WARPID : + PTX_READ_SREG_R32<"warpid", int_nvvm_read_ptx_sreg_warpid>; +def INT_PTX_SREG_NWARPID : + PTX_READ_SREG_R32<"nwarpid", int_nvvm_read_ptx_sreg_nwarpid>; + +def INT_PTX_SREG_CTAID_X : + PTX_READ_SREG_R32<"ctaid.x", int_nvvm_read_ptx_sreg_ctaid_x>; +def INT_PTX_SREG_CTAID_Y : + PTX_READ_SREG_R32<"ctaid.y", int_nvvm_read_ptx_sreg_ctaid_y>; +def INT_PTX_SREG_CTAID_Z : + PTX_READ_SREG_R32<"ctaid.z", int_nvvm_read_ptx_sreg_ctaid_z>; +def INT_PTX_SREG_CTAID_W : + PTX_READ_SREG_R32<"ctaid.w", int_nvvm_read_ptx_sreg_ctaid_w>; + +def INT_PTX_SREG_NCTAID_X : + PTX_READ_SREG_R32<"nctaid.x", int_nvvm_read_ptx_sreg_nctaid_x>; +def INT_PTX_SREG_NCTAID_Y : + PTX_READ_SREG_R32<"nctaid.y", int_nvvm_read_ptx_sreg_nctaid_y>; +def INT_PTX_SREG_NCTAID_Z : + PTX_READ_SREG_R32<"nctaid.z", int_nvvm_read_ptx_sreg_nctaid_z>; +def INT_PTX_SREG_NCTAID_W : + PTX_READ_SREG_R32<"nctaid.w", int_nvvm_read_ptx_sreg_nctaid_w>; + +def INT_PTX_SREG_SMID : + PTX_READ_SREG_R32<"smid", int_nvvm_read_ptx_sreg_smid>; +def INT_PTX_SREG_NSMID : + PTX_READ_SREG_R32<"nsmid", int_nvvm_read_ptx_sreg_nsmid>; +def INT_PTX_SREG_GRIDID : + PTX_READ_SREG_R32<"gridid", int_nvvm_read_ptx_sreg_gridid>; + +def INT_PTX_SREG_LANEMASK_EQ : + PTX_READ_SREG_R32<"lanemask_eq", int_nvvm_read_ptx_sreg_lanemask_eq>; +def INT_PTX_SREG_LANEMASK_LE : + PTX_READ_SREG_R32<"lanemask_le", int_nvvm_read_ptx_sreg_lanemask_le>; +def INT_PTX_SREG_LANEMASK_LT : + PTX_READ_SREG_R32<"lanemask_lt", int_nvvm_read_ptx_sreg_lanemask_lt>; +def INT_PTX_SREG_LANEMASK_GE : + PTX_READ_SREG_R32<"lanemask_ge", int_nvvm_read_ptx_sreg_lanemask_ge>; +def INT_PTX_SREG_LANEMASK_GT : + PTX_READ_SREG_R32<"lanemask_gt", int_nvvm_read_ptx_sreg_lanemask_gt>; + +def INT_PTX_SREG_CLOCK : + PTX_READ_SREG_R32<"clock", int_nvvm_read_ptx_sreg_clock>; +def INT_PTX_SREG_CLOCK64 : + PTX_READ_SREG_R64<"clock64", int_nvvm_read_ptx_sreg_clock64>; + +def INT_PTX_SREG_PM0 : PTX_READ_SREG_R32<"pm0", int_nvvm_read_ptx_sreg_pm0>; +def INT_PTX_SREG_PM1 : PTX_READ_SREG_R32<"pm1", int_nvvm_read_ptx_sreg_pm1>; +def INT_PTX_SREG_PM2 : PTX_READ_SREG_R32<"pm2", int_nvvm_read_ptx_sreg_pm2>; +def INT_PTX_SREG_PM3 : PTX_READ_SREG_R32<"pm3", int_nvvm_read_ptx_sreg_pm3>; + +// TODO: It would be nice to use PTX_READ_SREG here, but it doesn't +// handle the constant. +def INT_PTX_SREG_WARPSIZE : + NVPTXInst<(outs Int32Regs:$dst), (ins), "mov.u32 \t$dst, WARP_SZ;", + [(set Int32Regs:$dst, (int_nvvm_read_ptx_sreg_warpsize))]>; diff --git a/lib/Target/NVPTX/NVPTXLowerAlloca.cpp b/lib/Target/NVPTX/NVPTXLowerAlloca.cpp index 624052e9b981..fa1a3ef3fe24 100644 --- a/lib/Target/NVPTX/NVPTXLowerAlloca.cpp +++ b/lib/Target/NVPTX/NVPTXLowerAlloca.cpp @@ -62,6 +62,9 @@ INITIALIZE_PASS(NVPTXLowerAlloca, "nvptx-lower-alloca", // Main function for this pass. // ============================================================================= bool NVPTXLowerAlloca::runOnBasicBlock(BasicBlock &BB) { + if (skipBasicBlock(BB)) + return false; + bool Changed = false; for (auto &I : BB) { if (auto allocaInst = dyn_cast<AllocaInst>(&I)) { diff --git a/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp b/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp index 6656077348a1..d162a283f745 100644 --- a/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp +++ b/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp @@ -128,7 +128,7 @@ INITIALIZE_PASS(NVPTXLowerKernelArgs, "nvptx-lower-kernel-args", "Lower kernel arguments (NVPTX)", false, false) // ============================================================================= -// If the function had a byval struct ptr arg, say foo(%struct.x *byval %d), +// If the function had a byval struct ptr arg, say foo(%struct.x* byval %d), // then add the following instructions to the first basic block: // // %temp = alloca %struct.x, align 8 diff --git a/lib/Target/NVPTX/NVPTXMCExpr.cpp b/lib/Target/NVPTX/NVPTXMCExpr.cpp index 3c98b9febf85..84d5239ec096 100644 --- a/lib/Target/NVPTX/NVPTXMCExpr.cpp +++ b/lib/Target/NVPTX/NVPTXMCExpr.cpp @@ -15,8 +15,8 @@ using namespace llvm; #define DEBUG_TYPE "nvptx-mcexpr" -const NVPTXFloatMCExpr* -NVPTXFloatMCExpr::create(VariantKind Kind, APFloat Flt, MCContext &Ctx) { +const NVPTXFloatMCExpr * +NVPTXFloatMCExpr::create(VariantKind Kind, const APFloat &Flt, MCContext &Ctx) { return new (Ctx) NVPTXFloatMCExpr(Kind, Flt); } diff --git a/lib/Target/NVPTX/NVPTXMCExpr.h b/lib/Target/NVPTX/NVPTXMCExpr.h index 81a606d7535c..7f833c42fa8f 100644 --- a/lib/Target/NVPTX/NVPTXMCExpr.h +++ b/lib/Target/NVPTX/NVPTXMCExpr.h @@ -14,6 +14,7 @@ #include "llvm/ADT/APFloat.h" #include "llvm/MC/MCExpr.h" +#include <utility> namespace llvm { @@ -30,21 +31,21 @@ private: const APFloat Flt; explicit NVPTXFloatMCExpr(VariantKind Kind, APFloat Flt) - : Kind(Kind), Flt(Flt) {} + : Kind(Kind), Flt(std::move(Flt)) {} public: /// @name Construction /// @{ - static const NVPTXFloatMCExpr *create(VariantKind Kind, APFloat Flt, + static const NVPTXFloatMCExpr *create(VariantKind Kind, const APFloat &Flt, MCContext &Ctx); - static const NVPTXFloatMCExpr *createConstantFPSingle(APFloat Flt, + static const NVPTXFloatMCExpr *createConstantFPSingle(const APFloat &Flt, MCContext &Ctx) { return create(VK_NVPTX_SINGLE_PREC_FLOAT, Flt, Ctx); } - static const NVPTXFloatMCExpr *createConstantFPDouble(APFloat Flt, + static const NVPTXFloatMCExpr *createConstantFPDouble(const APFloat &Flt, MCContext &Ctx) { return create(VK_NVPTX_DOUBLE_PREC_FLOAT, Flt, Ctx); } diff --git a/lib/Target/NVPTX/NVPTXPeephole.cpp b/lib/Target/NVPTX/NVPTXPeephole.cpp index a61c291d233f..7d0cd553e03f 100644 --- a/lib/Target/NVPTX/NVPTXPeephole.cpp +++ b/lib/Target/NVPTX/NVPTXPeephole.cpp @@ -125,6 +125,9 @@ static void CombineCVTAToLocal(MachineInstr &Root) { } bool NVPTXPeephole::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(*MF.getFunction())) + return false; + bool Changed = false; // Loop over all of the basic blocks. for (auto &MBB : MF) { diff --git a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp index 17019d7b364d..029e0097c5dc 100644 --- a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp +++ b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp @@ -55,11 +55,10 @@ bool NVPTXPrologEpilogPass::runOnMachineFunction(MachineFunction &MF) { calculateFrameObjectOffsets(MF); - for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB) { - for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) { - MachineInstr *MI = I; - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - if (!MI->getOperand(i).isFI()) + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { + if (!MI.getOperand(i).isFI()) continue; TRI.eliminateFrameIndex(MI, 0, i, nullptr); Modified = true; diff --git a/lib/Target/NVPTX/NVPTXSection.h b/lib/Target/NVPTX/NVPTXSection.h index 45a7309479ee..cad4f5668fdf 100644 --- a/lib/Target/NVPTX/NVPTXSection.h +++ b/lib/Target/NVPTX/NVPTXSection.h @@ -16,7 +16,6 @@ #include "llvm/IR/GlobalVariable.h" #include "llvm/MC/MCSection.h" -#include <vector> namespace llvm { /// Represents a section in PTX PTX does not have sections. We create this class diff --git a/lib/Target/NVPTX/NVPTXSubtarget.h b/lib/Target/NVPTX/NVPTXSubtarget.h index c7287719be5f..41670390c41b 100644 --- a/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/lib/Target/NVPTX/NVPTXSubtarget.h @@ -19,8 +19,8 @@ #include "NVPTXISelLowering.h" #include "NVPTXInstrInfo.h" #include "NVPTXRegisterInfo.h" +#include "llvm/CodeGen/SelectionDAGTargetInfo.h" #include "llvm/IR/DataLayout.h" -#include "llvm/Target/TargetSelectionDAGInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" #include <string> @@ -42,7 +42,7 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo { const NVPTXTargetMachine &TM; NVPTXInstrInfo InstrInfo; NVPTXTargetLowering TLInfo; - TargetSelectionDAGInfo TSInfo; + SelectionDAGTargetInfo TSInfo; // NVPTX does not have any call stack frame, but need a NVPTX specific // FrameLowering class because TargetFrameLowering is abstract. @@ -65,7 +65,7 @@ public: const NVPTXTargetLowering *getTargetLowering() const override { return &TLInfo; } - const TargetSelectionDAGInfo *getSelectionDAGInfo() const override { + const SelectionDAGTargetInfo *getSelectionDAGInfo() const override { return &TSInfo; } diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp index aa931b134da9..b9f5919964c7 100644 --- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -23,6 +23,7 @@ #include "llvm/CodeGen/MachineFunctionAnalysis.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/IRPrintingPasses.h" #include "llvm/IR/LegacyPassManager.h" @@ -44,15 +45,23 @@ #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/GVN.h" using namespace llvm; +static cl::opt<bool> UseInferAddressSpaces( + "nvptx-use-infer-addrspace", cl::init(false), cl::Hidden, + cl::desc("Optimize address spaces using NVPTXInferAddressSpaces instead of " + "NVPTXFavorNonGenericAddrSpaces")); + namespace llvm { +void initializeNVVMIntrRangePass(PassRegistry&); void initializeNVVMReflectPass(PassRegistry&); void initializeGenericToNVVMPass(PassRegistry&); void initializeNVPTXAllocaHoistingPass(PassRegistry &); void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&); void initializeNVPTXFavorNonGenericAddrSpacesPass(PassRegistry &); +void initializeNVPTXInferAddressSpacesPass(PassRegistry &); void initializeNVPTXLowerAggrCopiesPass(PassRegistry &); void initializeNVPTXLowerKernelArgsPass(PassRegistry &); void initializeNVPTXLowerAllocaPass(PassRegistry &); @@ -67,10 +76,12 @@ extern "C" void LLVMInitializeNVPTXTarget() { // but it's very NVPTX-specific. PassRegistry &PR = *PassRegistry::getPassRegistry(); initializeNVVMReflectPass(PR); + initializeNVVMIntrRangePass(PR); initializeGenericToNVVMPass(PR); initializeNVPTXAllocaHoistingPass(PR); initializeNVPTXAssignValidGlobalNamesPass(PR); initializeNVPTXFavorNonGenericAddrSpacesPass(PR); + initializeNVPTXInferAddressSpacesPass(PR); initializeNVPTXLowerKernelArgsPass(PR); initializeNVPTXLowerAllocaPass(PR); initializeNVPTXLowerAggrCopiesPass(PR); @@ -90,11 +101,15 @@ static std::string computeDataLayout(bool is64Bit) { NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, + Optional<Reloc::Model> RM, + CodeModel::Model CM, CodeGenOpt::Level OL, bool is64bit) - : LLVMTargetMachine(T, computeDataLayout(is64bit), TT, CPU, FS, Options, RM, - CM, OL), - is64bit(is64bit), TLOF(make_unique<NVPTXTargetObjectFile>()), + // The pic relocation model is used regardless of what the client has + // specified, as it is the only relocation model currently supported. + : LLVMTargetMachine(T, computeDataLayout(is64bit), TT, CPU, FS, Options, + Reloc::PIC_, CM, OL), + is64bit(is64bit), + TLOF(make_unique<NVPTXTargetObjectFile>()), Subtarget(TT, CPU, FS, *this) { if (TT.getOS() == Triple::NVCL) drvInterface = NVPTX::NVCL; @@ -110,7 +125,8 @@ void NVPTXTargetMachine32::anchor() {} NVPTXTargetMachine32::NVPTXTargetMachine32(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, + Optional<Reloc::Model> RM, + CodeModel::Model CM, CodeGenOpt::Level OL) : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {} @@ -119,7 +135,8 @@ void NVPTXTargetMachine64::anchor() {} NVPTXTargetMachine64::NVPTXTargetMachine64(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, + Optional<Reloc::Model> RM, + CodeModel::Model CM, CodeGenOpt::Level OL) : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {} @@ -143,14 +160,25 @@ public: void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override; private: - // if the opt level is aggressive, add GVN; otherwise, add EarlyCSE. + // If the opt level is aggressive, add GVN; otherwise, add EarlyCSE. This + // function is only called in opt mode. void addEarlyCSEOrGVNPass(); + + // Add passes that propagate special memory spaces. + void addAddressSpaceInferencePasses(); + + // Add passes that perform straight-line scalar optimizations. + void addStraightLineScalarOptimizationPasses(); }; } // end anonymous namespace TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) { - NVPTXPassConfig *PassConfig = new NVPTXPassConfig(this, PM); - return PassConfig; + return new NVPTXPassConfig(this, PM); +} + +void NVPTXTargetMachine::addEarlyAsPossiblePasses(PassManagerBase &PM) { + PM.add(createNVVMReflectPass()); + PM.add(createNVVMIntrRangePass(Subtarget.getSmVersion())); } TargetIRAnalysis NVPTXTargetMachine::getTargetIRAnalysis() { @@ -166,34 +194,23 @@ void NVPTXPassConfig::addEarlyCSEOrGVNPass() { addPass(createEarlyCSEPass()); } -void NVPTXPassConfig::addIRPasses() { - // The following passes are known to not play well with virtual regs hanging - // around after register allocation (which in our case, is *all* registers). - // We explicitly disable them here. We do, however, need some functionality - // of the PrologEpilogCodeInserter pass, so we emulate that behavior in the - // NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp). - disablePass(&PrologEpilogCodeInserterID); - disablePass(&MachineCopyPropagationID); - disablePass(&TailDuplicateID); - - addPass(createNVVMReflectPass()); - addPass(createNVPTXImageOptimizerPass()); - addPass(createNVPTXAssignValidGlobalNamesPass()); - addPass(createGenericToNVVMPass()); - - // === Propagate special address spaces === - addPass(createNVPTXLowerKernelArgsPass(&getNVPTXTargetMachine())); +void NVPTXPassConfig::addAddressSpaceInferencePasses() { // NVPTXLowerKernelArgs emits alloca for byval parameters which can often // be eliminated by SROA. addPass(createSROAPass()); addPass(createNVPTXLowerAllocaPass()); - addPass(createNVPTXFavorNonGenericAddrSpacesPass()); - // FavorNonGenericAddrSpaces shortcuts unnecessary addrspacecasts, and leave - // them unused. We could remove dead code in an ad-hoc manner, but that - // requires manual work and might be error-prone. - addPass(createDeadCodeEliminationPass()); + if (UseInferAddressSpaces) { + addPass(createNVPTXInferAddressSpacesPass()); + } else { + addPass(createNVPTXFavorNonGenericAddrSpacesPass()); + // FavorNonGenericAddrSpaces shortcuts unnecessary addrspacecasts, and leave + // them unused. We could remove dead code in an ad-hoc manner, but that + // requires manual work and might be error-prone. + addPass(createDeadCodeEliminationPass()); + } +} - // === Straight-line scalar optimizations === +void NVPTXPassConfig::addStraightLineScalarOptimizationPasses() { addPass(createSeparateConstOffsetFromGEPPass()); addPass(createSpeculativeExecutionPass()); // ReassociateGEPs exposes more opportunites for SLSR. See @@ -208,6 +225,41 @@ void NVPTXPassConfig::addIRPasses() { // NaryReassociate on GEPs creates redundant common expressions, so run // EarlyCSE after it. addPass(createEarlyCSEPass()); +} + +void NVPTXPassConfig::addIRPasses() { + // The following passes are known to not play well with virtual regs hanging + // around after register allocation (which in our case, is *all* registers). + // We explicitly disable them here. We do, however, need some functionality + // of the PrologEpilogCodeInserter pass, so we emulate that behavior in the + // NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp). + disablePass(&PrologEpilogCodeInserterID); + disablePass(&MachineCopyPropagationID); + disablePass(&TailDuplicateID); + disablePass(&StackMapLivenessID); + disablePass(&LiveDebugValuesID); + disablePass(&PostRASchedulerID); + disablePass(&FuncletLayoutID); + disablePass(&PatchableFunctionID); + + // NVVMReflectPass is added in addEarlyAsPossiblePasses, so hopefully running + // it here does nothing. But since we need it for correctness when lowering + // to NVPTX, run it here too, in case whoever built our pass pipeline didn't + // call addEarlyAsPossiblePasses. + addPass(createNVVMReflectPass()); + + if (getOptLevel() != CodeGenOpt::None) + addPass(createNVPTXImageOptimizerPass()); + addPass(createNVPTXAssignValidGlobalNamesPass()); + addPass(createGenericToNVVMPass()); + + // NVPTXLowerKernelArgs is required for correctness and should be run right + // before the address space inference passes. + addPass(createNVPTXLowerKernelArgsPass(&getNVPTXTargetMachine())); + if (getOptLevel() != CodeGenOpt::None) { + addAddressSpaceInferencePasses(); + addStraightLineScalarOptimizationPasses(); + } // === LSR and other generic IR passes === TargetPassConfig::addIRPasses(); @@ -223,7 +275,8 @@ void NVPTXPassConfig::addIRPasses() { // %1 = shl %a, 2 // // but EarlyCSE can do neither of them. - addEarlyCSEOrGVNPass(); + if (getOptLevel() != CodeGenOpt::None) + addEarlyCSEOrGVNPass(); } bool NVPTXPassConfig::addInstSelector() { @@ -241,10 +294,12 @@ bool NVPTXPassConfig::addInstSelector() { void NVPTXPassConfig::addPostRegAlloc() { addPass(createNVPTXPrologEpilogPass(), false); - // NVPTXPrologEpilogPass calculates frame object offset and replace frame - // index with VRFrame register. NVPTXPeephole need to be run after that and - // will replace VRFrame with VRFrameLocal when possible. - addPass(createNVPTXPeephole()); + if (getOptLevel() != CodeGenOpt::None) { + // NVPTXPrologEpilogPass calculates frame object offset and replace frame + // index with VRFrame register. NVPTXPeephole need to be run after that and + // will replace VRFrame with VRFrameLocal when possible. + addPass(createNVPTXPeephole()); + } } FunctionPass *NVPTXPassConfig::createTargetRegisterAllocator(bool) { diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.h b/lib/Target/NVPTX/NVPTXTargetMachine.h index da7f62bf9d9b..78a053831772 100644 --- a/lib/Target/NVPTX/NVPTXTargetMachine.h +++ b/lib/Target/NVPTX/NVPTXTargetMachine.h @@ -16,9 +16,9 @@ #include "ManagedStringPool.h" #include "NVPTXSubtarget.h" +#include "llvm/CodeGen/SelectionDAGTargetInfo.h" #include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetMachine.h" -#include "llvm/Target/TargetSelectionDAGInfo.h" namespace llvm { @@ -36,8 +36,8 @@ class NVPTXTargetMachine : public LLVMTargetMachine { public: NVPTXTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OP, - bool is64bit); + Optional<Reloc::Model> RM, CodeModel::Model CM, + CodeGenOpt::Level OP, bool is64bit); ~NVPTXTargetMachine() override; const NVPTXSubtarget *getSubtargetImpl(const Function &) const override { @@ -61,6 +61,7 @@ public: return TLOF.get(); } + void addEarlyAsPossiblePasses(PassManagerBase &PM) override; TargetIRAnalysis getTargetIRAnalysis() override; }; // NVPTXTargetMachine. @@ -70,7 +71,7 @@ class NVPTXTargetMachine32 : public NVPTXTargetMachine { public: NVPTXTargetMachine32(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, + Optional<Reloc::Model> RM, CodeModel::Model CM, CodeGenOpt::Level OL); }; @@ -79,7 +80,7 @@ class NVPTXTargetMachine64 : public NVPTXTargetMachine { public: NVPTXTargetMachine64(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, + Optional<Reloc::Model> RM, CodeModel::Model CM, CodeGenOpt::Level OL); }; diff --git a/lib/Target/NVPTX/NVPTXTargetObjectFile.h b/lib/Target/NVPTX/NVPTXTargetObjectFile.h index 683b9a3f49f7..045fbb75a2a0 100644 --- a/lib/Target/NVPTX/NVPTXTargetObjectFile.h +++ b/lib/Target/NVPTX/NVPTXTargetObjectFile.h @@ -12,7 +12,6 @@ #include "NVPTXSection.h" #include "llvm/Target/TargetLoweringObjectFile.h" -#include <string> namespace llvm { class GlobalVariable; @@ -87,7 +86,8 @@ public: } MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind, - const Constant *C) const override { + const Constant *C, + unsigned &Align) const override { return ReadOnlySection; } diff --git a/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp index 6e679dd0257c..580d345cc663 100644 --- a/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -32,7 +32,7 @@ static bool readsThreadIndex(const IntrinsicInst *II) { } static bool readsLaneId(const IntrinsicInst *II) { - return II->getIntrinsicID() == Intrinsic::ptx_read_laneid; + return II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_laneid; } // Whether the given intrinsic is an atomic instruction in PTX. diff --git a/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/lib/Target/NVPTX/NVPTXTargetTransformInfo.h index 0946a3293eec..08ffdf191151 100644 --- a/lib/Target/NVPTX/NVPTXTargetTransformInfo.h +++ b/lib/Target/NVPTX/NVPTXTargetTransformInfo.h @@ -52,6 +52,10 @@ public: bool isSourceOfDivergence(const Value *V); + // Increase the inlining cost threshold by a factor of 5, reflecting that + // calls are particularly expensive in NVPTX. + unsigned getInliningThresholdMultiplier() { return 5; } + int getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, diff --git a/lib/Target/NVPTX/NVPTXUtilities.cpp b/lib/Target/NVPTX/NVPTXUtilities.cpp index 578b466568ae..835e4b442039 100644 --- a/lib/Target/NVPTX/NVPTXUtilities.cpp +++ b/lib/Target/NVPTX/NVPTXUtilities.cpp @@ -99,7 +99,7 @@ static void cacheAnnotationFromMD(const Module *m, const GlobalValue *gv) { } } -bool llvm::findOneNVVMAnnotation(const GlobalValue *gv, std::string prop, +bool llvm::findOneNVVMAnnotation(const GlobalValue *gv, const std::string &prop, unsigned &retval) { MutexGuard Guard(Lock); const Module *m = gv->getParent(); @@ -113,7 +113,7 @@ bool llvm::findOneNVVMAnnotation(const GlobalValue *gv, std::string prop, return true; } -bool llvm::findAllNVVMAnnotation(const GlobalValue *gv, std::string prop, +bool llvm::findAllNVVMAnnotation(const GlobalValue *gv, const std::string &prop, std::vector<unsigned> &retval) { MutexGuard Guard(Lock); const Module *m = gv->getParent(); diff --git a/lib/Target/NVPTX/NVPTXUtilities.h b/lib/Target/NVPTX/NVPTXUtilities.h index a5262cb7412f..ec5bfc17afc7 100644 --- a/lib/Target/NVPTX/NVPTXUtilities.h +++ b/lib/Target/NVPTX/NVPTXUtilities.h @@ -30,8 +30,9 @@ namespace llvm { void clearAnnotationCache(const llvm::Module *); -bool findOneNVVMAnnotation(const llvm::GlobalValue *, std::string, unsigned &); -bool findAllNVVMAnnotation(const llvm::GlobalValue *, std::string, +bool findOneNVVMAnnotation(const llvm::GlobalValue *, const std::string &, + unsigned &); +bool findAllNVVMAnnotation(const llvm::GlobalValue *, const std::string &, std::vector<unsigned> &); bool isTexture(const llvm::Value &); diff --git a/lib/Target/NVPTX/NVVMIntrRange.cpp b/lib/Target/NVPTX/NVVMIntrRange.cpp new file mode 100644 index 000000000000..b9c02c431141 --- /dev/null +++ b/lib/Target/NVPTX/NVVMIntrRange.cpp @@ -0,0 +1,148 @@ +//===- NVVMIntrRange.cpp - Set !range metadata for NVVM intrinsics --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass adds appropriate !range metadata for calls to NVVM +// intrinsics that return a limited range of values. +// +//===----------------------------------------------------------------------===// + +#include "NVPTX.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Instructions.h" + +using namespace llvm; + +#define DEBUG_TYPE "nvvm-intr-range" + +namespace llvm { void initializeNVVMIntrRangePass(PassRegistry &); } + +// Add !range metadata based on limits of given SM variant. +static cl::opt<unsigned> NVVMIntrRangeSM("nvvm-intr-range-sm", cl::init(20), + cl::Hidden, cl::desc("SM variant")); + +namespace { +class NVVMIntrRange : public FunctionPass { + private: + struct { + unsigned x, y, z; + } MaxBlockSize, MaxGridSize; + + public: + static char ID; + NVVMIntrRange() : NVVMIntrRange(NVVMIntrRangeSM) {} + NVVMIntrRange(unsigned int SmVersion) : FunctionPass(ID) { + MaxBlockSize.x = 1024; + MaxBlockSize.y = 1024; + MaxBlockSize.z = 64; + + MaxGridSize.x = SmVersion >= 30 ? 0x7fffffff : 0xffff; + MaxGridSize.y = 0xffff; + MaxGridSize.z = 0xffff; + + initializeNVVMIntrRangePass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &) override; +}; +} + +FunctionPass *llvm::createNVVMIntrRangePass(unsigned int SmVersion) { + return new NVVMIntrRange(SmVersion); +} + +char NVVMIntrRange::ID = 0; +INITIALIZE_PASS(NVVMIntrRange, "nvvm-intr-range", + "Add !range metadata to NVVM intrinsics.", false, false) + +// Adds the passed-in [Low,High) range information as metadata to the +// passed-in call instruction. +static bool addRangeMetadata(uint64_t Low, uint64_t High, CallInst *C) { + LLVMContext &Context = C->getParent()->getContext(); + IntegerType *Int32Ty = Type::getInt32Ty(Context); + Metadata *LowAndHigh[] = { + ConstantAsMetadata::get(ConstantInt::get(Int32Ty, Low)), + ConstantAsMetadata::get(ConstantInt::get(Int32Ty, High))}; + C->setMetadata(LLVMContext::MD_range, MDNode::get(Context, LowAndHigh)); + return true; +} + +bool NVVMIntrRange::runOnFunction(Function &F) { + // Go through the calls in this function. + bool Changed = false; + for (Instruction &I : instructions(F)) { + CallInst *Call = dyn_cast<CallInst>(&I); + if (!Call) + continue; + + if (Function *Callee = Call->getCalledFunction()) { + switch (Callee->getIntrinsicID()) { + // Index within block + case Intrinsic::nvvm_read_ptx_sreg_tid_x: + Changed |= addRangeMetadata(0, MaxBlockSize.x, Call); + break; + case Intrinsic::nvvm_read_ptx_sreg_tid_y: + Changed |= addRangeMetadata(0, MaxBlockSize.y, Call); + break; + case Intrinsic::nvvm_read_ptx_sreg_tid_z: + Changed |= addRangeMetadata(0, MaxBlockSize.z, Call); + break; + + // Block size + case Intrinsic::nvvm_read_ptx_sreg_ntid_x: + Changed |= addRangeMetadata(1, MaxBlockSize.x+1, Call); + break; + case Intrinsic::nvvm_read_ptx_sreg_ntid_y: + Changed |= addRangeMetadata(1, MaxBlockSize.y+1, Call); + break; + case Intrinsic::nvvm_read_ptx_sreg_ntid_z: + Changed |= addRangeMetadata(1, MaxBlockSize.z+1, Call); + break; + + // Index within grid + case Intrinsic::nvvm_read_ptx_sreg_ctaid_x: + Changed |= addRangeMetadata(0, MaxGridSize.x, Call); + break; + case Intrinsic::nvvm_read_ptx_sreg_ctaid_y: + Changed |= addRangeMetadata(0, MaxGridSize.y, Call); + break; + case Intrinsic::nvvm_read_ptx_sreg_ctaid_z: + Changed |= addRangeMetadata(0, MaxGridSize.z, Call); + break; + + // Grid size + case Intrinsic::nvvm_read_ptx_sreg_nctaid_x: + Changed |= addRangeMetadata(1, MaxGridSize.x+1, Call); + break; + case Intrinsic::nvvm_read_ptx_sreg_nctaid_y: + Changed |= addRangeMetadata(1, MaxGridSize.y+1, Call); + break; + case Intrinsic::nvvm_read_ptx_sreg_nctaid_z: + Changed |= addRangeMetadata(1, MaxGridSize.z+1, Call); + break; + + // warp size is constant 32. + case Intrinsic::nvvm_read_ptx_sreg_warpsize: + Changed |= addRangeMetadata(32, 32+1, Call); + break; + + // Lane ID is [0..warpsize) + case Intrinsic::nvvm_read_ptx_sreg_laneid: + Changed |= addRangeMetadata(0, 32, Call); + break; + + default: + break; + } + } + } + + return Changed; +} diff --git a/lib/Target/NVPTX/NVVMReflect.cpp b/lib/Target/NVPTX/NVVMReflect.cpp index 20ab5db584d2..e0c35e7039e5 100644 --- a/lib/Target/NVPTX/NVVMReflect.cpp +++ b/lib/Target/NVPTX/NVVMReflect.cpp @@ -7,20 +7,26 @@ // //===----------------------------------------------------------------------===// // -// This pass replaces occurrences of __nvvm_reflect("string") with an -// integer based on -nvvm-reflect-list string=<int> option given to this pass. -// If an undefined string value is seen in a call to __nvvm_reflect("string"), -// a default value of 0 will be used. +// This pass replaces occurrences of __nvvm_reflect("foo") and llvm.nvvm.reflect +// with an integer. +// +// We choose the value we use by looking, in this order, at: +// +// * the -nvvm-reflect-list flag, which has the format "foo=1,bar=42", +// * the StringMap passed to the pass's constructor, and +// * metadata in the module itself. +// +// If we see an unknown string, we replace its call with 0. // //===----------------------------------------------------------------------===// #include "NVPTX.h" -#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringMap.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" +#include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" @@ -31,11 +37,8 @@ #include "llvm/Support/raw_os_ostream.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" -#include <map> #include <sstream> #include <string> -#include <vector> - #define NVVM_REFLECT_FUNCTION "__nvvm_reflect" using namespace llvm; @@ -45,31 +48,21 @@ using namespace llvm; namespace llvm { void initializeNVVMReflectPass(PassRegistry &); } namespace { -class NVVMReflect : public ModulePass { +class NVVMReflect : public FunctionPass { private: StringMap<int> VarMap; - typedef DenseMap<std::string, int>::iterator VarMapIter; public: static char ID; - NVVMReflect() : ModulePass(ID) { - initializeNVVMReflectPass(*PassRegistry::getPassRegistry()); - VarMap.clear(); - } + NVVMReflect() : NVVMReflect(StringMap<int>()) {} NVVMReflect(const StringMap<int> &Mapping) - : ModulePass(ID) { + : FunctionPass(ID), VarMap(Mapping) { initializeNVVMReflectPass(*PassRegistry::getPassRegistry()); - for (StringMap<int>::const_iterator I = Mapping.begin(), E = Mapping.end(); - I != E; ++I) { - VarMap[(*I).getKey()] = (*I).getValue(); - } + setVarMap(); } - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesAll(); - } - bool runOnModule(Module &) override; + bool runOnFunction(Function &) override; private: bool handleFunction(Function *ReflectFunction); @@ -77,11 +70,8 @@ private: }; } -ModulePass *llvm::createNVVMReflectPass() { - return new NVVMReflect(); -} - -ModulePass *llvm::createNVVMReflectPass(const StringMap<int>& Mapping) { +FunctionPass *llvm::createNVVMReflectPass() { return new NVVMReflect(); } +FunctionPass *llvm::createNVVMReflectPass(const StringMap<int> &Mapping) { return new NVVMReflect(Mapping); } @@ -123,30 +113,35 @@ void NVVMReflect::setVarMap() { } } -bool NVVMReflect::handleFunction(Function *ReflectFunction) { - // Validate _reflect function - assert(ReflectFunction->isDeclaration() && - "_reflect function should not have a body"); - assert(ReflectFunction->getReturnType()->isIntegerTy() && - "_reflect's return type should be integer"); +bool NVVMReflect::runOnFunction(Function &F) { + if (!NVVMReflectEnabled) + return false; + + if (F.getName() == NVVM_REFLECT_FUNCTION) { + assert(F.isDeclaration() && "_reflect function should not have a body"); + assert(F.getReturnType()->isIntegerTy() && + "_reflect's return type should be integer"); + return false; + } - std::vector<Instruction *> ToRemove; + SmallVector<Instruction *, 4> ToRemove; - // Go through the uses of ReflectFunction in this Function. - // Each of them should a CallInst with a ConstantArray argument. - // First validate that. If the c-string corresponding to the - // ConstantArray can be found successfully, see if it can be - // found in VarMap. If so, replace the uses of CallInst with the - // value found in VarMap. If not, replace the use with value 0. + // Go through the calls in this function. Each call to __nvvm_reflect or + // llvm.nvvm.reflect should be a CallInst with a ConstantArray argument. + // First validate that. If the c-string corresponding to the ConstantArray can + // be found successfully, see if it can be found in VarMap. If so, replace the + // uses of CallInst with the value found in VarMap. If not, replace the use + // with value 0. - // IR for __nvvm_reflect calls differs between CUDA versions: + // The IR for __nvvm_reflect calls differs between CUDA versions. + // // CUDA 6.5 and earlier uses this sequence: // %ptr = tail call i8* @llvm.nvvm.ptr.constant.to.gen.p0i8.p4i8 // (i8 addrspace(4)* getelementptr inbounds // ([8 x i8], [8 x i8] addrspace(4)* @str, i32 0, i32 0)) // %reflect = tail call i32 @__nvvm_reflect(i8* %ptr) // - // Value returned by Sym->getOperand(0) is a Constant with a + // The value returned by Sym->getOperand(0) is a Constant with a // ConstantDataSequential operand which can be converted to string and used // for lookup. // @@ -157,31 +152,37 @@ bool NVVMReflect::handleFunction(Function *ReflectFunction) { // // In this case, we get a Constant with a GlobalVariable operand and we need // to dig deeper to find its initializer with the string we'll use for lookup. - - for (User *U : ReflectFunction->users()) { - assert(isa<CallInst>(U) && "Only a call instruction can use _reflect"); - CallInst *Reflect = cast<CallInst>(U); - - assert((Reflect->getNumOperands() == 2) && - "Only one operand expect for _reflect function"); - // In cuda, we will have an extra constant-to-generic conversion of - // the string. - const Value *Str = Reflect->getArgOperand(0); - if (isa<CallInst>(Str)) { - // CUDA path - const CallInst *ConvCall = cast<CallInst>(Str); + for (Instruction &I : instructions(F)) { + CallInst *Call = dyn_cast<CallInst>(&I); + if (!Call) + continue; + Function *Callee = Call->getCalledFunction(); + if (!Callee || (Callee->getName() != NVVM_REFLECT_FUNCTION && + Callee->getIntrinsicID() != Intrinsic::nvvm_reflect)) + continue; + + // FIXME: Improve error handling here and elsewhere in this pass. + assert(Call->getNumOperands() == 2 && + "Wrong number of operands to __nvvm_reflect function"); + + // In cuda 6.5 and earlier, we will have an extra constant-to-generic + // conversion of the string. + const Value *Str = Call->getArgOperand(0); + if (const CallInst *ConvCall = dyn_cast<CallInst>(Str)) { + // FIXME: Add assertions about ConvCall. Str = ConvCall->getArgOperand(0); } assert(isa<ConstantExpr>(Str) && - "Format of _reflect function not recognized"); + "Format of __nvvm__reflect function not recognized"); const ConstantExpr *GEP = cast<ConstantExpr>(Str); const Value *Sym = GEP->getOperand(0); - assert(isa<Constant>(Sym) && "Format of _reflect function not recognized"); + assert(isa<Constant>(Sym) && + "Format of __nvvm_reflect function not recognized"); const Value *Operand = cast<Constant>(Sym)->getOperand(0); if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(Operand)) { - // For CUDA-7.0 style __nvvm_reflect calls we need to find operand's + // For CUDA-7.0 style __nvvm_reflect calls, we need to find the operand's // initializer. assert(GV->hasInitializer() && "Format of _reflect function not recognized"); @@ -194,57 +195,26 @@ bool NVVMReflect::handleFunction(Function *ReflectFunction) { assert(cast<ConstantDataSequential>(Operand)->isCString() && "Format of _reflect function not recognized"); - std::string ReflectArg = - cast<ConstantDataSequential>(Operand)->getAsString(); - + StringRef ReflectArg = cast<ConstantDataSequential>(Operand)->getAsString(); ReflectArg = ReflectArg.substr(0, ReflectArg.size() - 1); DEBUG(dbgs() << "Arg of _reflect : " << ReflectArg << "\n"); int ReflectVal = 0; // The default value is 0 - if (VarMap.find(ReflectArg) != VarMap.end()) { - ReflectVal = VarMap[ReflectArg]; - } - Reflect->replaceAllUsesWith( - ConstantInt::get(Reflect->getType(), ReflectVal)); - ToRemove.push_back(Reflect); - } - if (ToRemove.size() == 0) - return false; - - for (unsigned i = 0, e = ToRemove.size(); i != e; ++i) - ToRemove[i]->eraseFromParent(); - return true; -} - -bool NVVMReflect::runOnModule(Module &M) { - if (!NVVMReflectEnabled) - return false; - - setVarMap(); - - - bool Res = false; - std::string Name; - Type *Tys[1]; - Type *I8Ty = Type::getInt8Ty(M.getContext()); - Function *ReflectFunction; - - // Check for standard overloaded versions of llvm.nvvm.reflect - - for (unsigned i = 0; i != 5; ++i) { - Tys[0] = PointerType::get(I8Ty, i); - Name = Intrinsic::getName(Intrinsic::nvvm_reflect, Tys); - ReflectFunction = M.getFunction(Name); - if(ReflectFunction != 0) { - Res |= handleFunction(ReflectFunction); + auto Iter = VarMap.find(ReflectArg); + if (Iter != VarMap.end()) + ReflectVal = Iter->second; + else if (ReflectArg == "__CUDA_FTZ") { + // Try to pull __CUDA_FTZ from the nvvm-reflect-ftz module flag. + if (auto *Flag = mdconst::extract_or_null<ConstantInt>( + F.getParent()->getModuleFlag("nvvm-reflect-ftz"))) + ReflectVal = Flag->getSExtValue(); } + Call->replaceAllUsesWith(ConstantInt::get(Call->getType(), ReflectVal)); + ToRemove.push_back(Call); } - ReflectFunction = M.getFunction(NVVM_REFLECT_FUNCTION); - // If reflect function is not used, then there will be - // no entry in the module. - if (ReflectFunction != 0) - Res |= handleFunction(ReflectFunction); + for (Instruction *I : ToRemove) + I->eraseFromParent(); - return Res; + return ToRemove.size() > 0; } diff --git a/lib/Target/NVPTX/TargetInfo/Makefile b/lib/Target/NVPTX/TargetInfo/Makefile deleted file mode 100644 index 8622315b47b9..000000000000 --- a/lib/Target/NVPTX/TargetInfo/Makefile +++ /dev/null @@ -1,15 +0,0 @@ -##===- lib/Target/NVPTX/TargetInfo/Makefile ----------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## -LEVEL = ../../../.. -LIBRARYNAME = LLVMNVPTXInfo - -# Hack: we need to include 'main' target directory to grab private headers -CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. - -include $(LEVEL)/Makefile.common |