aboutsummaryrefslogtreecommitdiff
path: root/lib/Target/NVPTX
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/NVPTX')
-rw-r--r--lib/Target/NVPTX/CMakeLists.txt2
-rw-r--r--lib/Target/NVPTX/InstPrinter/Makefile15
-rw-r--r--lib/Target/NVPTX/MCTargetDesc/Makefile16
-rw-r--r--lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp7
-rw-r--r--lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp16
-rw-r--r--lib/Target/NVPTX/Makefile23
-rw-r--r--lib/Target/NVPTX/NVPTX.h8
-rw-r--r--lib/Target/NVPTX/NVPTX.td14
-rw-r--r--lib/Target/NVPTX/NVPTXAsmPrinter.cpp52
-rw-r--r--lib/Target/NVPTX/NVPTXAsmPrinter.h6
-rw-r--r--lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp7
-rw-r--r--lib/Target/NVPTX/NVPTXFrameLowering.cpp7
-rw-r--r--lib/Target/NVPTX/NVPTXFrameLowering.h2
-rw-r--r--lib/Target/NVPTX/NVPTXGenericToNVVM.cpp16
-rw-r--r--lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp528
-rw-r--r--lib/Target/NVPTX/NVPTXISelDAGToDAG.h42
-rw-r--r--lib/Target/NVPTX/NVPTXISelLowering.cpp126
-rw-r--r--lib/Target/NVPTX/NVPTXISelLowering.h20
-rw-r--r--lib/Target/NVPTX/NVPTXImageOptimizer.cpp3
-rw-r--r--lib/Target/NVPTX/NVPTXInferAddressSpaces.cpp586
-rw-r--r--lib/Target/NVPTX/NVPTXInstrInfo.cpp61
-rw-r--r--lib/Target/NVPTX/NVPTXInstrInfo.h19
-rw-r--r--lib/Target/NVPTX/NVPTXInstrInfo.td3025
-rw-r--r--lib/Target/NVPTX/NVPTXIntrinsics.td349
-rw-r--r--lib/Target/NVPTX/NVPTXLowerAlloca.cpp3
-rw-r--r--lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp2
-rw-r--r--lib/Target/NVPTX/NVPTXMCExpr.cpp4
-rw-r--r--lib/Target/NVPTX/NVPTXMCExpr.h9
-rw-r--r--lib/Target/NVPTX/NVPTXPeephole.cpp3
-rw-r--r--lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp9
-rw-r--r--lib/Target/NVPTX/NVPTXSection.h1
-rw-r--r--lib/Target/NVPTX/NVPTXSubtarget.h6
-rw-r--r--lib/Target/NVPTX/NVPTXTargetMachine.cpp129
-rw-r--r--lib/Target/NVPTX/NVPTXTargetMachine.h11
-rw-r--r--lib/Target/NVPTX/NVPTXTargetObjectFile.h4
-rw-r--r--lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp2
-rw-r--r--lib/Target/NVPTX/NVPTXTargetTransformInfo.h4
-rw-r--r--lib/Target/NVPTX/NVPTXUtilities.cpp4
-rw-r--r--lib/Target/NVPTX/NVPTXUtilities.h5
-rw-r--r--lib/Target/NVPTX/NVVMIntrRange.cpp148
-rw-r--r--lib/Target/NVPTX/NVVMReflect.cpp178
-rw-r--r--lib/Target/NVPTX/TargetInfo/Makefile15
42 files changed, 3175 insertions, 2312 deletions
diff --git a/lib/Target/NVPTX/CMakeLists.txt b/lib/Target/NVPTX/CMakeLists.txt
index 05fe06dbc07c..b67c40500861 100644
--- a/lib/Target/NVPTX/CMakeLists.txt
+++ b/lib/Target/NVPTX/CMakeLists.txt
@@ -18,6 +18,7 @@ set(NVPTXCodeGen_sources
NVPTXISelDAGToDAG.cpp
NVPTXISelLowering.cpp
NVPTXImageOptimizer.cpp
+ NVPTXInferAddressSpaces.cpp
NVPTXInstrInfo.cpp
NVPTXLowerAggrCopies.cpp
NVPTXLowerKernelArgs.cpp
@@ -31,6 +32,7 @@ set(NVPTXCodeGen_sources
NVPTXTargetMachine.cpp
NVPTXTargetTransformInfo.cpp
NVPTXUtilities.cpp
+ NVVMIntrRange.cpp
NVVMReflect.cpp
)
diff --git a/lib/Target/NVPTX/InstPrinter/Makefile b/lib/Target/NVPTX/InstPrinter/Makefile
deleted file mode 100644
index 7b7865436bf3..000000000000
--- a/lib/Target/NVPTX/InstPrinter/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/NVPTX/AsmPrinter/Makefile ----------------*- Makefile -*-===##
-#
-# The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMNVPTXAsmPrinter
-
-# Hack: we need to include 'main' ptx target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/NVPTX/MCTargetDesc/Makefile b/lib/Target/NVPTX/MCTargetDesc/Makefile
deleted file mode 100644
index 31d06cb5948d..000000000000
--- a/lib/Target/NVPTX/MCTargetDesc/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/NVPTX/TargetDesc/Makefile ----------------*- Makefile -*-===##
-#
-# The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMNVPTXDesc
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
index ef36c13b49f1..78bdf4e698d8 100644
--- a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
@@ -34,13 +34,16 @@ NVPTXMCAsmInfo::NVPTXMCAsmInfo(const Triple &TheTriple) {
HasSingleParameterDotFile = false;
- InlineAsmStart = " inline asm";
- InlineAsmEnd = " inline asm";
+ InlineAsmStart = " begin inline asm";
+ InlineAsmEnd = " end inline asm";
SupportsDebugInformation = CompileForDebugging;
// PTX does not allow .align on functions.
HasFunctionAlignment = false;
HasDotTypeDotSizeDirective = false;
+ // PTX does not allow .hidden or .protected
+ HiddenDeclarationVisibilityAttr = HiddenVisibilityAttr = MCSA_Invalid;
+ ProtectedVisibilityAttr = MCSA_Invalid;
Data8bitsDirective = " .b8 ";
Data16bitsDirective = " .b16 ";
diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
index ad7302037cad..e356a965a04b 100644
--- a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
@@ -14,7 +14,6 @@
#include "NVPTXMCTargetDesc.h"
#include "InstPrinter/NVPTXInstPrinter.h"
#include "NVPTXMCAsmInfo.h"
-#include "llvm/MC/MCCodeGenInfo.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCSubtargetInfo.h"
@@ -49,18 +48,6 @@ createNVPTXMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
return createNVPTXMCSubtargetInfoImpl(TT, CPU, FS);
}
-static MCCodeGenInfo *createNVPTXMCCodeGenInfo(const Triple &TT,
- Reloc::Model RM,
- CodeModel::Model CM,
- CodeGenOpt::Level OL) {
- MCCodeGenInfo *X = new MCCodeGenInfo();
-
- // The default relocation model is used regardless of what the client has
- // specified, as it is the only relocation model currently supported.
- X->initMCCodeGenInfo(Reloc::Default, CM, OL);
- return X;
-}
-
static MCInstPrinter *createNVPTXMCInstPrinter(const Triple &T,
unsigned SyntaxVariant,
const MCAsmInfo &MAI,
@@ -77,9 +64,6 @@ extern "C" void LLVMInitializeNVPTXTargetMC() {
// Register the MC asm info.
RegisterMCAsmInfo<NVPTXMCAsmInfo> X(*T);
- // Register the MC codegen info.
- TargetRegistry::RegisterMCCodeGenInfo(*T, createNVPTXMCCodeGenInfo);
-
// Register the MC instruction info.
TargetRegistry::RegisterMCInstrInfo(*T, createNVPTXMCInstrInfo);
diff --git a/lib/Target/NVPTX/Makefile b/lib/Target/NVPTX/Makefile
deleted file mode 100644
index 8db20ebed2c2..000000000000
--- a/lib/Target/NVPTX/Makefile
+++ /dev/null
@@ -1,23 +0,0 @@
-##===- lib/Target/NVPTX/Makefile ---------------------------*- Makefile -*-===##
-#
-# The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMNVPTXCodeGen
-TARGET = NVPTX
-
-# Make sure that tblgen is run, first thing.
-BUILT_SOURCES = NVPTXGenAsmWriter.inc \
- NVPTXGenDAGISel.inc \
- NVPTXGenInstrInfo.inc \
- NVPTXGenRegisterInfo.inc \
- NVPTXGenSubtargetInfo.inc
-
-DIRS = InstPrinter TargetInfo MCTargetDesc
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/NVPTX/NVPTX.h b/lib/Target/NVPTX/NVPTX.h
index e5fae85bacf2..e91385ac13f2 100644
--- a/lib/Target/NVPTX/NVPTX.h
+++ b/lib/Target/NVPTX/NVPTX.h
@@ -46,8 +46,10 @@ FunctionPass *createNVPTXISelDag(NVPTXTargetMachine &TM,
ModulePass *createNVPTXAssignValidGlobalNamesPass();
ModulePass *createGenericToNVVMPass();
FunctionPass *createNVPTXFavorNonGenericAddrSpacesPass();
-ModulePass *createNVVMReflectPass();
-ModulePass *createNVVMReflectPass(const StringMap<int>& Mapping);
+FunctionPass *createNVPTXInferAddressSpacesPass();
+FunctionPass *createNVVMIntrRangePass(unsigned int SmVersion);
+FunctionPass *createNVVMReflectPass();
+FunctionPass *createNVVMReflectPass(const StringMap<int> &Mapping);
MachineFunctionPass *createNVPTXPrologEpilogPass();
MachineFunctionPass *createNVPTXReplaceImageHandlesPass();
FunctionPass *createNVPTXImageOptimizerPass();
@@ -55,8 +57,6 @@ FunctionPass *createNVPTXLowerKernelArgsPass(const NVPTXTargetMachine *TM);
BasicBlockPass *createNVPTXLowerAllocaPass();
MachineFunctionPass *createNVPTXPeephole();
-bool isImageOrSamplerVal(const Value *, const Module *);
-
extern Target TheNVPTXTarget32;
extern Target TheNVPTXTarget64;
diff --git a/lib/Target/NVPTX/NVPTX.td b/lib/Target/NVPTX/NVPTX.td
index 96abfa859119..032991a20cc9 100644
--- a/lib/Target/NVPTX/NVPTX.td
+++ b/lib/Target/NVPTX/NVPTX.td
@@ -44,6 +44,12 @@ def SM52 : SubtargetFeature<"sm_52", "SmVersion", "52",
"Target SM 5.2">;
def SM53 : SubtargetFeature<"sm_53", "SmVersion", "53",
"Target SM 5.3">;
+def SM60 : SubtargetFeature<"sm_60", "SmVersion", "60",
+ "Target SM 6.0">;
+def SM61 : SubtargetFeature<"sm_61", "SmVersion", "61",
+ "Target SM 6.1">;
+def SM62 : SubtargetFeature<"sm_62", "SmVersion", "62",
+ "Target SM 6.2">;
// PTX Versions
def PTX32 : SubtargetFeature<"ptx32", "PTXVersion", "32",
@@ -54,6 +60,10 @@ def PTX41 : SubtargetFeature<"ptx41", "PTXVersion", "41",
"Use PTX version 4.1">;
def PTX42 : SubtargetFeature<"ptx42", "PTXVersion", "42",
"Use PTX version 4.2">;
+def PTX43 : SubtargetFeature<"ptx43", "PTXVersion", "43",
+ "Use PTX version 4.3">;
+def PTX50 : SubtargetFeature<"ptx50", "PTXVersion", "50",
+ "Use PTX version 5.0">;
//===----------------------------------------------------------------------===//
// NVPTX supported processors.
@@ -71,7 +81,9 @@ def : Proc<"sm_37", [SM37, PTX41]>;
def : Proc<"sm_50", [SM50, PTX40]>;
def : Proc<"sm_52", [SM52, PTX41]>;
def : Proc<"sm_53", [SM53, PTX42]>;
-
+def : Proc<"sm_60", [SM60, PTX50]>;
+def : Proc<"sm_61", [SM61, PTX50]>;
+def : Proc<"sm_62", [SM62, PTX50]>;
def NVPTXInstrInfo : InstrInfo {
}
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index e8c36089a779..660016bfcd05 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -117,7 +117,7 @@ void NVPTXAsmPrinter::emitLineNumberAsDotLoc(const MachineInstr &MI) {
if (ignoreLoc(MI))
return;
- DebugLoc curLoc = MI.getDebugLoc();
+ const DebugLoc &curLoc = MI.getDebugLoc();
if (!prevDebugLoc && !curLoc)
return;
@@ -277,7 +277,7 @@ bool NVPTXAsmPrinter::lowerOperand(const MachineOperand &MO,
break;
case MachineOperand::MO_FPImmediate: {
const ConstantFP *Cnt = MO.getFPImm();
- APFloat Val = Cnt->getValueAPF();
+ const APFloat &Val = Cnt->getValueAPF();
switch (Cnt->getType()->getTypeID()) {
default: report_fatal_error("Unsupported FP type"); break;
@@ -432,7 +432,8 @@ bool NVPTXAsmPrinter::isLoopHeaderOfNoUnroll(
continue;
}
if (const BasicBlock *PBB = PMBB->getBasicBlock()) {
- if (MDNode *LoopID = PBB->getTerminator()->getMetadata("llvm.loop")) {
+ if (MDNode *LoopID =
+ PBB->getTerminator()->getMetadata(LLVMContext::MD_loop)) {
if (GetUnrollMetadata(LoopID, "llvm.loop.unroll.disable"))
return true;
}
@@ -798,10 +799,18 @@ void NVPTXAsmPrinter::recordAndEmitFilenames(Module &M) {
if (filenameMap.find(Filename) != filenameMap.end())
continue;
filenameMap[Filename] = i;
+ OutStreamer->EmitDwarfFileDirective(i, "", Filename);
++i;
}
}
+static bool isEmptyXXStructor(GlobalVariable *GV) {
+ if (!GV) return true;
+ const ConstantArray *InitList = dyn_cast<ConstantArray>(GV->getInitializer());
+ if (!InitList) return true; // Not an array; we don't know how to parse.
+ return InitList->getNumOperands() == 0;
+}
+
bool NVPTXAsmPrinter::doInitialization(Module &M) {
// Construct a default subtarget off of the TargetMachine defaults. The
// rest of NVPTX isn't friendly to change subtargets per function and
@@ -812,6 +821,21 @@ bool NVPTXAsmPrinter::doInitialization(Module &M) {
const NVPTXTargetMachine &NTM = static_cast<const NVPTXTargetMachine &>(TM);
const NVPTXSubtarget STI(TT, CPU, FS, NTM);
+ if (M.alias_size()) {
+ report_fatal_error("Module has aliases, which NVPTX does not support.");
+ return true; // error
+ }
+ if (!isEmptyXXStructor(M.getNamedGlobal("llvm.global_ctors"))) {
+ report_fatal_error(
+ "Module has a nontrivial global ctor, which NVPTX does not support.");
+ return true; // error
+ }
+ if (!isEmptyXXStructor(M.getNamedGlobal("llvm.global_dtors"))) {
+ report_fatal_error(
+ "Module has a nontrivial global dtor, which NVPTX does not support.");
+ return true; // error
+ }
+
SmallString<128> Str1;
raw_svector_ostream OS1(Str1);
@@ -1017,7 +1041,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
// Skip meta data
if (GVar->hasSection()) {
- if (GVar->getSection() == StringRef("llvm.metadata"))
+ if (GVar->getSection() == "llvm.metadata")
return;
}
@@ -1030,7 +1054,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
// GlobalVariables are always constant pointers themselves.
PointerType *PTy = GVar->getType();
- Type *ETy = PTy->getElementType();
+ Type *ETy = GVar->getValueType();
if (GVar->hasExternalLinkage()) {
if (GVar->hasInitializer())
@@ -1341,11 +1365,10 @@ void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar,
const DataLayout &DL = getDataLayout();
// GlobalVariables are always constant pointers themselves.
- PointerType *PTy = GVar->getType();
- Type *ETy = PTy->getElementType();
+ Type *ETy = GVar->getValueType();
O << ".";
- emitPTXAddressSpace(PTy->getAddressSpace(), O);
+ emitPTXAddressSpace(GVar->getType()->getAddressSpace(), O);
if (GVar->getAlignment() == 0)
O << " .align " << (int)DL.getPrefTypeAlignment(ETy);
else
@@ -1429,6 +1452,11 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
bool isABI = (nvptxSubtarget->getSmVersion() >= 20);
MVT thePointerTy = TLI->getPointerTy(DL);
+ if (F->arg_empty()) {
+ O << "()\n";
+ return;
+ }
+
O << "(\n";
for (I = F->arg_begin(), E = F->arg_end(); I != E; ++I, paramIndex++) {
@@ -1715,9 +1743,8 @@ void NVPTXAsmPrinter::printScalarConstant(const Constant *CPV, raw_ostream &O) {
return;
}
if (const GlobalValue *GVar = dyn_cast<GlobalValue>(CPV)) {
- PointerType *PTy = dyn_cast<PointerType>(GVar->getType());
bool IsNonGenericPointer = false;
- if (PTy && PTy->getAddressSpace() != 0) {
+ if (GVar->getType()->getAddressSpace() != 0) {
IsNonGenericPointer = true;
}
if (EmitGeneric && !isa<Function>(CPV) && !IsNonGenericPointer) {
@@ -1883,8 +1910,7 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
case Type::ArrayTyID:
case Type::VectorTyID:
case Type::StructTyID: {
- if (isa<ConstantArray>(CPV) || isa<ConstantVector>(CPV) ||
- isa<ConstantStruct>(CPV) || isa<ConstantDataSequential>(CPV)) {
+ if (isa<ConstantAggregate>(CPV) || isa<ConstantDataSequential>(CPV)) {
int ElementSize = DL.getTypeAllocSize(CPV->getType());
bufferAggregateConstant(CPV, aggBuffer);
if (Bytes > ElementSize)
@@ -2315,7 +2341,7 @@ void NVPTXAsmPrinter::emitSrcInText(StringRef filename, unsigned line) {
this->OutStreamer->EmitRawText(temp.str());
}
-LineReader *NVPTXAsmPrinter::getReader(std::string filename) {
+LineReader *NVPTXAsmPrinter::getReader(const std::string &filename) {
if (!reader) {
reader = new LineReader(filename);
}
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.h b/lib/Target/NVPTX/NVPTXAsmPrinter.h
index 76bf179896a8..85660fbdb26e 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.h
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -18,14 +18,14 @@
#include "NVPTX.h"
#include "NVPTXSubtarget.h"
#include "NVPTXTargetMachine.h"
-#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/IR/Function.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/CommandLine.h"
#include "llvm/Support/FormattedStream.h"
#include "llvm/Target/TargetMachine.h"
#include <fstream>
@@ -293,7 +293,7 @@ private:
bool isLoopHeaderOfNoUnroll(const MachineBasicBlock &MBB) const;
LineReader *reader;
- LineReader *getReader(std::string);
+ LineReader *getReader(const std::string &);
// Used to control the need to emit .generic() in the initializer of
// module scope variables.
diff --git a/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp b/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp
index 95813c8430d1..7c5a54162d77 100644
--- a/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp
+++ b/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp
@@ -7,6 +7,9 @@
//
//===----------------------------------------------------------------------===//
//
+// FIXME: This pass is deprecated in favor of NVPTXInferAddressSpaces, which
+// uses a new algorithm that handles pointer induction variables.
+//
// When a load/store accesses the generic address space, checks whether the
// address is casted from a non-generic address space. If so, remove this
// addrspacecast because accessing non-generic address spaces is typically
@@ -164,8 +167,8 @@ Value *NVPTXFavorNonGenericAddrSpaces::hoistAddrSpaceCastFromGEP(
GEP->getSourceElementType(), Cast->getOperand(0), Indices,
"", GEPI);
NewGEP->setIsInBounds(GEP->isInBounds());
+ NewGEP->takeName(GEP);
NewASC = new AddrSpaceCastInst(NewGEP, GEP->getType(), "", GEPI);
- NewASC->takeName(GEP);
// Without RAUWing GEP, the compiler would visit GEP again and emit
// redundant instructions. This is exercised in test @rauw in
// access-non-generic.ll.
@@ -263,7 +266,7 @@ bool NVPTXFavorNonGenericAddrSpaces::optimizeMemoryInstruction(Instruction *MI,
}
bool NVPTXFavorNonGenericAddrSpaces::runOnFunction(Function &F) {
- if (DisableFavorNonGeneric)
+ if (DisableFavorNonGeneric || skipFunction(F))
return false;
bool Changed = false;
diff --git a/lib/Target/NVPTX/NVPTXFrameLowering.cpp b/lib/Target/NVPTX/NVPTXFrameLowering.cpp
index 9b34aef3fdec..bbcb497ead9d 100644
--- a/lib/Target/NVPTX/NVPTXFrameLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXFrameLowering.cpp
@@ -16,7 +16,6 @@
#include "NVPTXRegisterInfo.h"
#include "NVPTXSubtarget.h"
#include "NVPTXTargetMachine.h"
-#include "llvm/ADT/BitVector.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -35,7 +34,7 @@ void NVPTXFrameLowering::emitPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
if (MF.getFrameInfo()->hasStackObjects()) {
assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
- MachineInstr *MI = MBB.begin();
+ MachineInstr *MI = &MBB.front();
MachineRegisterInfo &MR = MF.getRegInfo();
// This instruction really occurs before first instruction
@@ -70,10 +69,10 @@ void NVPTXFrameLowering::emitEpilogue(MachineFunction &MF,
// This function eliminates ADJCALLSTACKDOWN,
// ADJCALLSTACKUP pseudo instructions
-void NVPTXFrameLowering::eliminateCallFramePseudoInstr(
+MachineBasicBlock::iterator NVPTXFrameLowering::eliminateCallFramePseudoInstr(
MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator I) const {
// Simply discard ADJCALLSTACKDOWN,
// ADJCALLSTACKUP instructions.
- MBB.erase(I);
+ return MBB.erase(I);
}
diff --git a/lib/Target/NVPTX/NVPTXFrameLowering.h b/lib/Target/NVPTX/NVPTXFrameLowering.h
index 14f8bb7b98fe..320ca9a2f095 100644
--- a/lib/Target/NVPTX/NVPTXFrameLowering.h
+++ b/lib/Target/NVPTX/NVPTXFrameLowering.h
@@ -26,7 +26,7 @@ public:
void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
- void
+ MachineBasicBlock::iterator
eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator I) const override;
};
diff --git a/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp b/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
index 62ca5e9f9f62..66a964082c5f 100644
--- a/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
+++ b/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
@@ -86,7 +86,7 @@ bool GenericToNVVM::runOnModule(Module &M) {
!llvm::isTexture(*GV) && !llvm::isSurface(*GV) &&
!llvm::isSampler(*GV) && !GV->getName().startswith("llvm.")) {
GlobalVariable *NewGV = new GlobalVariable(
- M, GV->getType()->getElementType(), GV->isConstant(),
+ M, GV->getValueType(), GV->isConstant(),
GV->getLinkage(),
GV->hasInitializer() ? GV->getInitializer() : nullptr,
"", GV, GV->getThreadLocalMode(), llvm::ADDRESS_SPACE_GLOBAL);
@@ -172,7 +172,7 @@ Value *GenericToNVVM::getOrInsertCVTA(Module *M, Function *F,
// See if the address space conversion requires the operand to be bitcast
// to i8 addrspace(n)* first.
- EVT ExtendedGVType = EVT::getEVT(GVType->getElementType(), true);
+ EVT ExtendedGVType = EVT::getEVT(GV->getValueType(), true);
if (!ExtendedGVType.isInteger() && !ExtendedGVType.isFloatingPoint()) {
// A bitcast to i8 addrspace(n)* on the operand is needed.
LLVMContext &Context = M->getContext();
@@ -182,21 +182,18 @@ Value *GenericToNVVM::getOrInsertCVTA(Module *M, Function *F,
// Insert the address space conversion.
Type *ResultType =
PointerType::get(Type::getInt8Ty(Context), llvm::ADDRESS_SPACE_GENERIC);
- SmallVector<Type *, 2> ParamTypes;
- ParamTypes.push_back(ResultType);
- ParamTypes.push_back(DestTy);
Function *CVTAFunction = Intrinsic::getDeclaration(
- M, Intrinsic::nvvm_ptr_global_to_gen, ParamTypes);
+ M, Intrinsic::nvvm_ptr_global_to_gen, {ResultType, DestTy});
CVTA = Builder.CreateCall(CVTAFunction, CVTA, "cvta");
// Another bitcast from i8 * to <the element type of GVType> * is
// required.
DestTy =
- PointerType::get(GVType->getElementType(), llvm::ADDRESS_SPACE_GENERIC);
+ PointerType::get(GV->getValueType(), llvm::ADDRESS_SPACE_GENERIC);
CVTA = Builder.CreateBitCast(CVTA, DestTy, "cvta");
} else {
// A simple CVTA is enough.
SmallVector<Type *, 2> ParamTypes;
- ParamTypes.push_back(PointerType::get(GVType->getElementType(),
+ ParamTypes.push_back(PointerType::get(GV->getValueType(),
llvm::ADDRESS_SPACE_GENERIC));
ParamTypes.push_back(GVType);
Function *CVTAFunction = Intrinsic::getDeclaration(
@@ -230,8 +227,7 @@ Value *GenericToNVVM::remapConstant(Module *M, Function *F, Constant *C,
if (I != GVMap.end()) {
NewValue = getOrInsertCVTA(M, F, I->second, Builder);
}
- } else if (isa<ConstantVector>(C) || isa<ConstantArray>(C) ||
- isa<ConstantStruct>(C)) {
+ } else if (isa<ConstantAggregate>(C)) {
// If any element in the constant vector or aggregate C is or uses a global
// variable in GVMap, the constant C needs to be reconstructed, using a set
// of instructions.
diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 2d0098b392f4..61c6758ef118 100644
--- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -105,57 +105,66 @@ bool NVPTXDAGToDAGISel::allowFMA() const {
/// Select - Select instructions not customized! Used for
/// expanded, promoted and normal instructions.
-SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) {
+void NVPTXDAGToDAGISel::Select(SDNode *N) {
if (N->isMachineOpcode()) {
N->setNodeId(-1);
- return nullptr; // Already selected.
+ return; // Already selected.
}
- SDNode *ResNode = nullptr;
switch (N->getOpcode()) {
case ISD::LOAD:
- ResNode = SelectLoad(N);
+ if (tryLoad(N))
+ return;
break;
case ISD::STORE:
- ResNode = SelectStore(N);
+ if (tryStore(N))
+ return;
break;
case NVPTXISD::LoadV2:
case NVPTXISD::LoadV4:
- ResNode = SelectLoadVector(N);
+ if (tryLoadVector(N))
+ return;
break;
case NVPTXISD::LDGV2:
case NVPTXISD::LDGV4:
case NVPTXISD::LDUV2:
case NVPTXISD::LDUV4:
- ResNode = SelectLDGLDU(N);
+ if (tryLDGLDU(N))
+ return;
break;
case NVPTXISD::StoreV2:
case NVPTXISD::StoreV4:
- ResNode = SelectStoreVector(N);
+ if (tryStoreVector(N))
+ return;
break;
case NVPTXISD::LoadParam:
case NVPTXISD::LoadParamV2:
case NVPTXISD::LoadParamV4:
- ResNode = SelectLoadParam(N);
+ if (tryLoadParam(N))
+ return;
break;
case NVPTXISD::StoreRetval:
case NVPTXISD::StoreRetvalV2:
case NVPTXISD::StoreRetvalV4:
- ResNode = SelectStoreRetval(N);
+ if (tryStoreRetval(N))
+ return;
break;
case NVPTXISD::StoreParam:
case NVPTXISD::StoreParamV2:
case NVPTXISD::StoreParamV4:
case NVPTXISD::StoreParamS32:
case NVPTXISD::StoreParamU32:
- ResNode = SelectStoreParam(N);
+ if (tryStoreParam(N))
+ return;
break;
case ISD::INTRINSIC_WO_CHAIN:
- ResNode = SelectIntrinsicNoChain(N);
+ if (tryIntrinsicNoChain(N))
+ return;
break;
case ISD::INTRINSIC_W_CHAIN:
- ResNode = SelectIntrinsicChain(N);
+ if (tryIntrinsicChain(N))
+ return;
break;
case NVPTXISD::Tex1DFloatS32:
case NVPTXISD::Tex1DFloatFloat:
@@ -325,7 +334,8 @@ SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) {
case NVPTXISD::Tld4UnifiedG2DU64Float:
case NVPTXISD::Tld4UnifiedB2DU64Float:
case NVPTXISD::Tld4UnifiedA2DU64Float:
- ResNode = SelectTextureIntrinsic(N);
+ if (tryTextureIntrinsic(N))
+ return;
break;
case NVPTXISD::Suld1DI8Clamp:
case NVPTXISD::Suld1DI16Clamp:
@@ -492,37 +502,37 @@ SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) {
case NVPTXISD::Suld3DV4I8Zero:
case NVPTXISD::Suld3DV4I16Zero:
case NVPTXISD::Suld3DV4I32Zero:
- ResNode = SelectSurfaceIntrinsic(N);
+ if (trySurfaceIntrinsic(N))
+ return;
break;
case ISD::AND:
case ISD::SRA:
case ISD::SRL:
// Try to select BFE
- ResNode = SelectBFE(N);
+ if (tryBFE(N))
+ return;
break;
case ISD::ADDRSPACECAST:
- ResNode = SelectAddrSpaceCast(N);
- break;
+ SelectAddrSpaceCast(N);
+ return;
default:
break;
}
- if (ResNode)
- return ResNode;
- return SelectCode(N);
+ SelectCode(N);
}
-SDNode *NVPTXDAGToDAGISel::SelectIntrinsicChain(SDNode *N) {
+bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
switch (IID) {
default:
- return NULL;
+ return false;
case Intrinsic::nvvm_ldg_global_f:
case Intrinsic::nvvm_ldg_global_i:
case Intrinsic::nvvm_ldg_global_p:
case Intrinsic::nvvm_ldu_global_f:
case Intrinsic::nvvm_ldu_global_i:
case Intrinsic::nvvm_ldu_global_p:
- return SelectLDGLDU(N);
+ return tryLDGLDU(N);
}
}
@@ -579,25 +589,26 @@ static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
return true;
}
-SDNode *NVPTXDAGToDAGISel::SelectIntrinsicNoChain(SDNode *N) {
+bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) {
unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
switch (IID) {
default:
- return nullptr;
+ return false;
case Intrinsic::nvvm_texsurf_handle_internal:
- return SelectTexSurfHandle(N);
+ SelectTexSurfHandle(N);
+ return true;
}
}
-SDNode *NVPTXDAGToDAGISel::SelectTexSurfHandle(SDNode *N) {
+void NVPTXDAGToDAGISel::SelectTexSurfHandle(SDNode *N) {
// Op 0 is the intrinsic ID
SDValue Wrapper = N->getOperand(1);
SDValue GlobalVal = Wrapper.getOperand(0);
- return CurDAG->getMachineNode(NVPTX::texsurf_handles, SDLoc(N), MVT::i64,
- GlobalVal);
+ ReplaceNode(N, CurDAG->getMachineNode(NVPTX::texsurf_handles, SDLoc(N),
+ MVT::i64, GlobalVal));
}
-SDNode *NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
+void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
SDValue Src = N->getOperand(0);
AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);
unsigned SrcAddrSpace = CastN->getSrcAddressSpace();
@@ -624,7 +635,9 @@ SDNode *NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
Opc = TM.is64Bit() ? NVPTX::cvta_local_yes_64 : NVPTX::cvta_local_yes;
break;
}
- return CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0), Src);
+ ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),
+ Src));
+ return;
} else {
// Generic to specific
if (SrcAddrSpace != 0)
@@ -653,11 +666,13 @@ SDNode *NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
: NVPTX::nvvm_ptr_gen_to_param;
break;
}
- return CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0), Src);
+ ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),
+ Src));
+ return;
}
}
-SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
+bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
SDLoc dl(N);
LoadSDNode *LD = cast<LoadSDNode>(N);
EVT LoadedVT = LD->getMemoryVT();
@@ -665,16 +680,16 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
// do not support pre/post inc/dec
if (LD->isIndexed())
- return nullptr;
+ return false;
if (!LoadedVT.isSimple())
- return nullptr;
+ return false;
// Address Space Setting
unsigned int codeAddrSpace = getCodeAddrSpace(LD);
if (canLowerToLDG(LD, *Subtarget, codeAddrSpace, MF)) {
- return SelectLDGLDU(N);
+ return tryLDGLDU(N);
}
// Volatile Setting
@@ -695,7 +710,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
else if (num == 4)
vecType = NVPTX::PTXLdStInstCode::V4;
else
- return nullptr;
+ return false;
}
// Type Setting: fromType + fromTypeWidth
@@ -744,7 +759,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
Opcode = NVPTX::LD_f64_avar;
break;
default:
- return nullptr;
+ return false;
}
SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(codeAddrSpace, dl),
getI32Imm(vecType, dl), getI32Imm(fromType, dl),
@@ -772,7 +787,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
Opcode = NVPTX::LD_f64_asi;
break;
default:
- return nullptr;
+ return false;
}
SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(codeAddrSpace, dl),
getI32Imm(vecType, dl), getI32Imm(fromType, dl),
@@ -801,7 +816,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
Opcode = NVPTX::LD_f64_ari_64;
break;
default:
- return nullptr;
+ return false;
}
} else {
switch (TargetVT) {
@@ -824,7 +839,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
Opcode = NVPTX::LD_f64_ari;
break;
default:
- return nullptr;
+ return false;
}
}
SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(codeAddrSpace, dl),
@@ -853,7 +868,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
Opcode = NVPTX::LD_f64_areg_64;
break;
default:
- return nullptr;
+ return false;
}
} else {
switch (TargetVT) {
@@ -876,7 +891,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
Opcode = NVPTX::LD_f64_areg;
break;
default:
- return nullptr;
+ return false;
}
}
SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(codeAddrSpace, dl),
@@ -885,16 +900,18 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT, MVT::Other, Ops);
}
- if (NVPTXLD) {
- MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
- MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
- cast<MachineSDNode>(NVPTXLD)->setMemRefs(MemRefs0, MemRefs0 + 1);
- }
+ if (!NVPTXLD)
+ return false;
+
+ MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+ MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
+ cast<MachineSDNode>(NVPTXLD)->setMemRefs(MemRefs0, MemRefs0 + 1);
- return NVPTXLD;
+ ReplaceNode(N, NVPTXLD);
+ return true;
}
-SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
+bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
SDValue Chain = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
@@ -906,13 +923,13 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
EVT LoadedVT = MemSD->getMemoryVT();
if (!LoadedVT.isSimple())
- return nullptr;
+ return false;
// Address Space Setting
unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD);
if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) {
- return SelectLDGLDU(N);
+ return tryLDGLDU(N);
}
// Volatile Setting
@@ -956,7 +973,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
VecType = NVPTX::PTXLdStInstCode::V4;
break;
default:
- return nullptr;
+ return false;
}
EVT EltVT = N->getValueType(0);
@@ -964,11 +981,11 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
if (SelectDirectAddr(Op1, Addr)) {
switch (N->getOpcode()) {
default:
- return nullptr;
+ return false;
case NVPTXISD::LoadV2:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::LDV_i8_v2_avar;
break;
@@ -992,7 +1009,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
case NVPTXISD::LoadV4:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::LDV_i8_v4_avar;
break;
@@ -1017,11 +1034,11 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
: SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) {
switch (N->getOpcode()) {
default:
- return nullptr;
+ return false;
case NVPTXISD::LoadV2:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::LDV_i8_v2_asi;
break;
@@ -1045,7 +1062,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
case NVPTXISD::LoadV4:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::LDV_i8_v4_asi;
break;
@@ -1071,11 +1088,11 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
if (TM.is64Bit()) {
switch (N->getOpcode()) {
default:
- return nullptr;
+ return false;
case NVPTXISD::LoadV2:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::LDV_i8_v2_ari_64;
break;
@@ -1099,7 +1116,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
case NVPTXISD::LoadV4:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::LDV_i8_v4_ari_64;
break;
@@ -1118,11 +1135,11 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
} else {
switch (N->getOpcode()) {
default:
- return nullptr;
+ return false;
case NVPTXISD::LoadV2:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::LDV_i8_v2_ari;
break;
@@ -1146,7 +1163,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
case NVPTXISD::LoadV4:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::LDV_i8_v4_ari;
break;
@@ -1173,11 +1190,11 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
if (TM.is64Bit()) {
switch (N->getOpcode()) {
default:
- return nullptr;
+ return false;
case NVPTXISD::LoadV2:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::LDV_i8_v2_areg_64;
break;
@@ -1201,7 +1218,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
case NVPTXISD::LoadV4:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::LDV_i8_v4_areg_64;
break;
@@ -1220,11 +1237,11 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
} else {
switch (N->getOpcode()) {
default:
- return nullptr;
+ return false;
case NVPTXISD::LoadV2:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::LDV_i8_v2_areg;
break;
@@ -1248,7 +1265,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
case NVPTXISD::LoadV4:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::LDV_i8_v4_areg;
break;
@@ -1276,17 +1293,18 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
cast<MachineSDNode>(LD)->setMemRefs(MemRefs0, MemRefs0 + 1);
- return LD;
+ ReplaceNode(N, LD);
+ return true;
}
-SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
+bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
SDValue Chain = N->getOperand(0);
SDValue Op1;
MemSDNode *Mem;
bool IsLDG = true;
- // If this is an LDG intrinsic, the address is the third operand. Its its an
+ // If this is an LDG intrinsic, the address is the third operand. If its an
// LDG/LDU SD node (from custom vector handling), then its the second operand
if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
Op1 = N->getOperand(2);
@@ -1294,7 +1312,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
switch (IID) {
default:
- return NULL;
+ return false;
case Intrinsic::nvvm_ldg_global_f:
case Intrinsic::nvvm_ldg_global_i:
case Intrinsic::nvvm_ldg_global_p:
@@ -1317,19 +1335,32 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
SDValue Base, Offset, Addr;
EVT EltVT = Mem->getMemoryVT();
+ unsigned NumElts = 1;
if (EltVT.isVector()) {
+ NumElts = EltVT.getVectorNumElements();
EltVT = EltVT.getVectorElementType();
}
+ // Build the "promoted" result VTList for the load. If we are really loading
+ // i8s, then the return type will be promoted to i16 since we do not expose
+ // 8-bit registers in NVPTX.
+ EVT NodeVT = (EltVT == MVT::i8) ? MVT::i16 : EltVT;
+ SmallVector<EVT, 5> InstVTs;
+ for (unsigned i = 0; i != NumElts; ++i) {
+ InstVTs.push_back(NodeVT);
+ }
+ InstVTs.push_back(MVT::Other);
+ SDVTList InstVTList = CurDAG->getVTList(InstVTs);
+
if (SelectDirectAddr(Op1, Addr)) {
switch (N->getOpcode()) {
default:
- return nullptr;
+ return false;
case ISD::INTRINSIC_W_CHAIN:
if (IsLDG) {
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8avar;
break;
@@ -1352,7 +1383,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
} else {
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8avar;
break;
@@ -1377,7 +1408,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
case NVPTXISD::LDGV2:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar;
break;
@@ -1401,7 +1432,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
case NVPTXISD::LDUV2:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar;
break;
@@ -1425,7 +1456,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
case NVPTXISD::LDGV4:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar;
break;
@@ -1443,7 +1474,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
case NVPTXISD::LDUV4:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar;
break;
@@ -1461,19 +1492,19 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
}
SDValue Ops[] = { Addr, Chain };
- LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
+ LD = CurDAG->getMachineNode(Opcode, DL, InstVTList, Ops);
} else if (TM.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
: SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
if (TM.is64Bit()) {
switch (N->getOpcode()) {
default:
- return nullptr;
+ return false;
case ISD::LOAD:
case ISD::INTRINSIC_W_CHAIN:
if (IsLDG) {
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8ari64;
break;
@@ -1496,7 +1527,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
} else {
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8ari64;
break;
@@ -1522,7 +1553,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
case NVPTXISD::LDGV2:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64;
break;
@@ -1546,7 +1577,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
case NVPTXISD::LDUV2:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari64;
break;
@@ -1571,7 +1602,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
case NVPTXISD::LDGV4:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64;
break;
@@ -1589,7 +1620,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
case NVPTXISD::LDUV4:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari64;
break;
@@ -1608,13 +1639,13 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
} else {
switch (N->getOpcode()) {
default:
- return nullptr;
+ return false;
case ISD::LOAD:
case ISD::INTRINSIC_W_CHAIN:
if (IsLDG) {
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8ari;
break;
@@ -1637,7 +1668,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
} else {
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8ari;
break;
@@ -1663,7 +1694,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
case NVPTXISD::LDGV2:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32;
break;
@@ -1687,7 +1718,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
case NVPTXISD::LDUV2:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32;
break;
@@ -1712,7 +1743,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
case NVPTXISD::LDGV4:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32;
break;
@@ -1730,7 +1761,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
case NVPTXISD::LDUV4:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari32;
break;
@@ -1750,18 +1781,18 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
SDValue Ops[] = { Base, Offset, Chain };
- LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
+ LD = CurDAG->getMachineNode(Opcode, DL, InstVTList, Ops);
} else {
if (TM.is64Bit()) {
switch (N->getOpcode()) {
default:
- return nullptr;
+ return false;
case ISD::LOAD:
case ISD::INTRINSIC_W_CHAIN:
if (IsLDG) {
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8areg64;
break;
@@ -1784,7 +1815,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
} else {
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8areg64;
break;
@@ -1810,7 +1841,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
case NVPTXISD::LDGV2:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64;
break;
@@ -1834,7 +1865,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
case NVPTXISD::LDUV2:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg64;
break;
@@ -1859,7 +1890,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
case NVPTXISD::LDGV4:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64;
break;
@@ -1877,7 +1908,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
case NVPTXISD::LDUV4:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg64;
break;
@@ -1896,13 +1927,13 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
} else {
switch (N->getOpcode()) {
default:
- return nullptr;
+ return false;
case ISD::LOAD:
case ISD::INTRINSIC_W_CHAIN:
if (IsLDG) {
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8areg;
break;
@@ -1925,7 +1956,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
} else {
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8areg;
break;
@@ -1951,7 +1982,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
case NVPTXISD::LDGV2:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32;
break;
@@ -1975,7 +2006,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
case NVPTXISD::LDUV2:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg32;
break;
@@ -2000,7 +2031,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
case NVPTXISD::LDGV4:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32;
break;
@@ -2018,7 +2049,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
case NVPTXISD::LDUV4:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg32;
break;
@@ -2037,17 +2068,54 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
}
SDValue Ops[] = { Op1, Chain };
- LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
+ LD = CurDAG->getMachineNode(Opcode, DL, InstVTList, Ops);
}
MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
MemRefs0[0] = Mem->getMemOperand();
cast<MachineSDNode>(LD)->setMemRefs(MemRefs0, MemRefs0 + 1);
- return LD;
+ // For automatic generation of LDG (through SelectLoad[Vector], not the
+ // intrinsics), we may have an extending load like:
+ //
+ // i32,ch = load<LD1[%data1(addrspace=1)], zext from i8> t0, t7, undef:i64
+ //
+ // In this case, the matching logic above will select a load for the original
+ // memory type (in this case, i8) and our types will not match (the node needs
+ // to return an i32 in this case). Our LDG/LDU nodes do not support the
+ // concept of sign-/zero-extension, so emulate it here by adding an explicit
+ // CVT instruction. Ptxas should clean up any redundancies here.
+
+ EVT OrigType = N->getValueType(0);
+ LoadSDNode *LdNode = dyn_cast<LoadSDNode>(N);
+
+ if (OrigType != EltVT && LdNode) {
+ // We have an extending-load. The instruction we selected operates on the
+ // smaller type, but the SDNode we are replacing has the larger type. We
+ // need to emit a CVT to make the types match.
+ bool IsSigned = LdNode->getExtensionType() == ISD::SEXTLOAD;
+ unsigned CvtOpc = GetConvertOpcode(OrigType.getSimpleVT(),
+ EltVT.getSimpleVT(), IsSigned);
+
+ // For each output value, apply the manual sign/zero-extension and make sure
+ // all users of the load go through that CVT.
+ for (unsigned i = 0; i != NumElts; ++i) {
+ SDValue Res(LD, i);
+ SDValue OrigVal(N, i);
+
+ SDNode *CvtNode =
+ CurDAG->getMachineNode(CvtOpc, DL, OrigType, Res,
+ CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE,
+ DL, MVT::i32));
+ ReplaceUses(OrigVal, SDValue(CvtNode, 0));
+ }
+ }
+
+ ReplaceNode(N, LD);
+ return true;
}
-SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
+bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
SDLoc dl(N);
StoreSDNode *ST = cast<StoreSDNode>(N);
EVT StoreVT = ST->getMemoryVT();
@@ -2055,10 +2123,10 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
// do not support pre/post inc/dec
if (ST->isIndexed())
- return nullptr;
+ return false;
if (!StoreVT.isSimple())
- return nullptr;
+ return false;
// Address Space Setting
unsigned int codeAddrSpace = getCodeAddrSpace(ST);
@@ -2081,7 +2149,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
else if (num == 4)
vecType = NVPTX::PTXLdStInstCode::V4;
else
- return nullptr;
+ return false;
}
// Type Setting: toType + toTypeWidth
@@ -2125,7 +2193,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
Opcode = NVPTX::ST_f64_avar;
break;
default:
- return nullptr;
+ return false;
}
SDValue Ops[] = { N1, getI32Imm(isVolatile, dl),
getI32Imm(codeAddrSpace, dl), getI32Imm(vecType, dl),
@@ -2154,7 +2222,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
Opcode = NVPTX::ST_f64_asi;
break;
default:
- return nullptr;
+ return false;
}
SDValue Ops[] = { N1, getI32Imm(isVolatile, dl),
getI32Imm(codeAddrSpace, dl), getI32Imm(vecType, dl),
@@ -2184,7 +2252,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
Opcode = NVPTX::ST_f64_ari_64;
break;
default:
- return nullptr;
+ return false;
}
} else {
switch (SourceVT) {
@@ -2207,7 +2275,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
Opcode = NVPTX::ST_f64_ari;
break;
default:
- return nullptr;
+ return false;
}
}
SDValue Ops[] = { N1, getI32Imm(isVolatile, dl),
@@ -2237,7 +2305,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
Opcode = NVPTX::ST_f64_areg_64;
break;
default:
- return nullptr;
+ return false;
}
} else {
switch (SourceVT) {
@@ -2260,7 +2328,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
Opcode = NVPTX::ST_f64_areg;
break;
default:
- return nullptr;
+ return false;
}
}
SDValue Ops[] = { N1, getI32Imm(isVolatile, dl),
@@ -2270,16 +2338,17 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
NVPTXST = CurDAG->getMachineNode(Opcode, dl, MVT::Other, Ops);
}
- if (NVPTXST) {
- MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
- MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
- cast<MachineSDNode>(NVPTXST)->setMemRefs(MemRefs0, MemRefs0 + 1);
- }
+ if (!NVPTXST)
+ return false;
- return NVPTXST;
+ MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+ MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
+ cast<MachineSDNode>(NVPTXST)->setMemRefs(MemRefs0, MemRefs0 + 1);
+ ReplaceNode(N, NVPTXST);
+ return true;
}
-SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
+bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
SDValue Chain = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
SDValue Addr, Offset, Base;
@@ -2337,7 +2406,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
N2 = N->getOperand(5);
break;
default:
- return nullptr;
+ return false;
}
StOps.push_back(getI32Imm(IsVolatile, DL));
@@ -2349,11 +2418,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
if (SelectDirectAddr(N2, Addr)) {
switch (N->getOpcode()) {
default:
- return nullptr;
+ return false;
case NVPTXISD::StoreV2:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::STV_i8_v2_avar;
break;
@@ -2377,7 +2446,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
case NVPTXISD::StoreV4:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::STV_i8_v4_avar;
break;
@@ -2398,11 +2467,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
: SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
switch (N->getOpcode()) {
default:
- return nullptr;
+ return false;
case NVPTXISD::StoreV2:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::STV_i8_v2_asi;
break;
@@ -2426,7 +2495,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
case NVPTXISD::StoreV4:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::STV_i8_v4_asi;
break;
@@ -2449,11 +2518,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
if (TM.is64Bit()) {
switch (N->getOpcode()) {
default:
- return nullptr;
+ return false;
case NVPTXISD::StoreV2:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::STV_i8_v2_ari_64;
break;
@@ -2477,7 +2546,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
case NVPTXISD::StoreV4:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::STV_i8_v4_ari_64;
break;
@@ -2496,11 +2565,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
} else {
switch (N->getOpcode()) {
default:
- return nullptr;
+ return false;
case NVPTXISD::StoreV2:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::STV_i8_v2_ari;
break;
@@ -2524,7 +2593,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
case NVPTXISD::StoreV4:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::STV_i8_v4_ari;
break;
@@ -2547,11 +2616,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
if (TM.is64Bit()) {
switch (N->getOpcode()) {
default:
- return nullptr;
+ return false;
case NVPTXISD::StoreV2:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::STV_i8_v2_areg_64;
break;
@@ -2575,7 +2644,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
case NVPTXISD::StoreV4:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::STV_i8_v4_areg_64;
break;
@@ -2594,11 +2663,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
} else {
switch (N->getOpcode()) {
default:
- return nullptr;
+ return false;
case NVPTXISD::StoreV2:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::STV_i8_v2_areg;
break;
@@ -2622,7 +2691,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
case NVPTXISD::StoreV4:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i8:
Opcode = NVPTX::STV_i8_v4_areg;
break;
@@ -2650,10 +2719,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
cast<MachineSDNode>(ST)->setMemRefs(MemRefs0, MemRefs0 + 1);
- return ST;
+ ReplaceNode(N, ST);
+ return true;
}
-SDNode *NVPTXDAGToDAGISel::SelectLoadParam(SDNode *Node) {
+bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) {
SDValue Chain = Node->getOperand(0);
SDValue Offset = Node->getOperand(2);
SDValue Flag = Node->getOperand(3);
@@ -2663,7 +2733,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadParam(SDNode *Node) {
unsigned VecSize;
switch (Node->getOpcode()) {
default:
- return nullptr;
+ return false;
case NVPTXISD::LoadParam:
VecSize = 1;
break;
@@ -2682,11 +2752,11 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadParam(SDNode *Node) {
switch (VecSize) {
default:
- return nullptr;
+ return false;
case 1:
switch (MemVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i1:
Opc = NVPTX::LoadParamMemI8;
break;
@@ -2713,7 +2783,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadParam(SDNode *Node) {
case 2:
switch (MemVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i1:
Opc = NVPTX::LoadParamMemV2I8;
break;
@@ -2740,7 +2810,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadParam(SDNode *Node) {
case 4:
switch (MemVT.getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i1:
Opc = NVPTX::LoadParamMemV4I8;
break;
@@ -2777,10 +2847,11 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadParam(SDNode *Node) {
Ops.push_back(Chain);
Ops.push_back(Flag);
- return CurDAG->getMachineNode(Opc, DL, VTs, Ops);
+ ReplaceNode(Node, CurDAG->getMachineNode(Opc, DL, VTs, Ops));
+ return true;
}
-SDNode *NVPTXDAGToDAGISel::SelectStoreRetval(SDNode *N) {
+bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {
SDLoc DL(N);
SDValue Chain = N->getOperand(0);
SDValue Offset = N->getOperand(1);
@@ -2791,7 +2862,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreRetval(SDNode *N) {
unsigned NumElts = 1;
switch (N->getOpcode()) {
default:
- return nullptr;
+ return false;
case NVPTXISD::StoreRetval:
NumElts = 1;
break;
@@ -2816,11 +2887,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreRetval(SDNode *N) {
unsigned Opcode = 0;
switch (NumElts) {
default:
- return nullptr;
+ return false;
case 1:
switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i1:
Opcode = NVPTX::StoreRetvalI8;
break;
@@ -2847,7 +2918,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreRetval(SDNode *N) {
case 2:
switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i1:
Opcode = NVPTX::StoreRetvalV2I8;
break;
@@ -2874,7 +2945,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreRetval(SDNode *N) {
case 4:
switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i1:
Opcode = NVPTX::StoreRetvalV4I8;
break;
@@ -2900,10 +2971,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreRetval(SDNode *N) {
MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
cast<MachineSDNode>(Ret)->setMemRefs(MemRefs0, MemRefs0 + 1);
- return Ret;
+ ReplaceNode(N, Ret);
+ return true;
}
-SDNode *NVPTXDAGToDAGISel::SelectStoreParam(SDNode *N) {
+bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
SDLoc DL(N);
SDValue Chain = N->getOperand(0);
SDValue Param = N->getOperand(1);
@@ -2917,7 +2989,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreParam(SDNode *N) {
unsigned NumElts = 1;
switch (N->getOpcode()) {
default:
- return nullptr;
+ return false;
case NVPTXISD::StoreParamU32:
case NVPTXISD::StoreParamS32:
case NVPTXISD::StoreParam:
@@ -2948,11 +3020,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreParam(SDNode *N) {
default:
switch (NumElts) {
default:
- return nullptr;
+ return false;
case 1:
switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i1:
Opcode = NVPTX::StoreParamI8;
break;
@@ -2979,7 +3051,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreParam(SDNode *N) {
case 2:
switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i1:
Opcode = NVPTX::StoreParamV2I8;
break;
@@ -3006,7 +3078,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreParam(SDNode *N) {
case 4:
switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
default:
- return nullptr;
+ return false;
case MVT::i1:
Opcode = NVPTX::StoreParamV4I8;
break;
@@ -3056,17 +3128,17 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreParam(SDNode *N) {
MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
cast<MachineSDNode>(Ret)->setMemRefs(MemRefs0, MemRefs0 + 1);
- return Ret;
+ ReplaceNode(N, Ret);
+ return true;
}
-SDNode *NVPTXDAGToDAGISel::SelectTextureIntrinsic(SDNode *N) {
+bool NVPTXDAGToDAGISel::tryTextureIntrinsic(SDNode *N) {
SDValue Chain = N->getOperand(0);
- SDNode *Ret = nullptr;
unsigned Opc = 0;
SmallVector<SDValue, 8> Ops;
switch (N->getOpcode()) {
- default: return nullptr;
+ default: return false;
case NVPTXISD::Tex1DFloatS32:
Opc = NVPTX::TEX_1D_F32_S32;
break;
@@ -3579,18 +3651,17 @@ SDNode *NVPTXDAGToDAGISel::SelectTextureIntrinsic(SDNode *N) {
}
Ops.push_back(Chain);
- Ret = CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops);
- return Ret;
+ ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
+ return true;
}
-SDNode *NVPTXDAGToDAGISel::SelectSurfaceIntrinsic(SDNode *N) {
+bool NVPTXDAGToDAGISel::trySurfaceIntrinsic(SDNode *N) {
SDValue Chain = N->getOperand(0);
SDValue TexHandle = N->getOperand(1);
- SDNode *Ret = nullptr;
unsigned Opc = 0;
SmallVector<SDValue, 8> Ops;
switch (N->getOpcode()) {
- default: return nullptr;
+ default: return false;
case NVPTXISD::Suld1DI8Clamp:
Opc = NVPTX::SULD_1D_I8_CLAMP;
Ops.push_back(TexHandle);
@@ -4780,14 +4851,14 @@ SDNode *NVPTXDAGToDAGISel::SelectSurfaceIntrinsic(SDNode *N) {
Ops.push_back(Chain);
break;
}
- Ret = CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops);
- return Ret;
+ ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
+ return true;
}
/// SelectBFE - Look for instruction sequences that can be made more efficient
/// by using the 'bfe' (bit-field extract) PTX instruction
-SDNode *NVPTXDAGToDAGISel::SelectBFE(SDNode *N) {
+bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) {
SDLoc DL(N);
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
@@ -4806,7 +4877,7 @@ SDNode *NVPTXDAGToDAGISel::SelectBFE(SDNode *N) {
ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS);
if (!Mask) {
// We need a constant mask on the RHS of the AND
- return NULL;
+ return false;
}
// Extract the mask bits
@@ -4815,7 +4886,7 @@ SDNode *NVPTXDAGToDAGISel::SelectBFE(SDNode *N) {
// We *could* handle shifted masks here, but doing so would require an
// 'and' operation to fix up the low-order bits so we would trade
// shr+and for bfe+and, which has the same throughput
- return NULL;
+ return false;
}
// How many bits are in our mask?
@@ -4836,7 +4907,7 @@ SDNode *NVPTXDAGToDAGISel::SelectBFE(SDNode *N) {
// Do not handle the case where bits have been shifted in. In theory
// we could handle this, but the cost is likely higher than just
// emitting the srl/and pair.
- return NULL;
+ return false;
}
Start = CurDAG->getTargetConstant(StartVal, DL, MVT::i32);
} else {
@@ -4844,20 +4915,20 @@ SDNode *NVPTXDAGToDAGISel::SelectBFE(SDNode *N) {
// was found) is not constant. We could handle this case, but it would
// require run-time logic that would be more expensive than just
// emitting the srl/and pair.
- return NULL;
+ return false;
}
} else {
// Do not handle the case where the LHS of the and is not a shift. While
// it would be trivial to handle this case, it would just transform
// 'and' -> 'bfe', but 'and' has higher-throughput.
- return NULL;
+ return false;
}
} else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) {
if (LHS->getOpcode() == ISD::AND) {
ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(RHS);
if (!ShiftCnst) {
// Shift amount must be constant
- return NULL;
+ return false;
}
uint64_t ShiftAmt = ShiftCnst->getZExtValue();
@@ -4873,7 +4944,7 @@ SDNode *NVPTXDAGToDAGISel::SelectBFE(SDNode *N) {
ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(AndRHS);
if (!MaskCnst) {
// Mask must be constant
- return NULL;
+ return false;
}
uint64_t MaskVal = MaskCnst->getZExtValue();
@@ -4893,13 +4964,13 @@ SDNode *NVPTXDAGToDAGISel::SelectBFE(SDNode *N) {
NumBits = NumZeros + NumOnes - ShiftAmt;
} else {
// This is not a mask we can handle
- return NULL;
+ return false;
}
if (ShiftAmt < NumZeros) {
// Handling this case would require extra logic that would make this
// transformation non-profitable
- return NULL;
+ return false;
}
Val = AndLHS;
@@ -4919,7 +4990,7 @@ SDNode *NVPTXDAGToDAGISel::SelectBFE(SDNode *N) {
ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(ShlRHS);
if (!ShlCnst) {
// Shift amount must be constant
- return NULL;
+ return false;
}
uint64_t InnerShiftAmt = ShlCnst->getZExtValue();
@@ -4927,20 +4998,20 @@ SDNode *NVPTXDAGToDAGISel::SelectBFE(SDNode *N) {
ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(ShrRHS);
if (!ShrCnst) {
// Shift amount must be constant
- return NULL;
+ return false;
}
uint64_t OuterShiftAmt = ShrCnst->getZExtValue();
// To avoid extra codegen and be profitable, we need Outer >= Inner
if (OuterShiftAmt < InnerShiftAmt) {
- return NULL;
+ return false;
}
// If the outer shift is more than the type size, we have no bitfield to
// extract (since we also check that the inner shift is <= the outer shift
// then this also implies that the inner shift is < the type size)
if (OuterShiftAmt >= Val.getValueType().getSizeInBits()) {
- return NULL;
+ return false;
}
Start =
@@ -4956,11 +5027,11 @@ SDNode *NVPTXDAGToDAGISel::SelectBFE(SDNode *N) {
}
} else {
// No can do...
- return NULL;
+ return false;
}
} else {
// No can do...
- return NULL;
+ return false;
}
@@ -4981,14 +5052,15 @@ SDNode *NVPTXDAGToDAGISel::SelectBFE(SDNode *N) {
}
} else {
// We cannot handle this type
- return NULL;
+ return false;
}
SDValue Ops[] = {
Val, Start, Len
};
- return CurDAG->getMachineNode(Opc, DL, N->getVTList(), Ops);
+ ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getVTList(), Ops));
+ return true;
}
// SelectDirectAddr - Match a direct address for DAG.
@@ -5122,3 +5194,57 @@ bool NVPTXDAGToDAGISel::SelectInlineAsmMemoryOperand(
}
return true;
}
+
+/// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a
+/// conversion from \p SrcTy to \p DestTy.
+unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy,
+ bool IsSigned) {
+ switch (SrcTy.SimpleTy) {
+ default:
+ llvm_unreachable("Unhandled source type");
+ case MVT::i8:
+ switch (DestTy.SimpleTy) {
+ default:
+ llvm_unreachable("Unhandled dest type");
+ case MVT::i16:
+ return IsSigned ? NVPTX::CVT_s16_s8 : NVPTX::CVT_u16_u8;
+ case MVT::i32:
+ return IsSigned ? NVPTX::CVT_s32_s8 : NVPTX::CVT_u32_u8;
+ case MVT::i64:
+ return IsSigned ? NVPTX::CVT_s64_s8 : NVPTX::CVT_u64_u8;
+ }
+ case MVT::i16:
+ switch (DestTy.SimpleTy) {
+ default:
+ llvm_unreachable("Unhandled dest type");
+ case MVT::i8:
+ return IsSigned ? NVPTX::CVT_s8_s16 : NVPTX::CVT_u8_u16;
+ case MVT::i32:
+ return IsSigned ? NVPTX::CVT_s32_s16 : NVPTX::CVT_u32_u16;
+ case MVT::i64:
+ return IsSigned ? NVPTX::CVT_s64_s16 : NVPTX::CVT_u64_u16;
+ }
+ case MVT::i32:
+ switch (DestTy.SimpleTy) {
+ default:
+ llvm_unreachable("Unhandled dest type");
+ case MVT::i8:
+ return IsSigned ? NVPTX::CVT_s8_s32 : NVPTX::CVT_u8_u32;
+ case MVT::i16:
+ return IsSigned ? NVPTX::CVT_s16_s32 : NVPTX::CVT_u16_u32;
+ case MVT::i64:
+ return IsSigned ? NVPTX::CVT_s64_s32 : NVPTX::CVT_u64_u32;
+ }
+ case MVT::i64:
+ switch (DestTy.SimpleTy) {
+ default:
+ llvm_unreachable("Unhandled dest type");
+ case MVT::i8:
+ return IsSigned ? NVPTX::CVT_s8_s64 : NVPTX::CVT_u8_u64;
+ case MVT::i16:
+ return IsSigned ? NVPTX::CVT_s16_s64 : NVPTX::CVT_u16_u64;
+ case MVT::i32:
+ return IsSigned ? NVPTX::CVT_s32_s64 : NVPTX::CVT_u32_u64;
+ }
+ }
+}
diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index fe20580c83a2..d53c92f1eff3 100644
--- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -21,9 +21,8 @@
#include "llvm/CodeGen/SelectionDAGISel.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/Support/Compiler.h"
-using namespace llvm;
-namespace {
+namespace llvm {
class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
const NVPTXTargetMachine &TM;
@@ -54,24 +53,24 @@ private:
// Include the pieces autogenerated from the target description.
#include "NVPTXGenDAGISel.inc"
- SDNode *Select(SDNode *N) override;
- SDNode *SelectIntrinsicNoChain(SDNode *N);
- SDNode *SelectIntrinsicChain(SDNode *N);
- SDNode *SelectTexSurfHandle(SDNode *N);
- SDNode *SelectLoad(SDNode *N);
- SDNode *SelectLoadVector(SDNode *N);
- SDNode *SelectLDGLDU(SDNode *N);
- SDNode *SelectStore(SDNode *N);
- SDNode *SelectStoreVector(SDNode *N);
- SDNode *SelectLoadParam(SDNode *N);
- SDNode *SelectStoreRetval(SDNode *N);
- SDNode *SelectStoreParam(SDNode *N);
- SDNode *SelectAddrSpaceCast(SDNode *N);
- SDNode *SelectTextureIntrinsic(SDNode *N);
- SDNode *SelectSurfaceIntrinsic(SDNode *N);
- SDNode *SelectBFE(SDNode *N);
-
- inline SDValue getI32Imm(unsigned Imm, SDLoc DL) {
+ void Select(SDNode *N) override;
+ bool tryIntrinsicNoChain(SDNode *N);
+ bool tryIntrinsicChain(SDNode *N);
+ void SelectTexSurfHandle(SDNode *N);
+ bool tryLoad(SDNode *N);
+ bool tryLoadVector(SDNode *N);
+ bool tryLDGLDU(SDNode *N);
+ bool tryStore(SDNode *N);
+ bool tryStoreVector(SDNode *N);
+ bool tryLoadParam(SDNode *N);
+ bool tryStoreRetval(SDNode *N);
+ bool tryStoreParam(SDNode *N);
+ void SelectAddrSpaceCast(SDNode *N);
+ bool tryTextureIntrinsic(SDNode *N);
+ bool trySurfaceIntrinsic(SDNode *N);
+ bool tryBFE(SDNode *N);
+
+ inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
}
@@ -94,7 +93,8 @@ private:
bool ChkMemSDNodeAddressSpace(SDNode *N, unsigned int spN) const;
+ static unsigned GetConvertOpcode(MVT DestTy, MVT SrcTy, bool IsSigned);
};
-}
+} // end namespace llvm
#endif
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp
index be735f6c1bce..f28c89cd976a 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -257,15 +257,9 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setOperationAction(ISD::CTLZ, MVT::i16, Legal);
setOperationAction(ISD::CTLZ, MVT::i32, Legal);
setOperationAction(ISD::CTLZ, MVT::i64, Legal);
- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Legal);
- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Legal);
- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Legal);
setOperationAction(ISD::CTTZ, MVT::i16, Expand);
setOperationAction(ISD::CTTZ, MVT::i32, Expand);
setOperationAction(ISD::CTTZ, MVT::i64, Expand);
- setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Expand);
- setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
- setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
setOperationAction(ISD::CTPOP, MVT::i16, Legal);
setOperationAction(ISD::CTPOP, MVT::i32, Legal);
setOperationAction(ISD::CTPOP, MVT::i64, Legal);
@@ -273,6 +267,10 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// PTX does not directly support SELP of i1, so promote to i32 first
setOperationAction(ISD::SELECT, MVT::i1, Custom);
+ // PTX cannot multiply two i64s in a single instruction.
+ setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
+ setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
+
// We have some custom DAG combine patterns for these nodes
setTargetDAGCombine(ISD::ADD);
setTargetDAGCombine(ISD::AND);
@@ -310,8 +308,12 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
return "NVPTXISD::DeclareRetParam";
case NVPTXISD::PrintCall:
return "NVPTXISD::PrintCall";
+ case NVPTXISD::PrintConvergentCall:
+ return "NVPTXISD::PrintConvergentCall";
case NVPTXISD::PrintCallUni:
return "NVPTXISD::PrintCallUni";
+ case NVPTXISD::PrintConvergentCallUni:
+ return "NVPTXISD::PrintConvergentCallUni";
case NVPTXISD::LoadParam:
return "NVPTXISD::LoadParam";
case NVPTXISD::LoadParamV2:
@@ -1309,9 +1311,9 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
InFlag };
unsigned opcode = NVPTXISD::StoreParam;
- if (Outs[OIdx].Flags.isZExt())
+ if (Outs[OIdx].Flags.isZExt() && VT.getSizeInBits() < 32)
opcode = NVPTXISD::StoreParamU32;
- else if (Outs[OIdx].Flags.isSExt())
+ else if (Outs[OIdx].Flags.isSExt() && VT.getSizeInBits() < 32)
opcode = NVPTXISD::StoreParamS32;
Chain = DAG.getMemIntrinsicNode(opcode, dl, CopyParamVTs, CopyParamOps,
VT, MachinePointerInfo());
@@ -1351,8 +1353,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, OutVals[OIdx],
DAG.getConstant(curOffset, dl, PtrVT));
SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr,
- MachinePointerInfo(), false, false, false,
- PartAlign);
+ MachinePointerInfo(), PartAlign);
if (elemtype.getSizeInBits() < 16) {
theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal);
}
@@ -1435,8 +1436,12 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
SDValue PrintCallOps[] = {
Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InFlag
};
- Chain = DAG.getNode(Func ? (NVPTXISD::PrintCallUni) : (NVPTXISD::PrintCall),
- dl, PrintCallVTs, PrintCallOps);
+ // We model convergent calls as separate opcodes.
+ unsigned Opcode = Func ? NVPTXISD::PrintCallUni : NVPTXISD::PrintCall;
+ if (CLI.IsConvergent)
+ Opcode = Opcode == NVPTXISD::PrintCallUni ? NVPTXISD::PrintConvergentCallUni
+ : NVPTXISD::PrintConvergentCall;
+ Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps);
InFlag = Chain.getValue(1);
// Ops to print out the function name
@@ -1608,9 +1613,11 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
unsigned sz = VTs[i].getSizeInBits();
unsigned AlignI = GreatestCommonDivisor64(RetAlign, Offsets[i]);
- bool needTruncate = sz < 8;
- if (VTs[i].isInteger() && (sz < 8))
+ bool needTruncate = false;
+ if (VTs[i].isInteger() && sz < 8) {
sz = 8;
+ needTruncate = true;
+ }
SmallVector<EVT, 4> LoadRetVTs;
EVT TheLoadType = VTs[i];
@@ -1619,10 +1626,16 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// aggregates.
LoadRetVTs.push_back(MVT::i32);
TheLoadType = MVT::i32;
+ needTruncate = true;
} else if (sz < 16) {
// If loading i1/i8 result, generate
// load i8 (-> i16)
// trunc i16 to i1/i8
+
+ // FIXME: Do we need to set needTruncate to true here, too? We could
+ // not figure out what this branch is for in D17872, so we left it
+ // alone. The comment above about loading i1/i8 may be wrong, as the
+ // branch above seems to cover integers of size < 32.
LoadRetVTs.push_back(MVT::i16);
} else
LoadRetVTs.push_back(Ins[i].VT);
@@ -1678,7 +1691,7 @@ NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
DAG.getIntPtrConstant(j, dl)));
}
}
- return DAG.getNode(ISD::BUILD_VECTOR, dl, Node->getValueType(0), Ops);
+ return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
}
/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
@@ -1872,10 +1885,9 @@ SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
assert(Node->getValueType(0) == MVT::i1 &&
"Custom lowering for i1 load only");
- SDValue newLD =
- DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(),
- LD->getPointerInfo(), LD->isVolatile(), LD->isNonTemporal(),
- LD->isInvariant(), LD->getAlignment());
+ SDValue newLD = DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(),
+ LD->getPointerInfo(), LD->getAlignment(),
+ LD->getMemOperand()->getFlags());
SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
// The legalizer (the caller) is expecting two values from the legalized
// load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
@@ -2002,13 +2014,10 @@ SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
SDValue Tmp2 = ST->getBasePtr();
SDValue Tmp3 = ST->getValue();
assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
- unsigned Alignment = ST->getAlignment();
- bool isVolatile = ST->isVolatile();
- bool isNonTemporal = ST->isNonTemporal();
Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
- SDValue Result = DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2,
- ST->getPointerInfo(), MVT::i8, isNonTemporal,
- isVolatile, Alignment);
+ SDValue Result =
+ DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8,
+ ST->getAlignment(), ST->getMemOperand()->getFlags());
return Result;
}
@@ -2027,7 +2036,7 @@ NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const {
// Check to see if the kernel argument is image*_t or sampler_t
-bool llvm::isImageOrSamplerVal(const Value *arg, const Module *context) {
+static bool isImageOrSamplerVal(const Value *arg, const Module *context) {
static const char *const specialTypes[] = { "struct._image2d_t",
"struct._image3d_t",
"struct._sampler_t" };
@@ -2042,16 +2051,17 @@ bool llvm::isImageOrSamplerVal(const Value *arg, const Module *context) {
return false;
auto *STy = dyn_cast<StructType>(PTy->getElementType());
- const std::string TypeName = STy && !STy->isLiteral() ? STy->getName() : "";
+ if (!STy || STy->isLiteral())
+ return false;
return std::find(std::begin(specialTypes), std::end(specialTypes),
- TypeName) != std::end(specialTypes);
+ STy->getName()) != std::end(specialTypes);
}
SDValue NVPTXTargetLowering::LowerFormalArguments(
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
- const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc dl, SelectionDAG &DAG,
- SmallVectorImpl<SDValue> &InVals) const {
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
MachineFunction &MF = DAG.getMachineFunction();
const DataLayout &DL = DAG.getDataLayout();
auto PtrVT = getPointerTy(DAG.getDataLayout());
@@ -2171,12 +2181,10 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
ISD::LoadExtType ExtOp = Ins[InsIdx].Flags.isSExt() ?
ISD::SEXTLOAD : ISD::ZEXTLOAD;
p = DAG.getExtLoad(ExtOp, dl, Ins[InsIdx].VT, Root, srcAddr,
- MachinePointerInfo(srcValue), partVT, false,
- false, false, partAlign);
+ MachinePointerInfo(srcValue), partVT, partAlign);
} else {
p = DAG.getLoad(partVT, dl, Root, srcAddr,
- MachinePointerInfo(srcValue), false, false, false,
- partAlign);
+ MachinePointerInfo(srcValue), partAlign);
}
if (p.getNode())
p.getNode()->setIROrder(idx + 1);
@@ -2202,9 +2210,9 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
Value *SrcValue = Constant::getNullValue(PointerType::get(
EltVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM));
SDValue P = DAG.getLoad(
- EltVT, dl, Root, Arg, MachinePointerInfo(SrcValue), false, false,
- true,
- DL.getABITypeAlignment(EltVT.getTypeForEVT(F->getContext())));
+ EltVT, dl, Root, Arg, MachinePointerInfo(SrcValue),
+ DL.getABITypeAlignment(EltVT.getTypeForEVT(F->getContext())),
+ MachineMemOperand::MOInvariant);
if (P.getNode())
P.getNode()->setIROrder(idx + 1);
@@ -2219,9 +2227,9 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
Value *SrcValue = Constant::getNullValue(PointerType::get(
VecVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM));
SDValue P = DAG.getLoad(
- VecVT, dl, Root, Arg, MachinePointerInfo(SrcValue), false, false,
- true,
- DL.getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())));
+ VecVT, dl, Root, Arg, MachinePointerInfo(SrcValue),
+ DL.getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())),
+ MachineMemOperand::MOInvariant);
if (P.getNode())
P.getNode()->setIROrder(idx + 1);
@@ -2241,10 +2249,9 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
} else {
// V4 loads
// We have at least 4 elements (<3 x Ty> expands to 4 elements) and
- // the
- // vector will be expanded to a power of 2 elements, so we know we can
- // always round up to the next multiple of 4 when creating the vector
- // loads.
+ // the vector will be expanded to a power of 2 elements, so we know we
+ // can always round up to the next multiple of 4 when creating the
+ // vector loads.
// e.g. 4 elem => 1 ld.v4
// 6 elem => 2 ld.v4
// 8 elem => 2 ld.v4
@@ -2262,9 +2269,9 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
SDValue SrcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
DAG.getConstant(Ofst, dl, PtrVT));
SDValue P = DAG.getLoad(
- VecVT, dl, Root, SrcAddr, MachinePointerInfo(SrcValue), false,
- false, true,
- DL.getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())));
+ VecVT, dl, Root, SrcAddr, MachinePointerInfo(SrcValue),
+ DL.getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())),
+ MachineMemOperand::MOInvariant);
if (P.getNode())
P.getNode()->setIROrder(idx + 1);
@@ -2298,12 +2305,11 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
ISD::SEXTLOAD : ISD::ZEXTLOAD;
p = DAG.getExtLoad(
ExtOp, dl, Ins[InsIdx].VT, Root, Arg, MachinePointerInfo(srcValue),
- ObjectVT, false, false, false,
+ ObjectVT,
DL.getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext())));
} else {
p = DAG.getLoad(
- Ins[InsIdx].VT, dl, Root, Arg, MachinePointerInfo(srcValue), false,
- false, false,
+ Ins[InsIdx].VT, dl, Root, Arg, MachinePointerInfo(srcValue),
DL.getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext())));
}
if (p.getNode())
@@ -2350,13 +2356,12 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
return Chain;
}
-
SDValue
NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
- SDLoc dl, SelectionDAG &DAG) const {
+ const SDLoc &dl, SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
const Function *F = MF.getFunction();
Type *RetTy = F->getReturnType();
@@ -3940,9 +3945,8 @@ static SDValue PerformADDCombine(SDNode *N,
SDValue N1 = N->getOperand(1);
// First try with the default operand order.
- SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget,
- OptLevel);
- if (Result.getNode())
+ if (SDValue Result =
+ PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, OptLevel))
return Result;
// If that didn't work, try again with the operands commuted.
@@ -4139,7 +4143,7 @@ static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS,
// The RHS can be a demotable op or a constant
if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) {
- APInt Val = CI->getAPIntValue();
+ const APInt &Val = CI->getAPIntValue();
if (LHSSign == Unsigned) {
return Val.isIntN(OptSize);
} else {
@@ -4230,8 +4234,7 @@ static SDValue PerformMULCombine(SDNode *N,
CodeGenOpt::Level OptLevel) {
if (OptLevel > 0) {
// Try mul.wide combining at OptLevel > 0
- SDValue Ret = TryMULWIDECombine(N, DCI);
- if (Ret.getNode())
+ if (SDValue Ret = TryMULWIDECombine(N, DCI))
return Ret;
}
@@ -4244,8 +4247,7 @@ static SDValue PerformSHLCombine(SDNode *N,
CodeGenOpt::Level OptLevel) {
if (OptLevel > 0) {
// Try mul.wide combining at OptLevel > 0
- SDValue Ret = TryMULWIDECombine(N, DCI);
- if (Ret.getNode())
+ if (SDValue Ret = TryMULWIDECombine(N, DCI))
return Ret;
}
@@ -4368,7 +4370,7 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
SDValue LoadChain = NewLD.getValue(NumElts);
- SDValue BuildVec = DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, ScalarRes);
+ SDValue BuildVec = DAG.getBuildVector(ResVT, DL, ScalarRes);
Results.push_back(BuildVec);
Results.push_back(LoadChain);
@@ -4481,7 +4483,7 @@ static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
SDValue LoadChain = NewLD.getValue(NumElts);
SDValue BuildVec =
- DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, ScalarRes);
+ DAG.getBuildVector(ResVT, DL, ScalarRes);
Results.push_back(BuildVec);
Results.push_back(LoadChain);
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.h b/lib/Target/NVPTX/NVPTXISelLowering.h
index 60914c1d09b4..1c32232024d1 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -34,7 +34,9 @@ enum NodeType : unsigned {
DeclareRet,
DeclareScalarRet,
PrintCall,
+ PrintConvergentCall,
PrintCallUni,
+ PrintConvergentCallUni,
CallArgBegin,
CallArg,
LastCallArg,
@@ -475,10 +477,11 @@ public:
getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
StringRef Constraint, MVT VT) const override;
- SDValue LowerFormalArguments(
- SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
- const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc dl, SelectionDAG &DAG,
- SmallVectorImpl<SDValue> &InVals) const override;
+ SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const override;
SDValue LowerCall(CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const override;
@@ -488,11 +491,10 @@ public:
unsigned retAlignment,
const ImmutableCallSite *CS) const;
- SDValue
- LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
- const SmallVectorImpl<ISD::OutputArg> &Outs,
- const SmallVectorImpl<SDValue> &OutVals, SDLoc dl,
- SelectionDAG &DAG) const override;
+ SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals, const SDLoc &dl,
+ SelectionDAG &DAG) const override;
void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
std::vector<SDValue> &Ops,
diff --git a/lib/Target/NVPTX/NVPTXImageOptimizer.cpp b/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
index aa36b6be7250..8d00bbb5e9c2 100644
--- a/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
+++ b/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
@@ -50,6 +50,9 @@ NVPTXImageOptimizer::NVPTXImageOptimizer()
: FunctionPass(ID) {}
bool NVPTXImageOptimizer::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+
bool Changed = false;
InstrToDelete.clear();
diff --git a/lib/Target/NVPTX/NVPTXInferAddressSpaces.cpp b/lib/Target/NVPTX/NVPTXInferAddressSpaces.cpp
new file mode 100644
index 000000000000..e451d273cf44
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXInferAddressSpaces.cpp
@@ -0,0 +1,586 @@
+//===-- NVPTXInferAddressSpace.cpp - ---------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// CUDA C/C++ includes memory space designation as variable type qualifers (such
+// as __global__ and __shared__). Knowing the space of a memory access allows
+// CUDA compilers to emit faster PTX loads and stores. For example, a load from
+// shared memory can be translated to `ld.shared` which is roughly 10% faster
+// than a generic `ld` on an NVIDIA Tesla K40c.
+//
+// Unfortunately, type qualifiers only apply to variable declarations, so CUDA
+// compilers must infer the memory space of an address expression from
+// type-qualified variables.
+//
+// LLVM IR uses non-zero (so-called) specific address spaces to represent memory
+// spaces (e.g. addrspace(3) means shared memory). The Clang frontend
+// places only type-qualified variables in specific address spaces, and then
+// conservatively `addrspacecast`s each type-qualified variable to addrspace(0)
+// (so-called the generic address space) for other instructions to use.
+//
+// For example, the Clang translates the following CUDA code
+// __shared__ float a[10];
+// float v = a[i];
+// to
+// %0 = addrspacecast [10 x float] addrspace(3)* @a to [10 x float]*
+// %1 = gep [10 x float], [10 x float]* %0, i64 0, i64 %i
+// %v = load float, float* %1 ; emits ld.f32
+// @a is in addrspace(3) since it's type-qualified, but its use from %1 is
+// redirected to %0 (the generic version of @a).
+//
+// The optimization implemented in this file propagates specific address spaces
+// from type-qualified variable declarations to its users. For example, it
+// optimizes the above IR to
+// %1 = gep [10 x float] addrspace(3)* @a, i64 0, i64 %i
+// %v = load float addrspace(3)* %1 ; emits ld.shared.f32
+// propagating the addrspace(3) from @a to %1. As the result, the NVPTX
+// codegen is able to emit ld.shared.f32 for %v.
+//
+// Address space inference works in two steps. First, it uses a data-flow
+// analysis to infer as many generic pointers as possible to point to only one
+// specific address space. In the above example, it can prove that %1 only
+// points to addrspace(3). This algorithm was published in
+// CUDA: Compiling and optimizing for a GPU platform
+// Chakrabarti, Grover, Aarts, Kong, Kudlur, Lin, Marathe, Murphy, Wang
+// ICCS 2012
+//
+// Then, address space inference replaces all refinable generic pointers with
+// equivalent specific pointers.
+//
+// The major challenge of implementing this optimization is handling PHINodes,
+// which may create loops in the data flow graph. This brings two complications.
+//
+// First, the data flow analysis in Step 1 needs to be circular. For example,
+// %generic.input = addrspacecast float addrspace(3)* %input to float*
+// loop:
+// %y = phi [ %generic.input, %y2 ]
+// %y2 = getelementptr %y, 1
+// %v = load %y2
+// br ..., label %loop, ...
+// proving %y specific requires proving both %generic.input and %y2 specific,
+// but proving %y2 specific circles back to %y. To address this complication,
+// the data flow analysis operates on a lattice:
+// uninitialized > specific address spaces > generic.
+// All address expressions (our implementation only considers phi, bitcast,
+// addrspacecast, and getelementptr) start with the uninitialized address space.
+// The monotone transfer function moves the address space of a pointer down a
+// lattice path from uninitialized to specific and then to generic. A join
+// operation of two different specific address spaces pushes the expression down
+// to the generic address space. The analysis completes once it reaches a fixed
+// point.
+//
+// Second, IR rewriting in Step 2 also needs to be circular. For example,
+// converting %y to addrspace(3) requires the compiler to know the converted
+// %y2, but converting %y2 needs the converted %y. To address this complication,
+// we break these cycles using "undef" placeholders. When converting an
+// instruction `I` to a new address space, if its operand `Op` is not converted
+// yet, we let `I` temporarily use `undef` and fix all the uses of undef later.
+// For instance, our algorithm first converts %y to
+// %y' = phi float addrspace(3)* [ %input, undef ]
+// Then, it converts %y2 to
+// %y2' = getelementptr %y', 1
+// Finally, it fixes the undef in %y' so that
+// %y' = phi float addrspace(3)* [ %input, %y2' ]
+//
+// TODO: This pass is experimental and not enabled by default. Users can turn it
+// on by setting the -nvptx-use-infer-addrspace flag of llc. We plan to replace
+// NVPTXNonFavorGenericAddrSpaces with this pass shortly.
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "nvptx-infer-addrspace"
+
+#include "NVPTX.h"
+#include "MCTargetDesc/NVPTXBaseInfo.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+
+using namespace llvm;
+
+namespace {
+const unsigned ADDRESS_SPACE_UNINITIALIZED = (unsigned)-1;
+
+using ValueToAddrSpaceMapTy = DenseMap<const Value *, unsigned>;
+
+/// \brief NVPTXInferAddressSpaces
+class NVPTXInferAddressSpaces: public FunctionPass {
+public:
+ static char ID;
+
+ NVPTXInferAddressSpaces() : FunctionPass(ID) {}
+
+ bool runOnFunction(Function &F) override;
+
+private:
+ // Returns the new address space of V if updated; otherwise, returns None.
+ Optional<unsigned>
+ updateAddressSpace(const Value &V,
+ const ValueToAddrSpaceMapTy &InferredAddrSpace);
+
+ // Tries to infer the specific address space of each address expression in
+ // Postorder.
+ void inferAddressSpaces(const std::vector<Value *> &Postorder,
+ ValueToAddrSpaceMapTy *InferredAddrSpace);
+
+ // Changes the generic address expressions in function F to point to specific
+ // address spaces if InferredAddrSpace says so. Postorder is the postorder of
+ // all generic address expressions in the use-def graph of function F.
+ bool
+ rewriteWithNewAddressSpaces(const std::vector<Value *> &Postorder,
+ const ValueToAddrSpaceMapTy &InferredAddrSpace,
+ Function *F);
+};
+} // end anonymous namespace
+
+char NVPTXInferAddressSpaces::ID = 0;
+
+namespace llvm {
+void initializeNVPTXInferAddressSpacesPass(PassRegistry &);
+}
+INITIALIZE_PASS(NVPTXInferAddressSpaces, "nvptx-infer-addrspace",
+ "Infer address spaces",
+ false, false)
+
+// Returns true if V is an address expression.
+// TODO: Currently, we consider only phi, bitcast, addrspacecast, and
+// getelementptr operators.
+static bool isAddressExpression(const Value &V) {
+ if (!isa<Operator>(V))
+ return false;
+
+ switch (cast<Operator>(V).getOpcode()) {
+ case Instruction::PHI:
+ case Instruction::BitCast:
+ case Instruction::AddrSpaceCast:
+ case Instruction::GetElementPtr:
+ return true;
+ default:
+ return false;
+ }
+}
+
+// Returns the pointer operands of V.
+//
+// Precondition: V is an address expression.
+static SmallVector<Value *, 2> getPointerOperands(const Value &V) {
+ assert(isAddressExpression(V));
+ const Operator& Op = cast<Operator>(V);
+ switch (Op.getOpcode()) {
+ case Instruction::PHI: {
+ auto IncomingValues = cast<PHINode>(Op).incoming_values();
+ return SmallVector<Value *, 2>(IncomingValues.begin(),
+ IncomingValues.end());
+ }
+ case Instruction::BitCast:
+ case Instruction::AddrSpaceCast:
+ case Instruction::GetElementPtr:
+ return {Op.getOperand(0)};
+ default:
+ llvm_unreachable("Unexpected instruction type.");
+ }
+}
+
+// If V is an unvisited generic address expression, appends V to PostorderStack
+// and marks it as visited.
+static void appendsGenericAddressExpressionToPostorderStack(
+ Value *V, std::vector<std::pair<Value *, bool>> *PostorderStack,
+ DenseSet<Value *> *Visited) {
+ assert(V->getType()->isPointerTy());
+ if (isAddressExpression(*V) &&
+ V->getType()->getPointerAddressSpace() ==
+ AddressSpace::ADDRESS_SPACE_GENERIC) {
+ if (Visited->insert(V).second)
+ PostorderStack->push_back(std::make_pair(V, false));
+ }
+}
+
+// Returns all generic address expressions in function F. The elements are
+// ordered in postorder.
+static std::vector<Value *> collectGenericAddressExpressions(Function &F) {
+ // This function implements a non-recursive postorder traversal of a partial
+ // use-def graph of function F.
+ std::vector<std::pair<Value*, bool>> PostorderStack;
+ // The set of visited expressions.
+ DenseSet<Value*> Visited;
+ // We only explore address expressions that are reachable from loads and
+ // stores for now because we aim at generating faster loads and stores.
+ for (Instruction &I : instructions(F)) {
+ if (isa<LoadInst>(I)) {
+ appendsGenericAddressExpressionToPostorderStack(
+ I.getOperand(0), &PostorderStack, &Visited);
+ } else if (isa<StoreInst>(I)) {
+ appendsGenericAddressExpressionToPostorderStack(
+ I.getOperand(1), &PostorderStack, &Visited);
+ }
+ }
+
+ std::vector<Value *> Postorder; // The resultant postorder.
+ while (!PostorderStack.empty()) {
+ // If the operands of the expression on the top are already explored,
+ // adds that expression to the resultant postorder.
+ if (PostorderStack.back().second) {
+ Postorder.push_back(PostorderStack.back().first);
+ PostorderStack.pop_back();
+ continue;
+ }
+ // Otherwise, adds its operands to the stack and explores them.
+ PostorderStack.back().second = true;
+ for (Value *PtrOperand : getPointerOperands(*PostorderStack.back().first)) {
+ appendsGenericAddressExpressionToPostorderStack(
+ PtrOperand, &PostorderStack, &Visited);
+ }
+ }
+ return Postorder;
+}
+
+// A helper function for cloneInstructionWithNewAddressSpace. Returns the clone
+// of OperandUse.get() in the new address space. If the clone is not ready yet,
+// returns an undef in the new address space as a placeholder.
+static Value *operandWithNewAddressSpaceOrCreateUndef(
+ const Use &OperandUse, unsigned NewAddrSpace,
+ const ValueToValueMapTy &ValueWithNewAddrSpace,
+ SmallVectorImpl<const Use *> *UndefUsesToFix) {
+ Value *Operand = OperandUse.get();
+ if (Value *NewOperand = ValueWithNewAddrSpace.lookup(Operand))
+ return NewOperand;
+
+ UndefUsesToFix->push_back(&OperandUse);
+ return UndefValue::get(
+ Operand->getType()->getPointerElementType()->getPointerTo(NewAddrSpace));
+}
+
+// Returns a clone of `I` with its operands converted to those specified in
+// ValueWithNewAddrSpace. Due to potential cycles in the data flow graph, an
+// operand whose address space needs to be modified might not exist in
+// ValueWithNewAddrSpace. In that case, uses undef as a placeholder operand and
+// adds that operand use to UndefUsesToFix so that caller can fix them later.
+//
+// Note that we do not necessarily clone `I`, e.g., if it is an addrspacecast
+// from a pointer whose type already matches. Therefore, this function returns a
+// Value* instead of an Instruction*.
+static Value *cloneInstructionWithNewAddressSpace(
+ Instruction *I, unsigned NewAddrSpace,
+ const ValueToValueMapTy &ValueWithNewAddrSpace,
+ SmallVectorImpl<const Use *> *UndefUsesToFix) {
+ Type *NewPtrType =
+ I->getType()->getPointerElementType()->getPointerTo(NewAddrSpace);
+
+ if (I->getOpcode() == Instruction::AddrSpaceCast) {
+ Value *Src = I->getOperand(0);
+ // Because `I` is generic, the source address space must be specific.
+ // Therefore, the inferred address space must be the source space, according
+ // to our algorithm.
+ assert(Src->getType()->getPointerAddressSpace() == NewAddrSpace);
+ if (Src->getType() != NewPtrType)
+ return new BitCastInst(Src, NewPtrType);
+ return Src;
+ }
+
+ // Computes the converted pointer operands.
+ SmallVector<Value *, 4> NewPointerOperands;
+ for (const Use &OperandUse : I->operands()) {
+ if (!OperandUse.get()->getType()->isPointerTy())
+ NewPointerOperands.push_back(nullptr);
+ else
+ NewPointerOperands.push_back(operandWithNewAddressSpaceOrCreateUndef(
+ OperandUse, NewAddrSpace, ValueWithNewAddrSpace, UndefUsesToFix));
+ }
+
+ switch (I->getOpcode()) {
+ case Instruction::BitCast:
+ return new BitCastInst(NewPointerOperands[0], NewPtrType);
+ case Instruction::PHI: {
+ assert(I->getType()->isPointerTy());
+ PHINode *PHI = cast<PHINode>(I);
+ PHINode *NewPHI = PHINode::Create(NewPtrType, PHI->getNumIncomingValues());
+ for (unsigned Index = 0; Index < PHI->getNumIncomingValues(); ++Index) {
+ unsigned OperandNo = PHINode::getOperandNumForIncomingValue(Index);
+ NewPHI->addIncoming(NewPointerOperands[OperandNo],
+ PHI->getIncomingBlock(Index));
+ }
+ return NewPHI;
+ }
+ case Instruction::GetElementPtr: {
+ GetElementPtrInst *GEP = cast<GetElementPtrInst>(I);
+ GetElementPtrInst *NewGEP = GetElementPtrInst::Create(
+ GEP->getSourceElementType(), NewPointerOperands[0],
+ SmallVector<Value *, 4>(GEP->idx_begin(), GEP->idx_end()));
+ NewGEP->setIsInBounds(GEP->isInBounds());
+ return NewGEP;
+ }
+ default:
+ llvm_unreachable("Unexpected opcode");
+ }
+}
+
+// Similar to cloneInstructionWithNewAddressSpace, returns a clone of the
+// constant expression `CE` with its operands replaced as specified in
+// ValueWithNewAddrSpace.
+static Value *cloneConstantExprWithNewAddressSpace(
+ ConstantExpr *CE, unsigned NewAddrSpace,
+ const ValueToValueMapTy &ValueWithNewAddrSpace) {
+ Type *TargetType =
+ CE->getType()->getPointerElementType()->getPointerTo(NewAddrSpace);
+
+ if (CE->getOpcode() == Instruction::AddrSpaceCast) {
+ // Because CE is generic, the source address space must be specific.
+ // Therefore, the inferred address space must be the source space according
+ // to our algorithm.
+ assert(CE->getOperand(0)->getType()->getPointerAddressSpace() ==
+ NewAddrSpace);
+ return ConstantExpr::getBitCast(CE->getOperand(0), TargetType);
+ }
+
+ // Computes the operands of the new constant expression.
+ SmallVector<Constant *, 4> NewOperands;
+ for (unsigned Index = 0; Index < CE->getNumOperands(); ++Index) {
+ Constant *Operand = CE->getOperand(Index);
+ // If the address space of `Operand` needs to be modified, the new operand
+ // with the new address space should already be in ValueWithNewAddrSpace
+ // because (1) the constant expressions we consider (i.e. addrspacecast,
+ // bitcast, and getelementptr) do not incur cycles in the data flow graph
+ // and (2) this function is called on constant expressions in postorder.
+ if (Value *NewOperand = ValueWithNewAddrSpace.lookup(Operand)) {
+ NewOperands.push_back(cast<Constant>(NewOperand));
+ } else {
+ // Otherwise, reuses the old operand.
+ NewOperands.push_back(Operand);
+ }
+ }
+
+ if (CE->getOpcode() == Instruction::GetElementPtr) {
+ // Needs to specify the source type while constructing a getelementptr
+ // constant expression.
+ return CE->getWithOperands(
+ NewOperands, TargetType, /*OnlyIfReduced=*/false,
+ NewOperands[0]->getType()->getPointerElementType());
+ }
+
+ return CE->getWithOperands(NewOperands, TargetType);
+}
+
+// Returns a clone of the value `V`, with its operands replaced as specified in
+// ValueWithNewAddrSpace. This function is called on every generic address
+// expression whose address space needs to be modified, in postorder.
+//
+// See cloneInstructionWithNewAddressSpace for the meaning of UndefUsesToFix.
+static Value *
+cloneValueWithNewAddressSpace(Value *V, unsigned NewAddrSpace,
+ const ValueToValueMapTy &ValueWithNewAddrSpace,
+ SmallVectorImpl<const Use *> *UndefUsesToFix) {
+ // All values in Postorder are generic address expressions.
+ assert(isAddressExpression(*V) &&
+ V->getType()->getPointerAddressSpace() ==
+ AddressSpace::ADDRESS_SPACE_GENERIC);
+
+ if (Instruction *I = dyn_cast<Instruction>(V)) {
+ Value *NewV = cloneInstructionWithNewAddressSpace(
+ I, NewAddrSpace, ValueWithNewAddrSpace, UndefUsesToFix);
+ if (Instruction *NewI = dyn_cast<Instruction>(NewV)) {
+ if (NewI->getParent() == nullptr) {
+ NewI->insertBefore(I);
+ NewI->takeName(I);
+ }
+ }
+ return NewV;
+ }
+
+ return cloneConstantExprWithNewAddressSpace(
+ cast<ConstantExpr>(V), NewAddrSpace, ValueWithNewAddrSpace);
+}
+
+// Defines the join operation on the address space lattice (see the file header
+// comments).
+static unsigned joinAddressSpaces(unsigned AS1, unsigned AS2) {
+ if (AS1 == AddressSpace::ADDRESS_SPACE_GENERIC ||
+ AS2 == AddressSpace::ADDRESS_SPACE_GENERIC)
+ return AddressSpace::ADDRESS_SPACE_GENERIC;
+
+ if (AS1 == ADDRESS_SPACE_UNINITIALIZED)
+ return AS2;
+ if (AS2 == ADDRESS_SPACE_UNINITIALIZED)
+ return AS1;
+
+ // The join of two different specific address spaces is generic.
+ return AS1 == AS2 ? AS1 : (unsigned)AddressSpace::ADDRESS_SPACE_GENERIC;
+}
+
+bool NVPTXInferAddressSpaces::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+
+ // Collects all generic address expressions in postorder.
+ std::vector<Value *> Postorder = collectGenericAddressExpressions(F);
+
+ // Runs a data-flow analysis to refine the address spaces of every expression
+ // in Postorder.
+ ValueToAddrSpaceMapTy InferredAddrSpace;
+ inferAddressSpaces(Postorder, &InferredAddrSpace);
+
+ // Changes the address spaces of the generic address expressions who are
+ // inferred to point to a specific address space.
+ return rewriteWithNewAddressSpaces(Postorder, InferredAddrSpace, &F);
+}
+
+void NVPTXInferAddressSpaces::inferAddressSpaces(
+ const std::vector<Value *> &Postorder,
+ ValueToAddrSpaceMapTy *InferredAddrSpace) {
+ SetVector<Value *> Worklist(Postorder.begin(), Postorder.end());
+ // Initially, all expressions are in the uninitialized address space.
+ for (Value *V : Postorder)
+ (*InferredAddrSpace)[V] = ADDRESS_SPACE_UNINITIALIZED;
+
+ while (!Worklist.empty()) {
+ Value* V = Worklist.pop_back_val();
+
+ // Tries to update the address space of the stack top according to the
+ // address spaces of its operands.
+ DEBUG(dbgs() << "Updating the address space of\n"
+ << " " << *V << "\n");
+ Optional<unsigned> NewAS = updateAddressSpace(*V, *InferredAddrSpace);
+ if (!NewAS.hasValue())
+ continue;
+ // If any updates are made, grabs its users to the worklist because
+ // their address spaces can also be possibly updated.
+ DEBUG(dbgs() << " to " << NewAS.getValue() << "\n");
+ (*InferredAddrSpace)[V] = NewAS.getValue();
+
+ for (Value *User : V->users()) {
+ // Skip if User is already in the worklist.
+ if (Worklist.count(User))
+ continue;
+
+ auto Pos = InferredAddrSpace->find(User);
+ // Our algorithm only updates the address spaces of generic address
+ // expressions, which are those in InferredAddrSpace.
+ if (Pos == InferredAddrSpace->end())
+ continue;
+
+ // Function updateAddressSpace moves the address space down a lattice
+ // path. Therefore, nothing to do if User is already inferred as
+ // generic (the bottom element in the lattice).
+ if (Pos->second == AddressSpace::ADDRESS_SPACE_GENERIC)
+ continue;
+
+ Worklist.insert(User);
+ }
+ }
+}
+
+Optional<unsigned> NVPTXInferAddressSpaces::updateAddressSpace(
+ const Value &V, const ValueToAddrSpaceMapTy &InferredAddrSpace) {
+ assert(InferredAddrSpace.count(&V));
+
+ // The new inferred address space equals the join of the address spaces
+ // of all its pointer operands.
+ unsigned NewAS = ADDRESS_SPACE_UNINITIALIZED;
+ for (Value *PtrOperand : getPointerOperands(V)) {
+ unsigned OperandAS;
+ if (InferredAddrSpace.count(PtrOperand))
+ OperandAS = InferredAddrSpace.lookup(PtrOperand);
+ else
+ OperandAS = PtrOperand->getType()->getPointerAddressSpace();
+ NewAS = joinAddressSpaces(NewAS, OperandAS);
+ // join(generic, *) = generic. So we can break if NewAS is already generic.
+ if (NewAS == AddressSpace::ADDRESS_SPACE_GENERIC)
+ break;
+ }
+
+ unsigned OldAS = InferredAddrSpace.lookup(&V);
+ assert(OldAS != AddressSpace::ADDRESS_SPACE_GENERIC);
+ if (OldAS == NewAS)
+ return None;
+ return NewAS;
+}
+
+bool NVPTXInferAddressSpaces::rewriteWithNewAddressSpaces(
+ const std::vector<Value *> &Postorder,
+ const ValueToAddrSpaceMapTy &InferredAddrSpace, Function *F) {
+ // For each address expression to be modified, creates a clone of it with its
+ // pointer operands converted to the new address space. Since the pointer
+ // operands are converted, the clone is naturally in the new address space by
+ // construction.
+ ValueToValueMapTy ValueWithNewAddrSpace;
+ SmallVector<const Use *, 32> UndefUsesToFix;
+ for (Value* V : Postorder) {
+ unsigned NewAddrSpace = InferredAddrSpace.lookup(V);
+ if (V->getType()->getPointerAddressSpace() != NewAddrSpace) {
+ ValueWithNewAddrSpace[V] = cloneValueWithNewAddressSpace(
+ V, NewAddrSpace, ValueWithNewAddrSpace, &UndefUsesToFix);
+ }
+ }
+
+ if (ValueWithNewAddrSpace.empty())
+ return false;
+
+ // Fixes all the undef uses generated by cloneInstructionWithNewAddressSpace.
+ for (const Use* UndefUse : UndefUsesToFix) {
+ User *V = UndefUse->getUser();
+ User *NewV = cast<User>(ValueWithNewAddrSpace.lookup(V));
+ unsigned OperandNo = UndefUse->getOperandNo();
+ assert(isa<UndefValue>(NewV->getOperand(OperandNo)));
+ NewV->setOperand(OperandNo, ValueWithNewAddrSpace.lookup(UndefUse->get()));
+ }
+
+ // Replaces the uses of the old address expressions with the new ones.
+ for (Value *V : Postorder) {
+ Value *NewV = ValueWithNewAddrSpace.lookup(V);
+ if (NewV == nullptr)
+ continue;
+
+ SmallVector<Use *, 4> Uses;
+ for (Use &U : V->uses())
+ Uses.push_back(&U);
+ DEBUG(dbgs() << "Replacing the uses of " << *V << "\n to\n " << *NewV
+ << "\n");
+ for (Use *U : Uses) {
+ if (isa<LoadInst>(U->getUser()) ||
+ (isa<StoreInst>(U->getUser()) && U->getOperandNo() == 1)) {
+ // If V is used as the pointer operand of a load/store, sets the pointer
+ // operand to NewV. This replacement does not change the element type,
+ // so the resultant load/store is still valid.
+ U->set(NewV);
+ } else if (isa<Instruction>(U->getUser())) {
+ // Otherwise, replaces the use with generic(NewV).
+ // TODO: Some optimization opportunities are missed. For example, in
+ // %0 = icmp eq float* %p, %q
+ // if both p and q are inferred to be shared, we can rewrite %0 as
+ // %0 = icmp eq float addrspace(3)* %new_p, %new_q
+ // instead of currently
+ // %generic_p = addrspacecast float addrspace(3)* %new_p to float*
+ // %generic_q = addrspacecast float addrspace(3)* %new_q to float*
+ // %0 = icmp eq float* %generic_p, %generic_q
+ if (Instruction *I = dyn_cast<Instruction>(V)) {
+ BasicBlock::iterator InsertPos = std::next(I->getIterator());
+ while (isa<PHINode>(InsertPos))
+ ++InsertPos;
+ U->set(new AddrSpaceCastInst(NewV, V->getType(), "", &*InsertPos));
+ } else {
+ U->set(ConstantExpr::getAddrSpaceCast(cast<Constant>(NewV),
+ V->getType()));
+ }
+ }
+ }
+ if (V->use_empty())
+ RecursivelyDeleteTriviallyDeadInstructions(V);
+ }
+
+ return true;
+}
+
+FunctionPass *llvm::createNVPTXInferAddressSpacesPass() {
+ return new NVPTXInferAddressSpaces();
+}
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/lib/Target/NVPTX/NVPTXInstrInfo.cpp
index 9f3cf4551955..0c7c6cbc4512 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.cpp
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.cpp
@@ -30,9 +30,10 @@ void NVPTXInstrInfo::anchor() {}
NVPTXInstrInfo::NVPTXInstrInfo() : NVPTXGenInstrInfo(), RegInfo() {}
-void NVPTXInstrInfo::copyPhysReg(
- MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL,
- unsigned DestReg, unsigned SrcReg, bool KillSrc) const {
+void NVPTXInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const DebugLoc &DL, unsigned DestReg,
+ unsigned SrcReg, bool KillSrc) const {
const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
const TargetRegisterClass *DestRC = MRI.getRegClass(DestReg);
const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
@@ -111,7 +112,7 @@ bool NVPTXInstrInfo::isStoreInstr(const MachineInstr &MI,
bool NVPTXInstrInfo::CanTailMerge(const MachineInstr *MI) const {
unsigned addrspace = 0;
- if (MI->getOpcode() == NVPTX::INT_CUDA_SYNCTHREADS)
+ if (MI->getOpcode() == NVPTX::INT_BARRIER0)
return false;
if (isLoadInstr(*MI, addrspace))
if (addrspace == NVPTX::PTXLdStInstCode::SHARED)
@@ -145,26 +146,28 @@ bool NVPTXInstrInfo::CanTailMerge(const MachineInstr *MI) const {
/// Note that RemoveBranch and InsertBranch must be implemented to support
/// cases where this method returns success.
///
-bool NVPTXInstrInfo::AnalyzeBranch(
- MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
- SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const {
+bool NVPTXInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const {
// If the block has no terminators, it just falls into the block after it.
MachineBasicBlock::iterator I = MBB.end();
- if (I == MBB.begin() || !isUnpredicatedTerminator(--I))
+ if (I == MBB.begin() || !isUnpredicatedTerminator(*--I))
return false;
// Get the last instruction in the block.
- MachineInstr *LastInst = I;
+ MachineInstr &LastInst = *I;
// If there is only one terminator instruction, process it.
- if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
- if (LastInst->getOpcode() == NVPTX::GOTO) {
- TBB = LastInst->getOperand(0).getMBB();
+ if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
+ if (LastInst.getOpcode() == NVPTX::GOTO) {
+ TBB = LastInst.getOperand(0).getMBB();
return false;
- } else if (LastInst->getOpcode() == NVPTX::CBranch) {
+ } else if (LastInst.getOpcode() == NVPTX::CBranch) {
// Block ends with fall-through condbranch.
- TBB = LastInst->getOperand(1).getMBB();
- Cond.push_back(LastInst->getOperand(0));
+ TBB = LastInst.getOperand(1).getMBB();
+ Cond.push_back(LastInst.getOperand(0));
return false;
}
// Otherwise, don't know what this is.
@@ -172,26 +175,26 @@ bool NVPTXInstrInfo::AnalyzeBranch(
}
// Get the instruction before it if it's a terminator.
- MachineInstr *SecondLastInst = I;
+ MachineInstr &SecondLastInst = *I;
// If there are three terminators, we don't know what sort of block this is.
- if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I))
+ if (I != MBB.begin() && isUnpredicatedTerminator(*--I))
return true;
// If the block ends with NVPTX::GOTO and NVPTX:CBranch, handle it.
- if (SecondLastInst->getOpcode() == NVPTX::CBranch &&
- LastInst->getOpcode() == NVPTX::GOTO) {
- TBB = SecondLastInst->getOperand(1).getMBB();
- Cond.push_back(SecondLastInst->getOperand(0));
- FBB = LastInst->getOperand(0).getMBB();
+ if (SecondLastInst.getOpcode() == NVPTX::CBranch &&
+ LastInst.getOpcode() == NVPTX::GOTO) {
+ TBB = SecondLastInst.getOperand(1).getMBB();
+ Cond.push_back(SecondLastInst.getOperand(0));
+ FBB = LastInst.getOperand(0).getMBB();
return false;
}
// If the block ends with two NVPTX:GOTOs, handle it. The second one is not
// executed, so remove it.
- if (SecondLastInst->getOpcode() == NVPTX::GOTO &&
- LastInst->getOpcode() == NVPTX::GOTO) {
- TBB = SecondLastInst->getOperand(0).getMBB();
+ if (SecondLastInst.getOpcode() == NVPTX::GOTO &&
+ LastInst.getOpcode() == NVPTX::GOTO) {
+ TBB = SecondLastInst.getOperand(0).getMBB();
I = LastInst;
if (AllowModify)
I->eraseFromParent();
@@ -226,9 +229,11 @@ unsigned NVPTXInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
return 2;
}
-unsigned NVPTXInstrInfo::InsertBranch(
- MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
- ArrayRef<MachineOperand> Cond, DebugLoc DL) const {
+unsigned NVPTXInstrInfo::InsertBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB,
+ ArrayRef<MachineOperand> Cond,
+ const DebugLoc &DL) const {
// Shouldn't be a fall through.
assert(TBB && "InsertBranch must not be told to insert a fallthrough");
assert((Cond.size() == 1 || Cond.size() == 0) &&
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.h b/lib/Target/NVPTX/NVPTXInstrInfo.h
index 3e407223f010..050bf12fe859 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.h
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.h
@@ -49,9 +49,9 @@ public:
* const TargetRegisterClass *RC) const;
*/
- void copyPhysReg(
- MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL,
- unsigned DestReg, unsigned SrcReg, bool KillSrc) const override;
+ void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+ const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+ bool KillSrc) const override;
virtual bool isMoveInstr(const MachineInstr &MI, unsigned &SrcReg,
unsigned &DestReg) const;
bool isLoadInstr(const MachineInstr &MI, unsigned &AddrSpace) const;
@@ -59,13 +59,14 @@ public:
virtual bool CanTailMerge(const MachineInstr *MI) const;
// Branch analysis.
- bool AnalyzeBranch(
- MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
- SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const override;
+ bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const override;
unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
- unsigned InsertBranch(
- MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
- ArrayRef<MachineOperand> Cond, DebugLoc DL) const override;
+ unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+ const DebugLoc &DL) const override;
unsigned getLdStCodeAddrSpace(const MachineInstr &MI) const {
return MI.getOperand(2).getImm();
}
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td
index 6fdd60f3ed2d..c158cc6cdab2 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -14,7 +14,9 @@
include "NVPTXInstrFormats.td"
// A NOP instruction
-def NOP : NVPTXInst<(outs), (ins), "", []>;
+let hasSideEffects = 0 in {
+ def NOP : NVPTXInst<(outs), (ins), "", []>;
+}
// List of vector specific properties
def isVecLD : VecInstTypeEnum<1>;
@@ -162,130 +164,146 @@ def hasPTX31 : Predicate<"Subtarget->getPTXVersion() >= 31">;
// Some Common Instruction Class Templates
//===----------------------------------------------------------------------===//
+// Template for instructions which take three int64, int32, or int16 args.
+// The instructions are named "<OpcStr><Width>" (e.g. "add.s64").
multiclass I3<string OpcStr, SDNode OpNode> {
- def i64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
- !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
- [(set Int64Regs:$dst, (OpNode Int64Regs:$a,
- Int64Regs:$b))]>;
- def i64ri : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
- !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
- [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
- def i32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
- !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
- [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
- Int32Regs:$b))]>;
- def i32ri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
- !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
- [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
- def i16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
- !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
- [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
- Int16Regs:$b))]>;
- def i16ri : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
- !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
- [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>;
+ def i64rr :
+ NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
+ !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+ [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>;
+ def i64ri :
+ NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
+ !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+ [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
+ def i32rr :
+ NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+ !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+ [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
+ def i32ri :
+ NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+ !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+ [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
+ def i16rr :
+ NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
+ !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+ [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>;
+ def i16ri :
+ NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+ !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+ [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>;
}
+// Template for instructions which take 3 int32 args. The instructions are
+// named "<OpcStr>.s32" (e.g. "addc.cc.s32").
multiclass ADD_SUB_INT_32<string OpcStr, SDNode OpNode> {
- def i32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a,
- Int32Regs:$b),
- !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
- [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
- Int32Regs:$b))]>;
- def i32ri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
- !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
- [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
+ def i32rr :
+ NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+ !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
+ [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
+ def i32ri :
+ NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+ !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
+ [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
}
+// Template for instructions which take three fp64 or fp32 args. The
+// instructions are named "<OpcStr>.f<Width>" (e.g. "add.f64").
+//
+// Also defines ftz (flush subnormal inputs and results to sign-preserving
+// zero) variants for fp32 functions.
multiclass F3<string OpcStr, SDNode OpNode> {
- def f64rr : NVPTXInst<(outs Float64Regs:$dst),
- (ins Float64Regs:$a, Float64Regs:$b),
- !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
- [(set Float64Regs:$dst,
- (OpNode Float64Regs:$a, Float64Regs:$b))]>,
- Requires<[allowFMA]>;
- def f64ri : NVPTXInst<(outs Float64Regs:$dst),
- (ins Float64Regs:$a, f64imm:$b),
- !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
- [(set Float64Regs:$dst,
- (OpNode Float64Regs:$a, fpimm:$b))]>,
- Requires<[allowFMA]>;
- def f32rr_ftz : NVPTXInst<(outs Float32Regs:$dst),
- (ins Float32Regs:$a, Float32Regs:$b),
- !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
- [(set Float32Regs:$dst,
- (OpNode Float32Regs:$a, Float32Regs:$b))]>,
- Requires<[allowFMA, doF32FTZ]>;
- def f32ri_ftz : NVPTXInst<(outs Float32Regs:$dst),
- (ins Float32Regs:$a, f32imm:$b),
- !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
- [(set Float32Regs:$dst,
- (OpNode Float32Regs:$a, fpimm:$b))]>,
- Requires<[allowFMA, doF32FTZ]>;
- def f32rr : NVPTXInst<(outs Float32Regs:$dst),
- (ins Float32Regs:$a, Float32Regs:$b),
- !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
- [(set Float32Regs:$dst,
- (OpNode Float32Regs:$a, Float32Regs:$b))]>,
- Requires<[allowFMA]>;
- def f32ri : NVPTXInst<(outs Float32Regs:$dst),
- (ins Float32Regs:$a, f32imm:$b),
- !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
- [(set Float32Regs:$dst,
- (OpNode Float32Regs:$a, fpimm:$b))]>,
- Requires<[allowFMA]>;
+ def f64rr :
+ NVPTXInst<(outs Float64Regs:$dst),
+ (ins Float64Regs:$a, Float64Regs:$b),
+ !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
+ [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>,
+ Requires<[allowFMA]>;
+ def f64ri :
+ NVPTXInst<(outs Float64Regs:$dst),
+ (ins Float64Regs:$a, f64imm:$b),
+ !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
+ [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>,
+ Requires<[allowFMA]>;
+ def f32rr_ftz :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, Float32Regs:$b),
+ !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
+ [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+ Requires<[allowFMA, doF32FTZ]>;
+ def f32ri_ftz :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, f32imm:$b),
+ !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
+ [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
+ Requires<[allowFMA, doF32FTZ]>;
+ def f32rr :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, Float32Regs:$b),
+ !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
+ [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+ Requires<[allowFMA]>;
+ def f32ri :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, f32imm:$b),
+ !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
+ [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
+ Requires<[allowFMA]>;
}
+// Same as F3, but defines ".rn" variants (round to nearest even).
multiclass F3_rn<string OpcStr, SDNode OpNode> {
- def f64rr : NVPTXInst<(outs Float64Regs:$dst),
- (ins Float64Regs:$a, Float64Regs:$b),
- !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
- [(set Float64Regs:$dst,
- (OpNode Float64Regs:$a, Float64Regs:$b))]>,
- Requires<[noFMA]>;
- def f64ri : NVPTXInst<(outs Float64Regs:$dst),
- (ins Float64Regs:$a, f64imm:$b),
- !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
- [(set Float64Regs:$dst,
- (OpNode Float64Regs:$a, fpimm:$b))]>,
- Requires<[noFMA]>;
- def f32rr_ftz : NVPTXInst<(outs Float32Regs:$dst),
- (ins Float32Regs:$a, Float32Regs:$b),
- !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
- [(set Float32Regs:$dst,
- (OpNode Float32Regs:$a, Float32Regs:$b))]>,
- Requires<[noFMA, doF32FTZ]>;
- def f32ri_ftz : NVPTXInst<(outs Float32Regs:$dst),
- (ins Float32Regs:$a, f32imm:$b),
- !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
- [(set Float32Regs:$dst,
- (OpNode Float32Regs:$a, fpimm:$b))]>,
- Requires<[noFMA, doF32FTZ]>;
- def f32rr : NVPTXInst<(outs Float32Regs:$dst),
- (ins Float32Regs:$a, Float32Regs:$b),
- !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
- [(set Float32Regs:$dst,
- (OpNode Float32Regs:$a, Float32Regs:$b))]>,
- Requires<[noFMA]>;
- def f32ri : NVPTXInst<(outs Float32Regs:$dst),
- (ins Float32Regs:$a, f32imm:$b),
- !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
- [(set Float32Regs:$dst,
- (OpNode Float32Regs:$a, fpimm:$b))]>,
- Requires<[noFMA]>;
+ def f64rr :
+ NVPTXInst<(outs Float64Regs:$dst),
+ (ins Float64Regs:$a, Float64Regs:$b),
+ !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
+ [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>,
+ Requires<[noFMA]>;
+ def f64ri :
+ NVPTXInst<(outs Float64Regs:$dst),
+ (ins Float64Regs:$a, f64imm:$b),
+ !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
+ [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>,
+ Requires<[noFMA]>;
+ def f32rr_ftz :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, Float32Regs:$b),
+ !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
+ [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+ Requires<[noFMA, doF32FTZ]>;
+ def f32ri_ftz :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, f32imm:$b),
+ !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
+ [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
+ Requires<[noFMA, doF32FTZ]>;
+ def f32rr :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, Float32Regs:$b),
+ !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
+ [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+ Requires<[noFMA]>;
+ def f32ri :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, f32imm:$b),
+ !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
+ [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
+ Requires<[noFMA]>;
}
+// Template for operations which take two f32 or f64 operands. Provides three
+// instructions: <OpcStr>.f64, <OpcStr>.f32, and <OpcStr>.ftz.f32 (flush
+// subnormal inputs and results to zero).
multiclass F2<string OpcStr, SDNode OpNode> {
- def f64 : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a),
- !strconcat(OpcStr, ".f64 \t$dst, $a;"),
- [(set Float64Regs:$dst, (OpNode Float64Regs:$a))]>;
+ def f64 : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a),
+ !strconcat(OpcStr, ".f64 \t$dst, $a;"),
+ [(set Float64Regs:$dst, (OpNode Float64Regs:$a))]>;
def f32_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
- !strconcat(OpcStr, ".ftz.f32 \t$dst, $a;"),
- [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>,
- Requires<[doF32FTZ]>;
- def f32 : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
- !strconcat(OpcStr, ".f32 \t$dst, $a;"),
- [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>;
+ !strconcat(OpcStr, ".ftz.f32 \t$dst, $a;"),
+ [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>,
+ Requires<[doF32FTZ]>;
+ def f32 : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
+ !strconcat(OpcStr, ".f32 \t$dst, $a;"),
+ [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>;
}
//===----------------------------------------------------------------------===//
@@ -293,160 +311,251 @@ multiclass F2<string OpcStr, SDNode OpNode> {
//===----------------------------------------------------------------------===//
//-----------------------------------
-// General Type Conversion
+// Type Conversion
//-----------------------------------
let hasSideEffects = 0 in {
-// Generate a cvt to the given type from all possible types.
-// Each instance takes a CvtMode immediate that defines the conversion mode to
-// use. It can be CvtNONE to omit a conversion mode.
-multiclass CVT_FROM_ALL<string FromName, RegisterClass RC> {
- def _s16 : NVPTXInst<(outs RC:$dst),
- (ins Int16Regs:$src, CvtMode:$mode),
- !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
- FromName, ".s16\t$dst, $src;"),
- []>;
- def _u16 : NVPTXInst<(outs RC:$dst),
- (ins Int16Regs:$src, CvtMode:$mode),
- !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
- FromName, ".u16\t$dst, $src;"),
- []>;
- def _f16 : NVPTXInst<(outs RC:$dst),
- (ins Int16Regs:$src, CvtMode:$mode),
- !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
- FromName, ".f16\t$dst, $src;"),
- []>;
- def _s32 : NVPTXInst<(outs RC:$dst),
- (ins Int32Regs:$src, CvtMode:$mode),
- !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
- FromName, ".s32\t$dst, $src;"),
- []>;
- def _u32 : NVPTXInst<(outs RC:$dst),
- (ins Int32Regs:$src, CvtMode:$mode),
- !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
- FromName, ".u32\t$dst, $src;"),
- []>;
- def _s64 : NVPTXInst<(outs RC:$dst),
- (ins Int64Regs:$src, CvtMode:$mode),
- !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
- FromName, ".s64\t$dst, $src;"),
- []>;
- def _u64 : NVPTXInst<(outs RC:$dst),
- (ins Int64Regs:$src, CvtMode:$mode),
- !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
- FromName, ".u64\t$dst, $src;"),
- []>;
- def _f32 : NVPTXInst<(outs RC:$dst),
- (ins Float32Regs:$src, CvtMode:$mode),
- !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
- FromName, ".f32\t$dst, $src;"),
- []>;
- def _f64 : NVPTXInst<(outs RC:$dst),
- (ins Float64Regs:$src, CvtMode:$mode),
- !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
- FromName, ".f64\t$dst, $src;"),
- []>;
-}
-
-// Generate a cvt to all possible types.
-defm CVT_s16 : CVT_FROM_ALL<"s16", Int16Regs>;
-defm CVT_u16 : CVT_FROM_ALL<"u16", Int16Regs>;
-defm CVT_f16 : CVT_FROM_ALL<"f16", Int16Regs>;
-defm CVT_s32 : CVT_FROM_ALL<"s32", Int32Regs>;
-defm CVT_u32 : CVT_FROM_ALL<"u32", Int32Regs>;
-defm CVT_s64 : CVT_FROM_ALL<"s64", Int64Regs>;
-defm CVT_u64 : CVT_FROM_ALL<"u64", Int64Regs>;
-defm CVT_f32 : CVT_FROM_ALL<"f32", Float32Regs>;
-defm CVT_f64 : CVT_FROM_ALL<"f64", Float64Regs>;
-
-// This set of cvt is different from the above. The type of the source
-// and target are the same.
-//
-def CVT_INREG_s16_s8 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
- "cvt.s16.s8 \t$dst, $src;", []>;
-def CVT_INREG_s32_s8 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
- "cvt.s32.s8 \t$dst, $src;", []>;
-def CVT_INREG_s32_s16 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
- "cvt.s32.s16 \t$dst, $src;", []>;
-def CVT_INREG_s64_s8 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
- "cvt.s64.s8 \t$dst, $src;", []>;
-def CVT_INREG_s64_s16 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
- "cvt.s64.s16 \t$dst, $src;", []>;
-def CVT_INREG_s64_s32 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
- "cvt.s64.s32 \t$dst, $src;", []>;
+ // Generate a cvt to the given type from all possible types. Each instance
+ // takes a CvtMode immediate that defines the conversion mode to use. It can
+ // be CvtNONE to omit a conversion mode.
+ multiclass CVT_FROM_ALL<string FromName, RegisterClass RC> {
+ def _s8 :
+ NVPTXInst<(outs RC:$dst),
+ (ins Int16Regs:$src, CvtMode:$mode),
+ !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+ FromName, ".s8\t$dst, $src;"), []>;
+ def _u8 :
+ NVPTXInst<(outs RC:$dst),
+ (ins Int16Regs:$src, CvtMode:$mode),
+ !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+ FromName, ".u8\t$dst, $src;"), []>;
+ def _s16 :
+ NVPTXInst<(outs RC:$dst),
+ (ins Int16Regs:$src, CvtMode:$mode),
+ !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+ FromName, ".s16\t$dst, $src;"), []>;
+ def _u16 :
+ NVPTXInst<(outs RC:$dst),
+ (ins Int16Regs:$src, CvtMode:$mode),
+ !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+ FromName, ".u16\t$dst, $src;"), []>;
+ def _f16 :
+ NVPTXInst<(outs RC:$dst),
+ (ins Int16Regs:$src, CvtMode:$mode),
+ !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+ FromName, ".f16\t$dst, $src;"), []>;
+ def _s32 :
+ NVPTXInst<(outs RC:$dst),
+ (ins Int32Regs:$src, CvtMode:$mode),
+ !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+ FromName, ".s32\t$dst, $src;"), []>;
+ def _u32 :
+ NVPTXInst<(outs RC:$dst),
+ (ins Int32Regs:$src, CvtMode:$mode),
+ !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+ FromName, ".u32\t$dst, $src;"), []>;
+ def _s64 :
+ NVPTXInst<(outs RC:$dst),
+ (ins Int64Regs:$src, CvtMode:$mode),
+ !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+ FromName, ".s64\t$dst, $src;"), []>;
+ def _u64 :
+ NVPTXInst<(outs RC:$dst),
+ (ins Int64Regs:$src, CvtMode:$mode),
+ !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+ FromName, ".u64\t$dst, $src;"), []>;
+ def _f32 :
+ NVPTXInst<(outs RC:$dst),
+ (ins Float32Regs:$src, CvtMode:$mode),
+ !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+ FromName, ".f32\t$dst, $src;"), []>;
+ def _f64 :
+ NVPTXInst<(outs RC:$dst),
+ (ins Float64Regs:$src, CvtMode:$mode),
+ !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+ FromName, ".f64\t$dst, $src;"), []>;
+ }
+
+ // Generate cvts from all types to all types.
+ defm CVT_s8 : CVT_FROM_ALL<"s8", Int16Regs>;
+ defm CVT_u8 : CVT_FROM_ALL<"u8", Int16Regs>;
+ defm CVT_s16 : CVT_FROM_ALL<"s16", Int16Regs>;
+ defm CVT_u16 : CVT_FROM_ALL<"u16", Int16Regs>;
+ defm CVT_f16 : CVT_FROM_ALL<"f16", Int16Regs>;
+ defm CVT_s32 : CVT_FROM_ALL<"s32", Int32Regs>;
+ defm CVT_u32 : CVT_FROM_ALL<"u32", Int32Regs>;
+ defm CVT_s64 : CVT_FROM_ALL<"s64", Int64Regs>;
+ defm CVT_u64 : CVT_FROM_ALL<"u64", Int64Regs>;
+ defm CVT_f32 : CVT_FROM_ALL<"f32", Float32Regs>;
+ defm CVT_f64 : CVT_FROM_ALL<"f64", Float64Regs>;
+
+ // These cvts are different from those above: The source and dest registers
+ // are of the same type.
+ def CVT_INREG_s16_s8 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
+ "cvt.s16.s8 \t$dst, $src;", []>;
+ def CVT_INREG_s32_s8 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
+ "cvt.s32.s8 \t$dst, $src;", []>;
+ def CVT_INREG_s32_s16 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
+ "cvt.s32.s16 \t$dst, $src;", []>;
+ def CVT_INREG_s64_s8 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+ "cvt.s64.s8 \t$dst, $src;", []>;
+ def CVT_INREG_s64_s16 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+ "cvt.s64.s16 \t$dst, $src;", []>;
+ def CVT_INREG_s64_s32 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+ "cvt.s64.s32 \t$dst, $src;", []>;
}
//-----------------------------------
// Integer Arithmetic
//-----------------------------------
+// Template for xor masquerading as int1 arithmetic.
multiclass ADD_SUB_i1<SDNode OpNode> {
def _rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
- "xor.pred \t$dst, $a, $b;",
- [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
+ "xor.pred \t$dst, $a, $b;",
+ [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
def _ri: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
- "xor.pred \t$dst, $a, $b;",
- [(set Int1Regs:$dst, (OpNode Int1Regs:$a, (imm):$b))]>;
+ "xor.pred \t$dst, $a, $b;",
+ [(set Int1Regs:$dst, (OpNode Int1Regs:$a, (imm):$b))]>;
}
+// int1 addition and subtraction are both just xor.
defm ADD_i1 : ADD_SUB_i1<add>;
defm SUB_i1 : ADD_SUB_i1<sub>;
-
+// int16, int32, and int64 signed addition. Since nvptx is 2's compliment, we
+// also use these for unsigned arithmetic.
defm ADD : I3<"add.s", add>;
defm SUB : I3<"sub.s", sub>;
+// int32 addition and subtraction with carry-out.
+// FIXME: PTX 4.3 adds a 64-bit add.cc (and maybe also 64-bit addc.cc?).
defm ADDCC : ADD_SUB_INT_32<"add.cc", addc>;
defm SUBCC : ADD_SUB_INT_32<"sub.cc", subc>;
+// int32 addition and subtraction with carry-in and carry-out.
defm ADDCCC : ADD_SUB_INT_32<"addc.cc", adde>;
defm SUBCCC : ADD_SUB_INT_32<"subc.cc", sube>;
-//mul.wide PTX instruction
+defm MULT : I3<"mul.lo.s", mul>;
+
+defm MULTHS : I3<"mul.hi.s", mulhs>;
+defm MULTHU : I3<"mul.hi.u", mulhu>;
+
+defm SDIV : I3<"div.s", sdiv>;
+defm UDIV : I3<"div.u", udiv>;
+
+// The ri versions of rem.s and rem.u won't be selected; DAGCombiner::visitSREM
+// will lower it.
+defm SREM : I3<"rem.s", srem>;
+defm UREM : I3<"rem.u", urem>;
+
+
+//
+// Wide multiplication
+//
+def MULWIDES64 :
+ NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+ "mul.wide.s32 \t$dst, $a, $b;", []>;
+def MULWIDES64Imm :
+ NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+ "mul.wide.s32 \t$dst, $a, $b;", []>;
+def MULWIDES64Imm64 :
+ NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
+ "mul.wide.s32 \t$dst, $a, $b;", []>;
+
+def MULWIDEU64 :
+ NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+ "mul.wide.u32 \t$dst, $a, $b;", []>;
+def MULWIDEU64Imm :
+ NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+ "mul.wide.u32 \t$dst, $a, $b;", []>;
+def MULWIDEU64Imm64 :
+ NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
+ "mul.wide.u32 \t$dst, $a, $b;", []>;
+
+def MULWIDES32 :
+ NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
+ "mul.wide.s16 \t$dst, $a, $b;", []>;
+def MULWIDES32Imm :
+ NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+ "mul.wide.s16 \t$dst, $a, $b;", []>;
+def MULWIDES32Imm32 :
+ NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
+ "mul.wide.s16 \t$dst, $a, $b;", []>;
+
+def MULWIDEU32 :
+ NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
+ "mul.wide.u16 \t$dst, $a, $b;", []>;
+def MULWIDEU32Imm :
+ NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+ "mul.wide.u16 \t$dst, $a, $b;", []>;
+def MULWIDEU32Imm32 :
+ NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
+ "mul.wide.u16 \t$dst, $a, $b;", []>;
+
+def SDTMulWide : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>;
+def mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>;
+def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>;
+
+// Matchers for signed, unsigned mul.wide ISD nodes.
+def : Pat<(i32 (mul_wide_signed Int16Regs:$a, Int16Regs:$b)),
+ (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>,
+ Requires<[doMulWide]>;
+def : Pat<(i32 (mul_wide_signed Int16Regs:$a, imm:$b)),
+ (MULWIDES32Imm Int16Regs:$a, imm:$b)>,
+ Requires<[doMulWide]>;
+def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, Int16Regs:$b)),
+ (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
+ Requires<[doMulWide]>;
+def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, imm:$b)),
+ (MULWIDEU32Imm Int16Regs:$a, imm:$b)>,
+ Requires<[doMulWide]>;
+
+def : Pat<(i64 (mul_wide_signed Int32Regs:$a, Int32Regs:$b)),
+ (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
+ Requires<[doMulWide]>;
+def : Pat<(i64 (mul_wide_signed Int32Regs:$a, imm:$b)),
+ (MULWIDES64Imm Int32Regs:$a, imm:$b)>,
+ Requires<[doMulWide]>;
+def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, Int32Regs:$b)),
+ (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
+ Requires<[doMulWide]>;
+def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, imm:$b)),
+ (MULWIDEU64Imm Int32Regs:$a, imm:$b)>,
+ Requires<[doMulWide]>;
+
+// Predicates used for converting some patterns to mul.wide.
def SInt32Const : PatLeaf<(imm), [{
const APInt &v = N->getAPIntValue();
- if (v.isSignedIntN(32))
- return true;
- return false;
+ return v.isSignedIntN(32);
}]>;
def UInt32Const : PatLeaf<(imm), [{
const APInt &v = N->getAPIntValue();
- if (v.isIntN(32))
- return true;
- return false;
+ return v.isIntN(32);
}]>;
def SInt16Const : PatLeaf<(imm), [{
const APInt &v = N->getAPIntValue();
- if (v.isSignedIntN(16))
- return true;
- return false;
+ return v.isSignedIntN(16);
}]>;
def UInt16Const : PatLeaf<(imm), [{
const APInt &v = N->getAPIntValue();
- if (v.isIntN(16))
- return true;
- return false;
+ return v.isIntN(16);
}]>;
def Int5Const : PatLeaf<(imm), [{
+ // Check if 0 <= v < 32; only then will the result of (x << v) be an int32.
const APInt &v = N->getAPIntValue();
- // Check if 0 <= v < 32
- // Only then the result from (x << v) will be i32
- if (v.sge(0) && v.slt(32))
- return true;
- return false;
+ return v.sge(0) && v.slt(32);
}]>;
def Int4Const : PatLeaf<(imm), [{
+ // Check if 0 <= v < 16; only then will the result of (x << v) be an int16.
const APInt &v = N->getAPIntValue();
- // Check if 0 <= v < 16
- // Only then the result from (x << v) will be i16
- if (v.sge(0) && v.slt(16))
- return true;
- return false;
+ return v.sge(0) && v.slt(16);
}]>;
def SHL2MUL32 : SDNodeXForm<imm, [{
@@ -461,215 +570,133 @@ def SHL2MUL16 : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i16);
}]>;
-def MULWIDES64
- : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
- "mul.wide.s32 \t$dst, $a, $b;", []>;
-def MULWIDES64Imm
- : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
- "mul.wide.s32 \t$dst, $a, $b;", []>;
-def MULWIDES64Imm64
- : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
- "mul.wide.s32 \t$dst, $a, $b;", []>;
-
-def MULWIDEU64
- : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
- "mul.wide.u32 \t$dst, $a, $b;", []>;
-def MULWIDEU64Imm
- : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
- "mul.wide.u32 \t$dst, $a, $b;", []>;
-def MULWIDEU64Imm64
- : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
- "mul.wide.u32 \t$dst, $a, $b;", []>;
-
-def MULWIDES32
- : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
- "mul.wide.s16 \t$dst, $a, $b;", []>;
-def MULWIDES32Imm
- : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
- "mul.wide.s16 \t$dst, $a, $b;", []>;
-def MULWIDES32Imm32
- : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
- "mul.wide.s16 \t$dst, $a, $b;", []>;
-
-def MULWIDEU32
- : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
- "mul.wide.u16 \t$dst, $a, $b;", []>;
-def MULWIDEU32Imm
- : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
- "mul.wide.u16 \t$dst, $a, $b;", []>;
-def MULWIDEU32Imm32
- : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
- "mul.wide.u16 \t$dst, $a, $b;", []>;
-
+// Convert "sign/zero-extend, then shift left by an immediate" to mul.wide.
def : Pat<(shl (sext Int32Regs:$a), (i32 Int5Const:$b)),
(MULWIDES64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
- Requires<[doMulWide]>;
+ Requires<[doMulWide]>;
def : Pat<(shl (zext Int32Regs:$a), (i32 Int5Const:$b)),
(MULWIDEU64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
- Requires<[doMulWide]>;
+ Requires<[doMulWide]>;
def : Pat<(shl (sext Int16Regs:$a), (i16 Int4Const:$b)),
(MULWIDES32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
- Requires<[doMulWide]>;
+ Requires<[doMulWide]>;
def : Pat<(shl (zext Int16Regs:$a), (i16 Int4Const:$b)),
(MULWIDEU32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
- Requires<[doMulWide]>;
+ Requires<[doMulWide]>;
+// Convert "sign/zero-extend then multiply" to mul.wide.
def : Pat<(mul (sext Int32Regs:$a), (sext Int32Regs:$b)),
(MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
- Requires<[doMulWide]>;
+ Requires<[doMulWide]>;
def : Pat<(mul (sext Int32Regs:$a), (i64 SInt32Const:$b)),
(MULWIDES64Imm64 Int32Regs:$a, (i64 SInt32Const:$b))>,
- Requires<[doMulWide]>;
+ Requires<[doMulWide]>;
def : Pat<(mul (zext Int32Regs:$a), (zext Int32Regs:$b)),
(MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
Requires<[doMulWide]>;
def : Pat<(mul (zext Int32Regs:$a), (i64 UInt32Const:$b)),
(MULWIDEU64Imm64 Int32Regs:$a, (i64 UInt32Const:$b))>,
- Requires<[doMulWide]>;
+ Requires<[doMulWide]>;
def : Pat<(mul (sext Int16Regs:$a), (sext Int16Regs:$b)),
(MULWIDES32 Int16Regs:$a, Int16Regs:$b)>,
Requires<[doMulWide]>;
def : Pat<(mul (sext Int16Regs:$a), (i32 SInt16Const:$b)),
(MULWIDES32Imm32 Int16Regs:$a, (i32 SInt16Const:$b))>,
- Requires<[doMulWide]>;
+ Requires<[doMulWide]>;
def : Pat<(mul (zext Int16Regs:$a), (zext Int16Regs:$b)),
(MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
Requires<[doMulWide]>;
def : Pat<(mul (zext Int16Regs:$a), (i32 UInt16Const:$b)),
(MULWIDEU32Imm32 Int16Regs:$a, (i32 UInt16Const:$b))>,
- Requires<[doMulWide]>;
-
-
-def SDTMulWide
- : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>;
-def mul_wide_signed
- : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>;
-def mul_wide_unsigned
- : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>;
-
-def : Pat<(i32 (mul_wide_signed Int16Regs:$a, Int16Regs:$b)),
- (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>,
Requires<[doMulWide]>;
-def : Pat<(i32 (mul_wide_signed Int16Regs:$a, imm:$b)),
- (MULWIDES32Imm Int16Regs:$a, imm:$b)>,
- Requires<[doMulWide]>;
-def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, Int16Regs:$b)),
- (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
- Requires<[doMulWide]>;
-def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, imm:$b)),
- (MULWIDEU32Imm Int16Regs:$a, imm:$b)>,
- Requires<[doMulWide]>;
-
-def : Pat<(i64 (mul_wide_signed Int32Regs:$a, Int32Regs:$b)),
- (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
- Requires<[doMulWide]>;
-def : Pat<(i64 (mul_wide_signed Int32Regs:$a, imm:$b)),
- (MULWIDES64Imm Int32Regs:$a, imm:$b)>,
- Requires<[doMulWide]>;
-def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, Int32Regs:$b)),
- (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
- Requires<[doMulWide]>;
-def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, imm:$b)),
- (MULWIDEU64Imm Int32Regs:$a, imm:$b)>,
- Requires<[doMulWide]>;
-
-defm MULT : I3<"mul.lo.s", mul>;
-
-defm MULTHS : I3<"mul.hi.s", mulhs>;
-defm MULTHU : I3<"mul.hi.u", mulhu>;
-
-defm SDIV : I3<"div.s", sdiv>;
-defm UDIV : I3<"div.u", udiv>;
-
-defm SREM : I3<"rem.s", srem>;
-// The ri version will not be selected as DAGCombiner::visitSREM will lower it.
-defm UREM : I3<"rem.u", urem>;
-// The ri version will not be selected as DAGCombiner::visitUREM will lower it.
-
-def SDTIMAD
- : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>,
- SDTCisInt<2>, SDTCisSameAs<0, 2>,
- SDTCisSameAs<0, 3>]>;
-def imad
- : SDNode<"NVPTXISD::IMAD", SDTIMAD>;
-
-def MAD16rrr : NVPTXInst<(outs Int16Regs:$dst),
- (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
- "mad.lo.s16 \t$dst, $a, $b, $c;",
- [(set Int16Regs:$dst,
- (imad Int16Regs:$a, Int16Regs:$b, Int16Regs:$c))]>;
-def MAD16rri : NVPTXInst<(outs Int16Regs:$dst),
- (ins Int16Regs:$a, Int16Regs:$b, i16imm:$c),
- "mad.lo.s16 \t$dst, $a, $b, $c;",
- [(set Int16Regs:$dst,
- (imad Int16Regs:$a, Int16Regs:$b, imm:$c))]>;
-def MAD16rir : NVPTXInst<(outs Int16Regs:$dst),
- (ins Int16Regs:$a, i16imm:$b, Int16Regs:$c),
- "mad.lo.s16 \t$dst, $a, $b, $c;",
- [(set Int16Regs:$dst,
- (imad Int16Regs:$a, imm:$b, Int16Regs:$c))]>;
-def MAD16rii : NVPTXInst<(outs Int16Regs:$dst),
- (ins Int16Regs:$a, i16imm:$b, i16imm:$c),
- "mad.lo.s16 \t$dst, $a, $b, $c;",
- [(set Int16Regs:$dst,
- (imad Int16Regs:$a, imm:$b, imm:$c))]>;
-
-def MAD32rrr : NVPTXInst<(outs Int32Regs:$dst),
- (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c),
- "mad.lo.s32 \t$dst, $a, $b, $c;",
- [(set Int32Regs:$dst,
- (imad Int32Regs:$a, Int32Regs:$b, Int32Regs:$c))]>;
-def MAD32rri : NVPTXInst<(outs Int32Regs:$dst),
- (ins Int32Regs:$a, Int32Regs:$b, i32imm:$c),
- "mad.lo.s32 \t$dst, $a, $b, $c;",
- [(set Int32Regs:$dst,
- (imad Int32Regs:$a, Int32Regs:$b, imm:$c))]>;
-def MAD32rir : NVPTXInst<(outs Int32Regs:$dst),
- (ins Int32Regs:$a, i32imm:$b, Int32Regs:$c),
- "mad.lo.s32 \t$dst, $a, $b, $c;",
- [(set Int32Regs:$dst,
- (imad Int32Regs:$a, imm:$b, Int32Regs:$c))]>;
-def MAD32rii : NVPTXInst<(outs Int32Regs:$dst),
- (ins Int32Regs:$a, i32imm:$b, i32imm:$c),
- "mad.lo.s32 \t$dst, $a, $b, $c;",
- [(set Int32Regs:$dst,
- (imad Int32Regs:$a, imm:$b, imm:$c))]>;
-
-def MAD64rrr : NVPTXInst<(outs Int64Regs:$dst),
- (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c),
- "mad.lo.s64 \t$dst, $a, $b, $c;",
- [(set Int64Regs:$dst,
- (imad Int64Regs:$a, Int64Regs:$b, Int64Regs:$c))]>;
-def MAD64rri : NVPTXInst<(outs Int64Regs:$dst),
- (ins Int64Regs:$a, Int64Regs:$b, i64imm:$c),
- "mad.lo.s64 \t$dst, $a, $b, $c;",
- [(set Int64Regs:$dst,
- (imad Int64Regs:$a, Int64Regs:$b, imm:$c))]>;
-def MAD64rir : NVPTXInst<(outs Int64Regs:$dst),
- (ins Int64Regs:$a, i64imm:$b, Int64Regs:$c),
- "mad.lo.s64 \t$dst, $a, $b, $c;",
- [(set Int64Regs:$dst,
- (imad Int64Regs:$a, imm:$b, Int64Regs:$c))]>;
-def MAD64rii : NVPTXInst<(outs Int64Regs:$dst),
- (ins Int64Regs:$a, i64imm:$b, i64imm:$c),
- "mad.lo.s64 \t$dst, $a, $b, $c;",
- [(set Int64Regs:$dst,
- (imad Int64Regs:$a, imm:$b, imm:$c))]>;
-
-def INEG16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
- "neg.s16 \t$dst, $src;",
- [(set Int16Regs:$dst, (ineg Int16Regs:$src))]>;
-def INEG32 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
- "neg.s32 \t$dst, $src;",
- [(set Int32Regs:$dst, (ineg Int32Regs:$src))]>;
-def INEG64 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
- "neg.s64 \t$dst, $src;",
- [(set Int64Regs:$dst, (ineg Int64Regs:$src))]>;
+//
+// Integer multiply-add
+//
+def SDTIMAD :
+ SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<2>,
+ SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>]>;
+def imad : SDNode<"NVPTXISD::IMAD", SDTIMAD>;
+
+def MAD16rrr :
+ NVPTXInst<(outs Int16Regs:$dst),
+ (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
+ "mad.lo.s16 \t$dst, $a, $b, $c;",
+ [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, Int16Regs:$c))]>;
+def MAD16rri :
+ NVPTXInst<(outs Int16Regs:$dst),
+ (ins Int16Regs:$a, Int16Regs:$b, i16imm:$c),
+ "mad.lo.s16 \t$dst, $a, $b, $c;",
+ [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, imm:$c))]>;
+def MAD16rir :
+ NVPTXInst<(outs Int16Regs:$dst),
+ (ins Int16Regs:$a, i16imm:$b, Int16Regs:$c),
+ "mad.lo.s16 \t$dst, $a, $b, $c;",
+ [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, Int16Regs:$c))]>;
+def MAD16rii :
+ NVPTXInst<(outs Int16Regs:$dst),
+ (ins Int16Regs:$a, i16imm:$b, i16imm:$c),
+ "mad.lo.s16 \t$dst, $a, $b, $c;",
+ [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, imm:$c))]>;
+
+def MAD32rrr :
+ NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c),
+ "mad.lo.s32 \t$dst, $a, $b, $c;",
+ [(set Int32Regs:$dst, (imad Int32Regs:$a, Int32Regs:$b, Int32Regs:$c))]>;
+def MAD32rri :
+ NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$a, Int32Regs:$b, i32imm:$c),
+ "mad.lo.s32 \t$dst, $a, $b, $c;",
+ [(set Int32Regs:$dst, (imad Int32Regs:$a, Int32Regs:$b, imm:$c))]>;
+def MAD32rir :
+ NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$a, i32imm:$b, Int32Regs:$c),
+ "mad.lo.s32 \t$dst, $a, $b, $c;",
+ [(set Int32Regs:$dst, (imad Int32Regs:$a, imm:$b, Int32Regs:$c))]>;
+def MAD32rii :
+ NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$a, i32imm:$b, i32imm:$c),
+ "mad.lo.s32 \t$dst, $a, $b, $c;",
+ [(set Int32Regs:$dst, (imad Int32Regs:$a, imm:$b, imm:$c))]>;
+
+def MAD64rrr :
+ NVPTXInst<(outs Int64Regs:$dst),
+ (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c),
+ "mad.lo.s64 \t$dst, $a, $b, $c;",
+ [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, Int64Regs:$c))]>;
+def MAD64rri :
+ NVPTXInst<(outs Int64Regs:$dst),
+ (ins Int64Regs:$a, Int64Regs:$b, i64imm:$c),
+ "mad.lo.s64 \t$dst, $a, $b, $c;",
+ [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, imm:$c))]>;
+def MAD64rir :
+ NVPTXInst<(outs Int64Regs:$dst),
+ (ins Int64Regs:$a, i64imm:$b, Int64Regs:$c),
+ "mad.lo.s64 \t$dst, $a, $b, $c;",
+ [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, Int64Regs:$c))]>;
+def MAD64rii :
+ NVPTXInst<(outs Int64Regs:$dst),
+ (ins Int64Regs:$a, i64imm:$b, i64imm:$c),
+ "mad.lo.s64 \t$dst, $a, $b, $c;",
+ [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, imm:$c))]>;
+
+def INEG16 :
+ NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
+ "neg.s16 \t$dst, $src;",
+ [(set Int16Regs:$dst, (ineg Int16Regs:$src))]>;
+def INEG32 :
+ NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
+ "neg.s32 \t$dst, $src;",
+ [(set Int32Regs:$dst, (ineg Int32Regs:$src))]>;
+def INEG64 :
+ NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+ "neg.s64 \t$dst, $src;",
+ [(set Int64Regs:$dst, (ineg Int64Regs:$src))]>;
//-----------------------------------
// Floating Point Arithmetic
@@ -677,17 +704,13 @@ def INEG64 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
// Constant 1.0f
def FloatConst1 : PatLeaf<(fpimm), [{
- if (&(N->getValueAPF().getSemantics()) != &llvm::APFloat::IEEEsingle)
- return false;
- float f = (float)N->getValueAPF().convertToFloat();
- return (f==1.0f);
+ return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEsingle &&
+ N->getValueAPF().convertToFloat() == 1.0f;
}]>;
-// Constand (double)1.0
+// Constant 1.0 (double)
def DoubleConst1 : PatLeaf<(fpimm), [{
- if (&(N->getValueAPF().getSemantics()) != &llvm::APFloat::IEEEdouble)
- return false;
- double d = (double)N->getValueAPF().convertToDouble();
- return (d==1.0);
+ return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEdouble &&
+ N->getValueAPF().convertToDouble() == 1.0;
}]>;
defm FADD : F3<"add", fadd>;
@@ -698,157 +721,157 @@ defm FADD_rn : F3_rn<"add", fadd>;
defm FSUB_rn : F3_rn<"sub", fsub>;
defm FMUL_rn : F3_rn<"mul", fmul>;
-defm FABS : F2<"abs", fabs>;
-defm FNEG : F2<"neg", fneg>;
+defm FABS : F2<"abs", fabs>;
+defm FNEG : F2<"neg", fneg>;
defm FSQRT : F2<"sqrt.rn", fsqrt>;
//
// F64 division
//
-def FDIV641r : NVPTXInst<(outs Float64Regs:$dst),
- (ins f64imm:$a, Float64Regs:$b),
- "rcp.rn.f64 \t$dst, $b;",
- [(set Float64Regs:$dst,
- (fdiv DoubleConst1:$a, Float64Regs:$b))]>;
-def FDIV64rr : NVPTXInst<(outs Float64Regs:$dst),
- (ins Float64Regs:$a, Float64Regs:$b),
- "div.rn.f64 \t$dst, $a, $b;",
- [(set Float64Regs:$dst,
- (fdiv Float64Regs:$a, Float64Regs:$b))]>;
-def FDIV64ri : NVPTXInst<(outs Float64Regs:$dst),
- (ins Float64Regs:$a, f64imm:$b),
- "div.rn.f64 \t$dst, $a, $b;",
- [(set Float64Regs:$dst,
- (fdiv Float64Regs:$a, fpimm:$b))]>;
+def FDIV641r :
+ NVPTXInst<(outs Float64Regs:$dst),
+ (ins f64imm:$a, Float64Regs:$b),
+ "rcp.rn.f64 \t$dst, $b;",
+ [(set Float64Regs:$dst, (fdiv DoubleConst1:$a, Float64Regs:$b))]>;
+def FDIV64rr :
+ NVPTXInst<(outs Float64Regs:$dst),
+ (ins Float64Regs:$a, Float64Regs:$b),
+ "div.rn.f64 \t$dst, $a, $b;",
+ [(set Float64Regs:$dst, (fdiv Float64Regs:$a, Float64Regs:$b))]>;
+def FDIV64ri :
+ NVPTXInst<(outs Float64Regs:$dst),
+ (ins Float64Regs:$a, f64imm:$b),
+ "div.rn.f64 \t$dst, $a, $b;",
+ [(set Float64Regs:$dst, (fdiv Float64Regs:$a, fpimm:$b))]>;
//
// F32 Approximate reciprocal
//
-def FDIV321r_ftz : NVPTXInst<(outs Float32Regs:$dst),
- (ins f32imm:$a, Float32Regs:$b),
- "rcp.approx.ftz.f32 \t$dst, $b;",
- [(set Float32Regs:$dst,
- (fdiv FloatConst1:$a, Float32Regs:$b))]>,
- Requires<[do_DIVF32_APPROX, doF32FTZ]>;
-def FDIV321r : NVPTXInst<(outs Float32Regs:$dst),
- (ins f32imm:$a, Float32Regs:$b),
- "rcp.approx.f32 \t$dst, $b;",
- [(set Float32Regs:$dst,
- (fdiv FloatConst1:$a, Float32Regs:$b))]>,
- Requires<[do_DIVF32_APPROX]>;
+def FDIV321r_ftz :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins f32imm:$a, Float32Regs:$b),
+ "rcp.approx.ftz.f32 \t$dst, $b;",
+ [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+ Requires<[do_DIVF32_APPROX, doF32FTZ]>;
+def FDIV321r :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins f32imm:$a, Float32Regs:$b),
+ "rcp.approx.f32 \t$dst, $b;",
+ [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+ Requires<[do_DIVF32_APPROX]>;
//
// F32 Approximate division
//
-def FDIV32approxrr_ftz : NVPTXInst<(outs Float32Regs:$dst),
- (ins Float32Regs:$a, Float32Regs:$b),
- "div.approx.ftz.f32 \t$dst, $a, $b;",
- [(set Float32Regs:$dst,
- (fdiv Float32Regs:$a, Float32Regs:$b))]>,
- Requires<[do_DIVF32_APPROX, doF32FTZ]>;
-def FDIV32approxri_ftz : NVPTXInst<(outs Float32Regs:$dst),
- (ins Float32Regs:$a, f32imm:$b),
- "div.approx.ftz.f32 \t$dst, $a, $b;",
- [(set Float32Regs:$dst,
- (fdiv Float32Regs:$a, fpimm:$b))]>,
- Requires<[do_DIVF32_APPROX, doF32FTZ]>;
-def FDIV32approxrr : NVPTXInst<(outs Float32Regs:$dst),
- (ins Float32Regs:$a, Float32Regs:$b),
- "div.approx.f32 \t$dst, $a, $b;",
- [(set Float32Regs:$dst,
- (fdiv Float32Regs:$a, Float32Regs:$b))]>,
- Requires<[do_DIVF32_APPROX]>;
-def FDIV32approxri : NVPTXInst<(outs Float32Regs:$dst),
- (ins Float32Regs:$a, f32imm:$b),
- "div.approx.f32 \t$dst, $a, $b;",
- [(set Float32Regs:$dst,
- (fdiv Float32Regs:$a, fpimm:$b))]>,
- Requires<[do_DIVF32_APPROX]>;
+def FDIV32approxrr_ftz :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, Float32Regs:$b),
+ "div.approx.ftz.f32 \t$dst, $a, $b;",
+ [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+ Requires<[do_DIVF32_APPROX, doF32FTZ]>;
+def FDIV32approxri_ftz :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, f32imm:$b),
+ "div.approx.ftz.f32 \t$dst, $a, $b;",
+ [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
+ Requires<[do_DIVF32_APPROX, doF32FTZ]>;
+def FDIV32approxrr :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, Float32Regs:$b),
+ "div.approx.f32 \t$dst, $a, $b;",
+ [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+ Requires<[do_DIVF32_APPROX]>;
+def FDIV32approxri :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, f32imm:$b),
+ "div.approx.f32 \t$dst, $a, $b;",
+ [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
+ Requires<[do_DIVF32_APPROX]>;
//
// F32 Semi-accurate reciprocal
//
// rcp.approx gives the same result as div.full(1.0f, a) and is faster.
//
-def FDIV321r_approx_ftz : NVPTXInst<(outs Float32Regs:$dst),
- (ins f32imm:$a, Float32Regs:$b),
- "rcp.approx.ftz.f32 \t$dst, $b;",
- [(set Float32Regs:$dst,
- (fdiv FloatConst1:$a, Float32Regs:$b))]>,
- Requires<[do_DIVF32_FULL, doF32FTZ]>;
-def FDIV321r_approx : NVPTXInst<(outs Float32Regs:$dst),
- (ins f32imm:$a, Float32Regs:$b),
- "rcp.approx.f32 \t$dst, $b;",
- [(set Float32Regs:$dst,
- (fdiv FloatConst1:$a, Float32Regs:$b))]>,
- Requires<[do_DIVF32_FULL]>;
+def FDIV321r_approx_ftz :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins f32imm:$a, Float32Regs:$b),
+ "rcp.approx.ftz.f32 \t$dst, $b;",
+ [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+ Requires<[do_DIVF32_FULL, doF32FTZ]>;
+def FDIV321r_approx :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins f32imm:$a, Float32Regs:$b),
+ "rcp.approx.f32 \t$dst, $b;",
+ [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+ Requires<[do_DIVF32_FULL]>;
//
// F32 Semi-accurate division
//
-def FDIV32rr_ftz : NVPTXInst<(outs Float32Regs:$dst),
- (ins Float32Regs:$a, Float32Regs:$b),
- "div.full.ftz.f32 \t$dst, $a, $b;",
- [(set Float32Regs:$dst,
- (fdiv Float32Regs:$a, Float32Regs:$b))]>,
- Requires<[do_DIVF32_FULL, doF32FTZ]>;
-def FDIV32ri_ftz : NVPTXInst<(outs Float32Regs:$dst),
- (ins Float32Regs:$a, f32imm:$b),
- "div.full.ftz.f32 \t$dst, $a, $b;",
- [(set Float32Regs:$dst,
- (fdiv Float32Regs:$a, fpimm:$b))]>,
- Requires<[do_DIVF32_FULL, doF32FTZ]>;
-def FDIV32rr : NVPTXInst<(outs Float32Regs:$dst),
- (ins Float32Regs:$a, Float32Regs:$b),
- "div.full.f32 \t$dst, $a, $b;",
- [(set Float32Regs:$dst,
- (fdiv Float32Regs:$a, Float32Regs:$b))]>,
- Requires<[do_DIVF32_FULL]>;
-def FDIV32ri : NVPTXInst<(outs Float32Regs:$dst),
- (ins Float32Regs:$a, f32imm:$b),
- "div.full.f32 \t$dst, $a, $b;",
- [(set Float32Regs:$dst,
- (fdiv Float32Regs:$a, fpimm:$b))]>,
- Requires<[do_DIVF32_FULL]>;
+def FDIV32rr_ftz :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, Float32Regs:$b),
+ "div.full.ftz.f32 \t$dst, $a, $b;",
+ [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+ Requires<[do_DIVF32_FULL, doF32FTZ]>;
+def FDIV32ri_ftz :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, f32imm:$b),
+ "div.full.ftz.f32 \t$dst, $a, $b;",
+ [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
+ Requires<[do_DIVF32_FULL, doF32FTZ]>;
+def FDIV32rr :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, Float32Regs:$b),
+ "div.full.f32 \t$dst, $a, $b;",
+ [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+ Requires<[do_DIVF32_FULL]>;
+def FDIV32ri :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, f32imm:$b),
+ "div.full.f32 \t$dst, $a, $b;",
+ [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
+ Requires<[do_DIVF32_FULL]>;
//
// F32 Accurate reciprocal
//
-def FDIV321r_prec_ftz : NVPTXInst<(outs Float32Regs:$dst),
- (ins f32imm:$a, Float32Regs:$b),
- "rcp.rn.ftz.f32 \t$dst, $b;",
- [(set Float32Regs:$dst,
- (fdiv FloatConst1:$a, Float32Regs:$b))]>,
- Requires<[reqPTX20, doF32FTZ]>;
-def FDIV321r_prec : NVPTXInst<(outs Float32Regs:$dst),
- (ins f32imm:$a, Float32Regs:$b),
- "rcp.rn.f32 \t$dst, $b;",
- [(set Float32Regs:$dst,
- (fdiv FloatConst1:$a, Float32Regs:$b))]>,
- Requires<[reqPTX20]>;
+def FDIV321r_prec_ftz :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins f32imm:$a, Float32Regs:$b),
+ "rcp.rn.ftz.f32 \t$dst, $b;",
+ [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+ Requires<[reqPTX20, doF32FTZ]>;
+def FDIV321r_prec :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins f32imm:$a, Float32Regs:$b),
+ "rcp.rn.f32 \t$dst, $b;",
+ [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+ Requires<[reqPTX20]>;
//
// F32 Accurate division
//
-def FDIV32rr_prec_ftz : NVPTXInst<(outs Float32Regs:$dst),
- (ins Float32Regs:$a, Float32Regs:$b),
- "div.rn.ftz.f32 \t$dst, $a, $b;",
- [(set Float32Regs:$dst,
- (fdiv Float32Regs:$a, Float32Regs:$b))]>,
- Requires<[doF32FTZ, reqPTX20]>;
-def FDIV32ri_prec_ftz : NVPTXInst<(outs Float32Regs:$dst),
- (ins Float32Regs:$a, f32imm:$b),
- "div.rn.ftz.f32 \t$dst, $a, $b;",
- [(set Float32Regs:$dst,
- (fdiv Float32Regs:$a, fpimm:$b))]>,
- Requires<[doF32FTZ, reqPTX20]>;
-def FDIV32rr_prec : NVPTXInst<(outs Float32Regs:$dst),
- (ins Float32Regs:$a, Float32Regs:$b),
- "div.rn.f32 \t$dst, $a, $b;",
- [(set Float32Regs:$dst,
- (fdiv Float32Regs:$a, Float32Regs:$b))]>,
- Requires<[reqPTX20]>;
-def FDIV32ri_prec : NVPTXInst<(outs Float32Regs:$dst),
- (ins Float32Regs:$a, f32imm:$b),
- "div.rn.f32 \t$dst, $a, $b;",
- [(set Float32Regs:$dst,
- (fdiv Float32Regs:$a, fpimm:$b))]>,
- Requires<[reqPTX20]>;
+def FDIV32rr_prec_ftz :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, Float32Regs:$b),
+ "div.rn.ftz.f32 \t$dst, $a, $b;",
+ [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+ Requires<[doF32FTZ, reqPTX20]>;
+def FDIV32ri_prec_ftz :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, f32imm:$b),
+ "div.rn.ftz.f32 \t$dst, $a, $b;",
+ [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
+ Requires<[doF32FTZ, reqPTX20]>;
+def FDIV32rr_prec :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, Float32Regs:$b),
+ "div.rn.f32 \t$dst, $a, $b;",
+ [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+ Requires<[reqPTX20]>;
+def FDIV32ri_prec :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, f32imm:$b),
+ "div.rn.f32 \t$dst, $a, $b;",
+ [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
+ Requires<[reqPTX20]>;
//
// F32 rsqrt
@@ -857,68 +880,39 @@ def FDIV32ri_prec : NVPTXInst<(outs Float32Regs:$dst),
def RSQRTF32approx1r : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$b),
"rsqrt.approx.f32 \t$dst, $b;", []>;
+// Convert 1.0f/sqrt(x) to rsqrt.approx.f32. (There is an rsqrt.approx.f64, but
+// it's emulated in software.)
def: Pat<(fdiv FloatConst1, (int_nvvm_sqrt_f Float32Regs:$b)),
(RSQRTF32approx1r Float32Regs:$b)>,
Requires<[do_DIVF32_FULL, do_SQRTF32_APPROX, doNoF32FTZ]>;
-multiclass FPCONTRACT32<string OpcStr, Predicate Pred> {
- def rrr : NVPTXInst<(outs Float32Regs:$dst),
- (ins Float32Regs:$a, Float32Regs:$b, Float32Regs:$c),
- !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
- [(set Float32Regs:$dst,
- (fma Float32Regs:$a, Float32Regs:$b, Float32Regs:$c))]>,
- Requires<[Pred]>;
- def rri : NVPTXInst<(outs Float32Regs:$dst),
- (ins Float32Regs:$a, Float32Regs:$b, f32imm:$c),
- !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
- [(set Float32Regs:$dst,
- (fma Float32Regs:$a, Float32Regs:$b, fpimm:$c))]>,
- Requires<[Pred]>;
- def rir : NVPTXInst<(outs Float32Regs:$dst),
- (ins Float32Regs:$a, f32imm:$b, Float32Regs:$c),
- !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
- [(set Float32Regs:$dst,
- (fma Float32Regs:$a, fpimm:$b, Float32Regs:$c))]>,
- Requires<[Pred]>;
- def rii : NVPTXInst<(outs Float32Regs:$dst),
- (ins Float32Regs:$a, f32imm:$b, f32imm:$c),
- !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
- [(set Float32Regs:$dst,
- (fma Float32Regs:$a, fpimm:$b, fpimm:$c))]>,
- Requires<[Pred]>;
+multiclass FMA<string OpcStr, RegisterClass RC, Operand ImmCls, Predicate Pred> {
+ def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
+ !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+ [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>,
+ Requires<[Pred]>;
+ def rri : NVPTXInst<(outs RC:$dst),
+ (ins RC:$a, RC:$b, ImmCls:$c),
+ !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+ [(set RC:$dst, (fma RC:$a, RC:$b, fpimm:$c))]>,
+ Requires<[Pred]>;
+ def rir : NVPTXInst<(outs RC:$dst),
+ (ins RC:$a, ImmCls:$b, RC:$c),
+ !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+ [(set RC:$dst, (fma RC:$a, fpimm:$b, RC:$c))]>,
+ Requires<[Pred]>;
+ def rii : NVPTXInst<(outs RC:$dst),
+ (ins RC:$a, ImmCls:$b, ImmCls:$c),
+ !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+ [(set RC:$dst, (fma RC:$a, fpimm:$b, fpimm:$c))]>,
+ Requires<[Pred]>;
}
-multiclass FPCONTRACT64<string OpcStr, Predicate Pred> {
- def rrr : NVPTXInst<(outs Float64Regs:$dst),
- (ins Float64Regs:$a, Float64Regs:$b, Float64Regs:$c),
- !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
- [(set Float64Regs:$dst,
- (fma Float64Regs:$a, Float64Regs:$b, Float64Regs:$c))]>,
- Requires<[Pred]>;
- def rri : NVPTXInst<(outs Float64Regs:$dst),
- (ins Float64Regs:$a, Float64Regs:$b, f64imm:$c),
- !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
- [(set Float64Regs:$dst,
- (fma Float64Regs:$a, Float64Regs:$b, fpimm:$c))]>,
- Requires<[Pred]>;
- def rir : NVPTXInst<(outs Float64Regs:$dst),
- (ins Float64Regs:$a, f64imm:$b, Float64Regs:$c),
- !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
- [(set Float64Regs:$dst,
- (fma Float64Regs:$a, fpimm:$b, Float64Regs:$c))]>,
- Requires<[Pred]>;
- def rii : NVPTXInst<(outs Float64Regs:$dst),
- (ins Float64Regs:$a, f64imm:$b, f64imm:$c),
- !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
- [(set Float64Regs:$dst,
- (fma Float64Regs:$a, fpimm:$b, fpimm:$c))]>,
- Requires<[Pred]>;
-}
-
-defm FMA32_ftz : FPCONTRACT32<"fma.rn.ftz.f32", doF32FTZ>;
-defm FMA32 : FPCONTRACT32<"fma.rn.f32", true>;
-defm FMA64 : FPCONTRACT64<"fma.rn.f64", true>;
+defm FMA32_ftz : FMA<"fma.rn.ftz.f32", Float32Regs, f32imm, doF32FTZ>;
+defm FMA32 : FMA<"fma.rn.f32", Float32Regs, f32imm, true>;
+defm FMA64 : FMA<"fma.rn.f64", Float64Regs, f64imm, true>;
+// sin/cos
def SINF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
"sin.approx.f32 \t$dst, $src;",
[(set Float32Regs:$dst, (fsin Float32Regs:$src))]>;
@@ -926,8 +920,8 @@ def COSF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
"cos.approx.f32 \t$dst, $src;",
[(set Float32Regs:$dst, (fcos Float32Regs:$src))]>;
-// Lower (frem x, y) into (sub x, (mul (floor (div x, y)) y))
-// e.g. "poor man's fmod()"
+// Lower (frem x, y) into (sub x, (mul (floor (div x, y)) y)),
+// i.e. "poor man's fmod()"
// frem - f32 FTZ
def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
@@ -962,183 +956,152 @@ def : Pat<(frem Float64Regs:$x, fpimm:$y),
fpimm:$y))>;
//-----------------------------------
-// Logical Arithmetic
+// Bitwise operations
//-----------------------------------
-multiclass LOG_FORMAT<string OpcStr, SDNode OpNode> {
- def b1rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
- !strconcat(OpcStr, ".pred \t$dst, $a, $b;"),
- [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
- def b1ri: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
- !strconcat(OpcStr, ".pred \t$dst, $a, $b;"),
- [(set Int1Regs:$dst, (OpNode Int1Regs:$a, imm:$b))]>;
- def b16rr: NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
- !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"),
- [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
- Int16Regs:$b))]>;
- def b16ri: NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
- !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"),
- [(set Int16Regs:$dst, (OpNode Int16Regs:$a, imm:$b))]>;
- def b32rr: NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
- !strconcat(OpcStr, ".b32 \t$dst, $a, $b;"),
- [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
- Int32Regs:$b))]>;
- def b32ri: NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
- !strconcat(OpcStr, ".b32 \t$dst, $a, $b;"),
- [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
- def b64rr: NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
- !strconcat(OpcStr, ".b64 \t$dst, $a, $b;"),
- [(set Int64Regs:$dst, (OpNode Int64Regs:$a,
- Int64Regs:$b))]>;
- def b64ri: NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
- !strconcat(OpcStr, ".b64 \t$dst, $a, $b;"),
- [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
+// Template for three-arg bitwise operations. Takes three args, Creates .b16,
+// .b32, .b64, and .pred (predicate registers -- i.e., i1) versions of OpcStr.
+multiclass BITWISE<string OpcStr, SDNode OpNode> {
+ def b1rr :
+ NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
+ !strconcat(OpcStr, ".pred \t$dst, $a, $b;"),
+ [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
+ def b1ri :
+ NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
+ !strconcat(OpcStr, ".pred \t$dst, $a, $b;"),
+ [(set Int1Regs:$dst, (OpNode Int1Regs:$a, imm:$b))]>;
+ def b16rr :
+ NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
+ !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"),
+ [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>;
+ def b16ri :
+ NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+ !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"),
+ [(set Int16Regs:$dst, (OpNode Int16Regs:$a, imm:$b))]>;
+ def b32rr :
+ NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+ !strconcat(OpcStr, ".b32 \t$dst, $a, $b;"),
+ [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
+ def b32ri :
+ NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+ !strconcat(OpcStr, ".b32 \t$dst, $a, $b;"),
+ [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
+ def b64rr :
+ NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
+ !strconcat(OpcStr, ".b64 \t$dst, $a, $b;"),
+ [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>;
+ def b64ri :
+ NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
+ !strconcat(OpcStr, ".b64 \t$dst, $a, $b;"),
+ [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
}
-defm OR : LOG_FORMAT<"or", or>;
-defm AND : LOG_FORMAT<"and", and>;
-defm XOR : LOG_FORMAT<"xor", xor>;
+defm OR : BITWISE<"or", or>;
+defm AND : BITWISE<"and", and>;
+defm XOR : BITWISE<"xor", xor>;
-def NOT1: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src),
+def NOT1 : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src),
"not.pred \t$dst, $src;",
[(set Int1Regs:$dst, (not Int1Regs:$src))]>;
-def NOT16: NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
+def NOT16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
"not.b16 \t$dst, $src;",
[(set Int16Regs:$dst, (not Int16Regs:$src))]>;
-def NOT32: NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
+def NOT32 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
"not.b32 \t$dst, $src;",
[(set Int32Regs:$dst, (not Int32Regs:$src))]>;
-def NOT64: NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
- "not.b64 \t$dst, $src;",
- [(set Int64Regs:$dst, (not Int64Regs:$src))]>;
-
-// For shifts, the second src operand must be 32-bit value
-multiclass LSHIFT_FORMAT<string OpcStr, SDNode OpNode> {
- def i64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a,
- Int32Regs:$b),
- !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
- [(set Int64Regs:$dst, (OpNode Int64Regs:$a,
- Int32Regs:$b))]>;
- def i64ri : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b),
- !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
- [(set Int64Regs:$dst, (OpNode Int64Regs:$a,
- (i32 imm:$b)))]>;
- def i32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a,
- Int32Regs:$b),
- !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
- [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
- Int32Regs:$b))]>;
- def i32ri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
- !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
- [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
- (i32 imm:$b)))]>;
- def i32ii : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b),
- !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
- [(set Int32Regs:$dst, (OpNode (i32 imm:$a),
- (i32 imm:$b)))]>;
- def i16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a,
- Int32Regs:$b),
- !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
- [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
- Int32Regs:$b))]>;
- def i16ri : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
- !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
- [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
- (i32 imm:$b)))]>;
-}
+def NOT64 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+ "not.b64 \t$dst, $src;",
+ [(set Int64Regs:$dst, (not Int64Regs:$src))]>;
-defm SHL : LSHIFT_FORMAT<"shl.b", shl>;
-
-// For shifts, the second src operand must be 32-bit value
-// Need to add cvt for the 8-bits.
-multiclass RSHIFT_FORMAT<string OpcStr, SDNode OpNode> {
- def i64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a,
- Int32Regs:$b),
- !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
- [(set Int64Regs:$dst, (OpNode Int64Regs:$a,
- Int32Regs:$b))]>;
- def i64ri : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b),
- !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
- [(set Int64Regs:$dst, (OpNode Int64Regs:$a,
- (i32 imm:$b)))]>;
- def i32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a,
- Int32Regs:$b),
- !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
- [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
- Int32Regs:$b))]>;
- def i32ri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
- !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
- [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
- (i32 imm:$b)))]>;
- def i32ii : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b),
- !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
- [(set Int32Regs:$dst, (OpNode (i32 imm:$a),
- (i32 imm:$b)))]>;
- def i16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a,
- Int32Regs:$b),
- !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
- [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
- Int32Regs:$b))]>;
- def i16ri : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
- !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
- [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
- (i32 imm:$b)))]>;
+// Template for left/right shifts. Takes three operands,
+// [dest (reg), src (reg), shift (reg or imm)].
+// dest and src may be int64, int32, or int16, but shift is always int32.
+//
+// This template also defines a 32-bit shift (imm, imm) instruction.
+multiclass SHIFT<string OpcStr, SDNode OpNode> {
+ def i64rr :
+ NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int32Regs:$b),
+ !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+ [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int32Regs:$b))]>;
+ def i64ri :
+ NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b),
+ !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+ [(set Int64Regs:$dst, (OpNode Int64Regs:$a, (i32 imm:$b)))]>;
+ def i32rr :
+ NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+ !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+ [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
+ def i32ri :
+ NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+ !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+ [(set Int32Regs:$dst, (OpNode Int32Regs:$a, (i32 imm:$b)))]>;
+ def i32ii :
+ NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b),
+ !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+ [(set Int32Regs:$dst, (OpNode (i32 imm:$a), (i32 imm:$b)))]>;
+ def i16rr :
+ NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int32Regs:$b),
+ !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+ [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int32Regs:$b))]>;
+ def i16ri :
+ NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
+ !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+ [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (i32 imm:$b)))]>;
}
-defm SRA : RSHIFT_FORMAT<"shr.s", sra>;
-defm SRL : RSHIFT_FORMAT<"shr.u", srl>;
+defm SHL : SHIFT<"shl.b", shl>;
+defm SRA : SHIFT<"shr.s", sra>;
+defm SRL : SHIFT<"shr.u", srl>;
//
-// Rotate: use ptx shf instruction if available.
+// Rotate: Use ptx shf instruction if available.
//
// 32 bit r2 = rotl r1, n
// =>
// r2 = shf.l r1, r1, n
-def ROTL32imm_hw : NVPTXInst<(outs Int32Regs:$dst),
- (ins Int32Regs:$src, i32imm:$amt),
- "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
- [(set Int32Regs:$dst, (rotl Int32Regs:$src, (i32 imm:$amt)))]>,
- Requires<[hasHWROT32]> ;
-
-def ROTL32reg_hw : NVPTXInst<(outs Int32Regs:$dst),
- (ins Int32Regs:$src, Int32Regs:$amt),
- "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
- [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>,
- Requires<[hasHWROT32]>;
+def ROTL32imm_hw :
+ NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt),
+ "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
+ [(set Int32Regs:$dst, (rotl Int32Regs:$src, (i32 imm:$amt)))]>,
+ Requires<[hasHWROT32]>;
+
+def ROTL32reg_hw :
+ NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
+ "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
+ [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>,
+ Requires<[hasHWROT32]>;
// 32 bit r2 = rotr r1, n
// =>
// r2 = shf.r r1, r1, n
-def ROTR32imm_hw : NVPTXInst<(outs Int32Regs:$dst),
- (ins Int32Regs:$src, i32imm:$amt),
- "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
- [(set Int32Regs:$dst, (rotr Int32Regs:$src, (i32 imm:$amt)))]>,
- Requires<[hasHWROT32]>;
-
-def ROTR32reg_hw : NVPTXInst<(outs Int32Regs:$dst),
- (ins Int32Regs:$src, Int32Regs:$amt),
- "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
- [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>,
- Requires<[hasHWROT32]>;
-
-//
-// Rotate: if ptx shf instruction is not available, then use shift+add
-//
-// 32bit
-def ROT32imm_sw : NVPTXInst<(outs Int32Regs:$dst),
- (ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2),
- !strconcat("{{\n\t",
- !strconcat(".reg .b32 %lhs;\n\t",
- !strconcat(".reg .b32 %rhs;\n\t",
- !strconcat("shl.b32 \t%lhs, $src, $amt1;\n\t",
- !strconcat("shr.b32 \t%rhs, $src, $amt2;\n\t",
- !strconcat("add.u32 \t$dst, %lhs, %rhs;\n\t",
- !strconcat("}}", ""))))))),
- []>;
+def ROTR32imm_hw :
+ NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt),
+ "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
+ [(set Int32Regs:$dst, (rotr Int32Regs:$src, (i32 imm:$amt)))]>,
+ Requires<[hasHWROT32]>;
+
+def ROTR32reg_hw :
+ NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
+ "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
+ [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>,
+ Requires<[hasHWROT32]>;
+
+// 32-bit software rotate by immediate. $amt2 should equal 32 - $amt1.
+def ROT32imm_sw :
+ NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2),
+ "{{\n\t"
+ ".reg .b32 %lhs;\n\t"
+ ".reg .b32 %rhs;\n\t"
+ "shl.b32 \t%lhs, $src, $amt1;\n\t"
+ "shr.b32 \t%rhs, $src, $amt2;\n\t"
+ "add.u32 \t$dst, %lhs, %rhs;\n\t"
+ "}}",
+ []>;
def SUB_FRM_32 : SDNodeXForm<imm, [{
- return CurDAG->getTargetConstant(32-N->getZExtValue(), SDLoc(N), MVT::i32);
+ return CurDAG->getTargetConstant(32 - N->getZExtValue(), SDLoc(N), MVT::i32);
}]>;
def : Pat<(rotl Int32Regs:$src, (i32 imm:$amt)),
@@ -1148,45 +1111,48 @@ def : Pat<(rotr Int32Regs:$src, (i32 imm:$amt)),
(ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>,
Requires<[noHWROT32]>;
-def ROTL32reg_sw : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src,
- Int32Regs:$amt),
- !strconcat("{{\n\t",
- !strconcat(".reg .b32 %lhs;\n\t",
- !strconcat(".reg .b32 %rhs;\n\t",
- !strconcat(".reg .b32 %amt2;\n\t",
- !strconcat("shl.b32 \t%lhs, $src, $amt;\n\t",
- !strconcat("sub.s32 \t%amt2, 32, $amt;\n\t",
- !strconcat("shr.b32 \t%rhs, $src, %amt2;\n\t",
- !strconcat("add.u32 \t$dst, %lhs, %rhs;\n\t",
- !strconcat("}}", ""))))))))),
- [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>,
- Requires<[noHWROT32]>;
-
-def ROTR32reg_sw : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src,
- Int32Regs:$amt),
- !strconcat("{{\n\t",
- !strconcat(".reg .b32 %lhs;\n\t",
- !strconcat(".reg .b32 %rhs;\n\t",
- !strconcat(".reg .b32 %amt2;\n\t",
- !strconcat("shr.b32 \t%lhs, $src, $amt;\n\t",
- !strconcat("sub.s32 \t%amt2, 32, $amt;\n\t",
- !strconcat("shl.b32 \t%rhs, $src, %amt2;\n\t",
- !strconcat("add.u32 \t$dst, %lhs, %rhs;\n\t",
- !strconcat("}}", ""))))))))),
- [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>,
- Requires<[noHWROT32]>;
-
-// 64bit
-def ROT64imm_sw : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src,
- i32imm:$amt1, i32imm:$amt2),
- !strconcat("{{\n\t",
- !strconcat(".reg .b64 %lhs;\n\t",
- !strconcat(".reg .b64 %rhs;\n\t",
- !strconcat("shl.b64 \t%lhs, $src, $amt1;\n\t",
- !strconcat("shr.b64 \t%rhs, $src, $amt2;\n\t",
- !strconcat("add.u64 \t$dst, %lhs, %rhs;\n\t",
- !strconcat("}}", ""))))))),
- []>;
+// 32-bit software rotate left by register.
+def ROTL32reg_sw :
+ NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
+ "{{\n\t"
+ ".reg .b32 %lhs;\n\t"
+ ".reg .b32 %rhs;\n\t"
+ ".reg .b32 %amt2;\n\t"
+ "shl.b32 \t%lhs, $src, $amt;\n\t"
+ "sub.s32 \t%amt2, 32, $amt;\n\t"
+ "shr.b32 \t%rhs, $src, %amt2;\n\t"
+ "add.u32 \t$dst, %lhs, %rhs;\n\t"
+ "}}",
+ [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>,
+ Requires<[noHWROT32]>;
+
+// 32-bit software rotate right by register.
+def ROTR32reg_sw :
+ NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
+ "{{\n\t"
+ ".reg .b32 %lhs;\n\t"
+ ".reg .b32 %rhs;\n\t"
+ ".reg .b32 %amt2;\n\t"
+ "shr.b32 \t%lhs, $src, $amt;\n\t"
+ "sub.s32 \t%amt2, 32, $amt;\n\t"
+ "shl.b32 \t%rhs, $src, %amt2;\n\t"
+ "add.u32 \t$dst, %lhs, %rhs;\n\t"
+ "}}",
+ [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>,
+ Requires<[noHWROT32]>;
+
+// 64-bit software rotate by immediate. $amt2 should equal 64 - $amt1.
+def ROT64imm_sw :
+ NVPTXInst<(outs Int64Regs:$dst),
+ (ins Int64Regs:$src, i32imm:$amt1, i32imm:$amt2),
+ "{{\n\t"
+ ".reg .b64 %lhs;\n\t"
+ ".reg .b64 %rhs;\n\t"
+ "shl.b64 \t%lhs, $src, $amt1;\n\t"
+ "shr.b64 \t%rhs, $src, $amt2;\n\t"
+ "add.u64 \t$dst, %lhs, %rhs;\n\t"
+ "}}",
+ []>;
def SUB_FRM_64 : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(64-N->getZExtValue(), SDLoc(N), MVT::i32);
@@ -1197,37 +1163,70 @@ def : Pat<(rotl Int64Regs:$src, (i32 imm:$amt)),
def : Pat<(rotr Int64Regs:$src, (i32 imm:$amt)),
(ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>;
-def ROTL64reg_sw : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src,
- Int32Regs:$amt),
- !strconcat("{{\n\t",
- !strconcat(".reg .b64 %lhs;\n\t",
- !strconcat(".reg .b64 %rhs;\n\t",
- !strconcat(".reg .u32 %amt2;\n\t",
- !strconcat("shl.b64 \t%lhs, $src, $amt;\n\t",
- !strconcat("sub.u32 \t%amt2, 64, $amt;\n\t",
- !strconcat("shr.b64 \t%rhs, $src, %amt2;\n\t",
- !strconcat("add.u64 \t$dst, %lhs, %rhs;\n\t",
- !strconcat("}}", ""))))))))),
- [(set Int64Regs:$dst, (rotl Int64Regs:$src, Int32Regs:$amt))]>;
-
-def ROTR64reg_sw : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src,
- Int32Regs:$amt),
- !strconcat("{{\n\t",
- !strconcat(".reg .b64 %lhs;\n\t",
- !strconcat(".reg .b64 %rhs;\n\t",
- !strconcat(".reg .u32 %amt2;\n\t",
- !strconcat("shr.b64 \t%lhs, $src, $amt;\n\t",
- !strconcat("sub.u32 \t%amt2, 64, $amt;\n\t",
- !strconcat("shl.b64 \t%rhs, $src, %amt2;\n\t",
- !strconcat("add.u64 \t$dst, %lhs, %rhs;\n\t",
- !strconcat("}}", ""))))))))),
- [(set Int64Regs:$dst, (rotr Int64Regs:$src, Int32Regs:$amt))]>;
+// 64-bit software rotate left by register.
+def ROTL64reg_sw :
+ NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt),
+ "{{\n\t"
+ ".reg .b64 %lhs;\n\t"
+ ".reg .b64 %rhs;\n\t"
+ ".reg .u32 %amt2;\n\t"
+ "shl.b64 \t%lhs, $src, $amt;\n\t"
+ "sub.u32 \t%amt2, 64, $amt;\n\t"
+ "shr.b64 \t%rhs, $src, %amt2;\n\t"
+ "add.u64 \t$dst, %lhs, %rhs;\n\t"
+ "}}",
+ [(set Int64Regs:$dst, (rotl Int64Regs:$src, Int32Regs:$amt))]>;
+
+def ROTR64reg_sw :
+ NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt),
+ "{{\n\t"
+ ".reg .b64 %lhs;\n\t"
+ ".reg .b64 %rhs;\n\t"
+ ".reg .u32 %amt2;\n\t"
+ "shr.b64 \t%lhs, $src, $amt;\n\t"
+ "sub.u32 \t%amt2, 64, $amt;\n\t"
+ "shl.b64 \t%rhs, $src, %amt2;\n\t"
+ "add.u64 \t$dst, %lhs, %rhs;\n\t"
+ "}}",
+ [(set Int64Regs:$dst, (rotr Int64Regs:$src, Int32Regs:$amt))]>;
+
+//
+// Funnnel shift in clamp mode
+//
+
+// Create SDNodes so they can be used in the DAG code, e.g.
+// NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts)
+def SDTIntShiftDOp :
+ SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+ SDTCisInt<0>, SDTCisInt<3>]>;
+def FUN_SHFL_CLAMP : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>;
+def FUN_SHFR_CLAMP : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>;
+
+def FUNSHFLCLAMP :
+ NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
+ "shf.l.clamp.b32 \t$dst, $lo, $hi, $amt;",
+ [(set Int32Regs:$dst,
+ (FUN_SHFL_CLAMP Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt))]>;
+def FUNSHFRCLAMP :
+ NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
+ "shf.r.clamp.b32 \t$dst, $lo, $hi, $amt;",
+ [(set Int32Regs:$dst,
+ (FUN_SHFR_CLAMP Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt))]>;
+
+//
// BFE - bit-field extract
+//
+// Template for BFE instructions. Takes four args,
+// [dest (reg), src (reg), start (reg or imm), end (reg or imm)].
+// Start may be an imm only if end is also an imm. FIXME: Is this a
+// restriction in PTX?
+//
+// dest and src may be int32 or int64, but start and end are always int32.
multiclass BFE<string TyStr, RegisterClass RC> {
- // BFE supports both 32-bit and 64-bit values, but the start and length
- // operands are always 32-bit
def rrr
: NVPTXInst<(outs RC:$d),
(ins RC:$a, Int32Regs:$b, Int32Regs:$c),
@@ -1242,29 +1241,35 @@ multiclass BFE<string TyStr, RegisterClass RC> {
!strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
}
-defm BFE_S32 : BFE<"s32", Int32Regs>;
-defm BFE_U32 : BFE<"u32", Int32Regs>;
-defm BFE_S64 : BFE<"s64", Int64Regs>;
-defm BFE_U64 : BFE<"u64", Int64Regs>;
+let hasSideEffects = 0 in {
+ defm BFE_S32 : BFE<"s32", Int32Regs>;
+ defm BFE_U32 : BFE<"u32", Int32Regs>;
+ defm BFE_S64 : BFE<"s64", Int64Regs>;
+ defm BFE_U64 : BFE<"u64", Int64Regs>;
+}
//-----------------------------------
-// General Comparison
+// Comparison instructions (setp, set)
//-----------------------------------
-// General setp instructions
-multiclass SETP<string TypeStr, RegisterClass RC, Operand ImmCls> {
- def rr : NVPTXInst<(outs Int1Regs:$dst),
- (ins RC:$a, RC:$b, CmpMode:$cmp),
- !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr, "\t$dst, $a, $b;"),
- []>;
- def ri : NVPTXInst<(outs Int1Regs:$dst),
- (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
- !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr, "\t$dst, $a, $b;"),
- []>;
- def ir : NVPTXInst<(outs Int1Regs:$dst),
- (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
- !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr, "\t$dst, $a, $b;"),
- []>;
+// FIXME: This doesn't cover versions of set and setp that combine with a
+// boolean predicate, e.g. setp.eq.and.b16.
+
+let hasSideEffects = 0 in {
+ multiclass SETP<string TypeStr, RegisterClass RC, Operand ImmCls> {
+ def rr :
+ NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, RC:$b, CmpMode:$cmp),
+ !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
+ "\t$dst, $a, $b;"), []>;
+ def ri :
+ NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
+ !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
+ "\t$dst, $a, $b;"), []>;
+ def ir :
+ NVPTXInst<(outs Int1Regs:$dst), (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
+ !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
+ "\t$dst, $a, $b;"), []>;
+ }
}
defm SETP_b16 : SETP<"b16", Int16Regs, i16imm>;
@@ -1279,17 +1284,22 @@ defm SETP_u64 : SETP<"u64", Int64Regs, i64imm>;
defm SETP_f32 : SETP<"f32", Float32Regs, f32imm>;
defm SETP_f64 : SETP<"f64", Float64Regs, f64imm>;
-// General set instructions
-multiclass SET<string TypeStr, RegisterClass RC, Operand ImmCls> {
- def rr : NVPTXInst<(outs Int32Regs:$dst),
- (ins RC:$a, RC:$b, CmpMode:$cmp),
- !strconcat("set$cmp.", TypeStr, "\t$dst, $a, $b;"), []>;
- def ri : NVPTXInst<(outs Int32Regs:$dst),
- (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
- !strconcat("set$cmp.", TypeStr, "\t$dst, $a, $b;"), []>;
- def ir : NVPTXInst<(outs Int32Regs:$dst),
- (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
- !strconcat("set$cmp.", TypeStr, "\t$dst, $a, $b;"), []>;
+// FIXME: This doesn't appear to be correct. The "set" mnemonic has the form
+// "set.CmpOp{.ftz}.dtype.stype", where dtype is the type of the destination
+// reg, either u32, s32, or f32. Anyway these aren't used at the moment.
+
+let hasSideEffects = 0 in {
+ multiclass SET<string TypeStr, RegisterClass RC, Operand ImmCls> {
+ def rr : NVPTXInst<(outs Int32Regs:$dst),
+ (ins RC:$a, RC:$b, CmpMode:$cmp),
+ !strconcat("set$cmp.", TypeStr, "\t$dst, $a, $b;"), []>;
+ def ri : NVPTXInst<(outs Int32Regs:$dst),
+ (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
+ !strconcat("set$cmp.", TypeStr, "\t$dst, $a, $b;"), []>;
+ def ir : NVPTXInst<(outs Int32Regs:$dst),
+ (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
+ !strconcat("set$cmp.", TypeStr, "\t$dst, $a, $b;"), []>;
+ }
}
defm SET_b16 : SET<"b16", Int16Regs, i16imm>;
@@ -1305,45 +1315,56 @@ defm SET_f32 : SET<"f32", Float32Regs, f32imm>;
defm SET_f64 : SET<"f64", Float64Regs, f64imm>;
//-----------------------------------
-// General Selection
+// Selection instructions (selp)
//-----------------------------------
-// General selp instructions
-multiclass SELP<string TypeStr, RegisterClass RC, Operand ImmCls> {
- def rr : NVPTXInst<(outs RC:$dst),
- (ins RC:$a, RC:$b, Int1Regs:$p),
- !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>;
- def ri : NVPTXInst<(outs RC:$dst),
- (ins RC:$a, ImmCls:$b, Int1Regs:$p),
- !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>;
- def ir : NVPTXInst<(outs RC:$dst),
- (ins ImmCls:$a, RC:$b, Int1Regs:$p),
- !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>;
- def ii : NVPTXInst<(outs RC:$dst),
- (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
- !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>;
-}
+// FIXME: Missing slct
-multiclass SELP_PATTERN<string TypeStr, RegisterClass RC, Operand ImmCls,
- SDNode ImmNode> {
- def rr : NVPTXInst<(outs RC:$dst),
- (ins RC:$a, RC:$b, Int1Regs:$p),
- !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"),
- [(set RC:$dst, (select Int1Regs:$p, RC:$a, RC:$b))]>;
- def ri : NVPTXInst<(outs RC:$dst),
- (ins RC:$a, ImmCls:$b, Int1Regs:$p),
- !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"),
- [(set RC:$dst, (select Int1Regs:$p, RC:$a, ImmNode:$b))]>;
- def ir : NVPTXInst<(outs RC:$dst),
- (ins ImmCls:$a, RC:$b, Int1Regs:$p),
- !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"),
- [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, RC:$b))]>;
- def ii : NVPTXInst<(outs RC:$dst),
- (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
- !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"),
- [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, ImmNode:$b))]>;
+// selp instructions that don't have any pattern matches; we explicitly use
+// them within this file.
+let hasSideEffects = 0 in {
+ multiclass SELP<string TypeStr, RegisterClass RC, Operand ImmCls> {
+ def rr : NVPTXInst<(outs RC:$dst),
+ (ins RC:$a, RC:$b, Int1Regs:$p),
+ !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>;
+ def ri : NVPTXInst<(outs RC:$dst),
+ (ins RC:$a, ImmCls:$b, Int1Regs:$p),
+ !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>;
+ def ir : NVPTXInst<(outs RC:$dst),
+ (ins ImmCls:$a, RC:$b, Int1Regs:$p),
+ !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>;
+ def ii : NVPTXInst<(outs RC:$dst),
+ (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
+ !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>;
+ }
+
+ multiclass SELP_PATTERN<string TypeStr, RegisterClass RC, Operand ImmCls,
+ SDNode ImmNode> {
+ def rr :
+ NVPTXInst<(outs RC:$dst),
+ (ins RC:$a, RC:$b, Int1Regs:$p),
+ !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"),
+ [(set RC:$dst, (select Int1Regs:$p, RC:$a, RC:$b))]>;
+ def ri :
+ NVPTXInst<(outs RC:$dst),
+ (ins RC:$a, ImmCls:$b, Int1Regs:$p),
+ !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"),
+ [(set RC:$dst, (select Int1Regs:$p, RC:$a, ImmNode:$b))]>;
+ def ir :
+ NVPTXInst<(outs RC:$dst),
+ (ins ImmCls:$a, RC:$b, Int1Regs:$p),
+ !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"),
+ [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, RC:$b))]>;
+ def ii :
+ NVPTXInst<(outs RC:$dst),
+ (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
+ !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"),
+ [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, ImmNode:$b))]>;
+ }
}
+// Don't pattern match on selp.{s,u}{16,32,64} -- selp.b{16,32,64} is just as
+// good.
defm SELP_b16 : SELP_PATTERN<"b16", Int16Regs, i16imm, imm>;
defm SELP_s16 : SELP<"s16", Int16Regs, i16imm>;
defm SELP_u16 : SELP<"u16", Int16Regs, i16imm>;
@@ -1356,40 +1377,14 @@ defm SELP_u64 : SELP<"u64", Int64Regs, i64imm>;
defm SELP_f32 : SELP_PATTERN<"f32", Float32Regs, f32imm, fpimm>;
defm SELP_f64 : SELP_PATTERN<"f64", Float64Regs, f64imm, fpimm>;
-//
-// Funnnel shift in clamp mode
-//
-// - SDNodes are created so they can be used in the DAG code,
-// e.g. NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts)
-//
-def SDTIntShiftDOp: SDTypeProfile<1, 3,
- [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
- SDTCisInt<0>, SDTCisInt<3>]>;
-def FUN_SHFL_CLAMP : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>;
-def FUN_SHFR_CLAMP : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>;
-
-def FUNSHFLCLAMP : NVPTXInst<(outs Int32Regs:$dst),
- (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
- "shf.l.clamp.b32 \t$dst, $lo, $hi, $amt;",
- [(set Int32Regs:$dst,
- (FUN_SHFL_CLAMP Int32Regs:$lo,
- Int32Regs:$hi, Int32Regs:$amt))]>;
-
-def FUNSHFRCLAMP : NVPTXInst<(outs Int32Regs:$dst),
- (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
- "shf.r.clamp.b32 \t$dst, $lo, $hi, $amt;",
- [(set Int32Regs:$dst,
- (FUN_SHFR_CLAMP Int32Regs:$lo,
- Int32Regs:$hi, Int32Regs:$amt))]>;
-
//-----------------------------------
// Data Movement (Load / Store, Move)
//-----------------------------------
def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex],
- [SDNPWantRoot]>;
+ [SDNPWantRoot]>;
def ADDRri64 : ComplexPattern<i64, 2, "SelectADDRri64", [frameindex],
- [SDNPWantRoot]>;
+ [SDNPWantRoot]>;
def MEMri : Operand<i32> {
let PrintMethod = "printMemOperand";
@@ -1401,82 +1396,83 @@ def MEMri64 : Operand<i64> {
}
def imem : Operand<iPTR> {
- let PrintMethod = "printOperand";
+ let PrintMethod = "printOperand";
}
def imemAny : Operand<iPTRAny> {
- let PrintMethod = "printOperand";
+ let PrintMethod = "printOperand";
}
def LdStCode : Operand<i32> {
- let PrintMethod = "printLdStCode";
+ let PrintMethod = "printLdStCode";
}
def SDTWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
def Wrapper : SDNode<"NVPTXISD::Wrapper", SDTWrapper>;
+// Load a memory address into a u32 or u64 register.
def MOV_ADDR : NVPTXInst<(outs Int32Regs:$dst), (ins imem:$a),
- "mov.u32 \t$dst, $a;",
- [(set Int32Regs:$dst, (Wrapper tglobaladdr:$a))]>;
-
+ "mov.u32 \t$dst, $a;",
+ [(set Int32Regs:$dst, (Wrapper tglobaladdr:$a))]>;
def MOV_ADDR64 : NVPTXInst<(outs Int64Regs:$dst), (ins imem:$a),
- "mov.u64 \t$dst, $a;",
- [(set Int64Regs:$dst, (Wrapper tglobaladdr:$a))]>;
+ "mov.u64 \t$dst, $a;",
+ [(set Int64Regs:$dst, (Wrapper tglobaladdr:$a))]>;
-// Get pointer to local stack
-def MOV_DEPOT_ADDR
- : NVPTXInst<(outs Int32Regs:$d), (ins i32imm:$num),
- "mov.u32 \t$d, __local_depot$num;", []>;
-def MOV_DEPOT_ADDR_64
- : NVPTXInst<(outs Int64Regs:$d), (ins i32imm:$num),
- "mov.u64 \t$d, __local_depot$num;", []>;
+// Get pointer to local stack.
+let hasSideEffects = 0 in {
+ def MOV_DEPOT_ADDR : NVPTXInst<(outs Int32Regs:$d), (ins i32imm:$num),
+ "mov.u32 \t$d, __local_depot$num;", []>;
+ def MOV_DEPOT_ADDR_64 : NVPTXInst<(outs Int64Regs:$d), (ins i32imm:$num),
+ "mov.u64 \t$d, __local_depot$num;", []>;
+}
// copyPhysreg is hard-coded in NVPTXInstrInfo.cpp
-let IsSimpleMove=1 in {
-def IMOV1rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$sss),
- "mov.pred \t$dst, $sss;", []>;
-def IMOV16rr: NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss),
- "mov.u16 \t$dst, $sss;", []>;
-def IMOV32rr: NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss),
- "mov.u32 \t$dst, $sss;", []>;
-def IMOV64rr: NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss),
- "mov.u64 \t$dst, $sss;", []>;
-
-def FMOV32rr: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
- "mov.f32 \t$dst, $src;", []>;
-def FMOV64rr: NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src),
- "mov.f64 \t$dst, $src;", []>;
+let IsSimpleMove=1, hasSideEffects=0 in {
+ def IMOV1rr : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$sss),
+ "mov.pred \t$dst, $sss;", []>;
+ def IMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss),
+ "mov.u16 \t$dst, $sss;", []>;
+ def IMOV32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss),
+ "mov.u32 \t$dst, $sss;", []>;
+ def IMOV64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss),
+ "mov.u64 \t$dst, $sss;", []>;
+
+ def FMOV32rr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
+ "mov.f32 \t$dst, $src;", []>;
+ def FMOV64rr : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src),
+ "mov.f64 \t$dst, $src;", []>;
}
-def IMOV1ri: NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src),
- "mov.pred \t$dst, $src;",
- [(set Int1Regs:$dst, imm:$src)]>;
-def IMOV16ri: NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src),
- "mov.u16 \t$dst, $src;",
- [(set Int16Regs:$dst, imm:$src)]>;
-def IMOV32ri: NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src),
- "mov.u32 \t$dst, $src;",
- [(set Int32Regs:$dst, imm:$src)]>;
-def IMOV64i: NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src),
- "mov.u64 \t$dst, $src;",
- [(set Int64Regs:$dst, imm:$src)]>;
-
-def FMOV32ri: NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src),
- "mov.f32 \t$dst, $src;",
- [(set Float32Regs:$dst, fpimm:$src)]>;
-def FMOV64ri: NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src),
- "mov.f64 \t$dst, $src;",
- [(set Float64Regs:$dst, fpimm:$src)]>;
+
+def IMOV1ri : NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src),
+ "mov.pred \t$dst, $src;",
+ [(set Int1Regs:$dst, imm:$src)]>;
+def IMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src),
+ "mov.u16 \t$dst, $src;",
+ [(set Int16Regs:$dst, imm:$src)]>;
+def IMOV32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src),
+ "mov.u32 \t$dst, $src;",
+ [(set Int32Regs:$dst, imm:$src)]>;
+def IMOV64i : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src),
+ "mov.u64 \t$dst, $src;",
+ [(set Int64Regs:$dst, imm:$src)]>;
+
+def FMOV32ri : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src),
+ "mov.f32 \t$dst, $src;",
+ [(set Float32Regs:$dst, fpimm:$src)]>;
+def FMOV64ri : NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src),
+ "mov.f64 \t$dst, $src;",
+ [(set Float64Regs:$dst, fpimm:$src)]>;
def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOV32ri texternalsym:$dst)>;
//---- Copy Frame Index ----
-def LEA_ADDRi : NVPTXInst<(outs Int32Regs:$dst), (ins MEMri:$addr),
- "add.u32 \t$dst, ${addr:add};",
- [(set Int32Regs:$dst, ADDRri:$addr)]>;
+def LEA_ADDRi : NVPTXInst<(outs Int32Regs:$dst), (ins MEMri:$addr),
+ "add.u32 \t$dst, ${addr:add};",
+ [(set Int32Regs:$dst, ADDRri:$addr)]>;
def LEA_ADDRi64 : NVPTXInst<(outs Int64Regs:$dst), (ins MEMri64:$addr),
- "add.u64 \t$dst, ${addr:add};",
- [(set Int64Regs:$dst, ADDRri64:$addr)]>;
+ "add.u64 \t$dst, ${addr:add};",
+ [(set Int64Regs:$dst, ADDRri64:$addr)]>;
//-----------------------------------
// Comparison and Selection
@@ -1554,7 +1550,7 @@ multiclass ISET_FORMAT_SIGNED<PatFrag OpNode, PatLeaf Mode>
SET_s16rr, SET_s16ri, SET_s16ir,
SET_s32rr, SET_s32ri, SET_s32ir,
SET_s64rr, SET_s64ri, SET_s64ir> {
- // TableGen doesn't like empty multiclasses
+ // TableGen doesn't like empty multiclasses.
def : PatLeaf<(i32 0)>;
}
@@ -1566,21 +1562,21 @@ multiclass ISET_FORMAT_UNSIGNED<PatFrag OpNode, PatLeaf Mode>
SET_u16rr, SET_u16ri, SET_u16ir,
SET_u32rr, SET_u32ri, SET_u32ir,
SET_u64rr, SET_u64ri, SET_u64ir> {
- // TableGen doesn't like empty multiclasses
+ // TableGen doesn't like empty multiclasses.
def : PatLeaf<(i32 0)>;
}
defm : ISET_FORMAT_SIGNED<setgt, CmpGT>;
-defm : ISET_FORMAT_UNSIGNED<setugt, CmpGT>;
defm : ISET_FORMAT_SIGNED<setlt, CmpLT>;
-defm : ISET_FORMAT_UNSIGNED<setult, CmpLT>;
defm : ISET_FORMAT_SIGNED<setge, CmpGE>;
-defm : ISET_FORMAT_UNSIGNED<setuge, CmpGE>;
defm : ISET_FORMAT_SIGNED<setle, CmpLE>;
-defm : ISET_FORMAT_UNSIGNED<setule, CmpLE>;
defm : ISET_FORMAT_SIGNED<seteq, CmpEQ>;
-defm : ISET_FORMAT_UNSIGNED<setueq, CmpEQ>;
defm : ISET_FORMAT_SIGNED<setne, CmpNE>;
+defm : ISET_FORMAT_UNSIGNED<setugt, CmpGT>;
+defm : ISET_FORMAT_UNSIGNED<setult, CmpLT>;
+defm : ISET_FORMAT_UNSIGNED<setuge, CmpGE>;
+defm : ISET_FORMAT_UNSIGNED<setule, CmpLE>;
+defm : ISET_FORMAT_UNSIGNED<setueq, CmpEQ>;
defm : ISET_FORMAT_UNSIGNED<setune, CmpNE>;
// i1 compares
@@ -1678,13 +1674,14 @@ defm FSetNE : FSET_FORMAT<setne, CmpNE, CmpNE_FTZ>;
defm FSetNUM : FSET_FORMAT<seto, CmpNUM, CmpNUM_FTZ>;
defm FSetNAN : FSET_FORMAT<setuo, CmpNAN, CmpNAN_FTZ>;
-//def ld_param : SDNode<"NVPTXISD::LOAD_PARAM", SDTLoad,
-// [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+// FIXME: What is this doing here? Can it be deleted?
+// def ld_param : SDNode<"NVPTXISD::LOAD_PARAM", SDTLoad,
+// [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
-def SDTDeclareParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>,
- SDTCisInt<2>]>;
-def SDTDeclareScalarParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>,
- SDTCisInt<1>, SDTCisInt<2>]>;
+def SDTDeclareParamProfile :
+ SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>;
+def SDTDeclareScalarParamProfile :
+ SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>;
def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>;
def SDTLoadParamV2Profile : SDTypeProfile<2, 2, [SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisInt<3>]>;
def SDTLoadParamV4Profile : SDTypeProfile<4, 2, [SDTCisInt<4>, SDTCisInt<5>]>;
@@ -1704,185 +1701,200 @@ def SDTStoreRetvalV2Profile : SDTypeProfile<0, 3, [SDTCisInt<0>]>;
def SDTStoreRetvalV4Profile : SDTypeProfile<0, 5, [SDTCisInt<0>]>;
def SDTPseudoUseParamProfile : SDTypeProfile<0, 1, []>;
-def DeclareParam : SDNode<"NVPTXISD::DeclareParam", SDTDeclareParamProfile,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def DeclareScalarParam : SDNode<"NVPTXISD::DeclareScalarParam",
- SDTDeclareScalarParamProfile,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def DeclareRetParam : SDNode<"NVPTXISD::DeclareRetParam",
- SDTDeclareParamProfile,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def DeclareRet : SDNode<"NVPTXISD::DeclareRet", SDTDeclareScalarParamProfile,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def LoadParam : SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile,
- [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
-def LoadParamV2 : SDNode<"NVPTXISD::LoadParamV2", SDTLoadParamV2Profile,
- [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
-def LoadParamV4 : SDNode<"NVPTXISD::LoadParamV4", SDTLoadParamV4Profile,
- [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
-def PrintCall : SDNode<"NVPTXISD::PrintCall", SDTPrintCallProfile,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def PrintCallUni : SDNode<"NVPTXISD::PrintCallUni", SDTPrintCallUniProfile,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def StoreParam : SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def StoreParamV2 : SDNode<"NVPTXISD::StoreParamV2", SDTStoreParamV2Profile,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def StoreParamV4 : SDNode<"NVPTXISD::StoreParamV4", SDTStoreParamV4Profile,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def StoreParamU32 : SDNode<"NVPTXISD::StoreParamU32", SDTStoreParam32Profile,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def StoreParamS32 : SDNode<"NVPTXISD::StoreParamS32", SDTStoreParam32Profile,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def CallArgBegin : SDNode<"NVPTXISD::CallArgBegin", SDTCallArgMarkProfile,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def CallArg : SDNode<"NVPTXISD::CallArg", SDTCallArgProfile,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def LastCallArg : SDNode<"NVPTXISD::LastCallArg", SDTCallArgProfile,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def CallArgEnd : SDNode<"NVPTXISD::CallArgEnd", SDTCallVoidProfile,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def CallVoid : SDNode<"NVPTXISD::CallVoid", SDTCallVoidProfile,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def Prototype : SDNode<"NVPTXISD::Prototype", SDTCallVoidProfile,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def CallVal : SDNode<"NVPTXISD::CallVal", SDTCallValProfile,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def MoveParam : SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile,
- []>;
-def StoreRetval : SDNode<"NVPTXISD::StoreRetval", SDTStoreRetvalProfile,
- [SDNPHasChain, SDNPSideEffect]>;
-def StoreRetvalV2 : SDNode<"NVPTXISD::StoreRetvalV2", SDTStoreRetvalV2Profile,
- [SDNPHasChain, SDNPSideEffect]>;
-def StoreRetvalV4 : SDNode<"NVPTXISD::StoreRetvalV4", SDTStoreRetvalV4Profile,
- [SDNPHasChain, SDNPSideEffect]>;
-def PseudoUseParam : SDNode<"NVPTXISD::PseudoUseParam",
- SDTPseudoUseParamProfile,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def RETURNNode : SDNode<"NVPTXISD::RETURN", SDTCallArgMarkProfile,
- [SDNPHasChain, SDNPSideEffect]>;
-
-class LoadParamMemInst<NVPTXRegClass regclass, string opstr> :
- NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
- !strconcat(!strconcat("ld.param", opstr),
- "\t$dst, [retval0+$b];"),
- []>;
+def DeclareParam :
+ SDNode<"NVPTXISD::DeclareParam", SDTDeclareParamProfile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def DeclareScalarParam :
+ SDNode<"NVPTXISD::DeclareScalarParam", SDTDeclareScalarParamProfile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def DeclareRetParam :
+ SDNode<"NVPTXISD::DeclareRetParam", SDTDeclareParamProfile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def DeclareRet :
+ SDNode<"NVPTXISD::DeclareRet", SDTDeclareScalarParamProfile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def LoadParam :
+ SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile,
+ [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
+def LoadParamV2 :
+ SDNode<"NVPTXISD::LoadParamV2", SDTLoadParamV2Profile,
+ [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
+def LoadParamV4 :
+ SDNode<"NVPTXISD::LoadParamV4", SDTLoadParamV4Profile,
+ [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
+def PrintCall :
+ SDNode<"NVPTXISD::PrintCall", SDTPrintCallProfile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def PrintConvergentCall :
+ SDNode<"NVPTXISD::PrintConvergentCall", SDTPrintCallProfile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def PrintCallUni :
+ SDNode<"NVPTXISD::PrintCallUni", SDTPrintCallUniProfile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def PrintConvergentCallUni :
+ SDNode<"NVPTXISD::PrintConvergentCallUni", SDTPrintCallUniProfile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def StoreParam :
+ SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def StoreParamV2 :
+ SDNode<"NVPTXISD::StoreParamV2", SDTStoreParamV2Profile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def StoreParamV4 :
+ SDNode<"NVPTXISD::StoreParamV4", SDTStoreParamV4Profile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def StoreParamU32 :
+ SDNode<"NVPTXISD::StoreParamU32", SDTStoreParam32Profile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def StoreParamS32 :
+ SDNode<"NVPTXISD::StoreParamS32", SDTStoreParam32Profile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def CallArgBegin :
+ SDNode<"NVPTXISD::CallArgBegin", SDTCallArgMarkProfile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def CallArg :
+ SDNode<"NVPTXISD::CallArg", SDTCallArgProfile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def LastCallArg :
+ SDNode<"NVPTXISD::LastCallArg", SDTCallArgProfile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def CallArgEnd :
+ SDNode<"NVPTXISD::CallArgEnd", SDTCallVoidProfile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def CallVoid :
+ SDNode<"NVPTXISD::CallVoid", SDTCallVoidProfile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def Prototype :
+ SDNode<"NVPTXISD::Prototype", SDTCallVoidProfile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def CallVal :
+ SDNode<"NVPTXISD::CallVal", SDTCallValProfile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def MoveParam :
+ SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile, []>;
+def StoreRetval :
+ SDNode<"NVPTXISD::StoreRetval", SDTStoreRetvalProfile,
+ [SDNPHasChain, SDNPSideEffect]>;
+def StoreRetvalV2 :
+ SDNode<"NVPTXISD::StoreRetvalV2", SDTStoreRetvalV2Profile,
+ [SDNPHasChain, SDNPSideEffect]>;
+def StoreRetvalV4 :
+ SDNode<"NVPTXISD::StoreRetvalV4", SDTStoreRetvalV4Profile,
+ [SDNPHasChain, SDNPSideEffect]>;
+def PseudoUseParam :
+ SDNode<"NVPTXISD::PseudoUseParam", SDTPseudoUseParamProfile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def RETURNNode :
+ SDNode<"NVPTXISD::RETURN", SDTCallArgMarkProfile,
+ [SDNPHasChain, SDNPSideEffect]>;
+
+let mayLoad = 1 in {
+ class LoadParamMemInst<NVPTXRegClass regclass, string opstr> :
+ NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
+ !strconcat(!strconcat("ld.param", opstr),
+ "\t$dst, [retval0+$b];"),
+ []>;
+
+ class LoadParamV2MemInst<NVPTXRegClass regclass, string opstr> :
+ NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins i32imm:$b),
+ !strconcat("ld.param.v2", opstr,
+ "\t{{$dst, $dst2}}, [retval0+$b];"), []>;
+
+ class LoadParamV4MemInst<NVPTXRegClass regclass, string opstr> :
+ NVPTXInst<(outs regclass:$dst, regclass:$dst2, regclass:$dst3,
+ regclass:$dst4),
+ (ins i32imm:$b),
+ !strconcat("ld.param.v4", opstr,
+ "\t{{$dst, $dst2, $dst3, $dst4}}, [retval0+$b];"),
+ []>;
+}
class LoadParamRegInst<NVPTXRegClass regclass, string opstr> :
NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
- !strconcat(!strconcat("mov", opstr),
- "\t$dst, retval$b;"),
+ !strconcat("mov", opstr, "\t$dst, retval$b;"),
[(set regclass:$dst, (LoadParam (i32 0), (i32 imm:$b)))]>;
-class LoadParamV2MemInst<NVPTXRegClass regclass, string opstr> :
- NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins i32imm:$b),
- !strconcat(!strconcat("ld.param.v2", opstr),
- "\t{{$dst, $dst2}}, [retval0+$b];"), []>;
-
-class LoadParamV4MemInst<NVPTXRegClass regclass, string opstr> :
- NVPTXInst<(outs regclass:$dst, regclass:$dst2, regclass:$dst3,
- regclass:$dst4),
- (ins i32imm:$b),
- !strconcat(!strconcat("ld.param.v4", opstr),
- "\t{{$dst, $dst2, $dst3, $dst4}}, [retval0+$b];"), []>;
-
-class StoreParamInst<NVPTXRegClass regclass, string opstr> :
- NVPTXInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b),
- !strconcat(!strconcat("st.param", opstr),
- "\t[param$a+$b], $val;"),
- []>;
-
-class StoreParamV2Inst<NVPTXRegClass regclass, string opstr> :
- NVPTXInst<(outs), (ins regclass:$val, regclass:$val2,
- i32imm:$a, i32imm:$b),
- !strconcat(!strconcat("st.param.v2", opstr),
- "\t[param$a+$b], {{$val, $val2}};"),
- []>;
-
-class StoreParamV4Inst<NVPTXRegClass regclass, string opstr> :
- NVPTXInst<(outs), (ins regclass:$val, regclass:$val1, regclass:$val2,
- regclass:$val3, i32imm:$a, i32imm:$b),
- !strconcat(!strconcat("st.param.v4", opstr),
- "\t[param$a+$b], {{$val, $val2, $val3, $val4}};"),
- []>;
-
-class StoreRetvalInst<NVPTXRegClass regclass, string opstr> :
- NVPTXInst<(outs), (ins regclass:$val, i32imm:$a),
- !strconcat(!strconcat("st.param", opstr),
- "\t[func_retval0+$a], $val;"),
- []>;
-
-class StoreRetvalV2Inst<NVPTXRegClass regclass, string opstr> :
- NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, i32imm:$a),
- !strconcat(!strconcat("st.param.v2", opstr),
- "\t[func_retval0+$a], {{$val, $val2}};"),
- []>;
-
-class StoreRetvalV4Inst<NVPTXRegClass regclass, string opstr> :
- NVPTXInst<(outs),
- (ins regclass:$val, regclass:$val2, regclass:$val3,
- regclass:$val4, i32imm:$a),
- !strconcat(!strconcat("st.param.v4", opstr),
- "\t[func_retval0+$a], {{$val, $val2, $val3, $val4}};"),
- []>;
-
-def PrintCallRetInst1 : NVPTXInst<(outs), (ins),
-"call (retval0), ",
- [(PrintCall (i32 1))]>;
-def PrintCallRetInst2 : NVPTXInst<(outs), (ins),
-"call (retval0, retval1), ",
- [(PrintCall (i32 2))]>;
-def PrintCallRetInst3 : NVPTXInst<(outs), (ins),
-"call (retval0, retval1, retval2), ",
- [(PrintCall (i32 3))]>;
-def PrintCallRetInst4 : NVPTXInst<(outs), (ins),
-"call (retval0, retval1, retval2, retval3), ",
- [(PrintCall (i32 4))]>;
-def PrintCallRetInst5 : NVPTXInst<(outs), (ins),
-"call (retval0, retval1, retval2, retval3, retval4), ",
- [(PrintCall (i32 5))]>;
-def PrintCallRetInst6 : NVPTXInst<(outs), (ins),
-"call (retval0, retval1, retval2, retval3, retval4, retval5), ",
- [(PrintCall (i32 6))]>;
-def PrintCallRetInst7 : NVPTXInst<(outs), (ins),
-"call (retval0, retval1, retval2, retval3, retval4, retval5, retval6), ",
- [(PrintCall (i32 7))]>;
-def PrintCallRetInst8 : NVPTXInst<(outs), (ins),
-!strconcat("call (retval0, retval1, retval2, retval3, retval4",
- ", retval5, retval6, retval7), "),
- [(PrintCall (i32 8))]>;
-
-def PrintCallNoRetInst : NVPTXInst<(outs), (ins), "call ",
- [(PrintCall (i32 0))]>;
-
-def PrintCallUniRetInst1 : NVPTXInst<(outs), (ins),
-"call.uni (retval0), ",
- [(PrintCallUni (i32 1))]>;
-def PrintCallUniRetInst2 : NVPTXInst<(outs), (ins),
-"call.uni (retval0, retval1), ",
- [(PrintCallUni (i32 2))]>;
-def PrintCallUniRetInst3 : NVPTXInst<(outs), (ins),
-"call.uni (retval0, retval1, retval2), ",
- [(PrintCallUni (i32 3))]>;
-def PrintCallUniRetInst4 : NVPTXInst<(outs), (ins),
-"call.uni (retval0, retval1, retval2, retval3), ",
- [(PrintCallUni (i32 4))]>;
-def PrintCallUniRetInst5 : NVPTXInst<(outs), (ins),
-"call.uni (retval0, retval1, retval2, retval3, retval4), ",
- [(PrintCallUni (i32 5))]>;
-def PrintCallUniRetInst6 : NVPTXInst<(outs), (ins),
-"call.uni (retval0, retval1, retval2, retval3, retval4, retval5), ",
- [(PrintCallUni (i32 6))]>;
-def PrintCallUniRetInst7 : NVPTXInst<(outs), (ins),
-"call.uni (retval0, retval1, retval2, retval3, retval4, retval5, retval6), ",
- [(PrintCallUni (i32 7))]>;
-def PrintCallUniRetInst8 : NVPTXInst<(outs), (ins),
-!strconcat("call.uni (retval0, retval1, retval2, retval3, retval4",
- ", retval5, retval6, retval7), "),
- [(PrintCallUni (i32 8))]>;
-
-def PrintCallUniNoRetInst : NVPTXInst<(outs), (ins), "call.uni ",
- [(PrintCallUni (i32 0))]>;
+let mayStore = 1 in {
+ class StoreParamInst<NVPTXRegClass regclass, string opstr> :
+ NVPTXInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b),
+ !strconcat("st.param", opstr, "\t[param$a+$b], $val;"),
+ []>;
+
+ class StoreParamV2Inst<NVPTXRegClass regclass, string opstr> :
+ NVPTXInst<(outs), (ins regclass:$val, regclass:$val2,
+ i32imm:$a, i32imm:$b),
+ !strconcat("st.param.v2", opstr,
+ "\t[param$a+$b], {{$val, $val2}};"),
+ []>;
+
+ class StoreParamV4Inst<NVPTXRegClass regclass, string opstr> :
+ NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, regclass:$val3,
+ regclass:$val4, i32imm:$a,
+ i32imm:$b),
+ !strconcat("st.param.v4", opstr,
+ "\t[param$a+$b], {{$val, $val2, $val3, $val4}};"),
+ []>;
+
+ class StoreRetvalInst<NVPTXRegClass regclass, string opstr> :
+ NVPTXInst<(outs), (ins regclass:$val, i32imm:$a),
+ !strconcat("st.param", opstr, "\t[func_retval0+$a], $val;"),
+ []>;
+
+ class StoreRetvalV2Inst<NVPTXRegClass regclass, string opstr> :
+ NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, i32imm:$a),
+ !strconcat("st.param.v2", opstr,
+ "\t[func_retval0+$a], {{$val, $val2}};"),
+ []>;
+
+ class StoreRetvalV4Inst<NVPTXRegClass regclass, string opstr> :
+ NVPTXInst<(outs),
+ (ins regclass:$val, regclass:$val2, regclass:$val3,
+ regclass:$val4, i32imm:$a),
+ !strconcat("st.param.v4", opstr,
+ "\t[func_retval0+$a], {{$val, $val2, $val3, $val4}};"),
+ []>;
+}
+
+let isCall=1 in {
+ multiclass CALL<string OpcStr, SDNode OpNode> {
+ def PrintCallNoRetInst : NVPTXInst<(outs), (ins),
+ !strconcat(OpcStr, " "), [(OpNode (i32 0))]>;
+ def PrintCallRetInst1 : NVPTXInst<(outs), (ins),
+ !strconcat(OpcStr, " (retval0), "), [(OpNode (i32 1))]>;
+ def PrintCallRetInst2 : NVPTXInst<(outs), (ins),
+ !strconcat(OpcStr, " (retval0, retval1), "), [(OpNode (i32 2))]>;
+ def PrintCallRetInst3 : NVPTXInst<(outs), (ins),
+ !strconcat(OpcStr, " (retval0, retval1, retval2), "), [(OpNode (i32 3))]>;
+ def PrintCallRetInst4 : NVPTXInst<(outs), (ins),
+ !strconcat(OpcStr, " (retval0, retval1, retval2, retval3), "),
+ [(OpNode (i32 4))]>;
+ def PrintCallRetInst5 : NVPTXInst<(outs), (ins),
+ !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4), "),
+ [(OpNode (i32 5))]>;
+ def PrintCallRetInst6 : NVPTXInst<(outs), (ins),
+ !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
+ "retval5), "),
+ [(OpNode (i32 6))]>;
+ def PrintCallRetInst7 : NVPTXInst<(outs), (ins),
+ !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
+ "retval5, retval6), "),
+ [(OpNode (i32 7))]>;
+ def PrintCallRetInst8 : NVPTXInst<(outs), (ins),
+ !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
+ "retval5, retval6, retval7), "),
+ [(OpNode (i32 8))]>;
+ }
+}
+
+defm Call : CALL<"call", PrintCall>;
+defm CallUni : CALL<"call.uni", PrintCallUni>;
+
+// Convergent call instructions. These are identical to regular calls, except
+// they have the isConvergent bit set.
+let isConvergent=1 in {
+ defm ConvergentCall : CALL<"call", PrintConvergentCall>;
+ defm ConvergentCallUni : CALL<"call.uni", PrintConvergentCallUni>;
+}
def LoadParamMemI64 : LoadParamMemInst<Int64Regs, ".b64">;
def LoadParamMemI32 : LoadParamMemInst<Int32Regs, ".b32">;
@@ -1911,39 +1923,15 @@ def StoreParamV2I32 : StoreParamV2Inst<Int32Regs, ".b32">;
def StoreParamV2I16 : StoreParamV2Inst<Int16Regs, ".b16">;
def StoreParamV2I8 : StoreParamV2Inst<Int16Regs, ".b8">;
-// FIXME: StoreParamV4Inst crashes llvm-tblgen :(
-//def StoreParamV4I32 : StoreParamV4Inst<Int32Regs, ".b32">;
-def StoreParamV4I32 : NVPTXInst<(outs), (ins Int32Regs:$val, Int32Regs:$val2,
- Int32Regs:$val3, Int32Regs:$val4,
- i32imm:$a, i32imm:$b),
- "st.param.v4.b32\t[param$a+$b], {{$val, $val2, $val3, $val4}};",
- []>;
-
-def StoreParamV4I16 : NVPTXInst<(outs), (ins Int16Regs:$val, Int16Regs:$val2,
- Int16Regs:$val3, Int16Regs:$val4,
- i32imm:$a, i32imm:$b),
- "st.param.v4.b16\t[param$a+$b], {{$val, $val2, $val3, $val4}};",
- []>;
-
-def StoreParamV4I8 : NVPTXInst<(outs), (ins Int16Regs:$val, Int16Regs:$val2,
- Int16Regs:$val3, Int16Regs:$val4,
- i32imm:$a, i32imm:$b),
- "st.param.v4.b8\t[param$a+$b], {{$val, $val2, $val3, $val4}};",
- []>;
-
-def StoreParamF32 : StoreParamInst<Float32Regs, ".f32">;
-def StoreParamF64 : StoreParamInst<Float64Regs, ".f64">;
+def StoreParamV4I32 : StoreParamV4Inst<Int32Regs, ".b32">;
+def StoreParamV4I16 : StoreParamV4Inst<Int16Regs, ".b16">;
+def StoreParamV4I8 : StoreParamV4Inst<Int16Regs, ".b8">;
+
+def StoreParamF32 : StoreParamInst<Float32Regs, ".f32">;
+def StoreParamF64 : StoreParamInst<Float64Regs, ".f64">;
def StoreParamV2F32 : StoreParamV2Inst<Float32Regs, ".f32">;
def StoreParamV2F64 : StoreParamV2Inst<Float64Regs, ".f64">;
-// FIXME: StoreParamV4Inst crashes llvm-tblgen :(
-//def StoreParamV4F32 : StoreParamV4Inst<Float32Regs, ".f32">;
-def StoreParamV4F32 : NVPTXInst<(outs),
- (ins Float32Regs:$val, Float32Regs:$val2,
- Float32Regs:$val3, Float32Regs:$val4,
- i32imm:$a, i32imm:$b),
- "st.param.v4.f32\t[param$a+$b], {{$val, $val2, $val3, $val4}};",
- []>;
-
+def StoreParamV4F32 : StoreParamV4Inst<Float32Regs, ".f32">;
def StoreRetvalI64 : StoreRetvalInst<Int64Regs, ".b64">;
def StoreRetvalI32 : StoreRetvalInst<Int32Regs, ".b32">;
@@ -1969,89 +1957,88 @@ def CallArgEndInst0 : NVPTXInst<(outs), (ins), ")", [(CallArgEnd (i32 0))]>;
def RETURNInst : NVPTXInst<(outs), (ins), "ret;", [(RETURNNode)]>;
class CallArgInst<NVPTXRegClass regclass> :
- NVPTXInst<(outs), (ins regclass:$a), "$a, ",
- [(CallArg (i32 0), regclass:$a)]>;
+ NVPTXInst<(outs), (ins regclass:$a), "$a, ",
+ [(CallArg (i32 0), regclass:$a)]>;
class LastCallArgInst<NVPTXRegClass regclass> :
- NVPTXInst<(outs), (ins regclass:$a), "$a",
- [(LastCallArg (i32 0), regclass:$a)]>;
+ NVPTXInst<(outs), (ins regclass:$a), "$a",
+ [(LastCallArg (i32 0), regclass:$a)]>;
def CallArgI64 : CallArgInst<Int64Regs>;
def CallArgI32 : CallArgInst<Int32Regs>;
def CallArgI16 : CallArgInst<Int16Regs>;
-
def CallArgF64 : CallArgInst<Float64Regs>;
def CallArgF32 : CallArgInst<Float32Regs>;
def LastCallArgI64 : LastCallArgInst<Int64Regs>;
def LastCallArgI32 : LastCallArgInst<Int32Regs>;
def LastCallArgI16 : LastCallArgInst<Int16Regs>;
-
def LastCallArgF64 : LastCallArgInst<Float64Regs>;
def LastCallArgF32 : LastCallArgInst<Float32Regs>;
def CallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a, ",
[(CallArg (i32 0), (i32 imm:$a))]>;
def LastCallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a",
- [(LastCallArg (i32 0), (i32 imm:$a))]>;
+ [(LastCallArg (i32 0), (i32 imm:$a))]>;
def CallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a, ",
[(CallArg (i32 1), (i32 imm:$a))]>;
def LastCallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a",
- [(LastCallArg (i32 1), (i32 imm:$a))]>;
-
-def CallVoidInst : NVPTXInst<(outs), (ins imem:$addr),
- "$addr, ",
- [(CallVoid (Wrapper tglobaladdr:$addr))]>;
-def CallVoidInstReg : NVPTXInst<(outs), (ins Int32Regs:$addr),
- "$addr, ",
- [(CallVoid Int32Regs:$addr)]>;
-def CallVoidInstReg64 : NVPTXInst<(outs), (ins Int64Regs:$addr),
- "$addr, ",
- [(CallVoid Int64Regs:$addr)]>;
-def PrototypeInst : NVPTXInst<(outs), (ins i32imm:$val),
- ", prototype_$val;",
- [(Prototype (i32 imm:$val))]>;
-
-def DeclareRetMemInst : NVPTXInst<(outs),
- (ins i32imm:$align, i32imm:$size, i32imm:$num),
- ".param .align $align .b8 retval$num[$size];",
- [(DeclareRetParam (i32 imm:$align), (i32 imm:$size), (i32 imm:$num))]>;
-def DeclareRetScalarInst : NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
- ".param .b$size retval$num;",
- [(DeclareRet (i32 1), (i32 imm:$size), (i32 imm:$num))]>;
-def DeclareRetRegInst : NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
- ".reg .b$size retval$num;",
- [(DeclareRet (i32 2), (i32 imm:$size), (i32 imm:$num))]>;
-
-def DeclareParamInst : NVPTXInst<(outs),
- (ins i32imm:$align, i32imm:$a, i32imm:$size),
- ".param .align $align .b8 param$a[$size];",
- [(DeclareParam (i32 imm:$align), (i32 imm:$a), (i32 imm:$size))]>;
-def DeclareScalarParamInst : NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
- ".param .b$size param$a;",
- [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 0))]>;
-def DeclareScalarRegInst : NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
- ".reg .b$size param$a;",
- [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 1))]>;
+ [(LastCallArg (i32 1), (i32 imm:$a))]>;
+
+def CallVoidInst : NVPTXInst<(outs), (ins imem:$addr), "$addr, ",
+ [(CallVoid (Wrapper tglobaladdr:$addr))]>;
+def CallVoidInstReg : NVPTXInst<(outs), (ins Int32Regs:$addr), "$addr, ",
+ [(CallVoid Int32Regs:$addr)]>;
+def CallVoidInstReg64 : NVPTXInst<(outs), (ins Int64Regs:$addr), "$addr, ",
+ [(CallVoid Int64Regs:$addr)]>;
+def PrototypeInst : NVPTXInst<(outs), (ins i32imm:$val), ", prototype_$val;",
+ [(Prototype (i32 imm:$val))]>;
+
+def DeclareRetMemInst :
+ NVPTXInst<(outs), (ins i32imm:$align, i32imm:$size, i32imm:$num),
+ ".param .align $align .b8 retval$num[$size];",
+ [(DeclareRetParam (i32 imm:$align), (i32 imm:$size), (i32 imm:$num))]>;
+def DeclareRetScalarInst :
+ NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
+ ".param .b$size retval$num;",
+ [(DeclareRet (i32 1), (i32 imm:$size), (i32 imm:$num))]>;
+def DeclareRetRegInst :
+ NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
+ ".reg .b$size retval$num;",
+ [(DeclareRet (i32 2), (i32 imm:$size), (i32 imm:$num))]>;
+
+def DeclareParamInst :
+ NVPTXInst<(outs), (ins i32imm:$align, i32imm:$a, i32imm:$size),
+ ".param .align $align .b8 param$a[$size];",
+ [(DeclareParam (i32 imm:$align), (i32 imm:$a), (i32 imm:$size))]>;
+def DeclareScalarParamInst :
+ NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
+ ".param .b$size param$a;",
+ [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 0))]>;
+def DeclareScalarRegInst :
+ NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
+ ".reg .b$size param$a;",
+ [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 1))]>;
class MoveParamInst<NVPTXRegClass regclass, string asmstr> :
- NVPTXInst<(outs regclass:$dst), (ins regclass:$src),
- !strconcat(!strconcat("mov", asmstr), "\t$dst, $src;"),
- [(set regclass:$dst, (MoveParam regclass:$src))]>;
+ NVPTXInst<(outs regclass:$dst), (ins regclass:$src),
+ !strconcat("mov", asmstr, "\t$dst, $src;"),
+ [(set regclass:$dst, (MoveParam regclass:$src))]>;
def MoveParamI64 : MoveParamInst<Int64Regs, ".b64">;
def MoveParamI32 : MoveParamInst<Int32Regs, ".b32">;
-def MoveParamI16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
- "cvt.u16.u32\t$dst, $src;",
- [(set Int16Regs:$dst, (MoveParam Int16Regs:$src))]>;
+def MoveParamI16 :
+ NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
+ "cvt.u16.u32\t$dst, $src;",
+ [(set Int16Regs:$dst, (MoveParam Int16Regs:$src))]>;
def MoveParamF64 : MoveParamInst<Float64Regs, ".f64">;
def MoveParamF32 : MoveParamInst<Float32Regs, ".f32">;
class PseudoUseParamInst<NVPTXRegClass regclass> :
- NVPTXInst<(outs), (ins regclass:$src),
- "// Pseudo use of $src",
- [(PseudoUseParam regclass:$src)]>;
+ NVPTXInst<(outs), (ins regclass:$src),
+ "// Pseudo use of $src",
+ [(PseudoUseParam regclass:$src)]>;
def PseudoUseParamI64 : PseudoUseParamInst<Int64Regs>;
def PseudoUseParamI32 : PseudoUseParamInst<Int32Regs>;
@@ -2064,254 +2051,278 @@ def PseudoUseParamF32 : PseudoUseParamInst<Float32Regs>;
// Load / Store Handling
//
multiclass LD<NVPTXRegClass regclass> {
- def _avar : NVPTXInst<(outs regclass:$dst),
+ def _avar : NVPTXInst<
+ (outs regclass:$dst),
(ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, imem:$addr),
-!strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
- "$fromWidth \t$dst, [$addr];"), []>;
- def _areg : NVPTXInst<(outs regclass:$dst),
+ i32imm:$fromWidth, imem:$addr),
+ "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t$dst, [$addr];", []>;
+ def _areg : NVPTXInst<
+ (outs regclass:$dst),
(ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int32Regs:$addr),
-!strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
- "$fromWidth \t$dst, [$addr];"), []>;
- def _areg_64 : NVPTXInst<(outs regclass:$dst),
+ i32imm:$fromWidth, Int32Regs:$addr),
+ "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t$dst, [$addr];", []>;
+ def _areg_64 : NVPTXInst<
+ (outs regclass:$dst),
(ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int64Regs:$addr),
- !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth",
- " \t$dst, [$addr];"), []>;
- def _ari : NVPTXInst<(outs regclass:$dst),
+ i32imm:$fromWidth, Int64Regs:$addr),
+ "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t$dst, [$addr];", []>;
+ def _ari : NVPTXInst<
+ (outs regclass:$dst),
(ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
-!strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
- "$fromWidth \t$dst, [$addr+$offset];"), []>;
- def _ari_64 : NVPTXInst<(outs regclass:$dst),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
- !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth",
- " \t$dst, [$addr+$offset];"), []>;
- def _asi : NVPTXInst<(outs regclass:$dst),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, imem:$addr, i32imm:$offset),
-!strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
- "$fromWidth \t$dst, [$addr+$offset];"), []>;
+ i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
+ "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t$dst, [$addr+$offset];", []>;
+ def _ari_64 : NVPTXInst<
+ (outs regclass:$dst),
+ (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
+ "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t$dst, [$addr+$offset];", []>;
+ def _asi : NVPTXInst<
+ (outs regclass:$dst),
+ (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset),
+ "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t$dst, [$addr+$offset];", []>;
}
let mayLoad=1, hasSideEffects=0 in {
-defm LD_i8 : LD<Int16Regs>;
-defm LD_i16 : LD<Int16Regs>;
-defm LD_i32 : LD<Int32Regs>;
-defm LD_i64 : LD<Int64Regs>;
-defm LD_f32 : LD<Float32Regs>;
-defm LD_f64 : LD<Float64Regs>;
+ defm LD_i8 : LD<Int16Regs>;
+ defm LD_i16 : LD<Int16Regs>;
+ defm LD_i32 : LD<Int32Regs>;
+ defm LD_i64 : LD<Int64Regs>;
+ defm LD_f32 : LD<Float32Regs>;
+ defm LD_f64 : LD<Float64Regs>;
}
multiclass ST<NVPTXRegClass regclass> {
- def _avar : NVPTXInst<(outs),
- (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
- LdStCode:$Sign, i32imm:$toWidth, imem:$addr),
-!strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth",
- " \t[$addr], $src;"), []>;
- def _areg : NVPTXInst<(outs),
+ def _avar : NVPTXInst<
+ (outs),
(ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
- LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr),
-!strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth",
- " \t[$addr], $src;"), []>;
- def _areg_64 : NVPTXInst<(outs),
+ LdStCode:$Sign, i32imm:$toWidth, imem:$addr),
+ "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ " \t[$addr], $src;", []>;
+ def _areg : NVPTXInst<
+ (outs),
+ (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp,
+ LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr),
+ "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ " \t[$addr], $src;", []>;
+ def _areg_64 : NVPTXInst<
+ (outs),
(ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
- LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr),
- !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth ",
- "\t[$addr], $src;"), []>;
- def _ari : NVPTXInst<(outs),
+ LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr),
+ "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ " \t[$addr], $src;", []>;
+ def _ari : NVPTXInst<
+ (outs),
(ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
- LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset),
-!strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth",
- " \t[$addr+$offset], $src;"), []>;
- def _ari_64 : NVPTXInst<(outs),
+ LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset),
+ "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ " \t[$addr+$offset], $src;", []>;
+ def _ari_64 : NVPTXInst<
+ (outs),
(ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
- LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset),
- !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth ",
- "\t[$addr+$offset], $src;"), []>;
- def _asi : NVPTXInst<(outs),
+ LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset),
+ "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ " \t[$addr+$offset], $src;", []>;
+ def _asi : NVPTXInst<
+ (outs),
(ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
- LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset),
-!strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth",
- " \t[$addr+$offset], $src;"), []>;
+ LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset),
+ "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ " \t[$addr+$offset], $src;", []>;
}
let mayStore=1, hasSideEffects=0 in {
-defm ST_i8 : ST<Int16Regs>;
-defm ST_i16 : ST<Int16Regs>;
-defm ST_i32 : ST<Int32Regs>;
-defm ST_i64 : ST<Int64Regs>;
-defm ST_f32 : ST<Float32Regs>;
-defm ST_f64 : ST<Float64Regs>;
+ defm ST_i8 : ST<Int16Regs>;
+ defm ST_i16 : ST<Int16Regs>;
+ defm ST_i32 : ST<Int32Regs>;
+ defm ST_i64 : ST<Int64Regs>;
+ defm ST_f32 : ST<Float32Regs>;
+ defm ST_f64 : ST<Float64Regs>;
}
-// The following is used only in and after vector elementizations.
-// Vector elementization happens at the machine instruction level, so the
-// following instruction
-// never appears in the DAG.
+// The following is used only in and after vector elementizations. Vector
+// elementization happens at the machine instruction level, so the following
+// instructions never appear in the DAG.
multiclass LD_VEC<NVPTXRegClass regclass> {
- def _v2_avar : NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+ def _v2_avar : NVPTXInst<
+ (outs regclass:$dst1, regclass:$dst2),
(ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, imem:$addr),
- !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
- "$fromWidth \t{{$dst1, $dst2}}, [$addr];"), []>;
- def _v2_areg : NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+ i32imm:$fromWidth, imem:$addr),
+ "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t{{$dst1, $dst2}}, [$addr];", []>;
+ def _v2_areg : NVPTXInst<
+ (outs regclass:$dst1, regclass:$dst2),
(ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int32Regs:$addr),
- !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
- "$fromWidth \t{{$dst1, $dst2}}, [$addr];"), []>;
- def _v2_areg_64 : NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+ i32imm:$fromWidth, Int32Regs:$addr),
+ "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t{{$dst1, $dst2}}, [$addr];", []>;
+ def _v2_areg_64 : NVPTXInst<
+ (outs regclass:$dst1, regclass:$dst2),
(ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int64Regs:$addr),
- !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
- "$fromWidth \t{{$dst1, $dst2}}, [$addr];"), []>;
- def _v2_ari : NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+ i32imm:$fromWidth, Int64Regs:$addr),
+ "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t{{$dst1, $dst2}}, [$addr];", []>;
+ def _v2_ari : NVPTXInst<
+ (outs regclass:$dst1, regclass:$dst2),
(ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
- !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
- "$fromWidth \t{{$dst1, $dst2}}, [$addr+$offset];"), []>;
- def _v2_ari_64 : NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+ i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
+ "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
+ def _v2_ari_64 : NVPTXInst<
+ (outs regclass:$dst1, regclass:$dst2),
(ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
- !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
- "$fromWidth \t{{$dst1, $dst2}}, [$addr+$offset];"), []>;
- def _v2_asi : NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+ i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
+ "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
+ def _v2_asi : NVPTXInst<
+ (outs regclass:$dst1, regclass:$dst2),
(ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, imem:$addr, i32imm:$offset),
- !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
- "$fromWidth \t{{$dst1, $dst2}}, [$addr+$offset];"), []>;
- def _v4_avar : NVPTXInst<(outs regclass:$dst1, regclass:$dst2,
- regclass:$dst3, regclass:$dst4),
+ i32imm:$fromWidth, imem:$addr, i32imm:$offset),
+ "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
+ def _v4_avar : NVPTXInst<
+ (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
(ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, imem:$addr),
- !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
- "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];"), []>;
- def _v4_areg : NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
- regclass:$dst4),
+ i32imm:$fromWidth, imem:$addr),
+ "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
+ def _v4_areg : NVPTXInst<
+ (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
(ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int32Regs:$addr),
- !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
- "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];"), []>;
- def _v4_areg_64 : NVPTXInst<(outs regclass:$dst1, regclass:$dst2,
- regclass:$dst3, regclass:$dst4),
+ i32imm:$fromWidth, Int32Regs:$addr),
+ "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
+ def _v4_areg_64 : NVPTXInst<
+ (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
(ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int64Regs:$addr),
- !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
- "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];"), []>;
- def _v4_ari : NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
- regclass:$dst4),
+ i32imm:$fromWidth, Int64Regs:$addr),
+ "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
+ def _v4_ari : NVPTXInst<
+ (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
(ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
- !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
- "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];"),
- []>;
- def _v4_ari_64 : NVPTXInst<(outs regclass:$dst1, regclass:$dst2,
- regclass:$dst3, regclass:$dst4),
+ i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
+ "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
+ def _v4_ari_64 : NVPTXInst<
+ (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
(ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
- !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
- "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];"),
- []>;
- def _v4_asi : NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
- regclass:$dst4),
+ i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
+ "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
+ def _v4_asi : NVPTXInst<
+ (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
(ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, imem:$addr, i32imm:$offset),
- !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
- "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];"),
- []>;
+ i32imm:$fromWidth, imem:$addr, i32imm:$offset),
+ "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
}
let mayLoad=1, hasSideEffects=0 in {
-defm LDV_i8 : LD_VEC<Int16Regs>;
-defm LDV_i16 : LD_VEC<Int16Regs>;
-defm LDV_i32 : LD_VEC<Int32Regs>;
-defm LDV_i64 : LD_VEC<Int64Regs>;
-defm LDV_f32 : LD_VEC<Float32Regs>;
-defm LDV_f64 : LD_VEC<Float64Regs>;
+ defm LDV_i8 : LD_VEC<Int16Regs>;
+ defm LDV_i16 : LD_VEC<Int16Regs>;
+ defm LDV_i32 : LD_VEC<Int32Regs>;
+ defm LDV_i64 : LD_VEC<Int64Regs>;
+ defm LDV_f32 : LD_VEC<Float32Regs>;
+ defm LDV_f64 : LD_VEC<Float64Regs>;
}
multiclass ST_VEC<NVPTXRegClass regclass> {
- def _v2_avar : NVPTXInst<(outs),
+ def _v2_avar : NVPTXInst<
+ (outs),
(ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
- LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
- !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
- "$fromWidth \t[$addr], {{$src1, $src2}};"), []>;
- def _v2_areg : NVPTXInst<(outs),
+ LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
+ "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t[$addr], {{$src1, $src2}};", []>;
+ def _v2_areg : NVPTXInst<
+ (outs),
(ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
- LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
- !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
- "$fromWidth \t[$addr], {{$src1, $src2}};"), []>;
- def _v2_areg_64 : NVPTXInst<(outs),
+ LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
+ "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t[$addr], {{$src1, $src2}};", []>;
+ def _v2_areg_64 : NVPTXInst<
+ (outs),
(ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
- LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
- !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
- "$fromWidth \t[$addr], {{$src1, $src2}};"), []>;
- def _v2_ari : NVPTXInst<(outs),
+ LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
+ "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t[$addr], {{$src1, $src2}};", []>;
+ def _v2_ari : NVPTXInst<
+ (outs),
(ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
- LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr,
- i32imm:$offset),
- !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
- "$fromWidth \t[$addr+$offset], {{$src1, $src2}};"), []>;
- def _v2_ari_64 : NVPTXInst<(outs),
+ LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr,
+ i32imm:$offset),
+ "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t[$addr+$offset], {{$src1, $src2}};", []>;
+ def _v2_ari_64 : NVPTXInst<
+ (outs),
(ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
- LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr,
- i32imm:$offset),
- !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
- "$fromWidth \t[$addr+$offset], {{$src1, $src2}};"), []>;
- def _v2_asi : NVPTXInst<(outs),
+ LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr,
+ i32imm:$offset),
+ "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t[$addr+$offset], {{$src1, $src2}};", []>;
+ def _v2_asi : NVPTXInst<
+ (outs),
(ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
- LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr,
- i32imm:$offset),
- !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
- "$fromWidth \t[$addr+$offset], {{$src1, $src2}};"), []>;
- def _v4_avar : NVPTXInst<(outs),
+ LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr,
+ i32imm:$offset),
+ "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t[$addr+$offset], {{$src1, $src2}};", []>;
+ def _v4_avar : NVPTXInst<
+ (outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, imem:$addr),
- !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
- "$fromWidth \t[$addr], {{$src1, $src2, $src3, $src4}};"), []>;
- def _v4_areg : NVPTXInst<(outs),
+ LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, imem:$addr),
+ "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
+ def _v4_areg : NVPTXInst<
+ (outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int32Regs:$addr),
- !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
- "$fromWidth \t[$addr], {{$src1, $src2, $src3, $src4}};"), []>;
- def _v4_areg_64 : NVPTXInst<(outs),
+ LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, Int32Regs:$addr),
+ "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
+ def _v4_areg_64 : NVPTXInst<
+ (outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int64Regs:$addr),
- !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
- "$fromWidth \t[$addr], {{$src1, $src2, $src3, $src4}};"), []>;
- def _v4_ari : NVPTXInst<(outs),
+ LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, Int64Regs:$addr),
+ "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
+ def _v4_ari : NVPTXInst<
+ (outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
- !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
- "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};"),
- []>;
- def _v4_ari_64 : NVPTXInst<(outs),
+ LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
+ "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
+ def _v4_ari_64 : NVPTXInst<
+ (outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
- !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
- "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};"),
- []>;
- def _v4_asi : NVPTXInst<(outs),
+ LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
+ "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
+ def _v4_asi : NVPTXInst<
+ (outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, imem:$addr, i32imm:$offset),
- !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
- "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};"),
- []>;
+ LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, imem:$addr, i32imm:$offset),
+ "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}"
+ "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
}
+
let mayStore=1, hasSideEffects=0 in {
-defm STV_i8 : ST_VEC<Int16Regs>;
-defm STV_i16 : ST_VEC<Int16Regs>;
-defm STV_i32 : ST_VEC<Int32Regs>;
-defm STV_i64 : ST_VEC<Int64Regs>;
-defm STV_f32 : ST_VEC<Float32Regs>;
-defm STV_f64 : ST_VEC<Float64Regs>;
+ defm STV_i8 : ST_VEC<Int16Regs>;
+ defm STV_i16 : ST_VEC<Int16Regs>;
+ defm STV_i32 : ST_VEC<Int32Regs>;
+ defm STV_i64 : ST_VEC<Int64Regs>;
+ defm STV_f32 : ST_VEC<Float32Regs>;
+ defm STV_f64 : ST_VEC<Float64Regs>;
}
@@ -2525,64 +2536,52 @@ def : Pat<(select Int32Regs:$pred, Float64Regs:$a, Float64Regs:$b),
(SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
-// pack a set of smaller int registers to a larger int register
-def V4I16toI64 : NVPTXInst<(outs Int64Regs:$d),
- (ins Int16Regs:$s1, Int16Regs:$s2,
- Int16Regs:$s3, Int16Regs:$s4),
- "mov.b64\t$d, {{$s1, $s2, $s3, $s4}};",
- []>;
-def V2I16toI32 : NVPTXInst<(outs Int32Regs:$d),
- (ins Int16Regs:$s1, Int16Regs:$s2),
- "mov.b32\t$d, {{$s1, $s2}};",
- []>;
-def V2I32toI64 : NVPTXInst<(outs Int64Regs:$d),
- (ins Int32Regs:$s1, Int32Regs:$s2),
- "mov.b64\t$d, {{$s1, $s2}};",
- []>;
-def V2F32toF64 : NVPTXInst<(outs Float64Regs:$d),
- (ins Float32Regs:$s1, Float32Regs:$s2),
- "mov.b64\t$d, {{$s1, $s2}};",
- []>;
-
-// unpack a larger int register to a set of smaller int registers
-def I64toV4I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2,
- Int16Regs:$d3, Int16Regs:$d4),
- (ins Int64Regs:$s),
- "mov.b64\t{{$d1, $d2, $d3, $d4}}, $s;",
- []>;
-def I32toV2I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2),
- (ins Int32Regs:$s),
- "mov.b32\t{{$d1, $d2}}, $s;",
- []>;
-def I64toV2I32 : NVPTXInst<(outs Int32Regs:$d1, Int32Regs:$d2),
- (ins Int64Regs:$s),
- "mov.b64\t{{$d1, $d2}}, $s;",
- []>;
-def F64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2),
- (ins Float64Regs:$s),
- "mov.b64\t{{$d1, $d2}}, $s;",
- []>;
+let hasSideEffects = 0 in {
+ // pack a set of smaller int registers to a larger int register
+ def V4I16toI64 : NVPTXInst<(outs Int64Regs:$d),
+ (ins Int16Regs:$s1, Int16Regs:$s2,
+ Int16Regs:$s3, Int16Regs:$s4),
+ "mov.b64\t$d, {{$s1, $s2, $s3, $s4}};", []>;
+ def V2I16toI32 : NVPTXInst<(outs Int32Regs:$d),
+ (ins Int16Regs:$s1, Int16Regs:$s2),
+ "mov.b32\t$d, {{$s1, $s2}};", []>;
+ def V2I32toI64 : NVPTXInst<(outs Int64Regs:$d),
+ (ins Int32Regs:$s1, Int32Regs:$s2),
+ "mov.b64\t$d, {{$s1, $s2}};", []>;
+ def V2F32toF64 : NVPTXInst<(outs Float64Regs:$d),
+ (ins Float32Regs:$s1, Float32Regs:$s2),
+ "mov.b64\t$d, {{$s1, $s2}};", []>;
+
+ // unpack a larger int register to a set of smaller int registers
+ def I64toV4I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2,
+ Int16Regs:$d3, Int16Regs:$d4),
+ (ins Int64Regs:$s),
+ "mov.b64\t{{$d1, $d2, $d3, $d4}}, $s;", []>;
+ def I32toV2I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2),
+ (ins Int32Regs:$s),
+ "mov.b32\t{{$d1, $d2}}, $s;", []>;
+ def I64toV2I32 : NVPTXInst<(outs Int32Regs:$d1, Int32Regs:$d2),
+ (ins Int64Regs:$s),
+ "mov.b64\t{{$d1, $d2}}, $s;", []>;
+ def F64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2),
+ (ins Float64Regs:$s),
+ "mov.b64\t{{$d1, $d2}}, $s;", []>;
+}
// Count leading zeros
-def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
- "clz.b32\t$d, $a;",
- []>;
-def CLZr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
- "clz.b64\t$d, $a;",
- []>;
+let hasSideEffects = 0 in {
+ def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
+ "clz.b32\t$d, $a;", []>;
+ def CLZr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+ "clz.b64\t$d, $a;", []>;
+}
// 32-bit has a direct PTX instruction
-def : Pat<(ctlz Int32Regs:$a),
- (CLZr32 Int32Regs:$a)>;
-def : Pat<(ctlz_zero_undef Int32Regs:$a),
- (CLZr32 Int32Regs:$a)>;
+def : Pat<(ctlz Int32Regs:$a), (CLZr32 Int32Regs:$a)>;
// For 64-bit, the result in PTX is actually 32-bit so we zero-extend
// to 64-bit to match the LLVM semantics
-def : Pat<(ctlz Int64Regs:$a),
- (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>;
-def : Pat<(ctlz_zero_undef Int64Regs:$a),
- (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>;
+def : Pat<(ctlz Int64Regs:$a), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>;
// For 16-bit, we zero-extend to 32-bit, then trunc the result back
// to 16-bits (ctlz of a 16-bit value is guaranteed to require less
@@ -2592,34 +2591,27 @@ def : Pat<(ctlz Int16Regs:$a),
(SUBi16ri (CVT_u16_u32 (CLZr32
(CVT_u32_u16 Int16Regs:$a, CvtNONE)),
CvtNONE), 16)>;
-def : Pat<(ctlz_zero_undef Int16Regs:$a),
- (SUBi16ri (CVT_u16_u32 (CLZr32
- (CVT_u32_u16 Int16Regs:$a, CvtNONE)),
- CvtNONE), 16)>;
// Population count
-def POPCr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
- "popc.b32\t$d, $a;",
- []>;
-def POPCr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
- "popc.b64\t$d, $a;",
- []>;
+let hasSideEffects = 0 in {
+ def POPCr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
+ "popc.b32\t$d, $a;", []>;
+ def POPCr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+ "popc.b64\t$d, $a;", []>;
+}
// 32-bit has a direct PTX instruction
-def : Pat<(ctpop Int32Regs:$a),
- (POPCr32 Int32Regs:$a)>;
+def : Pat<(ctpop Int32Regs:$a), (POPCr32 Int32Regs:$a)>;
// For 64-bit, the result in PTX is actually 32-bit so we zero-extend
// to 64-bit to match the LLVM semantics
-def : Pat<(ctpop Int64Regs:$a),
- (CVT_u64_u32 (POPCr64 Int64Regs:$a), CvtNONE)>;
+def : Pat<(ctpop Int64Regs:$a), (CVT_u64_u32 (POPCr64 Int64Regs:$a), CvtNONE)>;
// For 16-bit, we zero-extend to 32-bit, then trunc the result back
// to 16-bits (ctpop of a 16-bit value is guaranteed to require less
// than 16 bits to store)
def : Pat<(ctpop Int16Regs:$a),
- (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)),
- CvtNONE)>;
+ (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE)>;
// fround f64 -> f32
def : Pat<(f32 (fround Float64Regs:$a)),
@@ -2633,8 +2625,8 @@ def : Pat<(f64 (fextend Float32Regs:$a)),
def : Pat<(f64 (fextend Float32Regs:$a)),
(CVT_f64_f32 Float32Regs:$a, CvtNONE)>;
-def retflag : SDNode<"NVPTXISD::RET_FLAG", SDTNone,
- [SDNPHasChain, SDNPOptInGlue]>;
+def retflag : SDNode<"NVPTXISD::RET_FLAG", SDTNone,
+ [SDNPHasChain, SDNPOptInGlue]>;
//-----------------------------------
// Control-flow
@@ -2646,88 +2638,77 @@ let isTerminator=1 in {
let isBranch=1 in
def CBranch : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
- "@$a bra \t$target;",
- [(brcond Int1Regs:$a, bb:$target)]>;
+ "@$a bra \t$target;",
+ [(brcond Int1Regs:$a, bb:$target)]>;
let isBranch=1 in
def CBranchOther : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
- "@!$a bra \t$target;",
- []>;
+ "@!$a bra \t$target;", []>;
let isBranch=1, isBarrier=1 in
def GOTO : NVPTXInst<(outs), (ins brtarget:$target),
- "bra.uni \t$target;",
- [(br bb:$target)]>;
+ "bra.uni \t$target;", [(br bb:$target)]>;
}
def : Pat<(brcond Int32Regs:$a, bb:$target),
(CBranch (SETP_u32ri Int32Regs:$a, 0, CmpNE), bb:$target)>;
// SelectionDAGBuilder::visitSWitchCase() will invert the condition of a
-// conditional branch if
-// the target block is the next block so that the code can fall through to the
-// target block.
-// The invertion is done by 'xor condition, 1', which will be translated to
-// (setne condition, -1).
-// Since ptx supports '@!pred bra target', we should use it.
+// conditional branch if the target block is the next block so that the code
+// can fall through to the target block. The invertion is done by 'xor
+// condition, 1', which will be translated to (setne condition, -1). Since ptx
+// supports '@!pred bra target', we should use it.
def : Pat<(brcond (i1 (setne Int1Regs:$a, -1)), bb:$target),
- (CBranchOther Int1Regs:$a, bb:$target)>;
+ (CBranchOther Int1Regs:$a, bb:$target)>;
// Call
-def SDT_NVPTXCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
-def SDT_NVPTXCallSeqEnd : SDCallSeqEnd<[ SDTCisVT<0, i32>,
- SDTCisVT<1, i32> ]>;
+def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
+def SDT_NVPTXCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_NVPTXCallSeqStart,
[SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
-def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_NVPTXCallSeqEnd,
+def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_NVPTXCallSeqEnd,
[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
- SDNPSideEffect]>;
+ SDNPSideEffect]>;
def SDT_NVPTXCall : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
def call : SDNode<"NVPTXISD::CALL", SDT_NVPTXCall,
[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
def calltarget : Operand<i32>;
let isCall=1 in {
- def CALL : NVPTXInst<(outs), (ins calltarget:$dst),
- "call \t$dst, (1);", []>;
+ def CALL : NVPTXInst<(outs), (ins calltarget:$dst), "call \t$dst, (1);", []>;
}
-def : Pat<(call tglobaladdr:$dst),
- (CALL tglobaladdr:$dst)>;
-def : Pat<(call texternalsym:$dst),
- (CALL texternalsym:$dst)>;
+def : Pat<(call tglobaladdr:$dst), (CALL tglobaladdr:$dst)>;
+def : Pat<(call texternalsym:$dst), (CALL texternalsym:$dst)>;
// Pseudo instructions.
class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
: NVPTXInst<outs, ins, asmstr, pattern>;
-// @TODO: We use some tricks here to emit curly braces. Can we clean this up
-// a bit without TableGen modifications?
-def Callseq_Start : NVPTXInst<(outs), (ins i32imm:$amt),
- "// Callseq Start $amt\n\t{{\n\t.reg .b32 temp_param_reg;\n\t// <end>}}",
- [(callseq_start timm:$amt)]>;
-def Callseq_End : NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
- "\n\t//{{\n\t}}// Callseq End $amt1",
- [(callseq_end timm:$amt1, timm:$amt2)]>;
+def Callseq_Start :
+ NVPTXInst<(outs), (ins i32imm:$amt),
+ "\\{ // callseq $amt\n"
+ "\t.reg .b32 temp_param_reg;",
+ [(callseq_start timm:$amt)]>;
+def Callseq_End :
+ NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+ "\\} // callseq $amt1",
+ [(callseq_end timm:$amt1, timm:$amt2)]>;
// trap instruction
-
-def trapinst : NVPTXInst<(outs), (ins),
- "trap;",
- [(trap)]>;
+def trapinst : NVPTXInst<(outs), (ins), "trap;", [(trap)]>;
// Call prototype wrapper
def SDTCallPrototype : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
-def CallPrototype
- : SDNode<"NVPTXISD::CallPrototype", SDTCallPrototype,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def CallPrototype :
+ SDNode<"NVPTXISD::CallPrototype", SDTCallPrototype,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
def ProtoIdent : Operand<i32> {
let PrintMethod = "printProtoIdent";
}
-def CALL_PROTOTYPE
- : NVPTXInst<(outs), (ins ProtoIdent:$ident),
- "$ident", [(CallPrototype (i32 texternalsym:$ident))]>;
-
+def CALL_PROTOTYPE :
+ NVPTXInst<(outs), (ins ProtoIdent:$ident),
+ "$ident", [(CallPrototype (i32 texternalsym:$ident))]>;
include "NVPTXIntrinsics.td"
diff --git a/lib/Target/NVPTX/NVPTXIntrinsics.td b/lib/Target/NVPTX/NVPTXIntrinsics.td
index 14e51aa309ea..ed16afa24752 100644
--- a/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -30,11 +30,9 @@ def immDouble1 : PatLeaf<(fpimm), [{
//-----------------------------------
-// Synchronization Functions
+// Synchronization and shuffle functions
//-----------------------------------
-def INT_CUDA_SYNCTHREADS : NVPTXInst<(outs), (ins),
- "bar.sync \t0;",
- [(int_cuda_syncthreads)]>;
+let isConvergent = 1 in {
def INT_BARRIER0 : NVPTXInst<(outs), (ins),
"bar.sync \t0;",
[(int_nvvm_barrier0)]>;
@@ -64,6 +62,51 @@ def INT_BARRIER0_OR : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred),
!strconcat("}}", ""))))))),
[(set Int32Regs:$dst, (int_nvvm_barrier0_or Int32Regs:$pred))]>;
+def INT_BAR_SYNC : NVPTXInst<(outs), (ins i32imm:$i), "bar.sync\t$i;",
+ [(int_nvvm_bar_sync imm:$i)]>;
+
+// shfl.{up,down,bfly,idx}.b32
+multiclass SHFL<NVPTXRegClass regclass, string mode, Intrinsic IntOp> {
+ // The last two parameters to shfl can be regs or imms. ptxas is smart
+ // enough to inline constant registers, so strictly speaking we don't need to
+ // handle immediates here. But it's easy enough, and it makes our ptx more
+ // readable.
+ def reg : NVPTXInst<
+ (outs regclass:$dst),
+ (ins regclass:$src, Int32Regs:$offset, Int32Regs:$mask),
+ !strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"),
+ [(set regclass:$dst, (IntOp regclass:$src, Int32Regs:$offset, Int32Regs:$mask))]>;
+
+ def imm1 : NVPTXInst<
+ (outs regclass:$dst),
+ (ins regclass:$src, i32imm:$offset, Int32Regs:$mask),
+ !strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"),
+ [(set regclass:$dst, (IntOp regclass:$src, imm:$offset, Int32Regs:$mask))]>;
+
+ def imm2 : NVPTXInst<
+ (outs regclass:$dst),
+ (ins regclass:$src, Int32Regs:$offset, i32imm:$mask),
+ !strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"),
+ [(set regclass:$dst, (IntOp regclass:$src, Int32Regs:$offset, imm:$mask))]>;
+
+ def imm3 : NVPTXInst<
+ (outs regclass:$dst),
+ (ins regclass:$src, i32imm:$offset, i32imm:$mask),
+ !strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"),
+ [(set regclass:$dst, (IntOp regclass:$src, imm:$offset, imm:$mask))]>;
+}
+
+defm INT_SHFL_DOWN_I32 : SHFL<Int32Regs, "down", int_nvvm_shfl_down_i32>;
+defm INT_SHFL_DOWN_F32 : SHFL<Float32Regs, "down", int_nvvm_shfl_down_f32>;
+defm INT_SHFL_UP_I32 : SHFL<Int32Regs, "up", int_nvvm_shfl_up_i32>;
+defm INT_SHFL_UP_F32 : SHFL<Float32Regs, "up", int_nvvm_shfl_up_f32>;
+defm INT_SHFL_BFLY_I32 : SHFL<Int32Regs, "bfly", int_nvvm_shfl_bfly_i32>;
+defm INT_SHFL_BFLY_F32 : SHFL<Float32Regs, "bfly", int_nvvm_shfl_bfly_f32>;
+defm INT_SHFL_IDX_I32 : SHFL<Int32Regs, "idx", int_nvvm_shfl_idx_i32>;
+defm INT_SHFL_IDX_F32 : SHFL<Float32Regs, "idx", int_nvvm_shfl_idx_f32>;
+
+} // isConvergent = 1
+
//-----------------------------------
// Explicit Memory Fence Functions
@@ -1335,51 +1378,17 @@ defm INT_PTX_ATOM_CAS_GEN_64_USE_G : F_ATOMIC_3<Int64Regs, ".global", ".b64",
".cas", atomic_cmp_swap_64_gen, i64imm, useAtomRedG64forGen64>;
-//-----------------------------------
-// Read Special Registers
-//-----------------------------------
-class F_SREG<string OpStr, NVPTXRegClass regclassOut, Intrinsic IntOp> :
- NVPTXInst<(outs regclassOut:$dst), (ins),
- OpStr,
- [(set regclassOut:$dst, (IntOp))]>;
-
-def INT_PTX_SREG_TID_X : F_SREG<"mov.u32 \t$dst, %tid.x;", Int32Regs,
- int_nvvm_read_ptx_sreg_tid_x>;
-def INT_PTX_SREG_TID_Y : F_SREG<"mov.u32 \t$dst, %tid.y;", Int32Regs,
- int_nvvm_read_ptx_sreg_tid_y>;
-def INT_PTX_SREG_TID_Z : F_SREG<"mov.u32 \t$dst, %tid.z;", Int32Regs,
- int_nvvm_read_ptx_sreg_tid_z>;
-
-def INT_PTX_SREG_NTID_X : F_SREG<"mov.u32 \t$dst, %ntid.x;", Int32Regs,
- int_nvvm_read_ptx_sreg_ntid_x>;
-def INT_PTX_SREG_NTID_Y : F_SREG<"mov.u32 \t$dst, %ntid.y;", Int32Regs,
- int_nvvm_read_ptx_sreg_ntid_y>;
-def INT_PTX_SREG_NTID_Z : F_SREG<"mov.u32 \t$dst, %ntid.z;", Int32Regs,
- int_nvvm_read_ptx_sreg_ntid_z>;
-
-def INT_PTX_SREG_CTAID_X : F_SREG<"mov.u32 \t$dst, %ctaid.x;", Int32Regs,
- int_nvvm_read_ptx_sreg_ctaid_x>;
-def INT_PTX_SREG_CTAID_Y : F_SREG<"mov.u32 \t$dst, %ctaid.y;", Int32Regs,
- int_nvvm_read_ptx_sreg_ctaid_y>;
-def INT_PTX_SREG_CTAID_Z : F_SREG<"mov.u32 \t$dst, %ctaid.z;", Int32Regs,
- int_nvvm_read_ptx_sreg_ctaid_z>;
-
-def INT_PTX_SREG_NCTAID_X : F_SREG<"mov.u32 \t$dst, %nctaid.x;", Int32Regs,
- int_nvvm_read_ptx_sreg_nctaid_x>;
-def INT_PTX_SREG_NCTAID_Y : F_SREG<"mov.u32 \t$dst, %nctaid.y;", Int32Regs,
- int_nvvm_read_ptx_sreg_nctaid_y>;
-def INT_PTX_SREG_NCTAID_Z : F_SREG<"mov.u32 \t$dst, %nctaid.z;", Int32Regs,
- int_nvvm_read_ptx_sreg_nctaid_z>;
-
-def INT_PTX_SREG_WARPSIZE : F_SREG<"mov.u32 \t$dst, WARP_SZ;", Int32Regs,
- int_nvvm_read_ptx_sreg_warpsize>;
//-----------------------------------
// Support for ldu on sm_20 or later
//-----------------------------------
+// Don't annotate ldu instructions as mayLoad, as they load from memory that is
+// read-only in a kernel.
+
// Scalar
+
multiclass LDU_G<string TyStr, NVPTXRegClass regclass> {
def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src),
!strconcat("ldu.global.", TyStr),
@@ -1475,6 +1484,10 @@ defm INT_PTX_LDU_G_v4f32_ELE
// Support for ldg on sm_35 or later
//-----------------------------------
+// Don't annotate ld.global.nc as mayLoad, because these loads go through the
+// non-coherent texture cache, and therefore the values read must be read-only
+// during the lifetime of the kernel.
+
multiclass LDG_G<string TyStr, NVPTXRegClass regclass> {
def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src),
!strconcat("ld.global.nc.", TyStr),
@@ -1836,54 +1849,61 @@ def : Pat<(int_nvvm_rotate_b32 Int32Regs:$src, Int32Regs:$amt),
(ROTL32reg_sw Int32Regs:$src, Int32Regs:$amt)>,
Requires<[noHWROT32]> ;
-def GET_LO_INT64
- : NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src),
- !strconcat("{{\n\t",
- !strconcat(".reg .b32 %dummy;\n\t",
- !strconcat("mov.b64 \t{$dst,%dummy}, $src;\n\t",
- !strconcat("}}", "")))),
- []> ;
-
-def GET_HI_INT64
- : NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src),
- !strconcat("{{\n\t",
- !strconcat(".reg .b32 %dummy;\n\t",
- !strconcat("mov.b64 \t{%dummy,$dst}, $src;\n\t",
- !strconcat("}}", "")))),
- []> ;
-
-def PACK_TWO_INT32
- : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$lo, Int32Regs:$hi),
- "mov.b64 \t$dst, {{$lo, $hi}};", []> ;
+let hasSideEffects = 0 in {
+ def GET_LO_INT64
+ : NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src),
+ !strconcat("{{\n\t",
+ !strconcat(".reg .b32 %dummy;\n\t",
+ !strconcat("mov.b64 \t{$dst,%dummy}, $src;\n\t",
+ !strconcat("}}", "")))),
+ []> ;
+
+ def GET_HI_INT64
+ : NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src),
+ !strconcat("{{\n\t",
+ !strconcat(".reg .b32 %dummy;\n\t",
+ !strconcat("mov.b64 \t{%dummy,$dst}, $src;\n\t",
+ !strconcat("}}", "")))),
+ []> ;
+}
+
+let hasSideEffects = 0 in {
+ def PACK_TWO_INT32
+ : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$lo, Int32Regs:$hi),
+ "mov.b64 \t$dst, {{$lo, $hi}};", []> ;
+}
def : Pat<(int_nvvm_swap_lo_hi_b64 Int64Regs:$src),
(PACK_TWO_INT32 (GET_HI_INT64 Int64Regs:$src),
(GET_LO_INT64 Int64Regs:$src))> ;
-// funnel shift, requires >= sm_32
-def SHF_L_WRAP_B32_IMM
- : NVPTXInst<(outs Int32Regs:$dst),
- (ins Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt),
- "shf.l.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
- Requires<[hasHWROT32]>;
+// Funnel shift, requires >= sm_32. Does not trap if amt is out of range, so
+// no side effects.
+let hasSideEffects = 0 in {
+ def SHF_L_WRAP_B32_IMM
+ : NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt),
+ "shf.l.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
+ Requires<[hasHWROT32]>;
-def SHF_L_WRAP_B32_REG
- : NVPTXInst<(outs Int32Regs:$dst),
- (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
- "shf.l.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
- Requires<[hasHWROT32]>;
+ def SHF_L_WRAP_B32_REG
+ : NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
+ "shf.l.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
+ Requires<[hasHWROT32]>;
-def SHF_R_WRAP_B32_IMM
- : NVPTXInst<(outs Int32Regs:$dst),
- (ins Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt),
- "shf.r.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
- Requires<[hasHWROT32]>;
+ def SHF_R_WRAP_B32_IMM
+ : NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt),
+ "shf.r.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
+ Requires<[hasHWROT32]>;
-def SHF_R_WRAP_B32_REG
- : NVPTXInst<(outs Int32Regs:$dst),
- (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
- "shf.r.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
- Requires<[hasHWROT32]>;
+ def SHF_R_WRAP_B32_REG
+ : NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
+ "shf.r.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
+ Requires<[hasHWROT32]>;
+}
// HW version of rotate 64
def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, (i32 imm:$amt)),
@@ -6950,98 +6970,95 @@ def : Pat<(int_nvvm_sust_p_3d_v4i32_trap
Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+//-----------------------------------
+// Read Special Registers
+//-----------------------------------
-
-//===-- Old PTX Back-end Intrinsics ---------------------------------------===//
-
-// These intrinsics are handled to retain compatibility with the old backend.
-
-// PTX Special Purpose Register Accessor Intrinsics
-
-class PTX_READ_SPECIAL_REGISTER_R64<string regname, Intrinsic intop>
+class PTX_READ_SREG_R64<string regname, Intrinsic intop>
: NVPTXInst<(outs Int64Regs:$d), (ins),
!strconcat(!strconcat("mov.u64\t$d, %", regname), ";"),
[(set Int64Regs:$d, (intop))]>;
-class PTX_READ_SPECIAL_REGISTER_R32<string regname, Intrinsic intop>
+class PTX_READ_SREG_R32<string regname, Intrinsic intop>
: NVPTXInst<(outs Int32Regs:$d), (ins),
!strconcat(!strconcat("mov.u32\t$d, %", regname), ";"),
[(set Int32Regs:$d, (intop))]>;
// TODO Add read vector-version of special registers
-def PTX_READ_TID_X : PTX_READ_SPECIAL_REGISTER_R32<"tid.x",
- int_ptx_read_tid_x>;
-def PTX_READ_TID_Y : PTX_READ_SPECIAL_REGISTER_R32<"tid.y",
- int_ptx_read_tid_y>;
-def PTX_READ_TID_Z : PTX_READ_SPECIAL_REGISTER_R32<"tid.z",
- int_ptx_read_tid_z>;
-def PTX_READ_TID_W : PTX_READ_SPECIAL_REGISTER_R32<"tid.w",
- int_ptx_read_tid_w>;
-
-def PTX_READ_NTID_X : PTX_READ_SPECIAL_REGISTER_R32<"ntid.x",
- int_ptx_read_ntid_x>;
-def PTX_READ_NTID_Y : PTX_READ_SPECIAL_REGISTER_R32<"ntid.y",
- int_ptx_read_ntid_y>;
-def PTX_READ_NTID_Z : PTX_READ_SPECIAL_REGISTER_R32<"ntid.z",
- int_ptx_read_ntid_z>;
-def PTX_READ_NTID_W : PTX_READ_SPECIAL_REGISTER_R32<"ntid.w",
- int_ptx_read_ntid_w>;
-
-def PTX_READ_LANEID : PTX_READ_SPECIAL_REGISTER_R32<"laneid",
- int_ptx_read_laneid>;
-def PTX_READ_WARPID : PTX_READ_SPECIAL_REGISTER_R32<"warpid",
- int_ptx_read_warpid>;
-def PTX_READ_NWARPID : PTX_READ_SPECIAL_REGISTER_R32<"nwarpid",
- int_ptx_read_nwarpid>;
-
-def PTX_READ_CTAID_X : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.x",
- int_ptx_read_ctaid_x>;
-def PTX_READ_CTAID_Y : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.y",
- int_ptx_read_ctaid_y>;
-def PTX_READ_CTAID_Z : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.z",
- int_ptx_read_ctaid_z>;
-def PTX_READ_CTAID_W : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.w",
- int_ptx_read_ctaid_w>;
-
-def PTX_READ_NCTAID_X : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.x",
- int_ptx_read_nctaid_x>;
-def PTX_READ_NCTAID_Y : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.y",
- int_ptx_read_nctaid_y>;
-def PTX_READ_NCTAID_Z : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.z",
- int_ptx_read_nctaid_z>;
-def PTX_READ_NCTAID_W : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.w",
- int_ptx_read_nctaid_w>;
-
-def PTX_READ_SMID : PTX_READ_SPECIAL_REGISTER_R32<"smid",
- int_ptx_read_smid>;
-def PTX_READ_NSMID : PTX_READ_SPECIAL_REGISTER_R32<"nsmid",
- int_ptx_read_nsmid>;
-def PTX_READ_GRIDID : PTX_READ_SPECIAL_REGISTER_R32<"gridid",
- int_ptx_read_gridid>;
-
-def PTX_READ_LANEMASK_EQ
- : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_eq", int_ptx_read_lanemask_eq>;
-def PTX_READ_LANEMASK_LE
- : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_le", int_ptx_read_lanemask_le>;
-def PTX_READ_LANEMASK_LT
- : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_lt", int_ptx_read_lanemask_lt>;
-def PTX_READ_LANEMASK_GE
- : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_ge", int_ptx_read_lanemask_ge>;
-def PTX_READ_LANEMASK_GT
- : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_gt", int_ptx_read_lanemask_gt>;
-
-def PTX_READ_CLOCK
- : PTX_READ_SPECIAL_REGISTER_R32<"clock", int_ptx_read_clock>;
-def PTX_READ_CLOCK64
- : PTX_READ_SPECIAL_REGISTER_R64<"clock64", int_ptx_read_clock64>;
-
-def PTX_READ_PM0 : PTX_READ_SPECIAL_REGISTER_R32<"pm0", int_ptx_read_pm0>;
-def PTX_READ_PM1 : PTX_READ_SPECIAL_REGISTER_R32<"pm1", int_ptx_read_pm1>;
-def PTX_READ_PM2 : PTX_READ_SPECIAL_REGISTER_R32<"pm2", int_ptx_read_pm2>;
-def PTX_READ_PM3 : PTX_READ_SPECIAL_REGISTER_R32<"pm3", int_ptx_read_pm3>;
-
-// PTX Parallel Synchronization and Communication Intrinsics
-
-def PTX_BAR_SYNC : NVPTXInst<(outs), (ins i32imm:$i), "bar.sync\t$i;",
- [(int_ptx_bar_sync imm:$i)]>;
+def INT_PTX_SREG_TID_X :
+ PTX_READ_SREG_R32<"tid.x", int_nvvm_read_ptx_sreg_tid_x>;
+def INT_PTX_SREG_TID_Y :
+ PTX_READ_SREG_R32<"tid.y", int_nvvm_read_ptx_sreg_tid_y>;
+def INT_PTX_SREG_TID_Z :
+ PTX_READ_SREG_R32<"tid.z", int_nvvm_read_ptx_sreg_tid_z>;
+def INT_PTX_SREG_TID_W :
+ PTX_READ_SREG_R32<"tid.w", int_nvvm_read_ptx_sreg_tid_w>;
+
+def INT_PTX_SREG_NTID_X :
+ PTX_READ_SREG_R32<"ntid.x", int_nvvm_read_ptx_sreg_ntid_x>;
+def INT_PTX_SREG_NTID_Y :
+ PTX_READ_SREG_R32<"ntid.y", int_nvvm_read_ptx_sreg_ntid_y>;
+def INT_PTX_SREG_NTID_Z :
+ PTX_READ_SREG_R32<"ntid.z", int_nvvm_read_ptx_sreg_ntid_z>;
+def INT_PTX_SREG_NTID_W :
+ PTX_READ_SREG_R32<"ntid.w", int_nvvm_read_ptx_sreg_ntid_w>;
+
+def INT_PTX_SREG_LANEID :
+ PTX_READ_SREG_R32<"laneid", int_nvvm_read_ptx_sreg_laneid>;
+def INT_PTX_SREG_WARPID :
+ PTX_READ_SREG_R32<"warpid", int_nvvm_read_ptx_sreg_warpid>;
+def INT_PTX_SREG_NWARPID :
+ PTX_READ_SREG_R32<"nwarpid", int_nvvm_read_ptx_sreg_nwarpid>;
+
+def INT_PTX_SREG_CTAID_X :
+ PTX_READ_SREG_R32<"ctaid.x", int_nvvm_read_ptx_sreg_ctaid_x>;
+def INT_PTX_SREG_CTAID_Y :
+ PTX_READ_SREG_R32<"ctaid.y", int_nvvm_read_ptx_sreg_ctaid_y>;
+def INT_PTX_SREG_CTAID_Z :
+ PTX_READ_SREG_R32<"ctaid.z", int_nvvm_read_ptx_sreg_ctaid_z>;
+def INT_PTX_SREG_CTAID_W :
+ PTX_READ_SREG_R32<"ctaid.w", int_nvvm_read_ptx_sreg_ctaid_w>;
+
+def INT_PTX_SREG_NCTAID_X :
+ PTX_READ_SREG_R32<"nctaid.x", int_nvvm_read_ptx_sreg_nctaid_x>;
+def INT_PTX_SREG_NCTAID_Y :
+ PTX_READ_SREG_R32<"nctaid.y", int_nvvm_read_ptx_sreg_nctaid_y>;
+def INT_PTX_SREG_NCTAID_Z :
+ PTX_READ_SREG_R32<"nctaid.z", int_nvvm_read_ptx_sreg_nctaid_z>;
+def INT_PTX_SREG_NCTAID_W :
+ PTX_READ_SREG_R32<"nctaid.w", int_nvvm_read_ptx_sreg_nctaid_w>;
+
+def INT_PTX_SREG_SMID :
+ PTX_READ_SREG_R32<"smid", int_nvvm_read_ptx_sreg_smid>;
+def INT_PTX_SREG_NSMID :
+ PTX_READ_SREG_R32<"nsmid", int_nvvm_read_ptx_sreg_nsmid>;
+def INT_PTX_SREG_GRIDID :
+ PTX_READ_SREG_R32<"gridid", int_nvvm_read_ptx_sreg_gridid>;
+
+def INT_PTX_SREG_LANEMASK_EQ :
+ PTX_READ_SREG_R32<"lanemask_eq", int_nvvm_read_ptx_sreg_lanemask_eq>;
+def INT_PTX_SREG_LANEMASK_LE :
+ PTX_READ_SREG_R32<"lanemask_le", int_nvvm_read_ptx_sreg_lanemask_le>;
+def INT_PTX_SREG_LANEMASK_LT :
+ PTX_READ_SREG_R32<"lanemask_lt", int_nvvm_read_ptx_sreg_lanemask_lt>;
+def INT_PTX_SREG_LANEMASK_GE :
+ PTX_READ_SREG_R32<"lanemask_ge", int_nvvm_read_ptx_sreg_lanemask_ge>;
+def INT_PTX_SREG_LANEMASK_GT :
+ PTX_READ_SREG_R32<"lanemask_gt", int_nvvm_read_ptx_sreg_lanemask_gt>;
+
+def INT_PTX_SREG_CLOCK :
+ PTX_READ_SREG_R32<"clock", int_nvvm_read_ptx_sreg_clock>;
+def INT_PTX_SREG_CLOCK64 :
+ PTX_READ_SREG_R64<"clock64", int_nvvm_read_ptx_sreg_clock64>;
+
+def INT_PTX_SREG_PM0 : PTX_READ_SREG_R32<"pm0", int_nvvm_read_ptx_sreg_pm0>;
+def INT_PTX_SREG_PM1 : PTX_READ_SREG_R32<"pm1", int_nvvm_read_ptx_sreg_pm1>;
+def INT_PTX_SREG_PM2 : PTX_READ_SREG_R32<"pm2", int_nvvm_read_ptx_sreg_pm2>;
+def INT_PTX_SREG_PM3 : PTX_READ_SREG_R32<"pm3", int_nvvm_read_ptx_sreg_pm3>;
+
+// TODO: It would be nice to use PTX_READ_SREG here, but it doesn't
+// handle the constant.
+def INT_PTX_SREG_WARPSIZE :
+ NVPTXInst<(outs Int32Regs:$dst), (ins), "mov.u32 \t$dst, WARP_SZ;",
+ [(set Int32Regs:$dst, (int_nvvm_read_ptx_sreg_warpsize))]>;
diff --git a/lib/Target/NVPTX/NVPTXLowerAlloca.cpp b/lib/Target/NVPTX/NVPTXLowerAlloca.cpp
index 624052e9b981..fa1a3ef3fe24 100644
--- a/lib/Target/NVPTX/NVPTXLowerAlloca.cpp
+++ b/lib/Target/NVPTX/NVPTXLowerAlloca.cpp
@@ -62,6 +62,9 @@ INITIALIZE_PASS(NVPTXLowerAlloca, "nvptx-lower-alloca",
// Main function for this pass.
// =============================================================================
bool NVPTXLowerAlloca::runOnBasicBlock(BasicBlock &BB) {
+ if (skipBasicBlock(BB))
+ return false;
+
bool Changed = false;
for (auto &I : BB) {
if (auto allocaInst = dyn_cast<AllocaInst>(&I)) {
diff --git a/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp b/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp
index 6656077348a1..d162a283f745 100644
--- a/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp
+++ b/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp
@@ -128,7 +128,7 @@ INITIALIZE_PASS(NVPTXLowerKernelArgs, "nvptx-lower-kernel-args",
"Lower kernel arguments (NVPTX)", false, false)
// =============================================================================
-// If the function had a byval struct ptr arg, say foo(%struct.x *byval %d),
+// If the function had a byval struct ptr arg, say foo(%struct.x* byval %d),
// then add the following instructions to the first basic block:
//
// %temp = alloca %struct.x, align 8
diff --git a/lib/Target/NVPTX/NVPTXMCExpr.cpp b/lib/Target/NVPTX/NVPTXMCExpr.cpp
index 3c98b9febf85..84d5239ec096 100644
--- a/lib/Target/NVPTX/NVPTXMCExpr.cpp
+++ b/lib/Target/NVPTX/NVPTXMCExpr.cpp
@@ -15,8 +15,8 @@ using namespace llvm;
#define DEBUG_TYPE "nvptx-mcexpr"
-const NVPTXFloatMCExpr*
-NVPTXFloatMCExpr::create(VariantKind Kind, APFloat Flt, MCContext &Ctx) {
+const NVPTXFloatMCExpr *
+NVPTXFloatMCExpr::create(VariantKind Kind, const APFloat &Flt, MCContext &Ctx) {
return new (Ctx) NVPTXFloatMCExpr(Kind, Flt);
}
diff --git a/lib/Target/NVPTX/NVPTXMCExpr.h b/lib/Target/NVPTX/NVPTXMCExpr.h
index 81a606d7535c..7f833c42fa8f 100644
--- a/lib/Target/NVPTX/NVPTXMCExpr.h
+++ b/lib/Target/NVPTX/NVPTXMCExpr.h
@@ -14,6 +14,7 @@
#include "llvm/ADT/APFloat.h"
#include "llvm/MC/MCExpr.h"
+#include <utility>
namespace llvm {
@@ -30,21 +31,21 @@ private:
const APFloat Flt;
explicit NVPTXFloatMCExpr(VariantKind Kind, APFloat Flt)
- : Kind(Kind), Flt(Flt) {}
+ : Kind(Kind), Flt(std::move(Flt)) {}
public:
/// @name Construction
/// @{
- static const NVPTXFloatMCExpr *create(VariantKind Kind, APFloat Flt,
+ static const NVPTXFloatMCExpr *create(VariantKind Kind, const APFloat &Flt,
MCContext &Ctx);
- static const NVPTXFloatMCExpr *createConstantFPSingle(APFloat Flt,
+ static const NVPTXFloatMCExpr *createConstantFPSingle(const APFloat &Flt,
MCContext &Ctx) {
return create(VK_NVPTX_SINGLE_PREC_FLOAT, Flt, Ctx);
}
- static const NVPTXFloatMCExpr *createConstantFPDouble(APFloat Flt,
+ static const NVPTXFloatMCExpr *createConstantFPDouble(const APFloat &Flt,
MCContext &Ctx) {
return create(VK_NVPTX_DOUBLE_PREC_FLOAT, Flt, Ctx);
}
diff --git a/lib/Target/NVPTX/NVPTXPeephole.cpp b/lib/Target/NVPTX/NVPTXPeephole.cpp
index a61c291d233f..7d0cd553e03f 100644
--- a/lib/Target/NVPTX/NVPTXPeephole.cpp
+++ b/lib/Target/NVPTX/NVPTXPeephole.cpp
@@ -125,6 +125,9 @@ static void CombineCVTAToLocal(MachineInstr &Root) {
}
bool NVPTXPeephole::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(*MF.getFunction()))
+ return false;
+
bool Changed = false;
// Loop over all of the basic blocks.
for (auto &MBB : MF) {
diff --git a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
index 17019d7b364d..029e0097c5dc 100644
--- a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
+++ b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
@@ -55,11 +55,10 @@ bool NVPTXPrologEpilogPass::runOnMachineFunction(MachineFunction &MF) {
calculateFrameObjectOffsets(MF);
- for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB) {
- for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) {
- MachineInstr *MI = I;
- for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
- if (!MI->getOperand(i).isFI())
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB) {
+ for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+ if (!MI.getOperand(i).isFI())
continue;
TRI.eliminateFrameIndex(MI, 0, i, nullptr);
Modified = true;
diff --git a/lib/Target/NVPTX/NVPTXSection.h b/lib/Target/NVPTX/NVPTXSection.h
index 45a7309479ee..cad4f5668fdf 100644
--- a/lib/Target/NVPTX/NVPTXSection.h
+++ b/lib/Target/NVPTX/NVPTXSection.h
@@ -16,7 +16,6 @@
#include "llvm/IR/GlobalVariable.h"
#include "llvm/MC/MCSection.h"
-#include <vector>
namespace llvm {
/// Represents a section in PTX PTX does not have sections. We create this class
diff --git a/lib/Target/NVPTX/NVPTXSubtarget.h b/lib/Target/NVPTX/NVPTXSubtarget.h
index c7287719be5f..41670390c41b 100644
--- a/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -19,8 +19,8 @@
#include "NVPTXISelLowering.h"
#include "NVPTXInstrInfo.h"
#include "NVPTXRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
#include "llvm/IR/DataLayout.h"
-#include "llvm/Target/TargetSelectionDAGInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
#include <string>
@@ -42,7 +42,7 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
const NVPTXTargetMachine &TM;
NVPTXInstrInfo InstrInfo;
NVPTXTargetLowering TLInfo;
- TargetSelectionDAGInfo TSInfo;
+ SelectionDAGTargetInfo TSInfo;
// NVPTX does not have any call stack frame, but need a NVPTX specific
// FrameLowering class because TargetFrameLowering is abstract.
@@ -65,7 +65,7 @@ public:
const NVPTXTargetLowering *getTargetLowering() const override {
return &TLInfo;
}
- const TargetSelectionDAGInfo *getSelectionDAGInfo() const override {
+ const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
return &TSInfo;
}
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index aa931b134da9..b9f5919964c7 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -23,6 +23,7 @@
#include "llvm/CodeGen/MachineFunctionAnalysis.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/IRPrintingPasses.h"
#include "llvm/IR/LegacyPassManager.h"
@@ -44,15 +45,23 @@
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/GVN.h"
using namespace llvm;
+static cl::opt<bool> UseInferAddressSpaces(
+ "nvptx-use-infer-addrspace", cl::init(false), cl::Hidden,
+ cl::desc("Optimize address spaces using NVPTXInferAddressSpaces instead of "
+ "NVPTXFavorNonGenericAddrSpaces"));
+
namespace llvm {
+void initializeNVVMIntrRangePass(PassRegistry&);
void initializeNVVMReflectPass(PassRegistry&);
void initializeGenericToNVVMPass(PassRegistry&);
void initializeNVPTXAllocaHoistingPass(PassRegistry &);
void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&);
void initializeNVPTXFavorNonGenericAddrSpacesPass(PassRegistry &);
+void initializeNVPTXInferAddressSpacesPass(PassRegistry &);
void initializeNVPTXLowerAggrCopiesPass(PassRegistry &);
void initializeNVPTXLowerKernelArgsPass(PassRegistry &);
void initializeNVPTXLowerAllocaPass(PassRegistry &);
@@ -67,10 +76,12 @@ extern "C" void LLVMInitializeNVPTXTarget() {
// but it's very NVPTX-specific.
PassRegistry &PR = *PassRegistry::getPassRegistry();
initializeNVVMReflectPass(PR);
+ initializeNVVMIntrRangePass(PR);
initializeGenericToNVVMPass(PR);
initializeNVPTXAllocaHoistingPass(PR);
initializeNVPTXAssignValidGlobalNamesPass(PR);
initializeNVPTXFavorNonGenericAddrSpacesPass(PR);
+ initializeNVPTXInferAddressSpacesPass(PR);
initializeNVPTXLowerKernelArgsPass(PR);
initializeNVPTXLowerAllocaPass(PR);
initializeNVPTXLowerAggrCopiesPass(PR);
@@ -90,11 +101,15 @@ static std::string computeDataLayout(bool is64Bit) {
NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT,
StringRef CPU, StringRef FS,
const TargetOptions &Options,
- Reloc::Model RM, CodeModel::Model CM,
+ Optional<Reloc::Model> RM,
+ CodeModel::Model CM,
CodeGenOpt::Level OL, bool is64bit)
- : LLVMTargetMachine(T, computeDataLayout(is64bit), TT, CPU, FS, Options, RM,
- CM, OL),
- is64bit(is64bit), TLOF(make_unique<NVPTXTargetObjectFile>()),
+ // The pic relocation model is used regardless of what the client has
+ // specified, as it is the only relocation model currently supported.
+ : LLVMTargetMachine(T, computeDataLayout(is64bit), TT, CPU, FS, Options,
+ Reloc::PIC_, CM, OL),
+ is64bit(is64bit),
+ TLOF(make_unique<NVPTXTargetObjectFile>()),
Subtarget(TT, CPU, FS, *this) {
if (TT.getOS() == Triple::NVCL)
drvInterface = NVPTX::NVCL;
@@ -110,7 +125,8 @@ void NVPTXTargetMachine32::anchor() {}
NVPTXTargetMachine32::NVPTXTargetMachine32(const Target &T, const Triple &TT,
StringRef CPU, StringRef FS,
const TargetOptions &Options,
- Reloc::Model RM, CodeModel::Model CM,
+ Optional<Reloc::Model> RM,
+ CodeModel::Model CM,
CodeGenOpt::Level OL)
: NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
@@ -119,7 +135,8 @@ void NVPTXTargetMachine64::anchor() {}
NVPTXTargetMachine64::NVPTXTargetMachine64(const Target &T, const Triple &TT,
StringRef CPU, StringRef FS,
const TargetOptions &Options,
- Reloc::Model RM, CodeModel::Model CM,
+ Optional<Reloc::Model> RM,
+ CodeModel::Model CM,
CodeGenOpt::Level OL)
: NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
@@ -143,14 +160,25 @@ public:
void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override;
private:
- // if the opt level is aggressive, add GVN; otherwise, add EarlyCSE.
+ // If the opt level is aggressive, add GVN; otherwise, add EarlyCSE. This
+ // function is only called in opt mode.
void addEarlyCSEOrGVNPass();
+
+ // Add passes that propagate special memory spaces.
+ void addAddressSpaceInferencePasses();
+
+ // Add passes that perform straight-line scalar optimizations.
+ void addStraightLineScalarOptimizationPasses();
};
} // end anonymous namespace
TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) {
- NVPTXPassConfig *PassConfig = new NVPTXPassConfig(this, PM);
- return PassConfig;
+ return new NVPTXPassConfig(this, PM);
+}
+
+void NVPTXTargetMachine::addEarlyAsPossiblePasses(PassManagerBase &PM) {
+ PM.add(createNVVMReflectPass());
+ PM.add(createNVVMIntrRangePass(Subtarget.getSmVersion()));
}
TargetIRAnalysis NVPTXTargetMachine::getTargetIRAnalysis() {
@@ -166,34 +194,23 @@ void NVPTXPassConfig::addEarlyCSEOrGVNPass() {
addPass(createEarlyCSEPass());
}
-void NVPTXPassConfig::addIRPasses() {
- // The following passes are known to not play well with virtual regs hanging
- // around after register allocation (which in our case, is *all* registers).
- // We explicitly disable them here. We do, however, need some functionality
- // of the PrologEpilogCodeInserter pass, so we emulate that behavior in the
- // NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp).
- disablePass(&PrologEpilogCodeInserterID);
- disablePass(&MachineCopyPropagationID);
- disablePass(&TailDuplicateID);
-
- addPass(createNVVMReflectPass());
- addPass(createNVPTXImageOptimizerPass());
- addPass(createNVPTXAssignValidGlobalNamesPass());
- addPass(createGenericToNVVMPass());
-
- // === Propagate special address spaces ===
- addPass(createNVPTXLowerKernelArgsPass(&getNVPTXTargetMachine()));
+void NVPTXPassConfig::addAddressSpaceInferencePasses() {
// NVPTXLowerKernelArgs emits alloca for byval parameters which can often
// be eliminated by SROA.
addPass(createSROAPass());
addPass(createNVPTXLowerAllocaPass());
- addPass(createNVPTXFavorNonGenericAddrSpacesPass());
- // FavorNonGenericAddrSpaces shortcuts unnecessary addrspacecasts, and leave
- // them unused. We could remove dead code in an ad-hoc manner, but that
- // requires manual work and might be error-prone.
- addPass(createDeadCodeEliminationPass());
+ if (UseInferAddressSpaces) {
+ addPass(createNVPTXInferAddressSpacesPass());
+ } else {
+ addPass(createNVPTXFavorNonGenericAddrSpacesPass());
+ // FavorNonGenericAddrSpaces shortcuts unnecessary addrspacecasts, and leave
+ // them unused. We could remove dead code in an ad-hoc manner, but that
+ // requires manual work and might be error-prone.
+ addPass(createDeadCodeEliminationPass());
+ }
+}
- // === Straight-line scalar optimizations ===
+void NVPTXPassConfig::addStraightLineScalarOptimizationPasses() {
addPass(createSeparateConstOffsetFromGEPPass());
addPass(createSpeculativeExecutionPass());
// ReassociateGEPs exposes more opportunites for SLSR. See
@@ -208,6 +225,41 @@ void NVPTXPassConfig::addIRPasses() {
// NaryReassociate on GEPs creates redundant common expressions, so run
// EarlyCSE after it.
addPass(createEarlyCSEPass());
+}
+
+void NVPTXPassConfig::addIRPasses() {
+ // The following passes are known to not play well with virtual regs hanging
+ // around after register allocation (which in our case, is *all* registers).
+ // We explicitly disable them here. We do, however, need some functionality
+ // of the PrologEpilogCodeInserter pass, so we emulate that behavior in the
+ // NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp).
+ disablePass(&PrologEpilogCodeInserterID);
+ disablePass(&MachineCopyPropagationID);
+ disablePass(&TailDuplicateID);
+ disablePass(&StackMapLivenessID);
+ disablePass(&LiveDebugValuesID);
+ disablePass(&PostRASchedulerID);
+ disablePass(&FuncletLayoutID);
+ disablePass(&PatchableFunctionID);
+
+ // NVVMReflectPass is added in addEarlyAsPossiblePasses, so hopefully running
+ // it here does nothing. But since we need it for correctness when lowering
+ // to NVPTX, run it here too, in case whoever built our pass pipeline didn't
+ // call addEarlyAsPossiblePasses.
+ addPass(createNVVMReflectPass());
+
+ if (getOptLevel() != CodeGenOpt::None)
+ addPass(createNVPTXImageOptimizerPass());
+ addPass(createNVPTXAssignValidGlobalNamesPass());
+ addPass(createGenericToNVVMPass());
+
+ // NVPTXLowerKernelArgs is required for correctness and should be run right
+ // before the address space inference passes.
+ addPass(createNVPTXLowerKernelArgsPass(&getNVPTXTargetMachine()));
+ if (getOptLevel() != CodeGenOpt::None) {
+ addAddressSpaceInferencePasses();
+ addStraightLineScalarOptimizationPasses();
+ }
// === LSR and other generic IR passes ===
TargetPassConfig::addIRPasses();
@@ -223,7 +275,8 @@ void NVPTXPassConfig::addIRPasses() {
// %1 = shl %a, 2
//
// but EarlyCSE can do neither of them.
- addEarlyCSEOrGVNPass();
+ if (getOptLevel() != CodeGenOpt::None)
+ addEarlyCSEOrGVNPass();
}
bool NVPTXPassConfig::addInstSelector() {
@@ -241,10 +294,12 @@ bool NVPTXPassConfig::addInstSelector() {
void NVPTXPassConfig::addPostRegAlloc() {
addPass(createNVPTXPrologEpilogPass(), false);
- // NVPTXPrologEpilogPass calculates frame object offset and replace frame
- // index with VRFrame register. NVPTXPeephole need to be run after that and
- // will replace VRFrame with VRFrameLocal when possible.
- addPass(createNVPTXPeephole());
+ if (getOptLevel() != CodeGenOpt::None) {
+ // NVPTXPrologEpilogPass calculates frame object offset and replace frame
+ // index with VRFrame register. NVPTXPeephole need to be run after that and
+ // will replace VRFrame with VRFrameLocal when possible.
+ addPass(createNVPTXPeephole());
+ }
}
FunctionPass *NVPTXPassConfig::createTargetRegisterAllocator(bool) {
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.h b/lib/Target/NVPTX/NVPTXTargetMachine.h
index da7f62bf9d9b..78a053831772 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.h
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.h
@@ -16,9 +16,9 @@
#include "ManagedStringPool.h"
#include "NVPTXSubtarget.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
#include "llvm/Target/TargetFrameLowering.h"
#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetSelectionDAGInfo.h"
namespace llvm {
@@ -36,8 +36,8 @@ class NVPTXTargetMachine : public LLVMTargetMachine {
public:
NVPTXTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
StringRef FS, const TargetOptions &Options,
- Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OP,
- bool is64bit);
+ Optional<Reloc::Model> RM, CodeModel::Model CM,
+ CodeGenOpt::Level OP, bool is64bit);
~NVPTXTargetMachine() override;
const NVPTXSubtarget *getSubtargetImpl(const Function &) const override {
@@ -61,6 +61,7 @@ public:
return TLOF.get();
}
+ void addEarlyAsPossiblePasses(PassManagerBase &PM) override;
TargetIRAnalysis getTargetIRAnalysis() override;
}; // NVPTXTargetMachine.
@@ -70,7 +71,7 @@ class NVPTXTargetMachine32 : public NVPTXTargetMachine {
public:
NVPTXTargetMachine32(const Target &T, const Triple &TT, StringRef CPU,
StringRef FS, const TargetOptions &Options,
- Reloc::Model RM, CodeModel::Model CM,
+ Optional<Reloc::Model> RM, CodeModel::Model CM,
CodeGenOpt::Level OL);
};
@@ -79,7 +80,7 @@ class NVPTXTargetMachine64 : public NVPTXTargetMachine {
public:
NVPTXTargetMachine64(const Target &T, const Triple &TT, StringRef CPU,
StringRef FS, const TargetOptions &Options,
- Reloc::Model RM, CodeModel::Model CM,
+ Optional<Reloc::Model> RM, CodeModel::Model CM,
CodeGenOpt::Level OL);
};
diff --git a/lib/Target/NVPTX/NVPTXTargetObjectFile.h b/lib/Target/NVPTX/NVPTXTargetObjectFile.h
index 683b9a3f49f7..045fbb75a2a0 100644
--- a/lib/Target/NVPTX/NVPTXTargetObjectFile.h
+++ b/lib/Target/NVPTX/NVPTXTargetObjectFile.h
@@ -12,7 +12,6 @@
#include "NVPTXSection.h"
#include "llvm/Target/TargetLoweringObjectFile.h"
-#include <string>
namespace llvm {
class GlobalVariable;
@@ -87,7 +86,8 @@ public:
}
MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind,
- const Constant *C) const override {
+ const Constant *C,
+ unsigned &Align) const override {
return ReadOnlySection;
}
diff --git a/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index 6e679dd0257c..580d345cc663 100644
--- a/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -32,7 +32,7 @@ static bool readsThreadIndex(const IntrinsicInst *II) {
}
static bool readsLaneId(const IntrinsicInst *II) {
- return II->getIntrinsicID() == Intrinsic::ptx_read_laneid;
+ return II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_laneid;
}
// Whether the given intrinsic is an atomic instruction in PTX.
diff --git a/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
index 0946a3293eec..08ffdf191151 100644
--- a/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -52,6 +52,10 @@ public:
bool isSourceOfDivergence(const Value *V);
+ // Increase the inlining cost threshold by a factor of 5, reflecting that
+ // calls are particularly expensive in NVPTX.
+ unsigned getInliningThresholdMultiplier() { return 5; }
+
int getArithmeticInstrCost(
unsigned Opcode, Type *Ty,
TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
diff --git a/lib/Target/NVPTX/NVPTXUtilities.cpp b/lib/Target/NVPTX/NVPTXUtilities.cpp
index 578b466568ae..835e4b442039 100644
--- a/lib/Target/NVPTX/NVPTXUtilities.cpp
+++ b/lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -99,7 +99,7 @@ static void cacheAnnotationFromMD(const Module *m, const GlobalValue *gv) {
}
}
-bool llvm::findOneNVVMAnnotation(const GlobalValue *gv, std::string prop,
+bool llvm::findOneNVVMAnnotation(const GlobalValue *gv, const std::string &prop,
unsigned &retval) {
MutexGuard Guard(Lock);
const Module *m = gv->getParent();
@@ -113,7 +113,7 @@ bool llvm::findOneNVVMAnnotation(const GlobalValue *gv, std::string prop,
return true;
}
-bool llvm::findAllNVVMAnnotation(const GlobalValue *gv, std::string prop,
+bool llvm::findAllNVVMAnnotation(const GlobalValue *gv, const std::string &prop,
std::vector<unsigned> &retval) {
MutexGuard Guard(Lock);
const Module *m = gv->getParent();
diff --git a/lib/Target/NVPTX/NVPTXUtilities.h b/lib/Target/NVPTX/NVPTXUtilities.h
index a5262cb7412f..ec5bfc17afc7 100644
--- a/lib/Target/NVPTX/NVPTXUtilities.h
+++ b/lib/Target/NVPTX/NVPTXUtilities.h
@@ -30,8 +30,9 @@ namespace llvm {
void clearAnnotationCache(const llvm::Module *);
-bool findOneNVVMAnnotation(const llvm::GlobalValue *, std::string, unsigned &);
-bool findAllNVVMAnnotation(const llvm::GlobalValue *, std::string,
+bool findOneNVVMAnnotation(const llvm::GlobalValue *, const std::string &,
+ unsigned &);
+bool findAllNVVMAnnotation(const llvm::GlobalValue *, const std::string &,
std::vector<unsigned> &);
bool isTexture(const llvm::Value &);
diff --git a/lib/Target/NVPTX/NVVMIntrRange.cpp b/lib/Target/NVPTX/NVVMIntrRange.cpp
new file mode 100644
index 000000000000..b9c02c431141
--- /dev/null
+++ b/lib/Target/NVPTX/NVVMIntrRange.cpp
@@ -0,0 +1,148 @@
+//===- NVVMIntrRange.cpp - Set !range metadata for NVVM intrinsics --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass adds appropriate !range metadata for calls to NVVM
+// intrinsics that return a limited range of values.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Instructions.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "nvvm-intr-range"
+
+namespace llvm { void initializeNVVMIntrRangePass(PassRegistry &); }
+
+// Add !range metadata based on limits of given SM variant.
+static cl::opt<unsigned> NVVMIntrRangeSM("nvvm-intr-range-sm", cl::init(20),
+ cl::Hidden, cl::desc("SM variant"));
+
+namespace {
+class NVVMIntrRange : public FunctionPass {
+ private:
+ struct {
+ unsigned x, y, z;
+ } MaxBlockSize, MaxGridSize;
+
+ public:
+ static char ID;
+ NVVMIntrRange() : NVVMIntrRange(NVVMIntrRangeSM) {}
+ NVVMIntrRange(unsigned int SmVersion) : FunctionPass(ID) {
+ MaxBlockSize.x = 1024;
+ MaxBlockSize.y = 1024;
+ MaxBlockSize.z = 64;
+
+ MaxGridSize.x = SmVersion >= 30 ? 0x7fffffff : 0xffff;
+ MaxGridSize.y = 0xffff;
+ MaxGridSize.z = 0xffff;
+
+ initializeNVVMIntrRangePass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &) override;
+};
+}
+
+FunctionPass *llvm::createNVVMIntrRangePass(unsigned int SmVersion) {
+ return new NVVMIntrRange(SmVersion);
+}
+
+char NVVMIntrRange::ID = 0;
+INITIALIZE_PASS(NVVMIntrRange, "nvvm-intr-range",
+ "Add !range metadata to NVVM intrinsics.", false, false)
+
+// Adds the passed-in [Low,High) range information as metadata to the
+// passed-in call instruction.
+static bool addRangeMetadata(uint64_t Low, uint64_t High, CallInst *C) {
+ LLVMContext &Context = C->getParent()->getContext();
+ IntegerType *Int32Ty = Type::getInt32Ty(Context);
+ Metadata *LowAndHigh[] = {
+ ConstantAsMetadata::get(ConstantInt::get(Int32Ty, Low)),
+ ConstantAsMetadata::get(ConstantInt::get(Int32Ty, High))};
+ C->setMetadata(LLVMContext::MD_range, MDNode::get(Context, LowAndHigh));
+ return true;
+}
+
+bool NVVMIntrRange::runOnFunction(Function &F) {
+ // Go through the calls in this function.
+ bool Changed = false;
+ for (Instruction &I : instructions(F)) {
+ CallInst *Call = dyn_cast<CallInst>(&I);
+ if (!Call)
+ continue;
+
+ if (Function *Callee = Call->getCalledFunction()) {
+ switch (Callee->getIntrinsicID()) {
+ // Index within block
+ case Intrinsic::nvvm_read_ptx_sreg_tid_x:
+ Changed |= addRangeMetadata(0, MaxBlockSize.x, Call);
+ break;
+ case Intrinsic::nvvm_read_ptx_sreg_tid_y:
+ Changed |= addRangeMetadata(0, MaxBlockSize.y, Call);
+ break;
+ case Intrinsic::nvvm_read_ptx_sreg_tid_z:
+ Changed |= addRangeMetadata(0, MaxBlockSize.z, Call);
+ break;
+
+ // Block size
+ case Intrinsic::nvvm_read_ptx_sreg_ntid_x:
+ Changed |= addRangeMetadata(1, MaxBlockSize.x+1, Call);
+ break;
+ case Intrinsic::nvvm_read_ptx_sreg_ntid_y:
+ Changed |= addRangeMetadata(1, MaxBlockSize.y+1, Call);
+ break;
+ case Intrinsic::nvvm_read_ptx_sreg_ntid_z:
+ Changed |= addRangeMetadata(1, MaxBlockSize.z+1, Call);
+ break;
+
+ // Index within grid
+ case Intrinsic::nvvm_read_ptx_sreg_ctaid_x:
+ Changed |= addRangeMetadata(0, MaxGridSize.x, Call);
+ break;
+ case Intrinsic::nvvm_read_ptx_sreg_ctaid_y:
+ Changed |= addRangeMetadata(0, MaxGridSize.y, Call);
+ break;
+ case Intrinsic::nvvm_read_ptx_sreg_ctaid_z:
+ Changed |= addRangeMetadata(0, MaxGridSize.z, Call);
+ break;
+
+ // Grid size
+ case Intrinsic::nvvm_read_ptx_sreg_nctaid_x:
+ Changed |= addRangeMetadata(1, MaxGridSize.x+1, Call);
+ break;
+ case Intrinsic::nvvm_read_ptx_sreg_nctaid_y:
+ Changed |= addRangeMetadata(1, MaxGridSize.y+1, Call);
+ break;
+ case Intrinsic::nvvm_read_ptx_sreg_nctaid_z:
+ Changed |= addRangeMetadata(1, MaxGridSize.z+1, Call);
+ break;
+
+ // warp size is constant 32.
+ case Intrinsic::nvvm_read_ptx_sreg_warpsize:
+ Changed |= addRangeMetadata(32, 32+1, Call);
+ break;
+
+ // Lane ID is [0..warpsize)
+ case Intrinsic::nvvm_read_ptx_sreg_laneid:
+ Changed |= addRangeMetadata(0, 32, Call);
+ break;
+
+ default:
+ break;
+ }
+ }
+ }
+
+ return Changed;
+}
diff --git a/lib/Target/NVPTX/NVVMReflect.cpp b/lib/Target/NVPTX/NVVMReflect.cpp
index 20ab5db584d2..e0c35e7039e5 100644
--- a/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/lib/Target/NVPTX/NVVMReflect.cpp
@@ -7,20 +7,26 @@
//
//===----------------------------------------------------------------------===//
//
-// This pass replaces occurrences of __nvvm_reflect("string") with an
-// integer based on -nvvm-reflect-list string=<int> option given to this pass.
-// If an undefined string value is seen in a call to __nvvm_reflect("string"),
-// a default value of 0 will be used.
+// This pass replaces occurrences of __nvvm_reflect("foo") and llvm.nvvm.reflect
+// with an integer.
+//
+// We choose the value we use by looking, in this order, at:
+//
+// * the -nvvm-reflect-list flag, which has the format "foo=1,bar=42",
+// * the StringMap passed to the pass's constructor, and
+// * metadata in the module itself.
+//
+// If we see an unknown string, we replace its call with 0.
//
//===----------------------------------------------------------------------===//
#include "NVPTX.h"
-#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringMap.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/Module.h"
@@ -31,11 +37,8 @@
#include "llvm/Support/raw_os_ostream.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
-#include <map>
#include <sstream>
#include <string>
-#include <vector>
-
#define NVVM_REFLECT_FUNCTION "__nvvm_reflect"
using namespace llvm;
@@ -45,31 +48,21 @@ using namespace llvm;
namespace llvm { void initializeNVVMReflectPass(PassRegistry &); }
namespace {
-class NVVMReflect : public ModulePass {
+class NVVMReflect : public FunctionPass {
private:
StringMap<int> VarMap;
- typedef DenseMap<std::string, int>::iterator VarMapIter;
public:
static char ID;
- NVVMReflect() : ModulePass(ID) {
- initializeNVVMReflectPass(*PassRegistry::getPassRegistry());
- VarMap.clear();
- }
+ NVVMReflect() : NVVMReflect(StringMap<int>()) {}
NVVMReflect(const StringMap<int> &Mapping)
- : ModulePass(ID) {
+ : FunctionPass(ID), VarMap(Mapping) {
initializeNVVMReflectPass(*PassRegistry::getPassRegistry());
- for (StringMap<int>::const_iterator I = Mapping.begin(), E = Mapping.end();
- I != E; ++I) {
- VarMap[(*I).getKey()] = (*I).getValue();
- }
+ setVarMap();
}
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesAll();
- }
- bool runOnModule(Module &) override;
+ bool runOnFunction(Function &) override;
private:
bool handleFunction(Function *ReflectFunction);
@@ -77,11 +70,8 @@ private:
};
}
-ModulePass *llvm::createNVVMReflectPass() {
- return new NVVMReflect();
-}
-
-ModulePass *llvm::createNVVMReflectPass(const StringMap<int>& Mapping) {
+FunctionPass *llvm::createNVVMReflectPass() { return new NVVMReflect(); }
+FunctionPass *llvm::createNVVMReflectPass(const StringMap<int> &Mapping) {
return new NVVMReflect(Mapping);
}
@@ -123,30 +113,35 @@ void NVVMReflect::setVarMap() {
}
}
-bool NVVMReflect::handleFunction(Function *ReflectFunction) {
- // Validate _reflect function
- assert(ReflectFunction->isDeclaration() &&
- "_reflect function should not have a body");
- assert(ReflectFunction->getReturnType()->isIntegerTy() &&
- "_reflect's return type should be integer");
+bool NVVMReflect::runOnFunction(Function &F) {
+ if (!NVVMReflectEnabled)
+ return false;
+
+ if (F.getName() == NVVM_REFLECT_FUNCTION) {
+ assert(F.isDeclaration() && "_reflect function should not have a body");
+ assert(F.getReturnType()->isIntegerTy() &&
+ "_reflect's return type should be integer");
+ return false;
+ }
- std::vector<Instruction *> ToRemove;
+ SmallVector<Instruction *, 4> ToRemove;
- // Go through the uses of ReflectFunction in this Function.
- // Each of them should a CallInst with a ConstantArray argument.
- // First validate that. If the c-string corresponding to the
- // ConstantArray can be found successfully, see if it can be
- // found in VarMap. If so, replace the uses of CallInst with the
- // value found in VarMap. If not, replace the use with value 0.
+ // Go through the calls in this function. Each call to __nvvm_reflect or
+ // llvm.nvvm.reflect should be a CallInst with a ConstantArray argument.
+ // First validate that. If the c-string corresponding to the ConstantArray can
+ // be found successfully, see if it can be found in VarMap. If so, replace the
+ // uses of CallInst with the value found in VarMap. If not, replace the use
+ // with value 0.
- // IR for __nvvm_reflect calls differs between CUDA versions:
+ // The IR for __nvvm_reflect calls differs between CUDA versions.
+ //
// CUDA 6.5 and earlier uses this sequence:
// %ptr = tail call i8* @llvm.nvvm.ptr.constant.to.gen.p0i8.p4i8
// (i8 addrspace(4)* getelementptr inbounds
// ([8 x i8], [8 x i8] addrspace(4)* @str, i32 0, i32 0))
// %reflect = tail call i32 @__nvvm_reflect(i8* %ptr)
//
- // Value returned by Sym->getOperand(0) is a Constant with a
+ // The value returned by Sym->getOperand(0) is a Constant with a
// ConstantDataSequential operand which can be converted to string and used
// for lookup.
//
@@ -157,31 +152,37 @@ bool NVVMReflect::handleFunction(Function *ReflectFunction) {
//
// In this case, we get a Constant with a GlobalVariable operand and we need
// to dig deeper to find its initializer with the string we'll use for lookup.
-
- for (User *U : ReflectFunction->users()) {
- assert(isa<CallInst>(U) && "Only a call instruction can use _reflect");
- CallInst *Reflect = cast<CallInst>(U);
-
- assert((Reflect->getNumOperands() == 2) &&
- "Only one operand expect for _reflect function");
- // In cuda, we will have an extra constant-to-generic conversion of
- // the string.
- const Value *Str = Reflect->getArgOperand(0);
- if (isa<CallInst>(Str)) {
- // CUDA path
- const CallInst *ConvCall = cast<CallInst>(Str);
+ for (Instruction &I : instructions(F)) {
+ CallInst *Call = dyn_cast<CallInst>(&I);
+ if (!Call)
+ continue;
+ Function *Callee = Call->getCalledFunction();
+ if (!Callee || (Callee->getName() != NVVM_REFLECT_FUNCTION &&
+ Callee->getIntrinsicID() != Intrinsic::nvvm_reflect))
+ continue;
+
+ // FIXME: Improve error handling here and elsewhere in this pass.
+ assert(Call->getNumOperands() == 2 &&
+ "Wrong number of operands to __nvvm_reflect function");
+
+ // In cuda 6.5 and earlier, we will have an extra constant-to-generic
+ // conversion of the string.
+ const Value *Str = Call->getArgOperand(0);
+ if (const CallInst *ConvCall = dyn_cast<CallInst>(Str)) {
+ // FIXME: Add assertions about ConvCall.
Str = ConvCall->getArgOperand(0);
}
assert(isa<ConstantExpr>(Str) &&
- "Format of _reflect function not recognized");
+ "Format of __nvvm__reflect function not recognized");
const ConstantExpr *GEP = cast<ConstantExpr>(Str);
const Value *Sym = GEP->getOperand(0);
- assert(isa<Constant>(Sym) && "Format of _reflect function not recognized");
+ assert(isa<Constant>(Sym) &&
+ "Format of __nvvm_reflect function not recognized");
const Value *Operand = cast<Constant>(Sym)->getOperand(0);
if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(Operand)) {
- // For CUDA-7.0 style __nvvm_reflect calls we need to find operand's
+ // For CUDA-7.0 style __nvvm_reflect calls, we need to find the operand's
// initializer.
assert(GV->hasInitializer() &&
"Format of _reflect function not recognized");
@@ -194,57 +195,26 @@ bool NVVMReflect::handleFunction(Function *ReflectFunction) {
assert(cast<ConstantDataSequential>(Operand)->isCString() &&
"Format of _reflect function not recognized");
- std::string ReflectArg =
- cast<ConstantDataSequential>(Operand)->getAsString();
-
+ StringRef ReflectArg = cast<ConstantDataSequential>(Operand)->getAsString();
ReflectArg = ReflectArg.substr(0, ReflectArg.size() - 1);
DEBUG(dbgs() << "Arg of _reflect : " << ReflectArg << "\n");
int ReflectVal = 0; // The default value is 0
- if (VarMap.find(ReflectArg) != VarMap.end()) {
- ReflectVal = VarMap[ReflectArg];
- }
- Reflect->replaceAllUsesWith(
- ConstantInt::get(Reflect->getType(), ReflectVal));
- ToRemove.push_back(Reflect);
- }
- if (ToRemove.size() == 0)
- return false;
-
- for (unsigned i = 0, e = ToRemove.size(); i != e; ++i)
- ToRemove[i]->eraseFromParent();
- return true;
-}
-
-bool NVVMReflect::runOnModule(Module &M) {
- if (!NVVMReflectEnabled)
- return false;
-
- setVarMap();
-
-
- bool Res = false;
- std::string Name;
- Type *Tys[1];
- Type *I8Ty = Type::getInt8Ty(M.getContext());
- Function *ReflectFunction;
-
- // Check for standard overloaded versions of llvm.nvvm.reflect
-
- for (unsigned i = 0; i != 5; ++i) {
- Tys[0] = PointerType::get(I8Ty, i);
- Name = Intrinsic::getName(Intrinsic::nvvm_reflect, Tys);
- ReflectFunction = M.getFunction(Name);
- if(ReflectFunction != 0) {
- Res |= handleFunction(ReflectFunction);
+ auto Iter = VarMap.find(ReflectArg);
+ if (Iter != VarMap.end())
+ ReflectVal = Iter->second;
+ else if (ReflectArg == "__CUDA_FTZ") {
+ // Try to pull __CUDA_FTZ from the nvvm-reflect-ftz module flag.
+ if (auto *Flag = mdconst::extract_or_null<ConstantInt>(
+ F.getParent()->getModuleFlag("nvvm-reflect-ftz")))
+ ReflectVal = Flag->getSExtValue();
}
+ Call->replaceAllUsesWith(ConstantInt::get(Call->getType(), ReflectVal));
+ ToRemove.push_back(Call);
}
- ReflectFunction = M.getFunction(NVVM_REFLECT_FUNCTION);
- // If reflect function is not used, then there will be
- // no entry in the module.
- if (ReflectFunction != 0)
- Res |= handleFunction(ReflectFunction);
+ for (Instruction *I : ToRemove)
+ I->eraseFromParent();
- return Res;
+ return ToRemove.size() > 0;
}
diff --git a/lib/Target/NVPTX/TargetInfo/Makefile b/lib/Target/NVPTX/TargetInfo/Makefile
deleted file mode 100644
index 8622315b47b9..000000000000
--- a/lib/Target/NVPTX/TargetInfo/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/NVPTX/TargetInfo/Makefile ----------------*- Makefile -*-===##
-#
-# The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMNVPTXInfo
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common