aboutsummaryrefslogtreecommitdiff
path: root/lib/Target/AMDGPU
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2017-07-19 07:02:10 +0000
committerDimitry Andric <dim@FreeBSD.org>2017-07-19 07:02:10 +0000
commit93c91e39b29142dec1d03a30df9f6e757f56c193 (patch)
tree33a9b014a327e64450b3c9ed46d8c5bdb78ad345 /lib/Target/AMDGPU
parentca089b24d48ef6fa8da2d0bb8c25bb802c4a95c0 (diff)
downloadsrc-93c91e39b29142dec1d03a30df9f6e757f56c193.tar.gz
src-93c91e39b29142dec1d03a30df9f6e757f56c193.zip
Vendor import of llvm trunk r308421:vendor/llvm/llvm-trunk-r308421
Notes
Notes: svn path=/vendor/llvm/dist/; revision=321184 svn path=/vendor/llvm/llvm-trunk-r308421/; revision=321185; tag=vendor/llvm/llvm-trunk-r308421
Diffstat (limited to 'lib/Target/AMDGPU')
-rw-r--r--lib/Target/AMDGPU/AMDGPU.h2
-rw-r--r--lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp252
-rw-r--r--lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp9
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.cpp32
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.h1
-rw-r--r--lib/Target/AMDGPU/AMDGPUSubtarget.cpp2
-rw-r--r--lib/Target/AMDGPU/AMDGPUSubtarget.h4
-rw-r--r--lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp2
-rw-r--r--lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp5
-rw-r--r--lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h1
-rw-r--r--lib/Target/AMDGPU/SIFoldOperands.cpp1
-rw-r--r--lib/Target/AMDGPU/SIFrameLowering.cpp11
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.cpp119
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.h2
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.cpp24
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.h19
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.td2
-rw-r--r--lib/Target/AMDGPU/SIInstructions.td2
-rw-r--r--lib/Target/AMDGPU/SIMachineFunctionInfo.cpp68
-rw-r--r--lib/Target/AMDGPU/SIMachineFunctionInfo.h10
-rw-r--r--lib/Target/AMDGPU/SIRegisterInfo.cpp4
-rw-r--r--lib/Target/AMDGPU/SIRegisterInfo.td17
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp18
-rw-r--r--lib/Target/AMDGPU/VOP2Instructions.td11
-rw-r--r--lib/Target/AMDGPU/VOP3Instructions.td11
-rw-r--r--lib/Target/AMDGPU/VOP3PInstructions.td10
-rw-r--r--lib/Target/AMDGPU/VOPCInstructions.td24
-rw-r--r--lib/Target/AMDGPU/VOPInstructions.td2
28 files changed, 446 insertions, 219 deletions
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h
index 5a799b2d88d0..568682899be5 100644
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@@ -56,7 +56,7 @@ extern char &AMDGPUMachineCFGStructurizerID;
void initializeAMDGPUAlwaysInlinePass(PassRegistry&);
-ModulePass *createAMDGPUAnnotateKernelFeaturesPass();
+Pass *createAMDGPUAnnotateKernelFeaturesPass();
void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);
extern char &AMDGPUAnnotateKernelFeaturesID;
diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
index 7235d8fae332..c68e5861ff25 100644
--- a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
@@ -15,8 +15,10 @@
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/CallGraphSCCPass.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Constants.h"
+#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Module.h"
@@ -26,26 +28,27 @@ using namespace llvm;
namespace {
-class AMDGPUAnnotateKernelFeatures : public ModulePass {
+class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
private:
+ const TargetMachine *TM = nullptr;
AMDGPUAS AS;
- static bool hasAddrSpaceCast(const Function &F, AMDGPUAS AS);
- void addAttrToCallers(Function *Intrin, StringRef AttrName);
- bool addAttrsForIntrinsics(Module &M, ArrayRef<StringRef[2]>);
+ bool addFeatureAttributes(Function &F);
public:
static char ID;
- AMDGPUAnnotateKernelFeatures() : ModulePass(ID) {}
- bool runOnModule(Module &M) override;
+ AMDGPUAnnotateKernelFeatures() : CallGraphSCCPass(ID) {}
+
+ bool doInitialization(CallGraph &CG) override;
+ bool runOnSCC(CallGraphSCC &SCC) override;
StringRef getPassName() const override {
return "AMDGPU Annotate Kernel Features";
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesAll();
- ModulePass::getAnalysisUsage(AU);
+ CallGraphSCCPass::getAnalysisUsage(AU);
}
static bool visitConstantExpr(const ConstantExpr *CE, AMDGPUAS AS);
@@ -121,16 +124,130 @@ bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
return false;
}
-// Return true if an addrspacecast is used that requires the queue ptr.
-bool AMDGPUAnnotateKernelFeatures::hasAddrSpaceCast(const Function &F,
- AMDGPUAS AS) {
+// We do not need to note the x workitem or workgroup id because they are always
+// initialized.
+//
+// TODO: We should not add the attributes if the known compile time workgroup
+// size is 1 for y/z.
+static StringRef intrinsicToAttrName(Intrinsic::ID ID,
+ bool &NonKernelOnly,
+ bool &IsQueuePtr) {
+ switch (ID) {
+ case Intrinsic::amdgcn_workitem_id_x:
+ NonKernelOnly = true;
+ return "amdgpu-work-item-id-x";
+ case Intrinsic::amdgcn_workgroup_id_x:
+ NonKernelOnly = true;
+ return "amdgpu-work-group-id-x";
+ case Intrinsic::amdgcn_workitem_id_y:
+ case Intrinsic::r600_read_tidig_y:
+ return "amdgpu-work-item-id-y";
+ case Intrinsic::amdgcn_workitem_id_z:
+ case Intrinsic::r600_read_tidig_z:
+ return "amdgpu-work-item-id-z";
+ case Intrinsic::amdgcn_workgroup_id_y:
+ case Intrinsic::r600_read_tgid_y:
+ return "amdgpu-work-group-id-y";
+ case Intrinsic::amdgcn_workgroup_id_z:
+ case Intrinsic::r600_read_tgid_z:
+ return "amdgpu-work-group-id-z";
+ case Intrinsic::amdgcn_dispatch_ptr:
+ return "amdgpu-dispatch-ptr";
+ case Intrinsic::amdgcn_dispatch_id:
+ return "amdgpu-dispatch-id";
+ case Intrinsic::amdgcn_kernarg_segment_ptr:
+ case Intrinsic::amdgcn_implicitarg_ptr:
+ return "amdgpu-kernarg-segment-ptr";
+ case Intrinsic::amdgcn_queue_ptr:
+ case Intrinsic::trap:
+ case Intrinsic::debugtrap:
+ IsQueuePtr = true;
+ return "amdgpu-queue-ptr";
+ default:
+ return "";
+ }
+}
+
+static bool handleAttr(Function &Parent, const Function &Callee,
+ StringRef Name) {
+ if (Callee.hasFnAttribute(Name)) {
+ Parent.addFnAttr(Name);
+ return true;
+ }
+
+ return false;
+}
+
+static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
+ bool &NeedQueuePtr) {
+ // X ids unnecessarily propagated to kernels.
+ static const StringRef AttrNames[] = {
+ { "amdgpu-work-item-id-x" },
+ { "amdgpu-work-item-id-y" },
+ { "amdgpu-work-item-id-z" },
+ { "amdgpu-work-group-id-x" },
+ { "amdgpu-work-group-id-y" },
+ { "amdgpu-work-group-id-z" },
+ { "amdgpu-dispatch-ptr" },
+ { "amdgpu-dispatch-id" },
+ { "amdgpu-kernarg-segment-ptr" }
+ };
+
+ if (handleAttr(Parent, Callee, "amdgpu-queue-ptr"))
+ NeedQueuePtr = true;
+
+ for (StringRef AttrName : AttrNames)
+ handleAttr(Parent, Callee, AttrName);
+}
+
+bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
+ const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
+ bool HasFlat = ST.hasFlatAddressSpace();
+ bool HasApertureRegs = ST.hasApertureRegs();
SmallPtrSet<const Constant *, 8> ConstantExprVisited;
- for (const BasicBlock &BB : F) {
- for (const Instruction &I : BB) {
+ bool Changed = false;
+ bool NeedQueuePtr = false;
+ bool HaveCall = false;
+ bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv());
+
+ for (BasicBlock &BB : F) {
+ for (Instruction &I : BB) {
+ CallSite CS(&I);
+ if (CS) {
+ Function *Callee = CS.getCalledFunction();
+
+ // TODO: Do something with indirect calls.
+ if (!Callee) {
+ if (!CS.isInlineAsm())
+ HaveCall = true;
+ continue;
+ }
+
+ Intrinsic::ID IID = Callee->getIntrinsicID();
+ if (IID == Intrinsic::not_intrinsic) {
+ HaveCall = true;
+ copyFeaturesToFunction(F, *Callee, NeedQueuePtr);
+ Changed = true;
+ } else {
+ bool NonKernelOnly = false;
+ StringRef AttrName = intrinsicToAttrName(IID,
+ NonKernelOnly, NeedQueuePtr);
+ if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) {
+ F.addFnAttr(AttrName);
+ Changed = true;
+ }
+ }
+ }
+
+ if (NeedQueuePtr || HasApertureRegs)
+ continue;
+
if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
- if (castRequiresQueuePtr(ASC, AS))
- return true;
+ if (castRequiresQueuePtr(ASC, AS)) {
+ NeedQueuePtr = true;
+ continue;
+ }
}
for (const Use &U : I.operands()) {
@@ -138,100 +255,57 @@ bool AMDGPUAnnotateKernelFeatures::hasAddrSpaceCast(const Function &F,
if (!OpC)
continue;
- if (visitConstantExprsRecursively(OpC, ConstantExprVisited, AS))
- return true;
+ if (visitConstantExprsRecursively(OpC, ConstantExprVisited, AS)) {
+ NeedQueuePtr = true;
+ break;
+ }
}
}
}
- return false;
-}
-
-void AMDGPUAnnotateKernelFeatures::addAttrToCallers(Function *Intrin,
- StringRef AttrName) {
- SmallPtrSet<Function *, 4> SeenFuncs;
-
- for (User *U : Intrin->users()) {
- // CallInst is the only valid user for an intrinsic.
- CallInst *CI = cast<CallInst>(U);
-
- Function *CallingFunction = CI->getParent()->getParent();
- if (SeenFuncs.insert(CallingFunction).second)
- CallingFunction->addFnAttr(AttrName);
+ if (NeedQueuePtr) {
+ F.addFnAttr("amdgpu-queue-ptr");
+ Changed = true;
}
-}
-
-bool AMDGPUAnnotateKernelFeatures::addAttrsForIntrinsics(
- Module &M,
- ArrayRef<StringRef[2]> IntrinsicToAttr) {
- bool Changed = false;
- for (const StringRef *Arr : IntrinsicToAttr) {
- if (Function *Fn = M.getFunction(Arr[0])) {
- addAttrToCallers(Fn, Arr[1]);
- Changed = true;
- }
+ // TODO: We could refine this to captured pointers that could possibly be
+ // accessed by flat instructions. For now this is mostly a poor way of
+ // estimating whether there are calls before argument lowering.
+ if (HasFlat && !IsFunc && HaveCall) {
+ F.addFnAttr("amdgpu-flat-scratch");
+ Changed = true;
}
return Changed;
}
-bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) {
+bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) {
+ Module &M = SCC.getCallGraph().getModule();
Triple TT(M.getTargetTriple());
- AS = AMDGPU::getAMDGPUAS(M);
-
- static const StringRef IntrinsicToAttr[][2] = {
- // .x omitted
- { "llvm.amdgcn.workitem.id.y", "amdgpu-work-item-id-y" },
- { "llvm.amdgcn.workitem.id.z", "amdgpu-work-item-id-z" },
-
- { "llvm.amdgcn.workgroup.id.y", "amdgpu-work-group-id-y" },
- { "llvm.amdgcn.workgroup.id.z", "amdgpu-work-group-id-z" },
-
- { "llvm.r600.read.tgid.y", "amdgpu-work-group-id-y" },
- { "llvm.r600.read.tgid.z", "amdgpu-work-group-id-z" },
-
- // .x omitted
- { "llvm.r600.read.tidig.y", "amdgpu-work-item-id-y" },
- { "llvm.r600.read.tidig.z", "amdgpu-work-item-id-z" }
- };
- static const StringRef HSAIntrinsicToAttr[][2] = {
- { "llvm.amdgcn.dispatch.ptr", "amdgpu-dispatch-ptr" },
- { "llvm.amdgcn.queue.ptr", "amdgpu-queue-ptr" },
- { "llvm.amdgcn.dispatch.id", "amdgpu-dispatch-id" },
- { "llvm.trap", "amdgpu-queue-ptr" },
- { "llvm.debugtrap", "amdgpu-queue-ptr" }
- };
-
- // TODO: We should not add the attributes if the known compile time workgroup
- // size is 1 for y/z.
-
- // TODO: Intrinsics that require queue ptr.
+ bool Changed = false;
+ for (CallGraphNode *I : SCC) {
+ Function *F = I->getFunction();
+ if (!F || F->isDeclaration())
+ continue;
- // We do not need to note the x workitem or workgroup id because they are
- // always initialized.
+ Changed |= addFeatureAttributes(*F);
+ }
- bool Changed = addAttrsForIntrinsics(M, IntrinsicToAttr);
- if (TT.getOS() == Triple::AMDHSA || TT.getOS() == Triple::Mesa3D) {
- Changed |= addAttrsForIntrinsics(M, HSAIntrinsicToAttr);
- for (Function &F : M) {
- if (F.hasFnAttribute("amdgpu-queue-ptr"))
- continue;
+ return Changed;
+}
- auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
- bool HasApertureRegs = TPC && TPC->getTM<TargetMachine>()
- .getSubtarget<AMDGPUSubtarget>(F)
- .hasApertureRegs();
- if (!HasApertureRegs && hasAddrSpaceCast(F, AS))
- F.addFnAttr("amdgpu-queue-ptr");
- }
- }
+bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) {
+ auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+ if (!TPC)
+ report_fatal_error("TargetMachine is required");
- return Changed;
+ AS = AMDGPU::getAMDGPUAS(CG.getModule());
+ TM = &TPC->getTM<TargetMachine>();
+ return false;
}
-ModulePass *llvm::createAMDGPUAnnotateKernelFeaturesPass() {
+Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() {
return new AMDGPUAnnotateKernelFeatures();
}
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 83ad1a5c6ee3..2247814cfe55 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -268,20 +268,11 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
CurrentProgramInfo.ScratchSize,
getFunctionCodeSize(MF));
- OutStreamer->emitRawComment(" codeLenInByte = " +
- Twine(getFunctionCodeSize(MF)), false);
- OutStreamer->emitRawComment(
- " NumSgprs: " + Twine(CurrentProgramInfo.NumSGPR), false);
- OutStreamer->emitRawComment(
- " NumVgprs: " + Twine(CurrentProgramInfo.NumVGPR), false);
-
OutStreamer->emitRawComment(
" FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
OutStreamer->emitRawComment(
" IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);
OutStreamer->emitRawComment(
- " ScratchSize: " + Twine(CurrentProgramInfo.ScratchSize), false);
- OutStreamer->emitRawComment(
" LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
" bytes/workgroup (compile time only)", false);
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 2553cf4da0fe..258b1737deb3 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -573,6 +573,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::FSUB);
setTargetDAGCombine(ISD::FNEG);
setTargetDAGCombine(ISD::FABS);
+ setTargetDAGCombine(ISD::AssertZext);
+ setTargetDAGCombine(ISD::AssertSext);
}
//===----------------------------------------------------------------------===//
@@ -883,7 +885,7 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
/// input values across multiple registers. Each item in the Ins array
-/// represents a single value that will be stored in regsters. Ins[x].VT is
+/// represents a single value that will be stored in registers. Ins[x].VT is
/// the value type of the value that will be stored in the register, so
/// whatever SDNode we lower the argument to needs to be this type.
///
@@ -2591,6 +2593,31 @@ SDValue AMDGPUTargetLowering::performClampCombine(SDNode *N,
return SDValue(CSrc, 0);
}
+// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
+// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
+// issues.
+SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ SDValue N0 = N->getOperand(0);
+
+ // (vt2 (assertzext (truncate vt0:x), vt1)) ->
+ // (vt2 (truncate (assertzext vt0:x, vt1)))
+ if (N0.getOpcode() == ISD::TRUNCATE) {
+ SDValue N1 = N->getOperand(1);
+ EVT ExtVT = cast<VTSDNode>(N1)->getVT();
+ SDLoc SL(N);
+
+ SDValue Src = N0.getOperand(0);
+ EVT SrcVT = Src.getValueType();
+ if (SrcVT.bitsGE(ExtVT)) {
+ SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
+ return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
+ }
+ }
+
+ return SDValue();
+}
/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
/// binary operation \p Opc to it with the corresponding constant operands.
SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
@@ -3521,6 +3548,9 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
break;
}
+ case ISD::AssertZext:
+ case ISD::AssertSext:
+ return performAssertSZExtCombine(N, DCI);
}
return SDValue();
}
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index a45234e2b39f..d85aada6053a 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -76,6 +76,7 @@ protected:
SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performClampCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL,
unsigned Opc, SDValue LHS,
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 1bc5a52053ec..779617629010 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -277,7 +277,7 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
// Make sure requested values are compatible with values implied by requested
// minimum/maximum flat work group sizes.
if (RequestedFlatWorkGroupSize &&
- Requested.first > MinImpliedByFlatWorkGroupSize)
+ Requested.first < MinImpliedByFlatWorkGroupSize)
return Default;
return Requested;
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 22cede59086a..d4b6a5fe8020 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -359,6 +359,10 @@ public:
return FP64FP16Denormals;
}
+ bool supportsMinMaxDenormModes() const {
+ return getGeneration() >= AMDGPUSubtarget::GFX9;
+ }
+
bool hasFPExceptions() const {
return FPExceptions;
}
diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index e3c90f250600..b37c274102bc 100644
--- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1208,7 +1208,7 @@ bool AMDGPUOperand::isInlinableImm(MVT type) const {
}
bool AMDGPUOperand::isLiteralImm(MVT type) const {
- // Check that this imediate can be added as literal
+ // Check that this immediate can be added as literal
if (!isImmTy(ImmTyNone)) {
return false;
}
diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index f26e49295e69..966c6fec20c6 100644
--- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -87,6 +87,7 @@ DECODE_OPERAND(Decode##RegClass##RegisterClass, decodeOperand_##RegClass)
DECODE_OPERAND_REG(VGPR_32)
DECODE_OPERAND_REG(VS_32)
DECODE_OPERAND_REG(VS_64)
+DECODE_OPERAND_REG(VS_128)
DECODE_OPERAND_REG(VReg_64)
DECODE_OPERAND_REG(VReg_96)
@@ -318,6 +319,10 @@ MCOperand AMDGPUDisassembler::decodeOperand_VS_64(unsigned Val) const {
return decodeSrcOp(OPW64, Val);
}
+MCOperand AMDGPUDisassembler::decodeOperand_VS_128(unsigned Val) const {
+ return decodeSrcOp(OPW128, Val);
+}
+
MCOperand AMDGPUDisassembler::decodeOperand_VSrc16(unsigned Val) const {
return decodeSrcOp(OPW16, Val);
}
diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 3d71db909e20..4c755be09999 100644
--- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -70,6 +70,7 @@ public:
MCOperand decodeOperand_VGPR_32(unsigned Val) const;
MCOperand decodeOperand_VS_32(unsigned Val) const;
MCOperand decodeOperand_VS_64(unsigned Val) const;
+ MCOperand decodeOperand_VS_128(unsigned Val) const;
MCOperand decodeOperand_VSrc16(unsigned Val) const;
MCOperand decodeOperand_VSrcV216(unsigned Val) const;
diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp
index 3af242d9ea66..0aad8f0843d6 100644
--- a/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -653,6 +653,7 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
// again. The same constant folded instruction could also have a second
// use operand.
NextUse = MRI->use_begin(Dst.getReg());
+ FoldList.clear();
continue;
}
diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp
index 08a64de38501..7334781916d8 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -158,7 +158,7 @@ SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
// No replacement necessary.
if (ScratchWaveOffsetReg == AMDGPU::NoRegister ||
!MRI.isPhysRegUsed(ScratchWaveOffsetReg)) {
- assert(MFI->getStackPtrOffsetReg() == AMDGPU::NoRegister);
+ assert(MFI->getStackPtrOffsetReg() == AMDGPU::SP_REG);
return std::make_pair(AMDGPU::NoRegister, AMDGPU::NoRegister);
}
@@ -246,13 +246,16 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
// this point it appears we need the setup. This part of the prolog should be
// emitted after frame indices are eliminated.
- if (MF.getFrameInfo().hasStackObjects() && MFI->hasFlatScratchInit())
+ if (MFI->hasFlatScratchInit())
emitFlatScratchInit(ST, MF, MBB);
unsigned SPReg = MFI->getStackPtrOffsetReg();
- if (SPReg != AMDGPU::NoRegister) {
+ if (SPReg != AMDGPU::SP_REG) {
+ assert(MRI.isReserved(SPReg) && "SPReg used but not reserved");
+
DebugLoc DL;
- int64_t StackSize = MF.getFrameInfo().getStackSize();
+ const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+ int64_t StackSize = FrameInfo.getStackSize();
if (StackSize == 0) {
BuildMI(MBB, MBB.begin(), DL, TII->get(AMDGPU::COPY), SPReg)
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 2ba570b9ebbb..2356405f0919 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1171,8 +1171,7 @@ static void allocateSystemSGPRs(CCState &CCInfo,
static void reservePrivateMemoryRegs(const TargetMachine &TM,
MachineFunction &MF,
const SIRegisterInfo &TRI,
- SIMachineFunctionInfo &Info,
- bool NeedSP) {
+ SIMachineFunctionInfo &Info) {
// Now that we've figured out where the scratch register inputs are, see if
// should reserve the arguments and use them directly.
MachineFrameInfo &MFI = MF.getFrameInfo();
@@ -1234,15 +1233,6 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
Info.setScratchWaveOffsetReg(ReservedOffsetReg);
}
}
-
- if (NeedSP) {
- unsigned ReservedStackPtrOffsetReg = TRI.reservedStackPtrOffsetReg(MF);
- Info.setStackPtrOffsetReg(ReservedStackPtrOffsetReg);
-
- assert(Info.getStackPtrOffsetReg() != Info.getFrameOffsetReg());
- assert(!TRI.isSubRegister(Info.getScratchRSrcReg(),
- Info.getStackPtrOffsetReg()));
- }
}
SDValue SITargetLowering::LowerFormalArguments(
@@ -1380,10 +1370,37 @@ SDValue SITargetLowering::LowerFormalArguments(
unsigned Reg = VA.getLocReg();
const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
+ EVT ValVT = VA.getValVT();
Reg = MF.addLiveIn(Reg, RC);
SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
+ // If this is an 8 or 16-bit value, it is really passed promoted
+ // to 32 bits. Insert an assert[sz]ext to capture this, then
+ // truncate to the right size.
+ switch (VA.getLocInfo()) {
+ case CCValAssign::Full:
+ break;
+ case CCValAssign::BCvt:
+ Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
+ break;
+ case CCValAssign::SExt:
+ Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
+ DAG.getValueType(ValVT));
+ Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
+ break;
+ case CCValAssign::ZExt:
+ Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
+ DAG.getValueType(ValVT));
+ Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
+ break;
+ case CCValAssign::AExt:
+ Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
+ break;
+ default:
+ llvm_unreachable("Unknown loc info!");
+ }
+
if (IsShader && Arg.VT.isVector()) {
// Build a vector from the registers
Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
@@ -1410,25 +1427,13 @@ SDValue SITargetLowering::LowerFormalArguments(
InVals.push_back(Val);
}
- const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
-
- // TODO: Could maybe omit SP if only tail calls?
- bool NeedSP = FrameInfo.hasCalls() || FrameInfo.hasVarSizedObjects();
-
// Start adding system SGPRs.
if (IsEntryFunc) {
allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
- reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info, NeedSP);
} else {
CCInfo.AllocateReg(Info->getScratchRSrcReg());
CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
CCInfo.AllocateReg(Info->getFrameOffsetReg());
-
- if (NeedSP) {
- unsigned StackPtrReg = findFirstFreeSGPR(CCInfo);
- CCInfo.AllocateReg(StackPtrReg);
- Info->setStackPtrOffsetReg(StackPtrReg);
- }
}
return Chains.empty() ? Chain :
@@ -4624,8 +4629,8 @@ static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
return DAG.isKnownNeverNaN(Op);
}
-static bool isCanonicalized(SDValue Op, const SISubtarget *ST,
- unsigned MaxDepth=5) {
+static bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
+ const SISubtarget *ST, unsigned MaxDepth=5) {
// If source is a result of another standard FP operation it is already in
// canonical form.
@@ -4663,7 +4668,7 @@ static bool isCanonicalized(SDValue Op, const SISubtarget *ST,
case ISD::FNEG:
case ISD::FABS:
return (MaxDepth > 0) &&
- isCanonicalized(Op.getOperand(0), ST, MaxDepth - 1);
+ isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1);
case ISD::FSIN:
case ISD::FCOS:
@@ -4672,16 +4677,19 @@ static bool isCanonicalized(SDValue Op, const SISubtarget *ST,
// In pre-GFX9 targets V_MIN_F32 and others do not flush denorms.
// For such targets need to check their input recursively.
- // TODO: on GFX9+ we could return true without checking provided no-nan
- // mode, since canonicalization is also used to quiet sNaNs.
case ISD::FMINNUM:
case ISD::FMAXNUM:
case ISD::FMINNAN:
case ISD::FMAXNAN:
+ if (ST->supportsMinMaxDenormModes() &&
+ DAG.isKnownNeverNaN(Op.getOperand(0)) &&
+ DAG.isKnownNeverNaN(Op.getOperand(1)))
+ return true;
+
return (MaxDepth > 0) &&
- isCanonicalized(Op.getOperand(0), ST, MaxDepth - 1) &&
- isCanonicalized(Op.getOperand(1), ST, MaxDepth - 1);
+ isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1) &&
+ isCanonicalized(DAG, Op.getOperand(1), ST, MaxDepth - 1);
case ISD::ConstantFP: {
auto F = cast<ConstantFPSDNode>(Op)->getValueAPF();
@@ -4700,11 +4708,19 @@ SDValue SITargetLowering::performFCanonicalizeCombine(
if (!CFP) {
SDValue N0 = N->getOperand(0);
+ EVT VT = N0.getValueType().getScalarType();
+ auto ST = getSubtarget();
+
+ if (((VT == MVT::f32 && ST->hasFP32Denormals()) ||
+ (VT == MVT::f64 && ST->hasFP64Denormals()) ||
+ (VT == MVT::f16 && ST->hasFP16Denormals())) &&
+ DAG.isKnownNeverNaN(N0))
+ return N0;
bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction());
if ((IsIEEEMode || isKnownNeverSNan(DAG, N0)) &&
- isCanonicalized(N0, getSubtarget()))
+ isCanonicalized(DAG, N0, ST))
return N0;
return SDValue();
@@ -5813,3 +5829,44 @@ SITargetLowering::getConstraintType(StringRef Constraint) const {
}
return TargetLowering::getConstraintType(Constraint);
}
+
+// Figure out which registers should be reserved for stack access. Only after
+// the function is legalized do we know all of the non-spill stack objects or if
+// calls are present.
+void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+ if (Info->isEntryFunction()) {
+ // Callable functions have fixed registers used for stack access.
+ reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info);
+ }
+
+ // We have to assume the SP is needed in case there are calls in the function
+ // during lowering. Calls are only detected after the function is
+ // lowered. We're about to reserve registers, so don't bother using it if we
+ // aren't really going to use it.
+ bool NeedSP = !Info->isEntryFunction() ||
+ MFI.hasVarSizedObjects() ||
+ MFI.hasCalls();
+
+ if (NeedSP) {
+ unsigned ReservedStackPtrOffsetReg = TRI->reservedStackPtrOffsetReg(MF);
+ Info->setStackPtrOffsetReg(ReservedStackPtrOffsetReg);
+
+ assert(Info->getStackPtrOffsetReg() != Info->getFrameOffsetReg());
+ assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
+ Info->getStackPtrOffsetReg()));
+ MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
+ }
+
+ MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
+ MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
+ MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG,
+ Info->getScratchWaveOffsetReg());
+
+ TargetLoweringBase::finalizeLowering(MF);
+}
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index 83392a7ab1b2..e6bb3d6cd419 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -232,6 +232,8 @@ public:
ConstraintType getConstraintType(StringRef Constraint) const override;
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL,
SDValue V) const;
+
+ void finalizeLowering(MachineFunction &MF) const override;
};
} // End namespace llvm
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index 160f8837d49c..a7e0feb10b9f 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3408,8 +3408,8 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
}
void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
- SmallVector<MachineInstr *, 128> Worklist;
- Worklist.push_back(&TopInst);
+ SetVectorType Worklist;
+ Worklist.insert(&TopInst);
while (!Worklist.empty()) {
MachineInstr &Inst = *Worklist.pop_back_val();
@@ -3610,7 +3610,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
}
}
-void SIInstrInfo::lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist,
+void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist,
MachineInstr &Inst) const {
MachineBasicBlock &MBB = *Inst.getParent();
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -3635,7 +3635,7 @@ void SIInstrInfo::lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist,
}
void SIInstrInfo::splitScalar64BitUnaryOp(
- SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst,
+ SetVectorType &Worklist, MachineInstr &Inst,
unsigned Opcode) const {
MachineBasicBlock &MBB = *Inst.getParent();
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -3686,7 +3686,7 @@ void SIInstrInfo::splitScalar64BitUnaryOp(
}
void SIInstrInfo::splitScalar64BitBinaryOp(
- SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst,
+ SetVectorType &Worklist, MachineInstr &Inst,
unsigned Opcode) const {
MachineBasicBlock &MBB = *Inst.getParent();
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -3753,7 +3753,7 @@ void SIInstrInfo::splitScalar64BitBinaryOp(
}
void SIInstrInfo::splitScalar64BitBCNT(
- SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst) const {
+ SetVectorType &Worklist, MachineInstr &Inst) const {
MachineBasicBlock &MBB = *Inst.getParent();
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -3789,7 +3789,7 @@ void SIInstrInfo::splitScalar64BitBCNT(
addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
}
-void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
+void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist,
MachineInstr &Inst) const {
MachineBasicBlock &MBB = *Inst.getParent();
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -3853,12 +3853,12 @@ void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
void SIInstrInfo::addUsersToMoveToVALUWorklist(
unsigned DstReg,
MachineRegisterInfo &MRI,
- SmallVectorImpl<MachineInstr *> &Worklist) const {
+ SetVectorType &Worklist) const {
for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
E = MRI.use_end(); I != E;) {
MachineInstr &UseMI = *I->getParent();
if (!canReadVGPR(UseMI, I.getOperandNo())) {
- Worklist.push_back(&UseMI);
+ Worklist.insert(&UseMI);
do {
++I;
@@ -3869,7 +3869,7 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist(
}
}
-void SIInstrInfo::movePackToVALU(SmallVectorImpl<MachineInstr *> &Worklist,
+void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
MachineRegisterInfo &MRI,
MachineInstr &Inst) const {
unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
@@ -3932,7 +3932,7 @@ void SIInstrInfo::movePackToVALU(SmallVectorImpl<MachineInstr *> &Worklist,
}
void SIInstrInfo::addSCCDefUsersToVALUWorklist(
- MachineInstr &SCCDefInst, SmallVectorImpl<MachineInstr *> &Worklist) const {
+ MachineInstr &SCCDefInst, SetVectorType &Worklist) const {
// This assumes that all the users of SCC are in the same block
// as the SCC def.
for (MachineInstr &MI :
@@ -3943,7 +3943,7 @@ void SIInstrInfo::addSCCDefUsersToVALUWorklist(
return;
if (MI.findRegisterUseOperandIdx(AMDGPU::SCC) != -1)
- Worklist.push_back(&MI);
+ Worklist.insert(&MI);
}
}
diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h
index d00c0d4a7f4e..3dd5bc89e6c7 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@@ -19,6 +19,7 @@
#include "AMDGPUInstrInfo.h"
#include "SIDefines.h"
#include "SIRegisterInfo.h"
+#include "llvm/ADT/SetVector.h"
namespace llvm {
@@ -38,6 +39,8 @@ private:
EXECZ = 3
};
+ typedef SmallSetVector<MachineInstr *, 32> SetVectorType;
+
static unsigned getBranchOpcode(BranchPredicate Cond);
static BranchPredicate getBranchPredicate(unsigned Opcode);
@@ -56,30 +59,30 @@ private:
void swapOperands(MachineInstr &Inst) const;
- void lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist,
+ void lowerScalarAbs(SetVectorType &Worklist,
MachineInstr &Inst) const;
- void splitScalar64BitUnaryOp(SmallVectorImpl<MachineInstr *> &Worklist,
+ void splitScalar64BitUnaryOp(SetVectorType &Worklist,
MachineInstr &Inst, unsigned Opcode) const;
- void splitScalar64BitBinaryOp(SmallVectorImpl<MachineInstr *> &Worklist,
+ void splitScalar64BitBinaryOp(SetVectorType &Worklist,
MachineInstr &Inst, unsigned Opcode) const;
- void splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist,
+ void splitScalar64BitBCNT(SetVectorType &Worklist,
MachineInstr &Inst) const;
- void splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
+ void splitScalar64BitBFE(SetVectorType &Worklist,
MachineInstr &Inst) const;
- void movePackToVALU(SmallVectorImpl<MachineInstr *> &Worklist,
+ void movePackToVALU(SetVectorType &Worklist,
MachineRegisterInfo &MRI,
MachineInstr &Inst) const;
void addUsersToMoveToVALUWorklist(
unsigned Reg, MachineRegisterInfo &MRI,
- SmallVectorImpl<MachineInstr *> &Worklist) const;
+ SetVectorType &Worklist) const;
void
addSCCDefUsersToVALUWorklist(MachineInstr &SCCDefInst,
- SmallVectorImpl<MachineInstr *> &Worklist) const;
+ SetVectorType &Worklist) const;
const TargetRegisterClass *
getDestEquivalentVGPRClass(const MachineInstr &Inst) const;
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index ffb01363e131..088173680fa8 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1436,7 +1436,7 @@ class VOPProfile <list<ValueType> _ArgVT> {
field bit IsPacked = isPackedType<Src0VT>.ret;
field bit HasOpSel = IsPacked;
- field bit HasOMod = !if(HasOpSel, 0, HasModifiers);
+ field bit HasOMod = !if(HasOpSel, 0, isFloatType<DstVT>.ret);
field bit HasSDWAOMod = isFloatType<DstVT>.ret;
field bit HasExt = getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index bcc685015cf5..ba69e42d9125 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -1060,7 +1060,7 @@ def : Pat <
class FPToI1Pat<Instruction Inst, int KOne, ValueType kone_type, ValueType vt, SDPatternOperator fp_to_int> : Pat <
(i1 (fp_to_int (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)))),
- (i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE, DSTOMOD.NONE))
+ (i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE))
>;
def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_ONE, i32, f32, fp_to_uint>;
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 3203c38dae34..a7c8166ff6d2 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -23,10 +23,10 @@ using namespace llvm;
SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
: AMDGPUMachineFunction(MF),
TIDReg(AMDGPU::NoRegister),
- ScratchRSrcReg(AMDGPU::NoRegister),
- ScratchWaveOffsetReg(AMDGPU::NoRegister),
- FrameOffsetReg(AMDGPU::NoRegister),
- StackPtrOffsetReg(AMDGPU::NoRegister),
+ ScratchRSrcReg(AMDGPU::PRIVATE_RSRC_REG),
+ ScratchWaveOffsetReg(AMDGPU::SCRATCH_WAVE_OFFSET_REG),
+ FrameOffsetReg(AMDGPU::FP_REG),
+ StackPtrOffsetReg(AMDGPU::SP_REG),
PrivateSegmentBufferUserSGPR(AMDGPU::NoRegister),
DispatchPtrUserSGPR(AMDGPU::NoRegister),
QueuePtrUserSGPR(AMDGPU::NoRegister),
@@ -42,6 +42,9 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
WorkGroupIDZSystemSGPR(AMDGPU::NoRegister),
WorkGroupInfoSystemSGPR(AMDGPU::NoRegister),
PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister),
+ WorkItemIDXVGPR(AMDGPU::NoRegister),
+ WorkItemIDYVGPR(AMDGPU::NoRegister),
+ WorkItemIDZVGPR(AMDGPU::NoRegister),
PSInputAddr(0),
PSInputEnable(0),
ReturnsVoid(true),
@@ -87,12 +90,14 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
ScratchWaveOffsetReg = AMDGPU::SGPR4;
FrameOffsetReg = AMDGPU::SGPR5;
StackPtrOffsetReg = AMDGPU::SGPR32;
- return;
+
+ // FIXME: Not really a system SGPR.
+ PrivateSegmentWaveByteOffsetSystemSGPR = ScratchWaveOffsetReg;
}
CallingConv::ID CC = F->getCallingConv();
if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) {
- KernargSegmentPtr = true;
+ KernargSegmentPtr = !F->arg_empty();
WorkGroupIDX = true;
WorkItemIDX = true;
} else if (CC == CallingConv::AMDGPU_PS) {
@@ -101,17 +106,25 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
if (ST.debuggerEmitPrologue()) {
// Enable everything.
+ WorkGroupIDX = true;
WorkGroupIDY = true;
WorkGroupIDZ = true;
+ WorkItemIDX = true;
WorkItemIDY = true;
WorkItemIDZ = true;
} else {
+ if (F->hasFnAttribute("amdgpu-work-group-id-x"))
+ WorkGroupIDX = true;
+
if (F->hasFnAttribute("amdgpu-work-group-id-y"))
WorkGroupIDY = true;
if (F->hasFnAttribute("amdgpu-work-group-id-z"))
WorkGroupIDZ = true;
+ if (F->hasFnAttribute("amdgpu-work-item-id-x"))
+ WorkItemIDX = true;
+
if (F->hasFnAttribute("amdgpu-work-item-id-y"))
WorkItemIDY = true;
@@ -119,25 +132,28 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
WorkItemIDZ = true;
}
- // X, XY, and XYZ are the only supported combinations, so make sure Y is
- // enabled if Z is.
- if (WorkItemIDZ)
- WorkItemIDY = true;
-
const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
bool MaySpill = ST.isVGPRSpillingEnabled(*F);
- bool HasStackObjects = FrameInfo.hasStackObjects() || FrameInfo.hasCalls();
+ bool HasStackObjects = FrameInfo.hasStackObjects();
+
+ if (isEntryFunction()) {
+ // X, XY, and XYZ are the only supported combinations, so make sure Y is
+ // enabled if Z is.
+ if (WorkItemIDZ)
+ WorkItemIDY = true;
- if (HasStackObjects || MaySpill) {
- PrivateSegmentWaveByteOffset = true;
+ if (HasStackObjects || MaySpill) {
+ PrivateSegmentWaveByteOffset = true;
- // HS and GS always have the scratch wave offset in SGPR5 on GFX9.
- if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 &&
- (CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS))
- PrivateSegmentWaveByteOffsetSystemSGPR = AMDGPU::SGPR5;
+ // HS and GS always have the scratch wave offset in SGPR5 on GFX9.
+ if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 &&
+ (CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS))
+ PrivateSegmentWaveByteOffsetSystemSGPR = AMDGPU::SGPR5;
+ }
}
- if (ST.isAmdCodeObjectV2(MF)) {
+ bool IsCOV2 = ST.isAmdCodeObjectV2(MF);
+ if (IsCOV2) {
if (HasStackObjects || MaySpill)
PrivateSegmentBuffer = true;
@@ -154,11 +170,15 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
ImplicitBufferPtr = true;
}
- // We don't need to worry about accessing spills with flat instructions.
- // TODO: On VI where we must use flat for global, we should be able to omit
- // this if it is never used for generic access.
- if (HasStackObjects && ST.hasFlatAddressSpace() && ST.isAmdHsaOS())
- FlatScratchInit = true;
+ if (F->hasFnAttribute("amdgpu-kernarg-segment-ptr"))
+ KernargSegmentPtr = true;
+
+ if (ST.hasFlatAddressSpace() && isEntryFunction() && IsCOV2) {
+ // TODO: This could be refined a lot. The attribute is a poor way of
+ // detecting calls that may require it before argument lowering.
+ if (HasStackObjects || F->hasFnAttribute("amdgpu-flat-scratch"))
+ FlatScratchInit = true;
+ }
}
unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 05aa249584bf..4c7f38a09a48 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -119,6 +119,11 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
unsigned WorkGroupInfoSystemSGPR;
unsigned PrivateSegmentWaveByteOffsetSystemSGPR;
+ // VGPR inputs. These are always v0, v1 and v2 for entry functions.
+ unsigned WorkItemIDXVGPR;
+ unsigned WorkItemIDYVGPR;
+ unsigned WorkItemIDZVGPR;
+
// Graphics info.
unsigned PSInputAddr;
unsigned PSInputEnable;
@@ -377,10 +382,13 @@ public:
}
void setStackPtrOffsetReg(unsigned Reg) {
- assert(Reg != AMDGPU::NoRegister && "Should never be unset");
StackPtrOffsetReg = Reg;
}
+ // Note the unset value for this is AMDGPU::SP_REG rather than
+ // NoRegister. This is mostly a workaround for MIR tests where state that
+ // can't be directly computed from the function is not preserved in serialized
+ // MIR.
unsigned getStackPtrOffsetReg() const {
return StackPtrOffsetReg;
}
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp
index ef6ad4ad0c8f..4a3fbb4593bb 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -207,7 +207,11 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg));
}
+ // We have to assume the SP is needed in case there are calls in the function,
+ // which is detected after the function is lowered. If we aren't really going
+ // to need SP, don't bother reserving it.
unsigned StackPtrReg = MFI->getStackPtrOffsetReg();
+
if (StackPtrReg != AMDGPU::NoRegister) {
reserveRegisterTuples(Reserved, StackPtrReg);
assert(!isSubRegister(ScratchRSrcReg, StackPtrReg));
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.td b/lib/Target/AMDGPU/SIRegisterInfo.td
index fc808011cd88..54ea7805e18d 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -23,6 +23,13 @@ class SIReg <string n, bits<16> regIdx = 0> : Register<n>,
def VCC_LO : SIReg<"vcc_lo", 106>;
def VCC_HI : SIReg<"vcc_hi", 107>;
+// Pseudo-registers: Used as placeholders during isel and immediately
+// replaced, never seeing the verifier.
+def PRIVATE_RSRC_REG : SIReg<"", 0>;
+def FP_REG : SIReg<"", 0>;
+def SP_REG : SIReg<"", 0>;
+def SCRATCH_WAVE_OFFSET_REG : SIReg<"", 0>;
+
// VCC for 64-bit instructions
def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]>,
DwarfRegAlias<VCC_LO> {
@@ -267,7 +274,8 @@ def VGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7,
def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
(add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI,
TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE, SRC_SHARED_LIMIT,
- SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT)> {
+ SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT,
+ FP_REG, SP_REG, SCRATCH_WAVE_OFFSET_REG)> {
let AllocationPriority = 7;
}
@@ -314,7 +322,8 @@ def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add TTMP_128R
let isAllocatable = 0;
}
-def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128, TTMP_128)> {
+def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32,
+ (add SGPR_128, TTMP_128)> {
let AllocationPriority = 10;
}
@@ -464,7 +473,9 @@ defm SCSrc : RegInlineOperand<"SReg", "SCSrc"> ;
defm VSrc : RegImmOperand<"VS", "VSrc">;
-def VSrc_128 : RegisterOperand<VReg_128>;
+def VSrc_128 : RegisterOperand<VReg_128> {
+ let DecoderMethod = "DecodeVS_128RegisterClass";
+}
//===----------------------------------------------------------------------===//
// VSrc_* Operands with an VGPR
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 26515b27bb77..67ad904ca972 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -539,23 +539,9 @@ bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) {
}
bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI) {
-
- if (Reg0 == Reg1) {
- return true;
+ for (MCRegAliasIterator R(Reg0, TRI, true); R.isValid(); ++R) {
+ if (*R == Reg1) return true;
}
-
- unsigned SubReg0 = TRI->getSubReg(Reg0, 1);
- if (SubReg0 == 0) {
- return TRI->getSubRegIndex(Reg1, Reg0) > 0;
- }
-
- for (unsigned Idx = 2; SubReg0 > 0; ++Idx) {
- if (isRegIntersect(Reg1, SubReg0, TRI)) {
- return true;
- }
- SubReg0 = TRI->getSubReg(Reg0, Idx);
- }
-
return false;
}
diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td
index 7b9bc71ad4c7..d5acb49b4f39 100644
--- a/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/lib/Target/AMDGPU/VOP2Instructions.td
@@ -117,7 +117,10 @@ class VOP2_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
class getVOP2Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {
list<dag> ret = !if(P.HasModifiers,
[(set P.DstVT:$vdst,
- (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
+ (node (P.Src0VT
+ !if(P.HasOMod,
+ (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod),
+ (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))),
(P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
[(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]);
}
@@ -813,9 +816,11 @@ let SubtargetPredicate = isVI in {
// Aliases to simplify matching of floating-point instructions that
// are VOP2 on SI and VOP3 on VI.
-class SI2_VI3Alias <string name, Instruction inst> : InstAlias <
+class SI2_VI3Alias <string name, VOP3_Real inst> : InstAlias <
name#" $dst, $src0, $src1",
- (inst VGPR_32:$dst, 0, VCSrc_f32:$src0, 0, VCSrc_f32:$src1, 0, 0)
+ !if(inst.Pfl.HasOMod,
+ (inst VGPR_32:$dst, 0, VCSrc_f32:$src0, 0, VCSrc_f32:$src1, 0, 0),
+ (inst VGPR_32:$dst, 0, VCSrc_f32:$src0, 0, VCSrc_f32:$src1, 0))
>, PredicateControl {
let UseInstAsmMatchConverter = 0;
let AsmVariantName = AMDGPUAsmVariants.VOP3;
diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td
index a8ca593f14ed..92ed0706dc01 100644
--- a/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/lib/Target/AMDGPU/VOP3Instructions.td
@@ -12,17 +12,21 @@
//===----------------------------------------------------------------------===//
class getVOP3ModPat<VOPProfile P, SDPatternOperator node> {
+ dag src0 = !if(P.HasOMod,
+ (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod),
+ (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp));
+
list<dag> ret3 = [(set P.DstVT:$vdst,
- (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
+ (node (P.Src0VT src0),
(P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
(P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))))];
list<dag> ret2 = [(set P.DstVT:$vdst,
- (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
+ (node (P.Src0VT src0),
(P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))];
list<dag> ret1 = [(set P.DstVT:$vdst,
- (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod))))];
+ (node (P.Src0VT src0)))];
list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
!if(!eq(P.NumSrcArgs, 2), ret2,
@@ -92,6 +96,7 @@ class VOP3_Profile<VOPProfile P> : VOPProfile<P.ArgVT> {
class VOP3b_Profile<ValueType vt> : VOPProfile<[vt, vt, vt, vt]> {
// v_div_scale_{f32|f64} do not support input modifiers.
let HasModifiers = 0;
+ let HasOMod = 0;
let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
let Asm64 = " $vdst, $sdst, $src0, $src1, $src2";
}
diff --git a/lib/Target/AMDGPU/VOP3PInstructions.td b/lib/Target/AMDGPU/VOP3PInstructions.td
index f2de1f995726..3becf758aaa3 100644
--- a/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -34,6 +34,9 @@ class VOP3_VOP3PInst<string OpName, VOPProfile P, SDPatternOperator node = null_
let isCommutable = 1 in {
def V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, fma>;
+def V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
+def V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
+
def V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fadd>;
def V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmul>;
def V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum>;
@@ -41,7 +44,6 @@ def V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>
def V_PK_ADD_U16 : VOP3PInst<"v_pk_add_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, add>;
def V_PK_ADD_I16 : VOP3PInst<"v_pk_add_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>;
-def V_PK_SUB_I16 : VOP3PInst<"v_pk_sub_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, sub>;
def V_PK_MUL_LO_U16 : VOP3PInst<"v_pk_mul_lo_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, mul>;
def V_PK_MIN_I16 : VOP3PInst<"v_pk_min_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, smin>;
@@ -50,6 +52,9 @@ def V_PK_MAX_I16 : VOP3PInst<"v_pk_max_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>
def V_PK_MAX_U16 : VOP3PInst<"v_pk_max_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, umax>;
}
+def V_PK_SUB_U16 : VOP3PInst<"v_pk_sub_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>;
+def V_PK_SUB_I16 : VOP3PInst<"v_pk_sub_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, sub>;
+
def V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshl_rev>;
def V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, ashr_rev>;
def V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshr_rev>;
@@ -71,6 +76,7 @@ multiclass VOP3P_Real_vi<bits<10> op> {
}
}
+defm V_PK_MAD_I16 : VOP3P_Real_vi <0x380>;
defm V_PK_MUL_LO_U16 : VOP3P_Real_vi <0x381>;
defm V_PK_ADD_I16 : VOP3P_Real_vi <0x382>;
defm V_PK_SUB_I16 : VOP3P_Real_vi <0x383>;
@@ -79,8 +85,10 @@ defm V_PK_LSHRREV_B16 : VOP3P_Real_vi <0x385>;
defm V_PK_ASHRREV_I16 : VOP3P_Real_vi <0x386>;
defm V_PK_MAX_I16 : VOP3P_Real_vi <0x387>;
defm V_PK_MIN_I16 : VOP3P_Real_vi <0x388>;
+defm V_PK_MAD_U16 : VOP3P_Real_vi <0x389>;
defm V_PK_ADD_U16 : VOP3P_Real_vi <0x38a>;
+defm V_PK_SUB_U16 : VOP3P_Real_vi <0x38b>;
defm V_PK_MAX_U16 : VOP3P_Real_vi <0x38c>;
defm V_PK_MIN_U16 : VOP3P_Real_vi <0x38d>;
defm V_PK_FMA_F16 : VOP3P_Real_vi <0x38e>;
diff --git a/lib/Target/AMDGPU/VOPCInstructions.td b/lib/Target/AMDGPU/VOPCInstructions.td
index f3482a22d5dc..b636fc9be431 100644
--- a/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/lib/Target/AMDGPU/VOPCInstructions.td
@@ -148,6 +148,19 @@ class VOPCInstAlias <VOP3_Pseudo ps, Instruction inst, VOPProfile p = ps.Pfl> :
let SubtargetPredicate = AssemblerPredicate;
}
+class getVOPCPat64 <PatLeaf cond, VOPProfile P> : LetDummies {
+ list<dag> ret = !if(P.HasModifiers,
+ [(set i1:$sdst,
+ (setcc (P.Src0VT
+ !if(P.HasOMod,
+ (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod),
+ (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))),
+ (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
+ cond))],
+ [(set i1:$sdst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))]);
+}
+
+
multiclass VOPC_Pseudos <string opName,
VOPC_Profile P,
PatLeaf cond = COND_NULL,
@@ -163,14 +176,7 @@ multiclass VOPC_Pseudos <string opName,
let isCommutable = 1;
}
- def _e64 : VOP3_Pseudo<opName, P,
- !if(P.HasModifiers,
- [(set i1:$sdst,
- (setcc (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
- i1:$clamp, i32:$omod)),
- (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
- cond))],
- [(set i1:$sdst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))])>,
+ def _e64 : VOP3_Pseudo<opName, P, getVOPCPat64<cond, P>.ret>,
Commutable_REV<revOp#"_e64", !eq(revOp, opName)> {
let Defs = !if(DefExec, [EXEC], []);
let SchedRW = P.Schedule;
@@ -634,7 +640,7 @@ class FCMP_Pattern <PatLeaf cond, Instruction inst, ValueType vt> : Pat <
(i64 (AMDGPUsetcc (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)),
(vt (VOP3Mods vt:$src1, i32:$src1_modifiers)), cond)),
(inst $src0_modifiers, $src0, $src1_modifiers, $src1,
- DSTCLAMP.NONE, DSTOMOD.NONE)
+ DSTCLAMP.NONE)
>;
def : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F32_e64, f32>;
diff --git a/lib/Target/AMDGPU/VOPInstructions.td b/lib/Target/AMDGPU/VOPInstructions.td
index 77b7952b22a8..b47538ba0349 100644
--- a/lib/Target/AMDGPU/VOPInstructions.td
+++ b/lib/Target/AMDGPU/VOPInstructions.td
@@ -136,6 +136,8 @@ class VOP3_Real <VOP3_Pseudo ps, int EncodingFamily> :
let TSFlags = ps.TSFlags;
let UseNamedOperandTable = ps.UseNamedOperandTable;
let Uses = ps.Uses;
+
+ VOPProfile Pfl = ps.Pfl;
}
// XXX - Is there any reason to distingusih this from regular VOP3