aboutsummaryrefslogtreecommitdiff
path: root/lib/Target/AMDGPU
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2017-07-13 19:25:18 +0000
committerDimitry Andric <dim@FreeBSD.org>2017-07-13 19:25:18 +0000
commitca089b24d48ef6fa8da2d0bb8c25bb802c4a95c0 (patch)
tree3a28a772df9b17aef34f49e3c727965ad28c0c93 /lib/Target/AMDGPU
parent9df3605dea17e84f8183581f6103bd0c79e2a606 (diff)
downloadsrc-ca089b24d48ef6fa8da2d0bb8c25bb802c4a95c0.tar.gz
src-ca089b24d48ef6fa8da2d0bb8c25bb802c4a95c0.zip
Vendor import of llvm trunk r307894:vendor/llvm/llvm-trunk-r307894
Notes
Notes: svn path=/vendor/llvm/dist/; revision=320957 svn path=/vendor/llvm/llvm-trunk-r307894/; revision=320958; tag=vendor/llvm/llvm-trunk-r307894
Diffstat (limited to 'lib/Target/AMDGPU')
-rw-r--r--lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp9
-rw-r--r--lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp4
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.cpp7
-rw-r--r--lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp13
-rw-r--r--lib/Target/AMDGPU/AMDGPUMacroFusion.cpp64
-rw-r--r--lib/Target/AMDGPU/AMDGPUMacroFusion.h19
-rw-r--r--lib/Target/AMDGPU/AMDGPUSubtarget.cpp58
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetMachine.cpp47
-rw-r--r--lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp166
-rw-r--r--lib/Target/AMDGPU/CMakeLists.txt1
-rw-r--r--lib/Target/AMDGPU/GCNIterativeScheduler.cpp2
-rw-r--r--lib/Target/AMDGPU/GCNMinRegStrategy.cpp2
-rw-r--r--lib/Target/AMDGPU/GCNRegPressure.cpp2
-rw-r--r--lib/Target/AMDGPU/GCNSchedStrategy.cpp2
-rw-r--r--lib/Target/AMDGPU/GCNSchedStrategy.h2
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp4
-rw-r--r--lib/Target/AMDGPU/MIMGInstructions.td1
-rw-r--r--lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp2
-rw-r--r--lib/Target/AMDGPU/R600ISelLowering.cpp3
-rw-r--r--lib/Target/AMDGPU/R600ISelLowering.h3
-rw-r--r--lib/Target/AMDGPU/R600MachineScheduler.cpp2
-rw-r--r--lib/Target/AMDGPU/SIFoldOperands.cpp1
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.cpp120
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.h3
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.cpp20
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.h8
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.td2
-rw-r--r--lib/Target/AMDGPU/SIMachineScheduler.cpp2
-rw-r--r--lib/Target/AMDGPU/SIShrinkInstructions.cpp78
-rw-r--r--lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp2
-rw-r--r--lib/Target/AMDGPU/VOP3PInstructions.td28
-rw-r--r--lib/Target/AMDGPU/VOPInstructions.td18
32 files changed, 428 insertions, 267 deletions
diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
index 6f002860044c..ed5370826647 100644
--- a/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
@@ -108,10 +108,11 @@ bool AMDGPUAnnotateUniformValues::isClobberedInFunction(LoadInst * Load) {
DFS(Start, Checklist);
for (auto &BB : Checklist) {
BasicBlock::iterator StartIt = (!L && (BB == Load->getParent())) ?
- BasicBlock::iterator(Load) : BB->end();
- if (MDR->getPointerDependencyFrom(MemoryLocation(Ptr),
- true, StartIt, BB, Load).isClobber())
- return true;
+ BasicBlock::iterator(Load) : BB->end();
+ auto Q = MDR->getPointerDependencyFrom(MemoryLocation(Ptr), true,
+ StartIt, BB, Load);
+ if (Q.isClobber() || Q.isUnknown())
+ return true;
}
return false;
}
diff --git a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index b312dbc8d14d..31ee9206ae27 100644
--- a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -380,7 +380,9 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
FastMathFlags FMF = FPOp->getFastMathFlags();
bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() ||
FMF.allowReciprocal();
- if (ST->hasFP32Denormals() && !UnsafeDiv)
+
+ // With UnsafeDiv node will be optimized to just rcp and mul.
+ if (ST->hasFP32Denormals() || UnsafeDiv)
return false;
IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 96f819fd0e68..2553cf4da0fe 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -2651,8 +2651,11 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
return DAG.getZExtOrTrunc(Shl, SL, VT);
}
- case ISD::OR: if (!isOrEquivalentToAdd(DAG, LHS)) break;
- case ISD::ADD: { // Fall through from above
+ case ISD::OR:
+ if (!isOrEquivalentToAdd(DAG, LHS))
+ break;
+ LLVM_FALLTHROUGH;
+ case ISD::ADD: {
// shl (or|add x, c2), c1 => or|add (shl x, c1), (c2 << c1)
if (ConstantSDNode *C2 = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
SDValue Shl = DAG.getNode(ISD::SHL, SL, VT, LHS->getOperand(0),
diff --git a/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp b/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
index 846e7dff5f8c..7e0e9802c0e6 100644
--- a/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
+++ b/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
@@ -10,6 +10,7 @@
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
@@ -34,9 +35,14 @@ public:
AMDGPULowerIntrinsics() : ModulePass(ID) {}
bool runOnModule(Module &M) override;
+ bool expandMemIntrinsicUses(Function &F);
StringRef getPassName() const override {
return "AMDGPU Lower Intrinsics";
}
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ }
};
}
@@ -55,7 +61,7 @@ static bool shouldExpandOperationWithSize(Value *Size) {
return !CI || (CI->getZExtValue() > MaxStaticSize);
}
-static bool expandMemIntrinsicUses(Function &F) {
+bool AMDGPULowerIntrinsics::expandMemIntrinsicUses(Function &F) {
Intrinsic::ID ID = F.getIntrinsicID();
bool Changed = false;
@@ -67,7 +73,10 @@ static bool expandMemIntrinsicUses(Function &F) {
case Intrinsic::memcpy: {
auto *Memcpy = cast<MemCpyInst>(Inst);
if (shouldExpandOperationWithSize(Memcpy->getLength())) {
- expandMemCpyAsLoop(Memcpy);
+ Function *ParentFunc = Memcpy->getParent()->getParent();
+ const TargetTransformInfo &TTI =
+ getAnalysis<TargetTransformInfoWrapperPass>().getTTI(*ParentFunc);
+ expandMemCpyAsLoop(Memcpy, TTI);
Changed = true;
Memcpy->eraseFromParent();
}
diff --git a/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp b/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp
new file mode 100644
index 000000000000..7263ba73d155
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp
@@ -0,0 +1,64 @@
+//===--- AMDGPUMacroFusion.cpp - AMDGPU Macro Fusion ----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file contains the AMDGPU implementation of the DAG scheduling
+/// mutation to pair instructions back to back.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMacroFusion.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+
+#include "llvm/CodeGen/MacroFusion.h"
+
+using namespace llvm;
+
+namespace {
+
+/// \brief Check if the instr pair, FirstMI and SecondMI, should be fused
+/// together. Given SecondMI, when FirstMI is unspecified, then check if
+/// SecondMI may be part of a fused pair at all.
+static bool shouldScheduleAdjacent(const TargetInstrInfo &TII_,
+ const TargetSubtargetInfo &TSI,
+ const MachineInstr *FirstMI,
+ const MachineInstr &SecondMI) {
+ const SIInstrInfo &TII = static_cast<const SIInstrInfo&>(TII_);
+
+ switch (SecondMI.getOpcode()) {
+ case AMDGPU::V_ADDC_U32_e64:
+ case AMDGPU::V_SUBB_U32_e64:
+ case AMDGPU::V_CNDMASK_B32_e64: {
+ // Try to cluster defs of condition registers to their uses. This improves
+ // the chance VCC will be available which will allow shrinking to VOP2
+ // encodings.
+ if (!FirstMI)
+ return true;
+
+ const MachineOperand *Src2 = TII.getNamedOperand(SecondMI,
+ AMDGPU::OpName::src2);
+ return FirstMI->definesRegister(Src2->getReg());
+ }
+ default:
+ return false;
+ }
+
+ return false;
+}
+
+} // end namespace
+
+
+namespace llvm {
+
+std::unique_ptr<ScheduleDAGMutation> createAMDGPUMacroFusionDAGMutation () {
+ return createMacroFusionDAGMutation(shouldScheduleAdjacent);
+}
+
+} // end namespace llvm
diff --git a/lib/Target/AMDGPU/AMDGPUMacroFusion.h b/lib/Target/AMDGPU/AMDGPUMacroFusion.h
new file mode 100644
index 000000000000..844958580a65
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUMacroFusion.h
@@ -0,0 +1,19 @@
+//===- AMDGPUMacroFusion.h - AMDGPU Macro Fusion ----------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineScheduler.h"
+
+namespace llvm {
+
+/// Note that you have to add:
+/// DAG.addMutation(createAMDGPUMacroFusionDAGMutation());
+/// to AMDGPUPassConfig::createMachineScheduler() to have an effect.
+std::unique_ptr<ScheduleDAGMutation> createAMDGPUMacroFusionDAGMutation();
+
+} // llvm
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index be47b900c6f0..1bc5a52053ec 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -13,6 +13,14 @@
//===----------------------------------------------------------------------===//
#include "AMDGPUSubtarget.h"
+#include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+#include "AMDGPUCallLowering.h"
+#include "AMDGPUInstructionSelector.h"
+#include "AMDGPULegalizerInfo.h"
+#include "AMDGPURegisterBankInfo.h"
+#endif
#include "SIMachineFunctionInfo.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/CodeGen/MachineScheduler.h"
@@ -72,6 +80,31 @@ AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
return *this;
}
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+namespace {
+
+struct SIGISelActualAccessor : public GISelAccessor {
+ std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
+ std::unique_ptr<InstructionSelector> InstSelector;
+ std::unique_ptr<LegalizerInfo> Legalizer;
+ std::unique_ptr<RegisterBankInfo> RegBankInfo;
+ const AMDGPUCallLowering *getCallLowering() const override {
+ return CallLoweringInfo.get();
+ }
+ const InstructionSelector *getInstructionSelector() const override {
+ return InstSelector.get();
+ }
+ const LegalizerInfo *getLegalizerInfo() const override {
+ return Legalizer.get();
+ }
+ const RegisterBankInfo *getRegBankInfo() const override {
+ return RegBankInfo.get();
+ }
+};
+
+} // end anonymous namespace
+#endif
+
AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
const TargetMachine &TM)
: AMDGPUGenSubtargetInfo(TT, GPU, FS),
@@ -265,18 +298,21 @@ bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
case Intrinsic::amdgcn_workitem_id_x:
case Intrinsic::r600_read_tidig_x:
IdQuery = true;
+ LLVM_FALLTHROUGH;
case Intrinsic::r600_read_local_size_x:
Dim = 0;
break;
case Intrinsic::amdgcn_workitem_id_y:
case Intrinsic::r600_read_tidig_y:
IdQuery = true;
+ LLVM_FALLTHROUGH;
case Intrinsic::r600_read_local_size_y:
Dim = 1;
break;
case Intrinsic::amdgcn_workitem_id_z:
case Intrinsic::r600_read_tidig_z:
IdQuery = true;
+ LLVM_FALLTHROUGH;
case Intrinsic::r600_read_local_size_z:
Dim = 2;
break;
@@ -317,11 +353,23 @@ R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
TLInfo(TM, *this) {}
SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
- const TargetMachine &TM) :
- AMDGPUSubtarget(TT, GPU, FS, TM),
- InstrInfo(*this),
- FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
- TLInfo(TM, *this) {}
+ const TargetMachine &TM)
+ : AMDGPUSubtarget(TT, GPU, FS, TM), InstrInfo(*this),
+ FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
+ TLInfo(TM, *this) {
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+ GISelAccessor *GISel = new GISelAccessor();
+#else
+ SIGISelActualAccessor *GISel = new SIGISelActualAccessor();
+ GISel->CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
+ GISel->Legalizer.reset(new AMDGPULegalizerInfo());
+
+ GISel->RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
+ GISel->InstSelector.reset(new AMDGPUInstructionSelector(
+ *this, *static_cast<AMDGPURegisterBankInfo *>(GISel->RegBankInfo.get())));
+#endif
+ setGISelAccessor(*GISel);
+}
void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
unsigned NumRegionInstrs) const {
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 425fd35d47de..dc868f010d85 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -19,9 +19,7 @@
#include "AMDGPUCallLowering.h"
#include "AMDGPUInstructionSelector.h"
#include "AMDGPULegalizerInfo.h"
-#ifdef LLVM_BUILD_GLOBAL_ISEL
-#include "AMDGPURegisterBankInfo.h"
-#endif
+#include "AMDGPUMacroFusion.h"
#include "AMDGPUTargetObjectFile.h"
#include "AMDGPUTargetTransformInfo.h"
#include "GCNIterativeScheduler.h"
@@ -85,7 +83,7 @@ static cl::opt<bool> EnableLoadStoreVectorizer(
static cl::opt<bool> ScalarizeGlobal(
"amdgpu-scalarize-global-loads",
cl::desc("Enable global load scalarization"),
- cl::init(false),
+ cl::init(true),
cl::Hidden);
// Option to run internalize pass.
@@ -176,6 +174,7 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
new GCNScheduleDAGMILive(C, make_unique<GCNMaxOccupancySchedStrategy>(C));
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
+ DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
return DAG;
}
@@ -389,31 +388,6 @@ const R600Subtarget *R600TargetMachine::getSubtargetImpl(
// GCN Target Machine (SI+)
//===----------------------------------------------------------------------===//
-#ifdef LLVM_BUILD_GLOBAL_ISEL
-namespace {
-
-struct SIGISelActualAccessor : public GISelAccessor {
- std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
- std::unique_ptr<InstructionSelector> InstSelector;
- std::unique_ptr<LegalizerInfo> Legalizer;
- std::unique_ptr<RegisterBankInfo> RegBankInfo;
- const AMDGPUCallLowering *getCallLowering() const override {
- return CallLoweringInfo.get();
- }
- const InstructionSelector *getInstructionSelector() const override {
- return InstSelector.get();
- }
- const LegalizerInfo *getLegalizerInfo() const override {
- return Legalizer.get();
- }
- const RegisterBankInfo *getRegBankInfo() const override {
- return RegBankInfo.get();
- }
-};
-
-} // end anonymous namespace
-#endif
-
GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
StringRef CPU, StringRef FS,
TargetOptions Options,
@@ -435,21 +409,6 @@ const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
// function that reside in TargetOptions.
resetTargetOptions(F);
I = llvm::make_unique<SISubtarget>(TargetTriple, GPU, FS, *this);
-
-#ifndef LLVM_BUILD_GLOBAL_ISEL
- GISelAccessor *GISel = new GISelAccessor();
-#else
- SIGISelActualAccessor *GISel = new SIGISelActualAccessor();
- GISel->CallLoweringInfo.reset(
- new AMDGPUCallLowering(*I->getTargetLowering()));
- GISel->Legalizer.reset(new AMDGPULegalizerInfo());
-
- GISel->RegBankInfo.reset(new AMDGPURegisterBankInfo(*I->getRegisterInfo()));
- GISel->InstSelector.reset(new AMDGPUInstructionSelector(*I,
- *static_cast<AMDGPURegisterBankInfo*>(GISel->RegBankInfo.get())));
-#endif
-
- I->setGISelAccessor(*GISel);
}
I->setScalarizeGlobalBehavior(ScalarizeGlobal);
diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 7b8756050b75..e3c90f250600 100644
--- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1058,17 +1058,13 @@ public:
OperandMatchResultTy parseOModOperand(OperandVector &Operands);
- void cvtId(MCInst &Inst, const OperandVector &Operands);
- void cvtVOP3_2_mod(MCInst &Inst, const OperandVector &Operands);
-
- void cvtVOP3Impl(MCInst &Inst,
- const OperandVector &Operands,
- OptionalImmIndexMap &OptionalIdx);
+ void cvtVOP3(MCInst &Inst, const OperandVector &Operands,
+ OptionalImmIndexMap &OptionalIdx);
void cvtVOP3(MCInst &Inst, const OperandVector &Operands);
- void cvtVOP3OMod(MCInst &Inst, const OperandVector &Operands);
void cvtVOP3P(MCInst &Inst, const OperandVector &Operands);
- void cvtMIMG(MCInst &Inst, const OperandVector &Operands);
+ void cvtMIMG(MCInst &Inst, const OperandVector &Operands,
+ bool IsAtomic = false);
void cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands);
OperandMatchResultTy parseDPPCtrl(OperandVector &Operands);
@@ -3870,13 +3866,19 @@ void AMDGPUAsmParser::cvtMtbuf(MCInst &Inst, const OperandVector &Operands) {
// mimg
//===----------------------------------------------------------------------===//
-void AMDGPUAsmParser::cvtMIMG(MCInst &Inst, const OperandVector &Operands) {
+void AMDGPUAsmParser::cvtMIMG(MCInst &Inst, const OperandVector &Operands,
+ bool IsAtomic) {
unsigned I = 1;
const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
for (unsigned J = 0; J < Desc.getNumDefs(); ++J) {
((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1);
}
+ if (IsAtomic) {
+ // Add src, same as dst
+ ((AMDGPUOperand &)*Operands[I]).addRegOperands(Inst, 1);
+ }
+
OptionalImmIndexMap OptionalIdx;
for (unsigned E = Operands.size(); I != E; ++I) {
@@ -3904,39 +3906,7 @@ void AMDGPUAsmParser::cvtMIMG(MCInst &Inst, const OperandVector &Operands) {
}
void AMDGPUAsmParser::cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands) {
- unsigned I = 1;
- const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
- for (unsigned J = 0; J < Desc.getNumDefs(); ++J) {
- ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1);
- }
-
- // Add src, same as dst
- ((AMDGPUOperand &)*Operands[I]).addRegOperands(Inst, 1);
-
- OptionalImmIndexMap OptionalIdx;
-
- for (unsigned E = Operands.size(); I != E; ++I) {
- AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
-
- // Add the register arguments
- if (Op.isRegOrImm()) {
- Op.addRegOrImmOperands(Inst, 1);
- continue;
- } else if (Op.isImmModifier()) {
- OptionalIdx[Op.getImmTy()] = I;
- } else {
- llvm_unreachable("unexpected operand type");
- }
- }
-
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDMask);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyUNorm);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDA);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyR128);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyLWE);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
+ cvtMIMG(Inst, Operands, true);
}
AMDGPUOperand::Ptr AMDGPUAsmParser::defaultDMask() const {
@@ -4118,25 +4088,6 @@ OperandMatchResultTy AMDGPUAsmParser::parseOModOperand(OperandVector &Operands)
return MatchOperand_NoMatch;
}
-void AMDGPUAsmParser::cvtId(MCInst &Inst, const OperandVector &Operands) {
- unsigned I = 1;
- const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
- for (unsigned J = 0; J < Desc.getNumDefs(); ++J) {
- ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1);
- }
- for (unsigned E = Operands.size(); I != E; ++I)
- ((AMDGPUOperand &)*Operands[I]).addRegOrImmOperands(Inst, 1);
-}
-
-void AMDGPUAsmParser::cvtVOP3_2_mod(MCInst &Inst, const OperandVector &Operands) {
- uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags;
- if (TSFlags & SIInstrFlags::VOP3) {
- cvtVOP3(Inst, Operands);
- } else {
- cvtId(Inst, Operands);
- }
-}
-
static bool isRegOrImmWithInputMods(const MCInstrDesc &Desc, unsigned OpNum) {
// 1. This operand is input modifiers
return Desc.OpInfo[OpNum].OperandType == AMDGPU::OPERAND_INPUT_MODS
@@ -4148,91 +4099,78 @@ static bool isRegOrImmWithInputMods(const MCInstrDesc &Desc, unsigned OpNum) {
&& Desc.getOperandConstraint(OpNum + 1, MCOI::OperandConstraint::TIED_TO) == -1;
}
-void AMDGPUAsmParser::cvtVOP3Impl(MCInst &Inst, const OperandVector &Operands,
- OptionalImmIndexMap &OptionalIdx) {
+void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands,
+ OptionalImmIndexMap &OptionalIdx) {
+ unsigned Opc = Inst.getOpcode();
+
unsigned I = 1;
const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
for (unsigned J = 0; J < Desc.getNumDefs(); ++J) {
((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1);
}
- for (unsigned E = Operands.size(); I != E; ++I) {
- AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
- if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
- Op.addRegOrImmWithFPInputModsOperands(Inst, 2);
- } else if (Op.isImmModifier()) {
- OptionalIdx[Op.getImmTy()] = I;
- } else if (Op.isRegOrImm()) {
- Op.addRegOrImmOperands(Inst, 1);
- } else {
- llvm_unreachable("unhandled operand type");
+ if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers) != -1) {
+ // This instruction has src modifiers
+ for (unsigned E = Operands.size(); I != E; ++I) {
+ AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
+ if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
+ Op.addRegOrImmWithFPInputModsOperands(Inst, 2);
+ } else if (Op.isImmModifier()) {
+ OptionalIdx[Op.getImmTy()] = I;
+ } else if (Op.isRegOrImm()) {
+ Op.addRegOrImmOperands(Inst, 1);
+ } else {
+ llvm_unreachable("unhandled operand type");
+ }
+ }
+ } else {
+ // No src modifiers
+ for (unsigned E = Operands.size(); I != E; ++I) {
+ AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
+ if (Op.isMod()) {
+ OptionalIdx[Op.getImmTy()] = I;
+ } else {
+ Op.addRegOrImmOperands(Inst, 1);
+ }
}
}
-}
-void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) {
- OptionalImmIndexMap OptionalIdx;
-
- cvtVOP3Impl(Inst, Operands, OptionalIdx);
+ if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp) != -1) {
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI);
+ }
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI);
+ if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod) != -1) {
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI);
+ }
// special case v_mac_{f16, f32}:
// it has src2 register operand that is tied to dst operand
// we don't allow modifiers for this operand in assembler so src2_modifiers
// should be 0
- if (Inst.getOpcode() == AMDGPU::V_MAC_F32_e64_si ||
- Inst.getOpcode() == AMDGPU::V_MAC_F32_e64_vi ||
- Inst.getOpcode() == AMDGPU::V_MAC_F16_e64_vi) {
+ if (Opc == AMDGPU::V_MAC_F32_e64_si || Opc == AMDGPU::V_MAC_F32_e64_vi ||
+ Opc == AMDGPU::V_MAC_F16_e64_vi) {
auto it = Inst.begin();
- std::advance(
- it,
- AMDGPU::getNamedOperandIdx(Inst.getOpcode() == AMDGPU::V_MAC_F16_e64_vi ?
- AMDGPU::V_MAC_F16_e64 :
- AMDGPU::V_MAC_F32_e64,
- AMDGPU::OpName::src2_modifiers));
+ std::advance(it, AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2_modifiers));
it = Inst.insert(it, MCOperand::createImm(0)); // no modifiers for src2
++it;
Inst.insert(it, Inst.getOperand(0)); // src2 = dst
}
}
-void AMDGPUAsmParser::cvtVOP3OMod(MCInst &Inst, const OperandVector &Operands) {
+void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) {
OptionalImmIndexMap OptionalIdx;
-
- unsigned I = 1;
- const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
- for (unsigned J = 0; J < Desc.getNumDefs(); ++J) {
- ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1);
- }
-
- for (unsigned E = Operands.size(); I != E; ++I) {
- AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
- if (Op.isMod()) {
- OptionalIdx[Op.getImmTy()] = I;
- } else {
- Op.addRegOrImmOperands(Inst, 1);
- }
- }
-
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI);
+ cvtVOP3(Inst, Operands, OptionalIdx);
}
void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands) {
OptionalImmIndexMap OptIdx;
- cvtVOP3Impl(Inst, Operands, OptIdx);
+ cvtVOP3(Inst, Operands, OptIdx);
// FIXME: This is messy. Parse the modifiers as if it was a normal VOP3
// instruction, and then figure out where to actually put the modifiers
int Opc = Inst.getOpcode();
- if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp) != -1) {
- addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyClampSI);
- }
-
addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyOpSel);
addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyOpSelHi, -1);
@@ -4284,7 +4222,7 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands) {
int ModIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]);
- Inst.getOperand(ModIdx).setImm(ModVal);
+ Inst.getOperand(ModIdx).setImm(Inst.getOperand(ModIdx).getImm() | ModVal);
}
}
diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt
index 917d9cfa6905..971208c5db84 100644
--- a/lib/Target/AMDGPU/CMakeLists.txt
+++ b/lib/Target/AMDGPU/CMakeLists.txt
@@ -47,6 +47,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUIntrinsicInfo.cpp
AMDGPUISelDAGToDAG.cpp
AMDGPULowerIntrinsics.cpp
+ AMDGPUMacroFusion.cpp
AMDGPUMCInstLower.cpp
AMDGPUMachineCFGStructurizer.cpp
AMDGPUMachineFunction.cpp
diff --git a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
index 8ead48067336..2e7641cda375 100644
--- a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
+++ b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
@@ -17,7 +17,7 @@
using namespace llvm;
-#define DEBUG_TYPE "misched"
+#define DEBUG_TYPE "machine-scheduler"
namespace llvm {
std::vector<const SUnit*> makeMinRegSchedule(ArrayRef<const SUnit*> TopRoots,
diff --git a/lib/Target/AMDGPU/GCNMinRegStrategy.cpp b/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
index d378df674be9..0657f67b217d 100644
--- a/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
+++ b/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
@@ -15,7 +15,7 @@
using namespace llvm;
-#define DEBUG_TYPE "misched"
+#define DEBUG_TYPE "machine-scheduler"
namespace {
class GCNMinRegScheduler {
diff --git a/lib/Target/AMDGPU/GCNRegPressure.cpp b/lib/Target/AMDGPU/GCNRegPressure.cpp
index 390a8286c76a..1d02c7fdffbf 100644
--- a/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -16,7 +16,7 @@
using namespace llvm;
-#define DEBUG_TYPE "misched"
+#define DEBUG_TYPE "machine-scheduler"
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
LLVM_DUMP_METHOD
diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 8ec46665daf5..155b400ba022 100644
--- a/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -20,7 +20,7 @@
#include "llvm/CodeGen/RegisterClassInfo.h"
#include "llvm/Support/MathExtras.h"
-#define DEBUG_TYPE "misched"
+#define DEBUG_TYPE "machine-scheduler"
using namespace llvm;
diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.h b/lib/Target/AMDGPU/GCNSchedStrategy.h
index 3ed3cd5b3b1c..060d2ca72d93 100644
--- a/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -66,7 +66,7 @@ class GCNScheduleDAGMILive : public ScheduleDAGMILive {
const SIMachineFunctionInfo &MFI;
- // Occupancy target at the begining of function scheduling cycle.
+ // Occupancy target at the beginning of function scheduling cycle.
unsigned StartingOccupancy;
// Minimal real occupancy recorder for the function.
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
index 2b408ff10caa..a50e3eb8d9ce 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -32,7 +32,7 @@ public:
void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
const MCValue &Target, MutableArrayRef<char> Data,
- uint64_t Value, bool IsPCRel) const override;
+ uint64_t Value, bool IsResolved) const override;
bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
const MCRelaxableFragment *DF,
const MCAsmLayout &Layout) const override {
@@ -100,7 +100,7 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
void AMDGPUAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
const MCValue &Target,
MutableArrayRef<char> Data, uint64_t Value,
- bool IsPCRel) const {
+ bool IsResolved) const {
Value = adjustFixupValue(Fixup, Value, &Asm.getContext());
if (!Value)
return; // Doesn't change encoding.
diff --git a/lib/Target/AMDGPU/MIMGInstructions.td b/lib/Target/AMDGPU/MIMGInstructions.td
index a515eecc222a..06e2c11b0193 100644
--- a/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/lib/Target/AMDGPU/MIMGInstructions.td
@@ -26,6 +26,7 @@ class MIMG_Helper <dag outs, dag ins, string asm,
let isAsmParserOnly = !if(!eq(dns,""), 1, 0);
let AsmMatchConverter = "cvtMIMG";
let usesCustomInserter = 1;
+ let SchedRW = [WriteVMEM];
}
class MIMG_NoSampler_Helper <bits<7> op, string asm,
diff --git a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
index 6993e8a62a9c..00cbd24b84fb 100644
--- a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
+++ b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
@@ -555,7 +555,7 @@ public:
CFStack.pushBranch(AMDGPU::CF_PUSH_EG);
} else
CFStack.pushBranch(AMDGPU::CF_ALU_PUSH_BEFORE);
-
+ LLVM_FALLTHROUGH;
case AMDGPU::CF_ALU:
I = MI;
AluClauses.push_back(MakeALUClause(MBB, I));
diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp
index 215791f4f92d..69a63b6941ef 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -1618,7 +1618,8 @@ EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
return VT.changeVectorElementTypeToInteger();
}
-bool R600TargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT) const {
+bool R600TargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
+ const SelectionDAG &DAG) const {
// Local and Private addresses do not handle vectors. Limit to i32
if ((AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.PRIVATE_ADDRESS)) {
return (MemVT.getSizeInBits() <= 32);
diff --git a/lib/Target/AMDGPU/R600ISelLowering.h b/lib/Target/AMDGPU/R600ISelLowering.h
index d6a0876a6ee7..2a774693f02b 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.h
+++ b/lib/Target/AMDGPU/R600ISelLowering.h
@@ -44,7 +44,8 @@ public:
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &,
EVT VT) const override;
- bool canMergeStoresTo(unsigned AS, EVT MemVT) const override;
+ bool canMergeStoresTo(unsigned AS, EVT MemVT,
+ const SelectionDAG &DAG) const override;
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS,
unsigned Align,
diff --git a/lib/Target/AMDGPU/R600MachineScheduler.cpp b/lib/Target/AMDGPU/R600MachineScheduler.cpp
index 47fda1c8fa82..a7e540f9d14d 100644
--- a/lib/Target/AMDGPU/R600MachineScheduler.cpp
+++ b/lib/Target/AMDGPU/R600MachineScheduler.cpp
@@ -22,7 +22,7 @@
using namespace llvm;
-#define DEBUG_TYPE "misched"
+#define DEBUG_TYPE "machine-scheduler"
void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {
assert(dag->hasVRegLiveness() && "R600SchedStrategy needs vreg liveness");
diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp
index f391f67a241f..3af242d9ea66 100644
--- a/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -137,6 +137,7 @@ static bool isInlineConstantIfFolded(const SIInstrInfo *TII,
= TII->get(IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType);
}
+ return false;
}
default:
return false;
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index d39b345bdf03..2ba570b9ebbb 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -547,7 +547,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.align = 0;
const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
- Info.vol = !Vol || !Vol->isNullValue();
+ Info.vol = !Vol || !Vol->isZero();
Info.readMem = true;
Info.writeMem = true;
return true;
@@ -713,7 +713,8 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
}
}
-bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT) const {
+bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
+ const SelectionDAG &DAG) const {
if (AS == AMDGPUASI.GLOBAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS) {
return (MemVT.getSizeInBits() <= 4 * 32);
} else if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
@@ -2374,20 +2375,16 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
}
case ISD::INTRINSIC_WO_CHAIN: {
unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
- switch (IID) {
- case Intrinsic::amdgcn_cvt_pkrtz: {
+ if (IID == Intrinsic::amdgcn_cvt_pkrtz) {
SDValue Src0 = N->getOperand(1);
SDValue Src1 = N->getOperand(2);
SDLoc SL(N);
SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32,
Src0, Src1);
-
Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
return;
}
- default:
- break;
- }
+ break;
}
case ISD::SELECT: {
SDLoc SL(N);
@@ -3736,7 +3733,9 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
EVT VT = Op.getValueType();
- bool Unsafe = DAG.getTarget().Options.UnsafeFPMath;
+ const SDNodeFlags Flags = Op->getFlags();
+ bool Unsafe = DAG.getTarget().Options.UnsafeFPMath ||
+ Flags.hasUnsafeAlgebra() || Flags.hasAllowReciprocal();
if (!Unsafe && VT == MVT::f32 && Subtarget->hasFP32Denormals())
return SDValue();
@@ -3771,15 +3770,11 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
}
}
- const SDNodeFlags Flags = Op->getFlags();
-
- if (Unsafe || Flags.hasAllowReciprocal()) {
+ if (Unsafe) {
// Turn into multiply by the reciprocal.
// x / y -> x * (1.0 / y)
- SDNodeFlags NewFlags;
- NewFlags.setUnsafeAlgebra(true);
SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
- return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, NewFlags);
+ return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
}
return SDValue();
@@ -4622,15 +4617,99 @@ SDValue SITargetLowering::performClassCombine(SDNode *N,
return SDValue();
}
+static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
+ if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions())
+ return true;
+
+ return DAG.isKnownNeverNaN(Op);
+}
+
+static bool isCanonicalized(SDValue Op, const SISubtarget *ST,
+ unsigned MaxDepth=5) {
+ // If source is a result of another standard FP operation it is already in
+ // canonical form.
+
+ switch (Op.getOpcode()) {
+ default:
+ break;
+
+ // These will flush denorms if required.
+ case ISD::FADD:
+ case ISD::FSUB:
+ case ISD::FMUL:
+ case ISD::FSQRT:
+ case ISD::FCEIL:
+ case ISD::FFLOOR:
+ case ISD::FMA:
+ case ISD::FMAD:
+
+ case ISD::FCANONICALIZE:
+ return true;
+
+ case ISD::FP_ROUND:
+ return Op.getValueType().getScalarType() != MVT::f16 ||
+ ST->hasFP16Denormals();
+
+ case ISD::FP_EXTEND:
+ return Op.getOperand(0).getValueType().getScalarType() != MVT::f16 ||
+ ST->hasFP16Denormals();
+
+ case ISD::FP16_TO_FP:
+ case ISD::FP_TO_FP16:
+ return ST->hasFP16Denormals();
+
+ // It can/will be lowered or combined as a bit operation.
+ // Need to check their input recursively to handle.
+ case ISD::FNEG:
+ case ISD::FABS:
+ return (MaxDepth > 0) &&
+ isCanonicalized(Op.getOperand(0), ST, MaxDepth - 1);
+
+ case ISD::FSIN:
+ case ISD::FCOS:
+ case ISD::FSINCOS:
+ return Op.getValueType().getScalarType() != MVT::f16;
+
+ // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms.
+ // For such targets need to check their input recursively.
+ // TODO: on GFX9+ we could return true without checking provided no-nan
+ // mode, since canonicalization is also used to quiet sNaNs.
+ case ISD::FMINNUM:
+ case ISD::FMAXNUM:
+ case ISD::FMINNAN:
+ case ISD::FMAXNAN:
+
+ return (MaxDepth > 0) &&
+ isCanonicalized(Op.getOperand(0), ST, MaxDepth - 1) &&
+ isCanonicalized(Op.getOperand(1), ST, MaxDepth - 1);
+
+ case ISD::ConstantFP: {
+ auto F = cast<ConstantFPSDNode>(Op)->getValueAPF();
+ return !F.isDenormal() && !(F.isNaN() && F.isSignaling());
+ }
+ }
+ return false;
+}
+
// Constant fold canonicalize.
SDValue SITargetLowering::performFCanonicalizeCombine(
SDNode *N,
DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
ConstantFPSDNode *CFP = isConstOrConstSplatFP(N->getOperand(0));
- if (!CFP)
+
+ if (!CFP) {
+ SDValue N0 = N->getOperand(0);
+
+ bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction());
+
+ if ((IsIEEEMode || isKnownNeverSNan(DAG, N0)) &&
+ isCanonicalized(N0, getSubtarget()))
+ return N0;
+
return SDValue();
+ }
- SelectionDAG &DAG = DCI.DAG;
const APFloat &C = CFP->getValueAPF();
// Flush denormals to 0 if not enabled.
@@ -4723,13 +4802,6 @@ SDValue SITargetLowering::performIntMed3ImmCombine(
return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3);
}
-static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
- if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions())
- return true;
-
- return DAG.isKnownNeverNaN(Op);
-}
-
SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
const SDLoc &SL,
SDValue Op0,
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index 24f88e632d38..83392a7ab1b2 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -153,7 +153,8 @@ public:
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
unsigned AS) const override;
- bool canMergeStoresTo(unsigned AS, EVT MemVT) const override;
+ bool canMergeStoresTo(unsigned AS, EVT MemVT,
+ const SelectionDAG &DAG) const override;
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS,
unsigned Align,
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index b6784ec14e9f..160f8837d49c 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2022,10 +2022,12 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
return nullptr;
case AMDGPU::V_MAC_F16_e64:
IsF16 = true;
+ LLVM_FALLTHROUGH;
case AMDGPU::V_MAC_F32_e64:
break;
case AMDGPU::V_MAC_F16_e32:
IsF16 = true;
+ LLVM_FALLTHROUGH;
case AMDGPU::V_MAC_F32_e32: {
int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
AMDGPU::OpName::src0);
@@ -4320,6 +4322,24 @@ SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const
return new GCNHazardRecognizer(MF);
}
+std::pair<unsigned, unsigned>
+SIInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
+ return std::make_pair(TF & MO_MASK, TF & ~MO_MASK);
+}
+
+ArrayRef<std::pair<unsigned, const char *>>
+SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
+ static const std::pair<unsigned, const char *> TargetFlags[] = {
+ { MO_GOTPCREL, "amdgpu-gotprel" },
+ { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" },
+ { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" },
+ { MO_REL32_LO, "amdgpu-rel32-lo" },
+ { MO_REL32_HI, "amdgpu-rel32-hi" }
+ };
+
+ return makeArrayRef(TargetFlags);
+}
+
bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const {
return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
MI.modifiesRegister(AMDGPU::EXEC, &RI);
diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h
index 74b48c761808..d00c0d4a7f4e 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@@ -100,6 +100,8 @@ protected:
public:
enum TargetOperandFlags {
+ MO_MASK = 0x7,
+
MO_NONE = 0,
// MO_GOTPCREL -> symbol@GOTPCREL -> R_AMDGPU_GOTPCREL.
MO_GOTPCREL = 1,
@@ -781,9 +783,15 @@ public:
void convertNonUniformLoopRegion(MachineBasicBlock *LoopEntry,
MachineBasicBlock *LoopEnd) const;
+ std::pair<unsigned, unsigned>
+ decomposeMachineOperandsTargetFlags(unsigned TF) const override;
+
ArrayRef<std::pair<int, const char *>>
getSerializableTargetIndices() const override;
+ ArrayRef<std::pair<unsigned, const char *>>
+ getSerializableDirectMachineOperandTargetFlags() const override;
+
ScheduleHazardRecognizer *
CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
const ScheduleDAG *DAG) const override;
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index 4a81fb3b463a..ffb01363e131 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1502,6 +1502,8 @@ def VOP_B32_F16_F16 : VOPProfile <[i32, f16, f16, untyped]>;
def VOP_V2F16_V2F16_V2F16_V2F16 : VOPProfile <[v2f16, v2f16, v2f16, v2f16]>;
def VOP_V2I16_V2I16_V2I16_V2I16 : VOPProfile <[v2i16, v2i16, v2i16, v2i16]>;
+def VOP_F32_V2F16_V2F16_V2F16 : VOPProfile <[f32, v2f16, v2f16, v2f16]>;
+
def VOP_NONE : VOPProfile <[untyped, untyped, untyped, untyped]>;
def VOP_F32_F32 : VOPProfile <[f32, f32, untyped, untyped]>;
diff --git a/lib/Target/AMDGPU/SIMachineScheduler.cpp b/lib/Target/AMDGPU/SIMachineScheduler.cpp
index bb17dbbdfbd6..34886c48f461 100644
--- a/lib/Target/AMDGPU/SIMachineScheduler.cpp
+++ b/lib/Target/AMDGPU/SIMachineScheduler.cpp
@@ -38,7 +38,7 @@
using namespace llvm;
-#define DEBUG_TYPE "misched"
+#define DEBUG_TYPE "machine-scheduler"
// This scheduler implements a different scheduling algorithm than
// GenericScheduler.
diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 96a18544f02a..874fbadca7f3 100644
--- a/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -110,10 +110,8 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
}
const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
- const MachineOperand *Src1Mod =
- TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
-
- if (Src1 && (!isVGPR(Src1, TRI, MRI) || (Src1Mod && Src1Mod->getImm() != 0)))
+ if (Src1 && (!isVGPR(Src1, TRI, MRI) ||
+ TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
return false;
// We don't need to check src0, all input types are legal, so just make sure
@@ -122,58 +120,64 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
return false;
// Check output modifiers
- if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
- return false;
-
- return !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp);
+ return !TII->hasModifiersSet(MI, AMDGPU::OpName::omod) &&
+ !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp);
}
/// \brief This function checks \p MI for operands defined by a move immediate
/// instruction and then folds the literal constant into the instruction if it
-/// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instruction
-/// and will only fold literal constants if we are still in SSA.
-static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
+/// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions.
+static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
MachineRegisterInfo &MRI, bool TryToCommute = true) {
-
- if (!MRI.isSSA())
- return;
-
assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI));
int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
- // Only one literal constant is allowed per instruction, so if src0 is a
- // literal constant then we can't do any folding.
- if (TII->isLiteralConstant(MI, Src0Idx))
- return;
-
// Try to fold Src0
MachineOperand &Src0 = MI.getOperand(Src0Idx);
- if (Src0.isReg() && MRI.hasOneUse(Src0.getReg())) {
+ if (Src0.isReg()) {
unsigned Reg = Src0.getReg();
- MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
- if (Def && Def->isMoveImmediate()) {
- MachineOperand &MovSrc = Def->getOperand(1);
- bool ConstantFolded = false;
-
- if (MovSrc.isImm() && (isInt<32>(MovSrc.getImm()) ||
- isUInt<32>(MovSrc.getImm()))) {
- Src0.ChangeToImmediate(MovSrc.getImm());
- ConstantFolded = true;
- }
- if (ConstantFolded) {
- if (MRI.use_empty(Reg))
+ if (TargetRegisterInfo::isVirtualRegister(Reg) && MRI.hasOneUse(Reg)) {
+ MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
+ if (Def && Def->isMoveImmediate()) {
+ MachineOperand &MovSrc = Def->getOperand(1);
+ bool ConstantFolded = false;
+
+ if (MovSrc.isImm() && (isInt<32>(MovSrc.getImm()) ||
+ isUInt<32>(MovSrc.getImm()))) {
+ // It's possible to have only one component of a super-reg defined by
+ // a single mov, so we need to clear any subregister flag.
+ Src0.setSubReg(0);
+ Src0.ChangeToImmediate(MovSrc.getImm());
+ ConstantFolded = true;
+ } else if (MovSrc.isFI()) {
+ Src0.setSubReg(0);
+ Src0.ChangeToFrameIndex(MovSrc.getIndex());
+ ConstantFolded = true;
+ }
+
+ if (ConstantFolded) {
+ assert(MRI.use_empty(Reg));
Def->eraseFromParent();
- ++NumLiteralConstantsFolded;
- return;
+ ++NumLiteralConstantsFolded;
+ return true;
+ }
}
}
}
// We have failed to fold src0, so commute the instruction and try again.
- if (TryToCommute && MI.isCommutable() && TII->commuteInstruction(MI))
- foldImmediates(MI, TII, MRI, false);
+ if (TryToCommute && MI.isCommutable()) {
+ if (TII->commuteInstruction(MI)) {
+ if (foldImmediates(MI, TII, MRI, false))
+ return true;
+ // Commute back.
+ TII->commuteInstruction(MI);
+ }
+ }
+
+ return false;
}
// Copy MachineOperand with all flags except setting it as implicit.
diff --git a/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp b/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp
index 9908fc003ce7..92fb762ebd73 100644
--- a/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp
+++ b/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp
@@ -16,7 +16,7 @@
using namespace llvm;
-/// \brief The target which suports all AMD GPUs. This will eventually
+/// \brief The target which supports all AMD GPUs. This will eventually
/// be deprecated and there will be a R600 target and a GCN target.
Target &llvm::getTheAMDGPUTarget() {
static Target TheAMDGPUTarget;
diff --git a/lib/Target/AMDGPU/VOP3PInstructions.td b/lib/Target/AMDGPU/VOP3PInstructions.td
index 96d343099132..f2de1f995726 100644
--- a/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -16,12 +16,21 @@ class VOP3PInst<string OpName, VOPProfile P, SDPatternOperator node = null_frag>
!if(P.HasModifiers, getVOP3PModPat<P, node>.ret, getVOP3Pat<P, node>.ret)
>;
-// Non-packed instructions that use the VOP3P encoding. i.e. where
-// omod/abs are used.
+// Non-packed instructions that use the VOP3P encoding.
+// VOP3 neg/abs and VOP3P opsel/opsel_hi modifiers are allowed.
class VOP3_VOP3PInst<string OpName, VOPProfile P, SDPatternOperator node = null_frag> :
- VOP3P_Pseudo<OpName, P,
- !if(P.HasModifiers, getVOP3ModPat<P, node>.ret, getVOP3Pat<P, node>.ret)
->;
+ VOP3P_Pseudo<OpName, P> {
+ let InOperandList =
+ (ins
+ FP32InputMods:$src0_modifiers, VCSrc_f32:$src0,
+ FP32InputMods:$src1_modifiers, VCSrc_f32:$src1,
+ FP32InputMods:$src2_modifiers, VCSrc_f32:$src2,
+ clampmod:$clamp,
+ op_sel:$op_sel,
+ op_sel_hi:$op_sel_hi);
+ let AsmOperands =
+ " $vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$op_sel$op_sel_hi$clamp";
+}
let isCommutable = 1 in {
def V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, fma>;
@@ -46,9 +55,12 @@ def V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile<VOP_V2I16_V2I1
def V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshr_rev>;
// XXX - Commutable?
-def V_MAD_MIX_F32 : VOP3_VOP3PInst<"v_mad_mix_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
-def V_MAD_MIXLO_F16 : VOP3_VOP3PInst<"v_mad_mixlo_f16", VOP3_Profile<VOP_F16_F16_F16_F16>>;
-def V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3_Profile<VOP_F16_F16_F16_F16>>;
+// These are VOP3a-like opcodes which accept no omod.
+// Size of src arguments (16/32) is controlled by op_sel.
+// For 16-bit src arguments their location (hi/lo) are controlled by op_sel_hi.
+def V_MAD_MIX_F32 : VOP3_VOP3PInst<"v_mad_mix_f32", VOP3_Profile<VOP_F32_V2F16_V2F16_V2F16>>;
+def V_MAD_MIXLO_F16 : VOP3_VOP3PInst<"v_mad_mixlo_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>>;
+def V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>>;
multiclass VOP3P_Real_vi<bits<10> op> {
diff --git a/lib/Target/AMDGPU/VOPInstructions.td b/lib/Target/AMDGPU/VOPInstructions.td
index e386f21c2ba4..77b7952b22a8 100644
--- a/lib/Target/AMDGPU/VOPInstructions.td
+++ b/lib/Target/AMDGPU/VOPInstructions.td
@@ -51,12 +51,8 @@ class VOP3Common <dag outs, dag ins, string asm = "",
let VOP3 = 1;
- let AsmMatchConverter =
- !if(!eq(VOP3Only,1),
- "cvtVOP3",
- !if(!eq(HasMods,1), "cvtVOP3_2_mod", ""));
-
let AsmVariantName = AMDGPUAsmVariants.VOP3;
+ let AsmMatchConverter = !if(!eq(HasMods,1), "cvtVOP3", "");
let isCodeGenOnly = 0;
@@ -106,13 +102,11 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
let AsmVariantName = AMDGPUAsmVariants.VOP3;
let AsmMatchConverter =
- !if(!eq(VOP3Only,1),
- !if(!and(P.IsPacked, isVOP3P), "cvtVOP3P", "cvtVOP3"),
- !if(!eq(P.HasModifiers, 1),
- "cvtVOP3_2_mod",
- !if(!eq(P.HasOMod, 1), "cvtVOP3OMod", "")
- )
- );
+ !if(!and(P.IsPacked, isVOP3P),
+ "cvtVOP3P",
+ !if(!or(P.HasModifiers, P.HasOMod),
+ "cvtVOP3",
+ ""));
VOPProfile Pfl = P;
}