aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.td8
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp62
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp29
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp9
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp35
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp205
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h10
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp171
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructions.td24
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp204
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp61
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULibFunc.h1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPTNote.h2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp11
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp12
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp30
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp24
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h14
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp11
-rw-r--r--llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp71
-rw-r--r--llvm/lib/Target/AMDGPU/DSInstructions.td1
-rw-r--r--llvm/lib/Target/AMDGPU/FLATInstructions.td2
-rw-r--r--llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp10
-rw-r--r--llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h1
-rw-r--r--llvm/lib/Target/AMDGPU/GCNRegPressure.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.h2
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSubtarget.h7
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp11
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/MIMGInstructions.td106
-rw-r--r--llvm/lib/Target/AMDGPU/R600ISelLowering.h1
-rw-r--r--llvm/lib/Target/AMDGPU/R600InstrInfo.h1
-rw-r--r--llvm/lib/Target/AMDGPU/R600Subtarget.h6
-rw-r--r--llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/SIDefines.h2
-rw-r--r--llvm/lib/Target/AMDGPU/SIFoldOperands.cpp7
-rw-r--r--llvm/lib/Target/AMDGPU/SIFrameLowering.cpp13
-rw-r--r--llvm/lib/Target/AMDGPU/SIFrameLowering.h5
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp418
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.h5
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp7
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp149
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h11
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.td8
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstructions.td130
-rw-r--r--llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp137
-rw-r--r--llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp9
-rw-r--r--llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp55
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp53
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h3
-rw-r--r--llvm/lib/Target/AMDGPU/SIModeRegister.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp46
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.td8
-rw-r--r--llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp9
-rw-r--r--llvm/lib/Target/AMDGPU/SOPInstructions.td15
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp23
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h17
-rw-r--r--llvm/lib/Target/AMDGPU/VOP2Instructions.td36
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3PInstructions.td41
70 files changed, 1481 insertions, 916 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index e606f0e8fc3c..806c0b18637a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -610,12 +610,6 @@ def FeatureDsSrc2Insts : SubtargetFeature<"ds-src2-insts",
"Has ds_*_src2 instructions"
>;
-def FeatureRegisterBanking : SubtargetFeature<"register-banking",
- "HasRegisterBanking",
- "true",
- "Has register banking"
->;
-
def FeatureVOP3Literal : SubtargetFeature<"vop3-literal",
"HasVOP3Literal",
"true",
@@ -826,7 +820,7 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst,
FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts,
FeatureAddNoCarryInsts, FeatureFmaMixInsts, FeatureGFX8Insts,
- FeatureNoSdstCMPX, FeatureVscnt, FeatureRegisterBanking,
+ FeatureNoSdstCMPX, FeatureVscnt,
FeatureVOP3Literal, FeatureDPP8, FeatureExtendedImageInsts,
FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
FeatureGFX10A16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureG16,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h
index 22be014813b0..5ba9b2cd187e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h
@@ -26,7 +26,7 @@ class AMDGPUAAResult : public AAResultBase<AMDGPUAAResult> {
const DataLayout &DL;
public:
- explicit AMDGPUAAResult(const DataLayout &DL) : AAResultBase(), DL(DL) {}
+ explicit AMDGPUAAResult(const DataLayout &DL) : DL(DL) {}
AMDGPUAAResult(AMDGPUAAResult &&Arg)
: AAResultBase(std::move(Arg)), DL(Arg.DL) {}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 2f1e7823f65c..cd084fd5440a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -192,8 +192,20 @@ struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler {
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- if (!SPReg)
- SPReg = MIRBuilder.buildCopy(PtrTy, MFI->getStackPtrOffsetReg()).getReg(0);
+ if (!SPReg) {
+ const GCNSubtarget &ST = MIRBuilder.getMF().getSubtarget<GCNSubtarget>();
+ if (ST.enableFlatScratch()) {
+ // The stack is accessed unswizzled, so we can use a regular copy.
+ SPReg = MIRBuilder.buildCopy(PtrTy,
+ MFI->getStackPtrOffsetReg()).getReg(0);
+ } else {
+ // The address we produce here, without knowing the use context, is going
+ // to be interpreted as a vector address, so we need to convert to a
+ // swizzled address.
+ SPReg = MIRBuilder.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {PtrTy},
+ {MFI->getStackPtrOffsetReg()}).getReg(0);
+ }
+ }
auto OffsetReg = MIRBuilder.buildConstant(S32, Offset);
@@ -615,6 +627,13 @@ bool AMDGPUCallLowering::lowerFormalArguments(
CCInfo.AllocateReg(ImplicitBufferPtrReg);
}
+ // FIXME: This probably isn't defined for mesa
+ if (Info->hasFlatScratchInit() && !Subtarget.isAmdPalOS()) {
+ Register FlatScratchInitReg = Info->addFlatScratchInit(*TRI);
+ MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
+ CCInfo.AllocateReg(FlatScratchInitReg);
+ }
+
SmallVector<ArgInfo, 32> SplitArgs;
unsigned Idx = 0;
unsigned PSInputNum = 0;
@@ -879,13 +898,17 @@ bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder,
Register InputReg;
if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
NeedWorkItemIDX) {
- InputReg = MRI.createGenericVirtualRegister(S32);
- LI->loadInputValue(InputReg, MIRBuilder, IncomingArgX,
- std::get<1>(WorkitemIDX), std::get<2>(WorkitemIDX));
+ if (ST.getMaxWorkitemID(MF.getFunction(), 0) != 0) {
+ InputReg = MRI.createGenericVirtualRegister(S32);
+ LI->loadInputValue(InputReg, MIRBuilder, IncomingArgX,
+ std::get<1>(WorkitemIDX), std::get<2>(WorkitemIDX));
+ } else {
+ InputReg = MIRBuilder.buildConstant(S32, 0).getReg(0);
+ }
}
if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
- NeedWorkItemIDY) {
+ NeedWorkItemIDY && ST.getMaxWorkitemID(MF.getFunction(), 1) != 0) {
Register Y = MRI.createGenericVirtualRegister(S32);
LI->loadInputValue(Y, MIRBuilder, IncomingArgY, std::get<1>(WorkitemIDY),
std::get<2>(WorkitemIDY));
@@ -895,7 +918,7 @@ bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder,
}
if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
- NeedWorkItemIDZ) {
+ NeedWorkItemIDZ && ST.getMaxWorkitemID(MF.getFunction(), 2) != 0) {
Register Z = MRI.createGenericVirtualRegister(S32);
LI->loadInputValue(Z, MIRBuilder, IncomingArgZ, std::get<1>(WorkitemIDZ),
std::get<2>(WorkitemIDZ));
@@ -904,16 +927,24 @@ bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder,
InputReg = InputReg ? MIRBuilder.buildOr(S32, InputReg, Z).getReg(0) : Z;
}
- if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
+ if (!InputReg &&
+ (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
InputReg = MRI.createGenericVirtualRegister(S32);
-
- // Workitem ids are already packed, any of present incoming arguments will
- // carry all required fields.
- ArgDescriptor IncomingArg = ArgDescriptor::createArg(
- IncomingArgX ? *IncomingArgX :
+ if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
+ // We're in a situation where the outgoing function requires the workitem
+ // ID, but the calling function does not have it (e.g a graphics function
+ // calling a C calling convention function). This is illegal, but we need
+ // to produce something.
+ MIRBuilder.buildUndef(InputReg);
+ } else {
+ // Workitem ids are already packed, any of present incoming arguments will
+ // carry all required fields.
+ ArgDescriptor IncomingArg = ArgDescriptor::createArg(
+ IncomingArgX ? *IncomingArgX :
IncomingArgY ? *IncomingArgY : *IncomingArgZ, ~0u);
- LI->loadInputValue(InputReg, MIRBuilder, &IncomingArg,
- &AMDGPU::VGPR_32RegClass, S32);
+ LI->loadInputValue(InputReg, MIRBuilder, &IncomingArg,
+ &AMDGPU::VGPR_32RegClass, S32);
+ }
}
if (OutgoingArg->isRegister()) {
@@ -1314,6 +1345,7 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
return false;
}
+ Info.IsTailCall = CanTailCallOpt;
if (CanTailCallOpt)
return lowerTailCall(MIRBuilder, Info, OutArgs);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index a55729586b8d..1920684d8f1f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -150,13 +150,13 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
/// \returns The minimum number of bits needed to store the value of \Op as an
/// unsigned integer. Truncating to this size and then zero-extending to
- /// ScalarSize will not change the value.
- unsigned numBitsUnsigned(Value *Op, unsigned ScalarSize) const;
+ /// the original will not change the value.
+ unsigned numBitsUnsigned(Value *Op) const;
/// \returns The minimum number of bits needed to store the value of \Op as a
/// signed integer. Truncating to this size and then sign-extending to
- /// ScalarSize will not change the value.
- unsigned numBitsSigned(Value *Op, unsigned ScalarSize) const;
+ /// the original size will not change the value.
+ unsigned numBitsSigned(Value *Op) const;
/// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24.
/// SelectionDAG has an issue where an and asserting the bits are known
@@ -445,17 +445,12 @@ bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32(
return true;
}
-unsigned AMDGPUCodeGenPrepare::numBitsUnsigned(Value *Op,
- unsigned ScalarSize) const {
- KnownBits Known = computeKnownBits(Op, *DL, 0, AC);
- return ScalarSize - Known.countMinLeadingZeros();
+unsigned AMDGPUCodeGenPrepare::numBitsUnsigned(Value *Op) const {
+ return computeKnownBits(Op, *DL, 0, AC).countMaxActiveBits();
}
-unsigned AMDGPUCodeGenPrepare::numBitsSigned(Value *Op,
- unsigned ScalarSize) const {
- // In order for this to be a signed 24-bit value, bit 23, must
- // be a sign bit.
- return ScalarSize - ComputeNumSignBits(Op, *DL, 0, AC) + 1;
+unsigned AMDGPUCodeGenPrepare::numBitsSigned(Value *Op) const {
+ return ComputeMaxSignificantBits(Op, *DL, 0, AC);
}
static void extractValues(IRBuilder<> &Builder,
@@ -532,12 +527,12 @@ bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator &I) const {
unsigned LHSBits = 0, RHSBits = 0;
bool IsSigned = false;
- if (ST->hasMulU24() && (LHSBits = numBitsUnsigned(LHS, Size)) <= 24 &&
- (RHSBits = numBitsUnsigned(RHS, Size)) <= 24) {
+ if (ST->hasMulU24() && (LHSBits = numBitsUnsigned(LHS)) <= 24 &&
+ (RHSBits = numBitsUnsigned(RHS)) <= 24) {
IsSigned = false;
- } else if (ST->hasMulI24() && (LHSBits = numBitsSigned(LHS, Size)) <= 24 &&
- (RHSBits = numBitsSigned(RHS, Size)) <= 24) {
+ } else if (ST->hasMulI24() && (LHSBits = numBitsSigned(LHS)) <= 24 &&
+ (RHSBits = numBitsSigned(RHS)) <= 24) {
IsSigned = true;
} else
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index 699c6c479455..3ac7c45b3275 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -331,8 +331,7 @@ void MetadataStreamerV2::emitKernelArg(const Argument &Arg) {
if (auto PtrTy = dyn_cast<PointerType>(Arg.getType())) {
if (PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
// FIXME: Should report this for all address spaces
- PointeeAlign = DL.getValueOrABITypeAlignment(Arg.getParamAlign(),
- PtrTy->getElementType());
+ PointeeAlign = Arg.getParamAlign().valueOrOne();
}
}
@@ -731,10 +730,8 @@ void MetadataStreamerV3::emitKernelArg(const Argument &Arg, unsigned &Offset,
// FIXME: Need to distinguish in memory alignment from pointer alignment.
if (auto PtrTy = dyn_cast<PointerType>(Ty)) {
- if (PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
- PointeeAlign = DL.getValueOrABITypeAlignment(Arg.getParamAlign(),
- PtrTy->getElementType());
- }
+ if (PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
+ PointeeAlign = Arg.getParamAlign().valueOrOne();
}
// There's no distinction between byval aggregates and raw aggregates.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 54177564afbc..b9d0655feef7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -51,7 +51,7 @@ unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) {
unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {
// In order for this to be a signed 24-bit value, bit 23, must
// be a sign bit.
- return DAG.ComputeMinSignedBits(Op);
+ return DAG.ComputeMaxSignificantBits(Op);
}
AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
@@ -360,6 +360,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f16, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i16, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f16, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i16, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3f32, Custom);
@@ -1408,6 +1410,11 @@ SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
Start != 1)
return Op;
+ if (((SrcVT == MVT::v8f16 && VT == MVT::v4f16) ||
+ (SrcVT == MVT::v8i16 && VT == MVT::v4i16)) &&
+ (Start == 0 || Start == 4))
+ return Op;
+
DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
VT.getVectorNumElements());
@@ -4626,11 +4633,12 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
RHSKnown = RHSKnown.trunc(24);
if (Opc == AMDGPUISD::MUL_I24) {
- unsigned LHSValBits = 24 - LHSKnown.countMinSignBits();
- unsigned RHSValBits = 24 - RHSKnown.countMinSignBits();
- unsigned MaxValBits = std::min(LHSValBits + RHSValBits, 32u);
- if (MaxValBits >= 32)
+ unsigned LHSValBits = LHSKnown.countMaxSignificantBits();
+ unsigned RHSValBits = RHSKnown.countMaxSignificantBits();
+ unsigned MaxValBits = LHSValBits + RHSValBits;
+ if (MaxValBits > 32)
break;
+ unsigned SignBits = 32 - MaxValBits + 1;
bool LHSNegative = LHSKnown.isNegative();
bool LHSNonNegative = LHSKnown.isNonNegative();
bool LHSPositive = LHSKnown.isStrictlyPositive();
@@ -4639,16 +4647,16 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
bool RHSPositive = RHSKnown.isStrictlyPositive();
if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative))
- Known.Zero.setHighBits(32 - MaxValBits);
+ Known.Zero.setHighBits(SignBits);
else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative))
- Known.One.setHighBits(32 - MaxValBits);
+ Known.One.setHighBits(SignBits);
} else {
- unsigned LHSValBits = 24 - LHSKnown.countMinLeadingZeros();
- unsigned RHSValBits = 24 - RHSKnown.countMinLeadingZeros();
- unsigned MaxValBits = std::min(LHSValBits + RHSValBits, 32u);
+ unsigned LHSValBits = LHSKnown.countMaxActiveBits();
+ unsigned RHSValBits = RHSKnown.countMaxActiveBits();
+ unsigned MaxValBits = LHSValBits + RHSValBits;
if (MaxValBits >= 32)
break;
- Known.Zero.setHighBits(32 - MaxValBits);
+ Known.Zero.setBitsFrom(MaxValBits);
}
break;
}
@@ -4904,7 +4912,8 @@ AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
}
}
-bool AMDGPUTargetLowering::isConstantUnsignedBitfieldExtactLegal(
+bool AMDGPUTargetLowering::isConstantUnsignedBitfieldExtractLegal(
unsigned Opc, LLT Ty1, LLT Ty2) const {
- return Ty1 == Ty2 && (Ty1 == LLT::scalar(32) || Ty1 == LLT::scalar(64));
+ return (Ty1 == LLT::scalar(32) || Ty1 == LLT::scalar(64)) &&
+ Ty2 == LLT::scalar(32);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index daaca8737c5d..b41506157b68 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -335,8 +335,8 @@ public:
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override;
- bool isConstantUnsignedBitfieldExtactLegal(unsigned Opc, LLT Ty1,
- LLT Ty2) const override;
+ bool isConstantUnsignedBitfieldExtractLegal(unsigned Opc, LLT Ty1,
+ LLT Ty2) const override;
};
namespace AMDGPUISD {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index db84b8766924..4f1d700bcd84 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -58,24 +58,37 @@ static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
// Check if a value can be converted to a 16-bit value without losing
// precision.
-static bool canSafelyConvertTo16Bit(Value &V) {
+// The value is expected to be either a float (IsFloat = true) or an unsigned
+// integer (IsFloat = false).
+static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat) {
Type *VTy = V.getType();
if (VTy->isHalfTy() || VTy->isIntegerTy(16)) {
// The value is already 16-bit, so we don't want to convert to 16-bit again!
return false;
}
- if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
- // We need to check that if we cast the index down to a half, we do not lose
- // precision.
- APFloat FloatValue(ConstFloat->getValueAPF());
- bool LosesInfo = true;
- FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero, &LosesInfo);
- return !LosesInfo;
+ if (IsFloat) {
+ if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
+ // We need to check that if we cast the index down to a half, we do not
+ // lose precision.
+ APFloat FloatValue(ConstFloat->getValueAPF());
+ bool LosesInfo = true;
+ FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero,
+ &LosesInfo);
+ return !LosesInfo;
+ }
+ } else {
+ if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(&V)) {
+ // We need to check that if we cast the index down to an i16, we do not
+ // lose precision.
+ APInt IntValue(ConstInt->getValue());
+ return IntValue.getActiveBits() <= 16;
+ }
}
+
Value *CastSrc;
- if (match(&V, m_FPExt(PatternMatch::m_Value(CastSrc))) ||
- match(&V, m_SExt(PatternMatch::m_Value(CastSrc))) ||
- match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)))) {
+ bool IsExt = IsFloat ? match(&V, m_FPExt(PatternMatch::m_Value(CastSrc)))
+ : match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)));
+ if (IsExt) {
Type *CastSrcTy = CastSrc->getType();
if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16))
return true;
@@ -97,13 +110,116 @@ static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) {
llvm_unreachable("Should never be called!");
}
+/// Applies Function(II.Args, II.ArgTys) and replaces the intrinsic call with
+/// the modified arguments.
+static Optional<Instruction *> modifyIntrinsicCall(
+ IntrinsicInst &II, unsigned NewIntr, InstCombiner &IC,
+ std::function<void(SmallVectorImpl<Value *> &, SmallVectorImpl<Type *> &)>
+ Func) {
+ SmallVector<Type *, 4> ArgTys;
+ if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), ArgTys))
+ return None;
+
+ SmallVector<Value *, 8> Args(II.args());
+
+ // Modify arguments and types
+ Func(Args, ArgTys);
+
+ Function *I = Intrinsic::getDeclaration(II.getModule(), NewIntr, ArgTys);
+
+ CallInst *NewCall = IC.Builder.CreateCall(I, Args);
+ NewCall->takeName(&II);
+ NewCall->copyMetadata(II);
+ if (isa<FPMathOperator>(NewCall))
+ NewCall->copyFastMathFlags(&II);
+
+ // Erase and replace uses
+ if (!II.getType()->isVoidTy())
+ IC.replaceInstUsesWith(II, NewCall);
+ return IC.eraseInstFromFunction(II);
+}
+
static Optional<Instruction *>
simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
IntrinsicInst &II, InstCombiner &IC) {
+ // Optimize _L to _LZ when _L is zero
+ if (const auto *LZMappingInfo =
+ AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
+ if (auto *ConstantLod =
+ dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->LodIndex))) {
+ if (ConstantLod->isZero() || ConstantLod->isNegative()) {
+ const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
+ AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ,
+ ImageDimIntr->Dim);
+ return modifyIntrinsicCall(
+ II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
+ Args.erase(Args.begin() + ImageDimIntr->LodIndex);
+ });
+ }
+ }
+ }
+
+ // Optimize _mip away, when 'lod' is zero
+ if (const auto *MIPMappingInfo =
+ AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
+ if (auto *ConstantMip =
+ dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->MipIndex))) {
+ if (ConstantMip->isZero()) {
+ const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
+ AMDGPU::getImageDimIntrinsicByBaseOpcode(MIPMappingInfo->NONMIP,
+ ImageDimIntr->Dim);
+ return modifyIntrinsicCall(
+ II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
+ Args.erase(Args.begin() + ImageDimIntr->MipIndex);
+ });
+ }
+ }
+ }
+
+ // Optimize _bias away when 'bias' is zero
+ if (const auto *BiasMappingInfo =
+ AMDGPU::getMIMGBiasMappingInfo(ImageDimIntr->BaseOpcode)) {
+ if (auto *ConstantBias =
+ dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->BiasIndex))) {
+ if (ConstantBias->isZero()) {
+ const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
+ AMDGPU::getImageDimIntrinsicByBaseOpcode(BiasMappingInfo->NoBias,
+ ImageDimIntr->Dim);
+ return modifyIntrinsicCall(
+ II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
+ Args.erase(Args.begin() + ImageDimIntr->BiasIndex);
+ ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg);
+ });
+ }
+ }
+ }
+
+ // Optimize _offset away when 'offset' is zero
+ if (const auto *OffsetMappingInfo =
+ AMDGPU::getMIMGOffsetMappingInfo(ImageDimIntr->BaseOpcode)) {
+ if (auto *ConstantOffset =
+ dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->OffsetIndex))) {
+ if (ConstantOffset->isZero()) {
+ const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
+ AMDGPU::getImageDimIntrinsicByBaseOpcode(
+ OffsetMappingInfo->NoOffset, ImageDimIntr->Dim);
+ return modifyIntrinsicCall(
+ II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
+ Args.erase(Args.begin() + ImageDimIntr->OffsetIndex);
+ });
+ }
+ }
+ }
+
+ // Try to use A16 or G16
if (!ST->hasA16() && !ST->hasG16())
return None;
+ // Address is interpreted as float if the instruction has a sampler or as
+ // unsigned int if there is no sampler.
+ bool HasSampler =
+ AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode)->Sampler;
bool FloatCoord = false;
// true means derivatives can be converted to 16 bit, coordinates not
bool OnlyDerivatives = false;
@@ -112,7 +228,7 @@ simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
Value *Coord = II.getOperand(OperandIndex);
// If the values are not derived from 16-bit values, we cannot optimize.
- if (!canSafelyConvertTo16Bit(*Coord)) {
+ if (!canSafelyConvertTo16Bit(*Coord, HasSampler)) {
if (OperandIndex < ImageDimIntr->CoordStart ||
ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
return None;
@@ -127,43 +243,50 @@ simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
FloatCoord = Coord->getType()->isFloatingPointTy();
}
- if (OnlyDerivatives) {
- if (!ST->hasG16())
- return None;
- } else {
- if (!ST->hasA16())
- OnlyDerivatives = true; // Only supports G16
+ if (!OnlyDerivatives && !ST->hasA16())
+ OnlyDerivatives = true; // Only supports G16
+
+ // Check if there is a bias parameter and if it can be converted to f16
+ if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
+ Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
+ assert(HasSampler &&
+ "Only image instructions with a sampler can have a bias");
+ if (!canSafelyConvertTo16Bit(*Bias, HasSampler))
+ OnlyDerivatives = true;
}
+ if (OnlyDerivatives && (!ST->hasG16() || ImageDimIntr->GradientStart ==
+ ImageDimIntr->CoordStart))
+ return None;
+
Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext())
: Type::getInt16Ty(II.getContext());
- SmallVector<Type *, 4> ArgTys;
- if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), ArgTys))
- return None;
-
- ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
- if (!OnlyDerivatives)
- ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
- Function *I =
- Intrinsic::getDeclaration(II.getModule(), II.getIntrinsicID(), ArgTys);
+ return modifyIntrinsicCall(
+ II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) {
+ ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
+ if (!OnlyDerivatives) {
+ ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
- SmallVector<Value *, 8> Args(II.args());
+ // Change the bias type
+ if (ImageDimIntr->NumBiasArgs != 0)
+ ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(II.getContext());
+ }
- unsigned EndIndex =
- OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
- for (unsigned OperandIndex = ImageDimIntr->GradientStart;
- OperandIndex < EndIndex; OperandIndex++) {
- Args[OperandIndex] =
- convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder);
- }
+ unsigned EndIndex =
+ OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
+ for (unsigned OperandIndex = ImageDimIntr->GradientStart;
+ OperandIndex < EndIndex; OperandIndex++) {
+ Args[OperandIndex] =
+ convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder);
+ }
- CallInst *NewCall = IC.Builder.CreateCall(I, Args);
- NewCall->takeName(&II);
- NewCall->copyMetadata(II);
- if (isa<FPMathOperator>(NewCall))
- NewCall->copyFastMathFlags(&II);
- return IC.replaceInstUsesWith(II, NewCall);
+ // Convert the bias
+ if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
+ Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
+ Args[ImageDimIntr->BiasIndex] = convertTo16Bit(*Bias, IC.Builder);
+ }
+ });
}
bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Value *Op0, const Value *Op1,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
index b1263618c5db..e7ee36447682 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
@@ -20,9 +20,6 @@
namespace llvm {
class GCNSubtarget;
-class MachineFunction;
-class MachineInstr;
-class MachineInstrBuilder;
class MachineMemOperand;
class AMDGPUInstrInfo {
@@ -52,6 +49,9 @@ struct ImageDimIntrinsicInfo {
unsigned BaseOpcode;
MIMGDim Dim;
+ uint8_t NumOffsetArgs;
+ uint8_t NumBiasArgs;
+ uint8_t NumZCompareArgs;
uint8_t NumGradients;
uint8_t NumDmask;
uint8_t NumData;
@@ -60,6 +60,9 @@ struct ImageDimIntrinsicInfo {
uint8_t DMaskIndex;
uint8_t VAddrStart;
+ uint8_t OffsetIndex;
+ uint8_t BiasIndex;
+ uint8_t ZCompareIndex;
uint8_t GradientStart;
uint8_t CoordStart;
uint8_t LodIndex;
@@ -71,6 +74,7 @@ struct ImageDimIntrinsicInfo {
uint8_t TexFailCtrlIndex;
uint8_t CachePolicyIndex;
+ uint8_t BiasTyArg;
uint8_t GradientTyArg;
uint8_t CoordTyArg;
};
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index e16bead81b65..b7d0f0580cda 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -46,8 +46,7 @@ static cl::opt<bool> AllowRiskySelect(
AMDGPUInstructionSelector::AMDGPUInstructionSelector(
const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
const AMDGPUTargetMachine &TM)
- : InstructionSelector(), TII(*STI.getInstrInfo()),
- TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
+ : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
STI(STI),
EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG),
#define GET_GLOBALISEL_PREDICATES_INIT
@@ -1103,7 +1102,18 @@ bool AMDGPUInstructionSelector::selectIntrinsicIcmp(MachineInstr &I) const {
const DebugLoc &DL = I.getDebugLoc();
Register SrcReg = I.getOperand(2).getReg();
unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
+
auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
+ if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(Pred))) {
+ MachineInstr *ICmp =
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst);
+
+ if (!RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
+ *TRI.getBoolRC(), *MRI))
+ return false;
+ I.eraseFromParent();
+ return true;
+ }
int Opcode = getV_CMPOpcode(Pred, Size);
if (Opcode == -1)
@@ -1234,7 +1244,7 @@ bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
// Get the return address reg and mark it as an implicit live-in
Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
- AMDGPU::SReg_64RegClass);
+ AMDGPU::SReg_64RegClass, DL);
BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
.addReg(LiveIn);
I.eraseFromParent();
@@ -1494,9 +1504,9 @@ static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
if (TexFailCtrl)
IsTexFail = true;
- TFE = (TexFailCtrl & 0x1) ? 1 : 0;
+ TFE = (TexFailCtrl & 0x1) ? true : false;
TexFailCtrl &= ~(uint64_t)0x1;
- LWE = (TexFailCtrl & 0x2) ? 1 : 0;
+ LWE = (TexFailCtrl & 0x2) ? true : false;
TexFailCtrl &= ~(uint64_t)0x2;
return TexFailCtrl == 0;
@@ -1511,10 +1521,6 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
- const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
- AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode);
- const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo =
- AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode);
unsigned IntrOpcode = Intr->BaseOpcode;
const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
@@ -1523,7 +1529,8 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
Register VDataIn, VDataOut;
LLT VDataTy;
int NumVDataDwords = -1;
- bool IsD16 = false;
+ bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
+ MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
bool Unorm;
if (!BaseOpcode->Sampler)
@@ -1572,16 +1579,6 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
- // One memoperand is mandatory, except for getresinfo.
- // FIXME: Check this in verifier.
- if (!MI.memoperands_empty()) {
- const MachineMemOperand *MMO = *MI.memoperands_begin();
-
- // Infer d16 from the memory size, as the register type will be mangled by
- // unpacked subtargets, or by TFE.
- IsD16 = ((8 * MMO->getSize()) / DMaskLanes) < 32;
- }
-
if (BaseOpcode->Store) {
VDataIn = MI.getOperand(1).getReg();
VDataTy = MRI->getType(VDataIn);
@@ -1596,26 +1593,6 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
}
}
- // Optimize _L to _LZ when _L is zero
- if (LZMappingInfo) {
- // The legalizer replaced the register with an immediate 0 if we need to
- // change the opcode.
- const MachineOperand &Lod = MI.getOperand(ArgOffset + Intr->LodIndex);
- if (Lod.isImm()) {
- assert(Lod.getImm() == 0);
- IntrOpcode = LZMappingInfo->LZ; // set new opcode to _lz variant of _l
- }
- }
-
- // Optimize _mip away, when 'lod' is zero
- if (MIPMappingInfo) {
- const MachineOperand &Lod = MI.getOperand(ArgOffset + Intr->MipIndex);
- if (Lod.isImm()) {
- assert(Lod.getImm() == 0);
- IntrOpcode = MIPMappingInfo->NONMIP; // set new opcode to variant without _mip
- }
- }
-
// Set G16 opcode
if (IsG16 && !IsA16) {
const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
@@ -2562,6 +2539,8 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
Register MaskReg = I.getOperand(2).getReg();
LLT Ty = MRI->getType(DstReg);
LLT MaskTy = MRI->getType(MaskReg);
+ MachineBasicBlock *BB = I.getParent();
+ const DebugLoc &DL = I.getDebugLoc();
const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
@@ -2570,6 +2549,24 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
if (DstRB != SrcRB) // Should only happen for hand written MIR.
return false;
+ // Try to avoid emitting a bit operation when we only need to touch half of
+ // the 64-bit pointer.
+ APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zextOrSelf(64);
+ const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
+ const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
+
+ const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
+ const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
+
+ if (!IsVGPR && Ty.getSizeInBits() == 64 &&
+ !CanCopyLow32 && !CanCopyHi32) {
+ auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg)
+ .addReg(SrcReg)
+ .addReg(MaskReg);
+ I.eraseFromParent();
+ return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+ }
+
unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
const TargetRegisterClass &RegRC
= IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
@@ -2586,8 +2583,6 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
!RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
return false;
- MachineBasicBlock *BB = I.getParent();
- const DebugLoc &DL = I.getDebugLoc();
if (Ty.getSizeInBits() == 32) {
assert(MaskTy.getSizeInBits() == 32 &&
"ptrmask should have been narrowed during legalize");
@@ -2610,13 +2605,7 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
Register MaskedLo, MaskedHi;
- // Try to avoid emitting a bit operation when we only need to touch half of
- // the 64-bit pointer.
- APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zextOrSelf(64);
-
- const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
- const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
- if ((MaskOnes & MaskLo32) == MaskLo32) {
+ if (CanCopyLow32) {
// If all the bits in the low half are 1, we only need a copy for it.
MaskedLo = LoReg;
} else {
@@ -2631,7 +2620,7 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
.addReg(MaskLo);
}
- if ((MaskOnes & MaskHi32) == MaskHi32) {
+ if (CanCopyHi32) {
// If all the bits in the high half are 1, we only need a copy for it.
MaskedHi = HiReg;
} else {
@@ -3123,6 +3112,33 @@ bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{
return true;
}
+bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
+ const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
+ const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
+ MachineBasicBlock *MBB = MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ if (IsVALU) {
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
+ .addImm(Subtarget->getWavefrontSizeLog2())
+ .addReg(SrcReg);
+ } else {
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
+ .addReg(SrcReg)
+ .addImm(Subtarget->getWavefrontSizeLog2());
+ }
+
+ const TargetRegisterClass &RC =
+ IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
+ if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
+ return false;
+
+ MI.eraseFromParent();
+ return true;
+}
+
bool AMDGPUInstructionSelector::select(MachineInstr &I) {
if (I.isPHI())
return selectPHI(I);
@@ -3236,7 +3252,9 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
case TargetOpcode::G_SHUFFLE_VECTOR:
return selectG_SHUFFLE_VECTOR(I);
case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
- case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
+ case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
+ case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
+ case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
const AMDGPU::ImageDimIntrinsicInfo *Intr
= AMDGPU::getImageDimIntrinsicInfo(I.getIntrinsicID());
assert(Intr && "not an image intrinsic with image pseudo");
@@ -3252,6 +3270,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
case AMDGPU::G_SI_CALL:
I.setDesc(TII.get(AMDGPU::SI_CALL));
return true;
+ case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
+ return selectWaveAddress(I);
default:
return selectImpl(I, *CoverageInfo);
}
@@ -3896,20 +3916,59 @@ bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
return (LHSKnownZeros | *RHS).countTrailingOnes() >= ShAmtBits;
}
+// Return the wave level SGPR base address if this is a wave address.
+static Register getWaveAddress(const MachineInstr *Def) {
+ return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
+ ? Def->getOperand(1).getReg()
+ : Register();
+}
+
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectMUBUFScratchOffset(
MachineOperand &Root) const {
- MachineInstr *MI = Root.getParent();
- MachineBasicBlock *MBB = MI->getParent();
+ Register Reg = Root.getReg();
+ const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
+
+ const MachineInstr *Def = MRI->getVRegDef(Reg);
+ if (Register WaveBase = getWaveAddress(Def)) {
+ return {{
+ [=](MachineInstrBuilder &MIB) { // rsrc
+ MIB.addReg(Info->getScratchRSrcReg());
+ },
+ [=](MachineInstrBuilder &MIB) { // soffset
+ MIB.addReg(WaveBase);
+ },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset
+ }};
+ }
int64_t Offset = 0;
+
+ // FIXME: Copy check is a hack
+ Register BasePtr;
+ if (mi_match(Reg, *MRI, m_GPtrAdd(m_Reg(BasePtr), m_Copy(m_ICst(Offset))))) {
+ if (!SIInstrInfo::isLegalMUBUFImmOffset(Offset))
+ return {};
+ const MachineInstr *BasePtrDef = MRI->getVRegDef(BasePtr);
+ Register WaveBase = getWaveAddress(BasePtrDef);
+ if (!WaveBase)
+ return {};
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { // rsrc
+ MIB.addReg(Info->getScratchRSrcReg());
+ },
+ [=](MachineInstrBuilder &MIB) { // soffset
+ MIB.addReg(WaveBase);
+ },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
+ }};
+ }
+
if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
!SIInstrInfo::isLegalMUBUFImmOffset(Offset))
return {};
- const MachineFunction *MF = MBB->getParent();
- const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
-
return {{
[=](MachineInstrBuilder &MIB) { // rsrc
MIB.addReg(Info->getScratchRSrcReg());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 26996e42af53..42095332d11a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -30,7 +30,6 @@ namespace AMDGPU {
struct ImageDimIntrinsicInfo;
}
-class AMDGPUInstrInfo;
class AMDGPURegisterBankInfo;
class AMDGPUTargetMachine;
class BlockFrequencyInfo;
@@ -42,7 +41,6 @@ class MachineOperand;
class MachineRegisterInfo;
class RegisterBank;
class SIInstrInfo;
-class SIMachineFunctionInfo;
class SIRegisterInfo;
class TargetRegisterClass;
@@ -147,6 +145,7 @@ private:
bool selectGlobalAtomicFadd(MachineInstr &I, MachineOperand &AddrOp,
MachineOperand &DataOp) const;
bool selectBVHIntrinsic(MachineInstr &I) const;
+ bool selectWaveAddress(MachineInstr &I) const;
std::pair<Register, unsigned> selectVOP3ModsImpl(MachineOperand &Root,
bool AllowAbs = true) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 0528b552f475..7d3dbfd7e851 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -18,6 +18,7 @@ class AddressSpacesImpl {
int Local = 3;
int Constant = 4;
int Private = 5;
+ int Constant32Bit = 6;
}
def AddrSpaces : AddressSpacesImpl;
@@ -405,18 +406,23 @@ class Aligned<int Bytes> {
int MinAlignment = Bytes;
}
-class StoreHi16<SDPatternOperator op> : PatFrag <
+class StoreHi16<SDPatternOperator op, ValueType vt> : PatFrag <
(ops node:$value, node:$ptr), (op (srl node:$value, (i32 16)), node:$ptr)> {
let IsStore = 1;
+ let MemoryVT = vt;
}
-def LoadAddress_constant : AddressSpaceList<[ AddrSpaces.Constant ]>;
-def LoadAddress_global : AddressSpaceList<[ AddrSpaces.Global, AddrSpaces.Constant ]>;
+def LoadAddress_constant : AddressSpaceList<[ AddrSpaces.Constant,
+ AddrSpaces.Constant32Bit ]>;
+def LoadAddress_global : AddressSpaceList<[ AddrSpaces.Global,
+ AddrSpaces.Constant,
+ AddrSpaces.Constant32Bit ]>;
def StoreAddress_global : AddressSpaceList<[ AddrSpaces.Global ]>;
-def LoadAddress_flat : AddressSpaceList<[ AddrSpaces.Flat,
- AddrSpaces.Global,
- AddrSpaces.Constant ]>;
+def LoadAddress_flat : AddressSpaceList<[ AddrSpaces.Flat,
+ AddrSpaces.Global,
+ AddrSpaces.Constant,
+ AddrSpaces.Constant32Bit ]>;
def StoreAddress_flat : AddressSpaceList<[ AddrSpaces.Flat, AddrSpaces.Global ]>;
def LoadAddress_private : AddressSpaceList<[ AddrSpaces.Private ]>;
@@ -522,9 +528,9 @@ def truncstorei16_#as : PatFrag<(ops node:$val, node:$ptr),
let MemoryVT = i16;
}
-def store_hi16_#as : StoreHi16 <truncstorei16>;
-def truncstorei8_hi16_#as : StoreHi16<truncstorei8>;
-def truncstorei16_hi16_#as : StoreHi16<truncstorei16>;
+def store_hi16_#as : StoreHi16 <truncstorei16, i16>;
+def truncstorei8_hi16_#as : StoreHi16<truncstorei8, i8>;
+def truncstorei16_hi16_#as : StoreHi16<truncstorei16, i16>;
defm atomic_store_#as : binary_atomic_op<atomic_store>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 5046daaed977..04c6f67ed339 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -272,8 +272,8 @@ static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
unsigned RegSize = Ty.getSizeInBits();
- unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
- unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
+ uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
+ uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
unsigned AS = Query.Types[1].getAddressSpace();
// All of these need to be custom lowered to cast the pointer operand.
@@ -380,7 +380,7 @@ static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
/// access up to the alignment. Note this case when the memory access itself
/// changes, not the size of the result register.
static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
- unsigned AlignInBits, unsigned AddrSpace,
+ uint64_t AlignInBits, unsigned AddrSpace,
unsigned Opcode) {
unsigned SizeInBits = MemoryTy.getSizeInBits();
// We don't want to widen cases that are naturally legal.
@@ -929,10 +929,11 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
getActionDefinitionsBuilder(G_CTPOP)
.legalFor({{S32, S32}, {S32, S64}})
.clampScalar(0, S32, S32)
+ .widenScalarToNextPow2(1, 32)
.clampScalar(1, S32, S64)
.scalarize(0)
- .widenScalarToNextPow2(0, 32)
- .widenScalarToNextPow2(1, 32);
+ .widenScalarToNextPow2(0, 32);
+
// The hardware instructions return a different result on 0 than the generic
// instructions expect. The hardware produces -1, but these produce the
@@ -1172,7 +1173,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
if (MemSize > MaxSize)
return std::make_pair(0, LLT::scalar(MaxSize));
- unsigned Align = Query.MMODescrs[0].AlignInBits;
+ uint64_t Align = Query.MMODescrs[0].AlignInBits;
return std::make_pair(0, LLT::scalar(Align));
})
.fewerElementsIf(
@@ -1295,6 +1296,18 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
if (ST.hasAtomicFaddInsts())
Atomic.legalFor({{S32, GlobalPtr}});
+ if (ST.hasGFX90AInsts()) {
+ // These are legal with some caveats, and should have undergone expansion in
+ // the IR in most situations
+ // TODO: Move atomic expansion into legalizer
+ // TODO: Also supports <2 x f16>
+ Atomic.legalFor({
+ {S32, GlobalPtr},
+ {S64, GlobalPtr},
+ {S64, FlatPtr}
+ });
+ }
+
// BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
// demarshalling
getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
@@ -1345,8 +1358,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
}, changeTo(1, S16));
Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
Shifts.clampScalar(1, S32, S32);
- Shifts.clampScalar(0, S16, S64);
Shifts.widenScalarToNextPow2(0, 16);
+ Shifts.clampScalar(0, S16, S64);
getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
.minScalar(0, S16)
@@ -1357,8 +1370,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// expansion for the shifted type will produce much worse code if it hasn't
// been truncated already.
Shifts.clampScalar(1, S32, S32);
- Shifts.clampScalar(0, S32, S64);
Shifts.widenScalarToNextPow2(0, 32);
+ Shifts.clampScalar(0, S32, S64);
getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
.minScalar(0, S32)
@@ -1812,6 +1825,27 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
}
+/// Return true if the value is a known valid address, such that a null check is
+/// not necessary.
+static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
+ const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
+ MachineInstr *Def = MRI.getVRegDef(Val);
+ switch (Def->getOpcode()) {
+ case AMDGPU::G_FRAME_INDEX:
+ case AMDGPU::G_GLOBAL_VALUE:
+ case AMDGPU::G_BLOCK_ADDR:
+ return true;
+ case AMDGPU::G_CONSTANT: {
+ const ConstantInt *CI = Def->getOperand(1).getCImm();
+ return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
+ }
+ default:
+ return false;
+ }
+
+ return false;
+}
+
bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
@@ -1862,6 +1896,14 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
DestAS == AMDGPUAS::PRIVATE_ADDRESS);
+
+ if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
+ // Extract low 32-bits of the pointer.
+ B.buildExtract(Dst, Src, 0);
+ MI.eraseFromParent();
+ return true;
+ }
+
unsigned NullVal = TM.getNullPointerValue(DestAS);
auto SegmentNull = B.buildConstant(DstTy, NullVal);
@@ -1884,24 +1926,29 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
if (!ST.hasFlatAddressSpace())
return false;
- auto SegmentNull =
- B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
- auto FlatNull =
- B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
-
Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
if (!ApertureReg.isValid())
return false;
- auto CmpRes =
- B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
-
// Coerce the type of the low half of the result so we can use merge_values.
Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
// TODO: Should we allow mismatched types but matching sizes in merges to
// avoid the ptrtoint?
auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
+
+ if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
+ B.buildCopy(Dst, BuildPtr);
+ MI.eraseFromParent();
+ return true;
+ }
+
+ auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
+ auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
+
+ auto CmpRes =
+ B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
+
B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
MI.eraseFromParent();
@@ -1959,6 +2006,7 @@ bool AMDGPULegalizerInfo::legalizeFceil(
// TODO: Should this propagate fast-math-flags?
B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
+ MI.eraseFromParent();
return true;
}
@@ -2213,10 +2261,12 @@ bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
LLT EltTy = VecTy.getElementType();
assert(EltTy == MRI.getType(Dst));
- if (IdxVal < VecTy.getNumElements())
- B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
- else
+ if (IdxVal < VecTy.getNumElements()) {
+ auto Unmerge = B.buildUnmerge(EltTy, Vec);
+ B.buildCopy(Dst, Unmerge.getReg(IdxVal));
+ } else {
B.buildUndef(Dst);
+ }
MI.eraseFromParent();
return true;
@@ -2245,11 +2295,20 @@ bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
LLT VecTy = MRI.getType(Vec);
LLT EltTy = VecTy.getElementType();
assert(EltTy == MRI.getType(Ins));
+ (void)Ins;
- if (IdxVal < VecTy.getNumElements())
- B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
- else
+ unsigned NumElts = VecTy.getNumElements();
+ if (IdxVal < NumElts) {
+ SmallVector<Register, 8> SrcRegs;
+ for (unsigned i = 0; i < NumElts; ++i)
+ SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
+ B.buildUnmerge(SrcRegs, Vec);
+
+ SrcRegs[IdxVal] = MI.getOperand(2).getReg();
+ B.buildMerge(Dst, SrcRegs);
+ } else {
B.buildUndef(Dst);
+ }
MI.eraseFromParent();
return true;
@@ -2502,7 +2561,7 @@ bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
const LLT MemTy = MMO->getMemoryType();
const Align MemAlign = MMO->getAlign();
const unsigned MemSize = MemTy.getSizeInBits();
- const unsigned AlignInBits = 8 * MemAlign.value();
+ const uint64_t AlignInBits = 8 * MemAlign.value();
// Widen non-power-of-2 loads to the alignment if needed
if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
@@ -2832,8 +2891,8 @@ bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
assert(DstReg.isVirtual() && "Virtual register expected");
- Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
- ArgTy);
+ Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
+ *ArgRC, B.getDebugLoc(), ArgTy);
if (Arg->isMasked()) {
// TODO: Should we try to emit this once in the entry block?
const LLT S32 = LLT::scalar(32);
@@ -2842,6 +2901,8 @@ bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
Register AndMaskSrc = LiveIn;
+ // TODO: Avoid clearing the high bits if we know workitem id y/z are always
+ // 0.
if (Shift != 0) {
auto ShiftAmt = B.buildConstant(S32, Shift);
AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
@@ -4106,7 +4167,6 @@ static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
- case Intrinsic::amdgcn_buffer_atomic_fadd:
case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
@@ -4213,15 +4273,18 @@ static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
if ((I < Intr->GradientStart) ||
(I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
(I >= Intr->CoordStart && !IsA16)) {
- // Handle any gradient or coordinate operands that should not be packed
if ((I < Intr->GradientStart) && IsA16 &&
(B.getMRI()->getType(AddrReg) == S16)) {
+ assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
// Special handling of bias when A16 is on. Bias is of type half but
// occupies full 32-bit.
PackedAddrs.push_back(
B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
.getReg(0));
} else {
+ assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
+ "Bias needs to be converted to 16 bit in A16 mode");
+ // Handle any gradient or coordinate operands that should not be packed
AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
PackedAddrs.push_back(AddrReg);
}
@@ -4320,6 +4383,8 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
const LLT V2S16 = LLT::fixed_vector(2, 16);
unsigned DMask = 0;
+ Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
+ LLT Ty = MRI->getType(VData);
// Check for 16 bit addresses and pack if true.
LLT GradTy =
@@ -4328,6 +4393,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
const bool IsG16 = GradTy == S16;
const bool IsA16 = AddrTy == S16;
+ const bool IsD16 = Ty.getScalarType() == S16;
int DMaskLanes = 0;
if (!BaseOpcode->Atomic) {
@@ -4347,8 +4413,11 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
Observer.changingInstr(MI);
auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
- unsigned NewOpcode = NumDefs == 0 ?
- AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
+ const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
+ : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
+ const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
+ : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
+ unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode;
// Track that we legalized this
MI.setDesc(B.getTII().get(NewOpcode));
@@ -4381,44 +4450,6 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
- // Optimize _L to _LZ when _L is zero
- if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
- AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
- const ConstantFP *ConstantLod;
-
- if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
- m_GFCst(ConstantLod))) {
- if (ConstantLod->isZero() || ConstantLod->isNegative()) {
- // Set new opcode to _lz variant of _l, and change the intrinsic ID.
- const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
- AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ,
- Intr->Dim);
-
- // The starting indexes should remain in the same place.
- --CorrectedNumVAddrs;
-
- MI.getOperand(MI.getNumExplicitDefs())
- .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
- MI.RemoveOperand(ArgOffset + Intr->LodIndex);
- Intr = NewImageDimIntr;
- }
- }
- }
-
- // Optimize _mip away, when 'lod' is zero
- if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
- int64_t ConstantLod;
- if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
- m_ICst(ConstantLod))) {
- if (ConstantLod == 0) {
- // TODO: Change intrinsic opcode and remove operand instead or replacing
- // it with 0, as the _L to _LZ handling is done above.
- MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
- --CorrectedNumVAddrs;
- }
- }
- }
-
// Rewrite the addressing register layout before doing anything else.
if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
// 16 bit gradients are supported, but are tied to the A16 control
@@ -4494,9 +4525,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
if (BaseOpcode->Store) { // No TFE for stores?
// TODO: Handle dmask trim
- Register VData = MI.getOperand(1).getReg();
- LLT Ty = MRI->getType(VData);
- if (!Ty.isVector() || Ty.getElementType() != S16)
+ if (!Ty.isVector() || !IsD16)
return true;
Register RepackedReg = handleD16VData(B, *MRI, VData, true);
@@ -4508,9 +4537,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
}
Register DstReg = MI.getOperand(0).getReg();
- LLT Ty = MRI->getType(DstReg);
const LLT EltTy = Ty.getScalarType();
- const bool IsD16 = Ty.getScalarType() == S16;
const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
// Confirm that the return type is large enough for the dmask specified
@@ -4918,6 +4945,12 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
return true;
}
+static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C) {
+ B.buildConstant(MI.getOperand(0).getReg(), C);
+ MI.eraseFromParent();
+ return true;
+}
+
bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
MachineInstr &MI) const {
MachineIRBuilder &B = Helper.MIRBuilder;
@@ -5021,12 +5054,20 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_implicitarg_ptr:
return legalizeImplicitArgPtr(MI, MRI, B);
case Intrinsic::amdgcn_workitem_id_x:
+ if (ST.getMaxWorkitemID(B.getMF().getFunction(), 0) == 0)
+ return replaceWithConstant(B, MI, 0);
return legalizePreloadedArgIntrin(MI, MRI, B,
AMDGPUFunctionArgInfo::WORKITEM_ID_X);
case Intrinsic::amdgcn_workitem_id_y:
+ if (ST.getMaxWorkitemID(B.getMF().getFunction(), 1) == 0)
+ return replaceWithConstant(B, MI, 0);
+
return legalizePreloadedArgIntrin(MI, MRI, B,
AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
case Intrinsic::amdgcn_workitem_id_z:
+ if (ST.getMaxWorkitemID(B.getMF().getFunction(), 2) == 0)
+ return replaceWithConstant(B, MI, 0);
+
return legalizePreloadedArgIntrin(MI, MRI, B,
AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
case Intrinsic::amdgcn_workgroup_id_x:
@@ -5105,16 +5146,29 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_struct_buffer_atomic_inc:
case Intrinsic::amdgcn_raw_buffer_atomic_dec:
case Intrinsic::amdgcn_struct_buffer_atomic_dec:
- case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
- case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
- case Intrinsic::amdgcn_buffer_atomic_fadd:
case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
return legalizeBufferAtomic(MI, B, IntrID);
+ case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
+ case Intrinsic::amdgcn_struct_buffer_atomic_fadd: {
+ Register DstReg = MI.getOperand(0).getReg();
+ if (!MRI.use_empty(DstReg) && !ST.hasGFX90AInsts()) {
+ Function &F = B.getMF().getFunction();
+ DiagnosticInfoUnsupported NoFpRet(
+ F, "return versions of fp atomics not supported", B.getDebugLoc(),
+ DS_Error);
+ F.getContext().diagnose(NoFpRet);
+ B.buildUndef(DstReg);
+ MI.eraseFromParent();
+ return true;
+ }
+
+ return legalizeBufferAtomic(MI, B, IntrID);
+ }
case Intrinsic::amdgcn_atomic_inc:
return legalizeAtomicIncDec(MI, B, true);
case Intrinsic::amdgcn_atomic_dec:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index 7faf0436f995..964a41d3d740 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -21,7 +21,6 @@
namespace llvm {
class GCNTargetMachine;
-class LLVMContext;
class GCNSubtarget;
class MachineIRBuilder;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index 49cf6db5197f..c28427758ac7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -58,9 +58,6 @@ private:
// "FuncName" exists. It may create a new function prototype in pre-link mode.
FunctionCallee getFunction(Module *M, const FuncInfo &fInfo);
- // Replace a normal function with its native version.
- bool replaceWithNative(CallInst *CI, const FuncInfo &FInfo);
-
bool parseFunctionName(const StringRef &FMangledName, FuncInfo &FInfo);
bool TDOFold(CallInst *CI, const FuncInfo &FInfo);
@@ -90,24 +87,6 @@ private:
double& Res1, Constant *copr0, Constant *copr1, Constant *copr2);
bool evaluateCall(CallInst *aCI, const FuncInfo &FInfo);
- // exp
- bool fold_exp(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
-
- // exp2
- bool fold_exp2(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
-
- // exp10
- bool fold_exp10(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
-
- // log
- bool fold_log(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
-
- // log2
- bool fold_log2(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
-
- // log10
- bool fold_log10(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
-
// sqrt
bool fold_sqrt(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
@@ -623,7 +602,8 @@ bool AMDGPULibCalls::fold(CallInst *CI, AliasAnalysis *AA) {
Function *Callee = CI->getCalledFunction();
// Ignore indirect calls.
- if (Callee == 0) return false;
+ if (Callee == nullptr)
+ return false;
BasicBlock *BB = CI->getParent();
LLVMContext &Context = CI->getParent()->getContext();
@@ -778,27 +758,6 @@ bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) {
return false;
}
-bool AMDGPULibCalls::replaceWithNative(CallInst *CI, const FuncInfo &FInfo) {
- Module *M = CI->getModule();
- if (getArgType(FInfo) != AMDGPULibFunc::F32 ||
- FInfo.getPrefix() != AMDGPULibFunc::NOPFX ||
- !HasNative(FInfo.getId()))
- return false;
-
- AMDGPULibFunc nf = FInfo;
- nf.setPrefix(AMDGPULibFunc::NATIVE);
- if (FunctionCallee FPExpr = getFunction(M, nf)) {
- LLVM_DEBUG(dbgs() << "AMDIC: " << *CI << " ---> ");
-
- CI->setCalledFunction(FPExpr);
-
- LLVM_DEBUG(dbgs() << *CI << '\n');
-
- return true;
- }
- return false;
-}
-
// [native_]half_recip(c) ==> 1.0/c
bool AMDGPULibCalls::fold_recip(CallInst *CI, IRBuilder<> &B,
const FuncInfo &FInfo) {
@@ -1402,8 +1361,8 @@ AllocaInst* AMDGPULibCalls::insertAlloca(CallInst *UI, IRBuilder<> &B,
Function *UCallee = UI->getCalledFunction();
Type *RetType = UCallee->getReturnType();
B.SetInsertPoint(&*ItNew);
- AllocaInst *Alloc = B.CreateAlloca(RetType, 0,
- std::string(prefix) + UI->getName());
+ AllocaInst *Alloc =
+ B.CreateAlloca(RetType, nullptr, std::string(prefix) + UI->getName());
Alloc->setAlignment(
Align(UCallee->getParent()->getDataLayout().getTypeAllocSize(RetType)));
return Alloc;
@@ -1724,7 +1683,8 @@ bool AMDGPUSimplifyLibCalls::runOnFunction(Function &F) {
// Ignore indirect calls.
Function *Callee = CI->getCalledFunction();
- if (Callee == 0) continue;
+ if (Callee == nullptr)
+ continue;
LLVM_DEBUG(dbgs() << "AMDIC: try folding " << *CI << "\n";
dbgs().flush());
@@ -1757,7 +1717,7 @@ PreservedAnalyses AMDGPUSimplifyLibCallsPass::run(Function &F,
// Ignore indirect calls.
Function *Callee = CI->getCalledFunction();
- if (Callee == 0)
+ if (Callee == nullptr)
continue;
LLVM_DEBUG(dbgs() << "AMDIC: try folding " << *CI << "\n";
@@ -1783,9 +1743,10 @@ bool AMDGPUUseNativeCalls::runOnFunction(Function &F) {
// Ignore indirect calls.
Function *Callee = CI->getCalledFunction();
- if (Callee == 0) continue;
+ if (Callee == nullptr)
+ continue;
- if(Simplifier.useNative(CI))
+ if (Simplifier.useNative(CI))
Changed = true;
}
}
@@ -1811,7 +1772,7 @@ PreservedAnalyses AMDGPUUseNativeCallsPass::run(Function &F,
// Ignore indirect calls.
Function *Callee = CI->getCalledFunction();
- if (Callee == 0)
+ if (Callee == nullptr)
continue;
if (Simplifier.useNative(CI))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h
index c97223b047e8..dc0ac72016f3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h
@@ -10,6 +10,7 @@
#define _AMDGPU_LIBFUNC_H_
#include "llvm/ADT/StringRef.h"
+#include <memory>
namespace llvm {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index 0c743a77092c..593388a4d819 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -15,9 +15,8 @@
using namespace llvm;
AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF)
- : MachineFunctionInfo(), Mode(MF.getFunction()),
- IsEntryFunction(
- AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv())),
+ : Mode(MF.getFunction()), IsEntryFunction(AMDGPU::isEntryFunctionCC(
+ MF.getFunction().getCallingConv())),
IsModuleEntryFunction(
AMDGPU::isModuleEntryFunctionCC(MF.getFunction().getCallingConv())),
NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index 10ff50040c6a..48cf46b5f871 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -15,8 +15,6 @@
namespace llvm {
-class GCNSubtarget;
-
class AMDGPUMachineFunction : public MachineFunctionInfo {
/// A map to keep track of local memory objects and their offsets within the
/// local memory space.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h b/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h
index 8af7979dba8b..5cefc83e25e0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h
@@ -29,4 +29,4 @@ const char NoteNameV3[] = "AMDGPU";
} // End namespace ElfNote
} // End namespace AMDGPU
} // End namespace llvm
-#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUNOTETYPE_H
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUPTNOTE_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
index 7c4eb71882c7..f91f31508ad2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
@@ -463,7 +463,7 @@ bool AMDGPUPrintfRuntimeBindingImpl::lowerPrintfForGpu(Module &M) {
WhatToStore.push_back(Arg);
}
} else if (isa<FixedVectorType>(ArgType)) {
- Type *IType = NULL;
+ Type *IType = nullptr;
uint32_t EleCount = cast<FixedVectorType>(ArgType)->getNumElements();
uint32_t EleSize = ArgType->getScalarSizeInBits();
uint32_t TotalSize = EleCount * EleSize;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index f9a9fe403ff6..2d8126a49327 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -789,6 +789,17 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
Align Alignment =
DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getValueType());
uint64_t AllocSize = DL.getTypeAllocSize(GV->getValueType());
+
+ // HIP uses an extern unsized array in local address space for dynamically
+ // allocated shared memory. In that case, we have to disable the promotion.
+ if (GV->hasExternalLinkage() && AllocSize == 0) {
+ LocalMemLimit = 0;
+ LLVM_DEBUG(dbgs() << "Function has a reference to externally allocated "
+ "local memory. Promoting to local memory "
+ "disabled.\n");
+ return false;
+ }
+
AllocatedSizes.emplace_back(AllocSize, Alignment);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index 3ce67a733c10..0df6f4d45b06 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -36,6 +36,7 @@ protected:
MachineIRBuilder &B;
MachineFunction &MF;
MachineRegisterInfo &MRI;
+ const GCNSubtarget &Subtarget;
const RegisterBankInfo &RBI;
const TargetRegisterInfo &TRI;
const SIInstrInfo &TII;
@@ -44,9 +45,9 @@ protected:
public:
AMDGPURegBankCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper)
: B(B), MF(B.getMF()), MRI(*B.getMRI()),
- RBI(*MF.getSubtarget().getRegBankInfo()),
- TRI(*MF.getSubtarget().getRegisterInfo()),
- TII(*MF.getSubtarget<GCNSubtarget>().getInstrInfo()), Helper(Helper){};
+ Subtarget(MF.getSubtarget<GCNSubtarget>()),
+ RBI(*Subtarget.getRegBankInfo()), TRI(*Subtarget.getRegisterInfo()),
+ TII(*Subtarget.getInstrInfo()), Helper(Helper){};
bool isVgprRegBank(Register Reg);
Register getAsVgpr(Register Reg);
@@ -193,7 +194,10 @@ bool AMDGPURegBankCombinerHelper::matchFPMinMaxToMed3(
MachineInstr &MI, Med3MatchInfo &MatchInfo) {
Register Dst = MI.getOperand(0).getReg();
LLT Ty = MRI.getType(Dst);
- if (Ty != LLT::scalar(16) && Ty != LLT::scalar(32))
+
+ // med3 for f16 is only available on gfx9+, and not available for v2f16.
+ if ((Ty != LLT::scalar(16) || !Subtarget.hasMed3_16()) &&
+ Ty != LLT::scalar(32))
return false;
auto OpcodeTriple = getMinMaxPair(MI.getOpcode());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index c60012bcfe2e..de2dccef804a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -718,8 +718,11 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
const unsigned WaveAndOpc = Subtarget.isWave32() ?
AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
- const unsigned MovTermOpc = Subtarget.isWave32() ?
- AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
+ const unsigned MovExecOpc =
+ Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ const unsigned MovExecTermOpc =
+ Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
+
const unsigned XorTermOpc = Subtarget.isWave32() ?
AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
const unsigned AndSaveExecOpc = Subtarget.isWave32() ?
@@ -996,12 +999,12 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
// Save the EXEC mask before the loop.
- BuildMI(MBB, MBB.end(), DL, TII->get(MovTermOpc), SaveExecReg)
+ BuildMI(MBB, MBB.end(), DL, TII->get(MovExecOpc), SaveExecReg)
.addReg(ExecReg);
// Restore the EXEC mask after the loop.
B.setMBB(*RestoreExecBB);
- B.buildInstr(MovTermOpc)
+ B.buildInstr(MovExecTermOpc)
.addDef(ExecReg)
.addReg(SaveExecReg);
@@ -2953,7 +2956,9 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
break;
}
case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
- case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
+ case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
+ case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
+ case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
const AMDGPU::RsrcIntrinsic *RSrcIntrin
= AMDGPU::lookupRsrcIntrinsic(MI.getIntrinsicID());
assert(RSrcIntrin && RSrcIntrin->IsImage);
@@ -3691,6 +3696,16 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32);
break;
}
+ case AMDGPU::G_AMDGPU_WAVE_ADDRESS: {
+ // This case is weird because we expect a physical register in the source,
+ // but need to set a bank anyway.
+ //
+ // We could select the result to SGPR or VGPR, but for the one current use
+ // it's more practical to always use VGPR.
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+ OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
+ break;
+ }
case AMDGPU::G_INSERT: {
unsigned BankID = getMappingType(MRI, MI);
unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
@@ -4078,7 +4093,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_mqsad_pk_u16_u8:
case Intrinsic::amdgcn_mqsad_u32_u8:
case Intrinsic::amdgcn_cvt_pk_u8_f32:
- case Intrinsic::amdgcn_alignbit:
case Intrinsic::amdgcn_alignbyte:
case Intrinsic::amdgcn_perm:
case Intrinsic::amdgcn_fdot2:
@@ -4276,7 +4290,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
break;
}
case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
- case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
+ case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
+ case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
+ case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
auto IntrID = MI.getIntrinsicID();
const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID);
assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic");
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
index 45f7c2f369bd..1c6c63dd5b25 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
@@ -353,7 +353,7 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
// off any return attributes, e.g. zeroext doesn't make sense with a struct.
NewFunc->stealArgumentListFrom(F);
- AttrBuilder RetAttrs;
+ AttributeMask RetAttrs;
RetAttrs.addAttribute(Attribute::SExt);
RetAttrs.addAttribute(Attribute::ZExt);
RetAttrs.addAttribute(Attribute::NoAlias);
@@ -433,7 +433,7 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
PointerType *ArgType = cast<PointerType>(Arg.getType());
- auto *EltTy = ArgType->getElementType();
+ auto *EltTy = ArgType->getPointerElementType();
const auto Align =
DL->getValueOrABITypeAlignment(Arg.getParamAlign(), EltTy);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index cd05797fdbdb..e82f9232b114 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -269,7 +269,6 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
HasGetWaveIdInst(false),
HasSMemTimeInst(false),
HasShaderCyclesRegister(false),
- HasRegisterBanking(false),
HasVOP3Literal(false),
HasNoDataDepHazard(false),
FlatAddressSpace(false),
@@ -772,11 +771,11 @@ unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
}
unsigned
-GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratchInit) const {
+GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const {
if (getGeneration() >= AMDGPUSubtarget::GFX10)
return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
- if (HasFlatScratchInit || HasArchitectedFlatScratch) {
+ if (HasFlatScratch || HasArchitectedFlatScratch) {
if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
@@ -794,20 +793,11 @@ unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
}
unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
- // The logic to detect if the function has
- // flat scratch init is slightly different than how
- // SIMachineFunctionInfo constructor derives.
- // We don't use amdgpu-calls, amdgpu-stack-objects
- // attributes and isAmdHsaOrMesa here as it doesn't really matter.
- // TODO: Outline this derivation logic and have just
- // one common function in the backend to avoid duplication.
- bool isEntry = AMDGPU::isEntryFunctionCC(F.getCallingConv());
- bool FunctionHasFlatScratchInit = false;
- if (hasFlatAddressSpace() && isEntry && !flatScratchIsArchitected() &&
- enableFlatScratch()) {
- FunctionHasFlatScratchInit = true;
- }
- return getBaseReservedNumSGPRs(FunctionHasFlatScratchInit);
+ // In principle we do not need to reserve SGPR pair used for flat_scratch if
+ // we know flat instructions do not access the stack anywhere in the
+ // program. For now assume it's needed if we have flat instructions.
+ const bool KernelUsesFlatScratch = hasFlatAddressSpace();
+ return getBaseReservedNumSGPRs(KernelUsesFlatScratch);
}
unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 88ed4b2b7a24..7f1b94be4ffe 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -212,7 +212,19 @@ public:
/// Returns the offset in bytes from the start of the input buffer
/// of the first explicit kernel argument.
unsigned getExplicitKernelArgOffset(const Function &F) const {
- return isAmdHsaOrMesa(F) ? 0 : 36;
+ switch (TargetTriple.getOS()) {
+ case Triple::AMDHSA:
+ case Triple::AMDPAL:
+ case Triple::Mesa3D:
+ return 0;
+ case Triple::UnknownOS:
+ default:
+ // For legacy reasons unknown/other is treated as a different version of
+ // mesa.
+ return 36;
+ }
+
+ llvm_unreachable("invalid triple OS");
}
/// \returns Maximum number of work groups per compute unit supported by the
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index 226646a96953..dd3676f3b707 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -21,8 +21,6 @@
namespace llvm {
-class ScheduleDAGMILive;
-
//===----------------------------------------------------------------------===//
// AMDGPU Target Machine (R600+)
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 09c5eb192e1f..a8df7789c8a1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -844,15 +844,8 @@ bool GCNTTIImpl::isInlineAsmSourceOfDivergence(
TLI->ComputeConstraintToUse(TC, SDValue());
- Register AssignedReg;
- const TargetRegisterClass *RC;
- std::tie(AssignedReg, RC) = TLI->getRegForInlineAsmConstraint(
- TRI, TC.ConstraintCode, TC.ConstraintVT);
- if (AssignedReg) {
- // FIXME: This is a workaround for getRegForInlineAsmConstraint
- // returning VS_32
- RC = TRI->getPhysRegClass(AssignedReg);
- }
+ const TargetRegisterClass *RC = TLI->getRegForInlineAsmConstraint(
+ TRI, TC.ConstraintCode, TC.ConstraintVT).second;
// For AGPR constraints null is returned on subtargets without AGPRs, so
// assume divergent for null.
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 2bb59086f391..c1c88d9a7462 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -62,7 +62,7 @@ class AMDGPUOperand : public MCParsedAsmOperand {
public:
AMDGPUOperand(KindTy Kind_, const AMDGPUAsmParser *AsmParser_)
- : MCParsedAsmOperand(), Kind(Kind_), AsmParser(AsmParser_) {}
+ : Kind(Kind_), AsmParser(AsmParser_) {}
using Ptr = std::unique_ptr<AMDGPUOperand>;
@@ -1548,6 +1548,7 @@ private:
bool validateVccOperand(unsigned Reg) const;
bool validateVOPLiteral(const MCInst &Inst, const OperandVector &Operands);
bool validateMAIAccWrite(const MCInst &Inst, const OperandVector &Operands);
+ bool validateMFMA(const MCInst &Inst, const OperandVector &Operands);
bool validateAGPRLdSt(const MCInst &Inst) const;
bool validateVGPRAlign(const MCInst &Inst) const;
bool validateGWS(const MCInst &Inst, const OperandVector &Operands);
@@ -3613,6 +3614,40 @@ bool AMDGPUAsmParser::validateMAIAccWrite(const MCInst &Inst,
return true;
}
+bool AMDGPUAsmParser::validateMFMA(const MCInst &Inst,
+ const OperandVector &Operands) {
+ const unsigned Opc = Inst.getOpcode();
+ const MCInstrDesc &Desc = MII.get(Opc);
+
+ if ((Desc.TSFlags & SIInstrFlags::IsMAI) == 0)
+ return true;
+
+ const int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
+ if (Src2Idx == -1)
+ return true;
+
+ const MCOperand &Src2 = Inst.getOperand(Src2Idx);
+ if (!Src2.isReg())
+ return true;
+
+ MCRegister Src2Reg = Src2.getReg();
+ MCRegister DstReg = Inst.getOperand(0).getReg();
+ if (Src2Reg == DstReg)
+ return true;
+
+ const MCRegisterInfo *TRI = getContext().getRegisterInfo();
+ if (TRI->getRegClass(Desc.OpInfo[0].RegClass).getSizeInBits() <= 128)
+ return true;
+
+ if (isRegIntersect(Src2Reg, DstReg, TRI)) {
+ Error(getRegLoc(mc2PseudoReg(Src2Reg), Operands),
+ "source 2 operand must not partially overlap with dst");
+ return false;
+ }
+
+ return true;
+}
+
bool AMDGPUAsmParser::validateDivScale(const MCInst &Inst) {
switch (Inst.getOpcode()) {
default:
@@ -4297,6 +4332,9 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
if (!validateMAIAccWrite(Inst, Operands)) {
return false;
}
+ if (!validateMFMA(Inst, Operands)) {
+ return false;
+ }
if (!validateCoherencyBits(Inst, Operands, IDLoc)) {
return false;
}
@@ -4568,7 +4606,13 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
uint64_t AccumOffset = 0;
SMRange SGPRRange;
uint64_t NextFreeSGPR = 0;
- unsigned UserSGPRCount = 0;
+
+ // Count the number of user SGPRs implied from the enabled feature bits.
+ unsigned ImpliedUserSGPRCount = 0;
+
+ // Track if the asm explicitly contains the directive for the user SGPR
+ // count.
+ Optional<unsigned> ExplicitUserSGPRCount;
bool ReserveVCC = true;
bool ReserveFlatScr = true;
Optional<bool> EnableWavefrontSize32;
@@ -4617,6 +4661,8 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
if (!isUInt<sizeof(KD.kernarg_size) * CHAR_BIT>(Val))
return OutOfRangeError(ValRange);
KD.kernarg_size = Val;
+ } else if (ID == ".amdhsa_user_sgpr_count") {
+ ExplicitUserSGPRCount = Val;
} else if (ID == ".amdhsa_user_sgpr_private_segment_buffer") {
if (hasArchitectedFlatScratch())
return Error(IDRange.Start,
@@ -4626,31 +4672,31 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER,
Val, ValRange);
if (Val)
- UserSGPRCount += 4;
+ ImpliedUserSGPRCount += 4;
} else if (ID == ".amdhsa_user_sgpr_dispatch_ptr") {
PARSE_BITS_ENTRY(KD.kernel_code_properties,
KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR, Val,
ValRange);
if (Val)
- UserSGPRCount += 2;
+ ImpliedUserSGPRCount += 2;
} else if (ID == ".amdhsa_user_sgpr_queue_ptr") {
PARSE_BITS_ENTRY(KD.kernel_code_properties,
KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR, Val,
ValRange);
if (Val)
- UserSGPRCount += 2;
+ ImpliedUserSGPRCount += 2;
} else if (ID == ".amdhsa_user_sgpr_kernarg_segment_ptr") {
PARSE_BITS_ENTRY(KD.kernel_code_properties,
KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR,
Val, ValRange);
if (Val)
- UserSGPRCount += 2;
+ ImpliedUserSGPRCount += 2;
} else if (ID == ".amdhsa_user_sgpr_dispatch_id") {
PARSE_BITS_ENTRY(KD.kernel_code_properties,
KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID, Val,
ValRange);
if (Val)
- UserSGPRCount += 2;
+ ImpliedUserSGPRCount += 2;
} else if (ID == ".amdhsa_user_sgpr_flat_scratch_init") {
if (hasArchitectedFlatScratch())
return Error(IDRange.Start,
@@ -4660,13 +4706,13 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT, Val,
ValRange);
if (Val)
- UserSGPRCount += 2;
+ ImpliedUserSGPRCount += 2;
} else if (ID == ".amdhsa_user_sgpr_private_segment_size") {
PARSE_BITS_ENTRY(KD.kernel_code_properties,
KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE,
Val, ValRange);
if (Val)
- UserSGPRCount += 1;
+ ImpliedUserSGPRCount += 1;
} else if (ID == ".amdhsa_wavefront_size32") {
if (IVersion.Major < 10)
return Error(IDRange.Start, "directive requires gfx10+", IDRange);
@@ -4850,6 +4896,13 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT,
SGPRBlocks);
+ if (ExplicitUserSGPRCount && ImpliedUserSGPRCount > *ExplicitUserSGPRCount)
+ return TokError("amdgpu_user_sgpr_count smaller than than implied by "
+ "enabled user SGPRs");
+
+ unsigned UserSGPRCount =
+ ExplicitUserSGPRCount ? *ExplicitUserSGPRCount : ImpliedUserSGPRCount;
+
if (!isUInt<COMPUTE_PGM_RSRC2_USER_SGPR_COUNT_WIDTH>(UserSGPRCount))
return TokError("too many user SGPRs enabled");
AMDHSA_BITS_SET(KD.compute_pgm_rsrc2, COMPUTE_PGM_RSRC2_USER_SGPR_COUNT,
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 104b5160b985..c4043177b618 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -89,7 +89,6 @@ class DS_Real <DS_Pseudo ps> :
!if(!or(ps.has_data0, ps.has_gws_data0), data0{9}, 0));
}
-
// DS Pseudo instructions
class DS_0A1D_NORET<string opName, RegisterClass rc = VGPR_32>
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index c7ec5308e6d0..c530d3cb49f0 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -915,7 +915,7 @@ class FlatSignedAtomicPatNoRtn <FLAT_Pseudo inst, SDPatternOperator node, ValueT
class FlatSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt,
ValueType data_vt = vt> : GCNPat <
(vt (node (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)),
- (inst $vaddr, $data, $offset)
+ (inst VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)
>;
class ScratchLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 0f8dd0b3bf58..c0592f6f3c7a 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -95,7 +95,9 @@ static bool isDGEMM(unsigned Opcode) {
return Opcode == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
Opcode == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64 ||
Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_e64 ||
- Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64;
+ Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64 ||
+ Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64 ||
+ Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64;
}
static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) {
@@ -1438,7 +1440,7 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
if (!Use.isReg())
continue;
- unsigned Reg = Use.getReg();
+ Register Reg = Use.getReg();
bool FullReg;
const MachineInstr *MI1;
@@ -1477,6 +1479,8 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
switch (Opc1) {
case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
+ case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
+ case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
if (!isXDL(ST, *MI))
NeedWaitStates = DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
break;
@@ -1509,6 +1513,8 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
switch (Opc1) {
case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
+ case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
+ case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
NeedWaitStates = DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
break;
case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index 162121c2c525..716bc027a894 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -25,7 +25,6 @@ class MachineFunction;
class MachineInstr;
class MachineOperand;
class MachineRegisterInfo;
-class ScheduleDAG;
class SIInstrInfo;
class SIRegisterInfo;
class GCNSubtarget;
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index 82c09378acac..fb106d98c162 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -27,7 +27,7 @@ void llvm::printLivesAt(SlotIndex SI,
<< *LIS.getInstructionFromIndex(SI);
unsigned Num = 0;
for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
- const unsigned Reg = Register::index2VirtReg(I);
+ const Register Reg = Register::index2VirtReg(I);
if (!LIS.hasInterval(Reg))
continue;
const auto &LI = LIS.getInterval(Reg);
@@ -487,7 +487,7 @@ void GCNRPTracker::printLiveRegs(raw_ostream &OS, const LiveRegSet& LiveRegs,
const MachineRegisterInfo &MRI) {
const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
- unsigned Reg = Register::index2VirtReg(I);
+ Register Reg = Register::index2VirtReg(I);
auto It = LiveRegs.find(Reg);
if (It != LiveRegs.end() && It->second.any())
OS << ' ' << printVRegOrUnit(Reg, TRI) << ':'
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 53d6ff0aa731..a6e42ad3dfca 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -140,4 +140,4 @@ public:
} // End namespace llvm
-#endif // GCNSCHEDSTRATEGY_H
+#endif // LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index d8bc0b2df2bd..0cd2cfa2f0e7 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -153,7 +153,6 @@ protected:
bool HasGetWaveIdInst;
bool HasSMemTimeInst;
bool HasShaderCyclesRegister;
- bool HasRegisterBanking;
bool HasVOP3Literal;
bool HasNoDataDepHazard;
bool FlatAddressSpace;
@@ -723,10 +722,6 @@ public:
return HasShaderCyclesRegister;
}
- bool hasRegisterBanking() const {
- return HasRegisterBanking;
- }
-
bool hasVOP3Literal() const {
return HasVOP3Literal;
}
@@ -1029,7 +1024,7 @@ public:
/// \returns Reserved number of SGPRs. This is common
/// utility function called by MachineFunction and
/// Function variants of getReservedNumSGPRs.
- unsigned getBaseReservedNumSGPRs(const bool HasFlatScratchInit) const;
+ unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const;
/// \returns Reserved number of SGPRs for given machine function \p MF.
unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index b68b4b12e750..76663b563150 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -1397,21 +1397,26 @@ void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo,
unsigned Vmcnt, Expcnt, Lgkmcnt;
decodeWaitcnt(ISA, SImm16, Vmcnt, Expcnt, Lgkmcnt);
+ bool IsDefaultVmcnt = Vmcnt == getVmcntBitMask(ISA);
+ bool IsDefaultExpcnt = Expcnt == getExpcntBitMask(ISA);
+ bool IsDefaultLgkmcnt = Lgkmcnt == getLgkmcntBitMask(ISA);
+ bool PrintAll = IsDefaultVmcnt && IsDefaultExpcnt && IsDefaultLgkmcnt;
+
bool NeedSpace = false;
- if (Vmcnt != getVmcntBitMask(ISA)) {
+ if (!IsDefaultVmcnt || PrintAll) {
O << "vmcnt(" << Vmcnt << ')';
NeedSpace = true;
}
- if (Expcnt != getExpcntBitMask(ISA)) {
+ if (!IsDefaultExpcnt || PrintAll) {
if (NeedSpace)
O << ' ';
O << "expcnt(" << Expcnt << ')';
NeedSpace = true;
}
- if (Lgkmcnt != getLgkmcntBitMask(ISA)) {
+ if (!IsDefaultLgkmcnt || PrintAll) {
if (NeedSpace)
O << ' ';
O << "lgkmcnt(" << Lgkmcnt << ')';
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
index 7708579a4491..ded3fb7ab8d9 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -15,8 +15,7 @@
using namespace llvm;
AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT,
- const MCTargetOptions &Options)
- : MCAsmInfoELF() {
+ const MCTargetOptions &Options) {
CodePointerSize = (TT.getArch() == Triple::amdgcn) ? 8 : 4;
StackGrowsUp = true;
HasSingleParameterDotFile = false;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 9a9a2c973f44..9578bdb0bad0 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -319,6 +319,10 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
<< KD.private_segment_fixed_size << '\n';
OS << "\t\t.amdhsa_kernarg_size " << KD.kernarg_size << '\n';
+ PRINT_FIELD(OS, ".amdhsa_user_sgpr_count", KD,
+ compute_pgm_rsrc2,
+ amdhsa::COMPUTE_PGM_RSRC2_USER_SGPR_COUNT);
+
if (!hasArchitectedFlatScratch(STI))
PRINT_FIELD(
OS, ".amdhsa_user_sgpr_private_segment_buffer", KD,
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index 6dd886367302..cf03fd682143 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -131,6 +131,38 @@ def MIMGMIPMappingTable : GenericTable {
let PrimaryKeyName = "getMIMGMIPMappingInfo";
}
+class MIMGBiasMapping<MIMGBaseOpcode bias, MIMGBaseOpcode nobias> {
+ MIMGBaseOpcode Bias = bias;
+ MIMGBaseOpcode NoBias = nobias;
+}
+
+def MIMGBiasMappingTable : GenericTable {
+ let FilterClass = "MIMGBiasMapping";
+ let CppTypeName = "MIMGBiasMappingInfo";
+ let Fields = ["Bias", "NoBias"];
+ string TypeOf_Bias = "MIMGBaseOpcode";
+ string TypeOf_NoBias = "MIMGBaseOpcode";
+
+ let PrimaryKey = ["Bias"];
+ let PrimaryKeyName = "getMIMGBiasMappingInfo";
+}
+
+class MIMGOffsetMapping<MIMGBaseOpcode offset, MIMGBaseOpcode nooffset> {
+ MIMGBaseOpcode Offset = offset;
+ MIMGBaseOpcode NoOffset = nooffset;
+}
+
+def MIMGOffsetMappingTable : GenericTable {
+ let FilterClass = "MIMGOffsetMapping";
+ let CppTypeName = "MIMGOffsetMappingInfo";
+ let Fields = ["Offset", "NoOffset"];
+ string TypeOf_Offset = "MIMGBaseOpcode";
+ string TypeOf_NoOffset = "MIMGBaseOpcode";
+
+ let PrimaryKey = ["Offset"];
+ let PrimaryKeyName = "getMIMGOffsetMappingInfo";
+}
+
class MIMGG16Mapping<MIMGBaseOpcode g, MIMGBaseOpcode g16> {
MIMGBaseOpcode G = g;
MIMGBaseOpcode G16 = g16;
@@ -1070,6 +1102,9 @@ class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> {
AMDGPUDimProps Dim = I.P.Dim;
AMDGPUImageDimIntrinsicEval DimEval = AMDGPUImageDimIntrinsicEval<I.P>;
+ bits<8> NumOffsetArgs = DimEval.NumOffsetArgs;
+ bits<8> NumBiasArgs = DimEval.NumBiasArgs;
+ bits<8> NumZCompareArgs = DimEval.NumZCompareArgs;
bits<8> NumGradients = DimEval.NumGradientArgs;
bits<8> NumDmask = DimEval.NumDmaskArgs;
bits<8> NumData = DimEval.NumDataArgs;
@@ -1078,6 +1113,9 @@ class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> {
bits<8> DMaskIndex = DimEval.DmaskArgIndex;
bits<8> VAddrStart = DimEval.VAddrArgIndex;
+ bits<8> OffsetIndex = DimEval.OffsetArgIndex;
+ bits<8> BiasIndex = DimEval.BiasArgIndex;
+ bits<8> ZCompareIndex = DimEval.ZCompareArgIndex;
bits<8> GradientStart = DimEval.GradientArgIndex;
bits<8> CoordStart = DimEval.CoordArgIndex;
bits<8> LodIndex = DimEval.LodArgIndex;
@@ -1089,6 +1127,8 @@ class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> {
bits<8> TexFailCtrlIndex = DimEval.TexFailCtrlArgIndex;
bits<8> CachePolicyIndex = DimEval.CachePolicyArgIndex;
+ bits<8> BiasTyArg = !add(I.P.NumRetAndDataAnyTypes,
+ !if(!eq(NumOffsetArgs, 0), 0, I.P.ExtraAddrArgs[0].Type.isAny));
bits<8> GradientTyArg = !add(I.P.NumRetAndDataAnyTypes,
!foldl(0, I.P.ExtraAddrArgs, cnt, arg, !add(cnt, arg.Type.isAny)));
bits<8> CoordTyArg = !add(GradientTyArg, !if(I.P.Gradients, 1, 0));
@@ -1096,10 +1136,10 @@ class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> {
def ImageDimIntrinsicTable : GenericTable {
let FilterClass = "ImageDimIntrinsicInfo";
- let Fields = ["Intr", "BaseOpcode", "Dim", "NumGradients", "NumDmask", "NumData", "NumVAddrs", "NumArgs",
- "DMaskIndex", "VAddrStart", "GradientStart", "CoordStart", "LodIndex", "MipIndex", "VAddrEnd",
+ let Fields = ["Intr", "BaseOpcode", "Dim", "NumOffsetArgs", "NumBiasArgs", "NumZCompareArgs", "NumGradients", "NumDmask", "NumData", "NumVAddrs", "NumArgs",
+ "DMaskIndex", "VAddrStart", "OffsetIndex", "BiasIndex", "ZCompareIndex", "GradientStart", "CoordStart", "LodIndex", "MipIndex", "VAddrEnd",
"RsrcIndex", "SampIndex", "UnormIndex", "TexFailCtrlIndex", "CachePolicyIndex",
- "GradientTyArg", "CoordTyArg"];
+ "BiasTyArg", "GradientTyArg", "CoordTyArg"];
string TypeOf_BaseOpcode = "MIMGBaseOpcode";
string TypeOf_Dim = "MIMGDim";
@@ -1132,6 +1172,66 @@ def : MIMGLZMapping<IMAGE_GATHER4_C_L_O, IMAGE_GATHER4_C_LZ_O>;
def : MIMGMIPMapping<IMAGE_LOAD_MIP, IMAGE_LOAD>;
def : MIMGMIPMapping<IMAGE_STORE_MIP, IMAGE_STORE>;
+// Bias to NoBias Optimization Mapping
+def : MIMGBiasMapping<IMAGE_SAMPLE_B, IMAGE_SAMPLE>;
+def : MIMGBiasMapping<IMAGE_SAMPLE_B_CL, IMAGE_SAMPLE_CL>;
+def : MIMGBiasMapping<IMAGE_SAMPLE_C_B, IMAGE_SAMPLE_C>;
+def : MIMGBiasMapping<IMAGE_SAMPLE_C_B_CL, IMAGE_SAMPLE_C_CL>;
+def : MIMGBiasMapping<IMAGE_SAMPLE_B_O, IMAGE_SAMPLE_O>;
+def : MIMGBiasMapping<IMAGE_SAMPLE_B_CL_O, IMAGE_SAMPLE_CL_O>;
+def : MIMGBiasMapping<IMAGE_SAMPLE_C_B_O, IMAGE_SAMPLE_C_O>;
+def : MIMGBiasMapping<IMAGE_SAMPLE_C_B_CL_O, IMAGE_SAMPLE_C_CL_O>;
+def : MIMGBiasMapping<IMAGE_GATHER4_B, IMAGE_GATHER4>;
+def : MIMGBiasMapping<IMAGE_GATHER4_B_CL, IMAGE_GATHER4_CL>;
+def : MIMGBiasMapping<IMAGE_GATHER4_C_B, IMAGE_GATHER4_C>;
+def : MIMGBiasMapping<IMAGE_GATHER4_C_B_CL, IMAGE_GATHER4_C_CL>;
+def : MIMGBiasMapping<IMAGE_GATHER4_B_O, IMAGE_GATHER4_O>;
+def : MIMGBiasMapping<IMAGE_GATHER4_B_CL_O, IMAGE_GATHER4_CL_O>;
+def : MIMGBiasMapping<IMAGE_GATHER4_C_B_O, IMAGE_GATHER4_C_O>;
+def : MIMGBiasMapping<IMAGE_GATHER4_C_B_CL_O, IMAGE_GATHER4_C_CL_O>;
+
+// Offset to NoOffset Optimization Mapping
+def : MIMGOffsetMapping<IMAGE_SAMPLE_O, IMAGE_SAMPLE>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_CL_O, IMAGE_SAMPLE_CL>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_D_O, IMAGE_SAMPLE_D>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_D_CL_O, IMAGE_SAMPLE_D_CL>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_D_O_G16, IMAGE_SAMPLE_D_G16>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_D_CL_O_G16, IMAGE_SAMPLE_D_CL_G16>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_L_O, IMAGE_SAMPLE_L>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_B_O, IMAGE_SAMPLE_B>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_B_CL_O, IMAGE_SAMPLE_B_CL>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_LZ_O, IMAGE_SAMPLE_LZ>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_C_O, IMAGE_SAMPLE_C>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_C_CL_O, IMAGE_SAMPLE_C_CL>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_C_D_O, IMAGE_SAMPLE_C_D>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_C_D_CL_O, IMAGE_SAMPLE_C_D_CL>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_C_D_O_G16, IMAGE_SAMPLE_C_D_G16>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_C_D_CL_O_G16, IMAGE_SAMPLE_C_D_CL_G16>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_C_L_O, IMAGE_SAMPLE_C_L>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_C_B_CL_O, IMAGE_SAMPLE_C_B_CL>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_C_B_O, IMAGE_SAMPLE_C_B>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_C_LZ_O, IMAGE_SAMPLE_C_LZ>;
+def : MIMGOffsetMapping<IMAGE_GATHER4_O, IMAGE_GATHER4>;
+def : MIMGOffsetMapping<IMAGE_GATHER4_CL_O, IMAGE_GATHER4_CL>;
+def : MIMGOffsetMapping<IMAGE_GATHER4_L_O, IMAGE_GATHER4_L>;
+def : MIMGOffsetMapping<IMAGE_GATHER4_B_O, IMAGE_GATHER4_B>;
+def : MIMGOffsetMapping<IMAGE_GATHER4_B_CL_O, IMAGE_GATHER4_B_CL>;
+def : MIMGOffsetMapping<IMAGE_GATHER4_LZ_O, IMAGE_GATHER4_LZ>;
+def : MIMGOffsetMapping<IMAGE_GATHER4_C_O, IMAGE_GATHER4_C>;
+def : MIMGOffsetMapping<IMAGE_GATHER4_C_CL_O, IMAGE_GATHER4_C_CL>;
+def : MIMGOffsetMapping<IMAGE_GATHER4_C_L_O, IMAGE_GATHER4_C_L>;
+def : MIMGOffsetMapping<IMAGE_GATHER4_C_B_O, IMAGE_GATHER4_C_B>;
+def : MIMGOffsetMapping<IMAGE_GATHER4_C_B_CL_O, IMAGE_GATHER4_C_B_CL>;
+def : MIMGOffsetMapping<IMAGE_GATHER4_C_LZ_O, IMAGE_GATHER4_C_LZ>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_CD_O, IMAGE_SAMPLE_CD>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_CD_CL_O, IMAGE_SAMPLE_CD_CL>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_C_CD_O, IMAGE_SAMPLE_C_CD>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_C_CD_CL_O, IMAGE_SAMPLE_C_CD_CL>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_CD_O_G16, IMAGE_SAMPLE_CD_G16>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_CD_CL_O_G16, IMAGE_SAMPLE_CD_CL_G16>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_C_CD_O_G16, IMAGE_SAMPLE_C_CD_G16>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_C_CD_CL_O_G16, IMAGE_SAMPLE_C_CD_CL_G16>;
+
// G to G16 Optimization Mapping
def : MIMGG16Mapping<IMAGE_SAMPLE_D, IMAGE_SAMPLE_D_G16>;
def : MIMGG16Mapping<IMAGE_SAMPLE_D_CL, IMAGE_SAMPLE_D_CL_G16>;
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.h b/llvm/lib/Target/AMDGPU/R600ISelLowering.h
index f9a9a6127322..1e75a0432ec3 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.h
@@ -19,7 +19,6 @@
namespace llvm {
-class R600InstrInfo;
class R600Subtarget;
class R600TargetLowering final : public AMDGPUTargetLowering {
diff --git a/llvm/lib/Target/AMDGPU/R600InstrInfo.h b/llvm/lib/Target/AMDGPU/R600InstrInfo.h
index fc567f1a1fca..bc8a4786df77 100644
--- a/llvm/lib/Target/AMDGPU/R600InstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/R600InstrInfo.h
@@ -29,7 +29,6 @@ enum : uint64_t {
};
}
-class AMDGPUTargetMachine;
class DFAPacketizer;
class MachineFunction;
class MachineInstr;
diff --git a/llvm/lib/Target/AMDGPU/R600Subtarget.h b/llvm/lib/Target/AMDGPU/R600Subtarget.h
index 94403b88f21a..92d559b1f8e6 100644
--- a/llvm/lib/Target/AMDGPU/R600Subtarget.h
+++ b/llvm/lib/Target/AMDGPU/R600Subtarget.h
@@ -21,12 +21,6 @@
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
-namespace llvm {
-
-class MCInstrInfo;
-
-} // namespace llvm
-
#define GET_SUBTARGETINFO_HEADER
#include "R600GenSubtargetInfo.inc"
diff --git a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index 397b2f873515..b81fac36fc95 100644
--- a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -245,6 +245,12 @@ Value *SIAnnotateControlFlow::handleLoopCondition(
return CallInst::Create(IfBreak, Args, "", Insert);
}
+ if (isa<Argument>(Cond)) {
+ Instruction *Insert = L->getHeader()->getFirstNonPHIOrDbgOrLifetime();
+ Value *Args[] = { Cond, Broken };
+ return CallInst::Create(IfBreak, Args, "", Insert);
+ }
+
llvm_unreachable("Unhandled loop condition!");
}
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 580e4bc417a4..107ee5ed5532 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -379,6 +379,8 @@ enum Id { // HwRegCode, (6) [5:0]
ID_FLAT_SCR_LO = 20,
ID_FLAT_SCR_HI = 21,
ID_XNACK_MASK = 22,
+ ID_HW_ID1 = 23,
+ ID_HW_ID2 = 24,
ID_POPS_PACKER = 25,
ID_SHADER_CYCLES = 29,
ID_SYMBOLIC_FIRST_GFX1030_ = ID_SHADER_CYCLES,
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 1f93284fc7ee..33954e11d6c6 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -300,6 +300,13 @@ static bool updateOperand(FoldCandidate &Fold,
assert(!Fold.needsShrink() && "not handled");
if (Fold.isImm()) {
+ if (Old.isTied()) {
+ int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(MI->getOpcode());
+ if (NewMFMAOpc == -1)
+ return false;
+ MI->setDesc(TII.get(NewMFMAOpc));
+ MI->untieRegOperand(0);
+ }
Old.ChangeToImmediate(Fold.ImmToFold);
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index d4fe74ecb96e..6078f4a0577a 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -1195,7 +1195,8 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
}
} else if (TII->isStoreToStackSlot(MI, FrameIndex) ||
TII->isLoadFromStackSlot(MI, FrameIndex))
- NonVGPRSpillFIs.set(FrameIndex);
+ if (!MFI.isFixedObjectIndex(FrameIndex))
+ NonVGPRSpillFIs.set(FrameIndex);
}
}
@@ -1320,16 +1321,14 @@ void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
const BitVector AllSavedRegs = SavedRegs;
SavedRegs.clearBitsInMask(TRI->getAllVectorRegMask());
- // If clearing VGPRs changed the mask, we will have some CSR VGPR spills.
- const bool HaveAnyCSRVGPR = SavedRegs != AllSavedRegs;
-
// We have to anticipate introducing CSR VGPR spills or spill of caller
// save VGPR reserved for SGPR spills as we now always create stack entry
- // for it, if we don't have any stack objects already, since we require
- // an FP if there is a call and stack.
+ // for it, if we don't have any stack objects already, since we require a FP
+ // if there is a call and stack. We will allocate a VGPR for SGPR spills if
+ // there are any SGPR spills. Whether they are CSR spills or otherwise.
MachineFrameInfo &FrameInfo = MF.getFrameInfo();
const bool WillHaveFP =
- FrameInfo.hasCalls() && (HaveAnyCSRVGPR || MFI->VGPRReservedForSGPRSpill);
+ FrameInfo.hasCalls() && (AllSavedRegs.any() || MFI->hasSpilledSGPRs());
// FP will be specially managed like SP.
if (WillHaveFP || hasFP(MF))
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
index 56fbb875ffd9..7949dcfa6632 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -13,11 +13,6 @@
namespace llvm {
-class SIInstrInfo;
-class SIMachineFunctionInfo;
-class SIRegisterInfo;
-class GCNSubtarget;
-
class SIFrameLowering final : public AMDGPUFrameLowering {
public:
SIFrameLowering(StackDirection D, Align StackAl, int LAO,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 9f138136e6e9..561866b5a398 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -45,10 +45,6 @@ static cl::opt<bool> DisableLoopAlignment(
cl::desc("Do not align and prefetch loops"),
cl::init(false));
-static cl::opt<bool> VGPRReserveforSGPRSpill(
- "amdgpu-reserve-vgpr-for-sgpr-spill",
- cl::desc("Allocates one VGPR for future SGPR Spill"), cl::init(true));
-
static cl::opt<bool> UseDivergentRegisterIndexing(
"amdgpu-use-divergent-register-indexing",
cl::Hidden,
@@ -138,6 +134,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
+ addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
+ addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
}
addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
@@ -273,7 +271,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16,
MVT::v3i64, MVT::v3f64, MVT::v6i32, MVT::v6f32,
MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
- MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32 }) {
+ MVT::v8i16, MVT::v8f16, MVT::v16i64, MVT::v16f64,
+ MVT::v32i32, MVT::v32f32 }) {
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
switch (Op) {
case ISD::LOAD:
@@ -615,7 +614,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
if (STI.hasMadF16())
setOperationAction(ISD::FMAD, MVT::f16, Legal);
- for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) {
+ for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16, MVT::v8i16,
+ MVT::v8f16}) {
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
switch (Op) {
case ISD::LOAD:
@@ -677,6 +677,21 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STORE, MVT::v4f16, Promote);
AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
+ setOperationAction(ISD::LOAD, MVT::v8i16, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
+ setOperationAction(ISD::LOAD, MVT::v8f16, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
+
+ setOperationAction(ISD::STORE, MVT::v4i16, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
+ setOperationAction(ISD::STORE, MVT::v4f16, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
+
+ setOperationAction(ISD::STORE, MVT::v8i16, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
+ setOperationAction(ISD::STORE, MVT::v8f16, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
+
setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand);
setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand);
setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand);
@@ -686,6 +701,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Expand);
setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Expand);
+ setOperationAction(ISD::ANY_EXTEND, MVT::v8i32, Expand);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Expand);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Expand);
+
if (!Subtarget->hasVOP3PInsts()) {
setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom);
setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
@@ -703,9 +722,20 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FMINNUM_IEEE, MVT::v4f16, Custom);
setOperationAction(ISD::FMAXNUM_IEEE, MVT::v4f16, Custom);
+ setOperationAction(ISD::FMINNUM_IEEE, MVT::v8f16, Custom);
+ setOperationAction(ISD::FMAXNUM_IEEE, MVT::v8f16, Custom);
setOperationAction(ISD::FMINNUM, MVT::v4f16, Expand);
setOperationAction(ISD::FMAXNUM, MVT::v4f16, Expand);
+ setOperationAction(ISD::FMINNUM, MVT::v8f16, Expand);
+ setOperationAction(ISD::FMAXNUM, MVT::v8f16, Expand);
+
+ for (MVT Vec16 : { MVT::v8i16, MVT::v8f16 }) {
+ setOperationAction(ISD::BUILD_VECTOR, Vec16, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec16, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, Vec16, Expand);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, Vec16, Expand);
+ }
}
if (Subtarget->hasVOP3PInsts()) {
@@ -739,34 +769,42 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f16, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f16, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i16, Custom);
- setOperationAction(ISD::SHL, MVT::v4i16, Custom);
- setOperationAction(ISD::SRA, MVT::v4i16, Custom);
- setOperationAction(ISD::SRL, MVT::v4i16, Custom);
- setOperationAction(ISD::ADD, MVT::v4i16, Custom);
- setOperationAction(ISD::SUB, MVT::v4i16, Custom);
- setOperationAction(ISD::MUL, MVT::v4i16, Custom);
+ for (MVT VT : { MVT::v4i16, MVT::v8i16 }) {
+ // Split vector operations.
+ setOperationAction(ISD::SHL, VT, Custom);
+ setOperationAction(ISD::SRA, VT, Custom);
+ setOperationAction(ISD::SRL, VT, Custom);
+ setOperationAction(ISD::ADD, VT, Custom);
+ setOperationAction(ISD::SUB, VT, Custom);
+ setOperationAction(ISD::MUL, VT, Custom);
- setOperationAction(ISD::SMIN, MVT::v4i16, Custom);
- setOperationAction(ISD::SMAX, MVT::v4i16, Custom);
- setOperationAction(ISD::UMIN, MVT::v4i16, Custom);
- setOperationAction(ISD::UMAX, MVT::v4i16, Custom);
+ setOperationAction(ISD::SMIN, VT, Custom);
+ setOperationAction(ISD::SMAX, VT, Custom);
+ setOperationAction(ISD::UMIN, VT, Custom);
+ setOperationAction(ISD::UMAX, VT, Custom);
- setOperationAction(ISD::UADDSAT, MVT::v4i16, Custom);
- setOperationAction(ISD::SADDSAT, MVT::v4i16, Custom);
- setOperationAction(ISD::USUBSAT, MVT::v4i16, Custom);
- setOperationAction(ISD::SSUBSAT, MVT::v4i16, Custom);
+ setOperationAction(ISD::UADDSAT, VT, Custom);
+ setOperationAction(ISD::SADDSAT, VT, Custom);
+ setOperationAction(ISD::USUBSAT, VT, Custom);
+ setOperationAction(ISD::SSUBSAT, VT, Custom);
+ }
- setOperationAction(ISD::FADD, MVT::v4f16, Custom);
- setOperationAction(ISD::FMUL, MVT::v4f16, Custom);
- setOperationAction(ISD::FMA, MVT::v4f16, Custom);
+ for (MVT VT : { MVT::v4f16, MVT::v8f16 }) {
+ // Split vector operations.
+ setOperationAction(ISD::FADD, VT, Custom);
+ setOperationAction(ISD::FMUL, VT, Custom);
+ setOperationAction(ISD::FMA, VT, Custom);
+ setOperationAction(ISD::FCANONICALIZE, VT, Custom);
+ }
setOperationAction(ISD::FMAXNUM, MVT::v2f16, Custom);
setOperationAction(ISD::FMINNUM, MVT::v2f16, Custom);
setOperationAction(ISD::FMINNUM, MVT::v4f16, Custom);
setOperationAction(ISD::FMAXNUM, MVT::v4f16, Custom);
- setOperationAction(ISD::FCANONICALIZE, MVT::v4f16, Custom);
setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
setOperationAction(ISD::SELECT, MVT::v4i16, Custom);
@@ -803,7 +841,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FABS, MVT::v2f16, Custom);
}
- for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8 }) {
+ for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8,
+ MVT::v8i16, MVT::v8f16 }) {
setOperationAction(ISD::SELECT, VT, Custom);
}
@@ -2776,6 +2815,7 @@ void SITargetLowering::passSpecialInputs(
SelectionDAG &DAG = CLI.DAG;
const SDLoc &DL = CLI.DL;
+ const Function &F = DAG.getMachineFunction().getFunction();
const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
@@ -2887,11 +2927,16 @@ void SITargetLowering::passSpecialInputs(
// If incoming ids are not packed we need to pack them.
if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
- NeedWorkItemIDX)
- InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
+ NeedWorkItemIDX) {
+ if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
+ InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
+ } else {
+ InputReg = DAG.getConstant(0, DL, MVT::i32);
+ }
+ }
if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
- NeedWorkItemIDY) {
+ NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
DAG.getShiftAmountConstant(10, MVT::i32, SL));
@@ -2900,7 +2945,7 @@ void SITargetLowering::passSpecialInputs(
}
if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
- NeedWorkItemIDZ) {
+ NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
DAG.getShiftAmountConstant(20, MVT::i32, SL));
@@ -2909,13 +2954,21 @@ void SITargetLowering::passSpecialInputs(
}
if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
- // Workitem ids are already packed, any of present incoming arguments
- // will carry all required fields.
- ArgDescriptor IncomingArg = ArgDescriptor::createArg(
- IncomingArgX ? *IncomingArgX :
- IncomingArgY ? *IncomingArgY :
- *IncomingArgZ, ~0u);
- InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
+ if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
+ // We're in a situation where the outgoing function requires the workitem
+ // ID, but the calling function does not have it (e.g a graphics function
+ // calling a C calling convention function). This is illegal, but we need
+ // to produce something.
+ InputReg = DAG.getUNDEF(MVT::i32);
+ } else {
+ // Workitem ids are already packed, any of present incoming arguments
+ // will carry all required fields.
+ ArgDescriptor IncomingArg = ArgDescriptor::createArg(
+ IncomingArgX ? *IncomingArgX :
+ IncomingArgY ? *IncomingArgY :
+ *IncomingArgZ, ~0u);
+ InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
+ }
}
if (OutgoingArg->isRegister()) {
@@ -4600,7 +4653,8 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
unsigned Opc = Op.getOpcode();
EVT VT = Op.getValueType();
assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
- VT == MVT::v8f32 || VT == MVT::v16f32 || VT == MVT::v32f32);
+ VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8f32 ||
+ VT == MVT::v16f32 || VT == MVT::v32f32);
SDValue Lo0, Hi0;
std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
@@ -4621,21 +4675,26 @@ SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op,
SelectionDAG &DAG) const {
unsigned Opc = Op.getOpcode();
EVT VT = Op.getValueType();
- assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
- VT == MVT::v8f32 || VT == MVT::v16f32 || VT == MVT::v32f32);
+ assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
+ VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v8f32 ||
+ VT == MVT::v16f32 || VT == MVT::v32f32);
SDValue Lo0, Hi0;
- std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
+ SDValue Op0 = Op.getOperand(0);
+ std::tie(Lo0, Hi0) = Op0.getValueType().isVector()
+ ? DAG.SplitVectorOperand(Op.getNode(), 0)
+ : std::make_pair(Op0, Op0);
SDValue Lo1, Hi1;
std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
SDValue Lo2, Hi2;
std::tie(Lo2, Hi2) = DAG.SplitVectorOperand(Op.getNode(), 2);
SDLoc SL(Op);
+ auto ResVT = DAG.GetSplitDestVTs(VT);
- SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Lo2,
+ SDValue OpLo = DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2,
Op->getFlags());
- SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Hi2,
+ SDValue OpHi = DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2,
Op->getFlags());
return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
@@ -5297,7 +5356,7 @@ SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
if (IsIEEEMode)
return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
- if (VT == MVT::v4f16)
+ if (VT == MVT::v4f16 || VT == MVT::v8f16)
return splitBinaryVectorOp(Op, DAG);
return Op;
}
@@ -5501,6 +5560,22 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
MachineMemOperand::MOInvariant);
}
+/// Return true if the value is a known valid address, such that a null check is
+/// not necessary.
+static bool isKnownNonNull(SDValue Val, SelectionDAG &DAG,
+ const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
+ if (isa<FrameIndexSDNode>(Val) || isa<GlobalAddressSDNode>(Val) ||
+ isa<BasicBlockSDNode>(Val))
+ return true;
+
+ if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
+ return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
+
+ // TODO: Search through arithmetic, handle arguments and loads
+ // marked nonnull.
+ return false;
+}
+
SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
SelectionDAG &DAG) const {
SDLoc SL(Op);
@@ -5508,48 +5583,64 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
SDValue Src = ASC->getOperand(0);
SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
+ unsigned SrcAS = ASC->getSrcAddressSpace();
const AMDGPUTargetMachine &TM =
static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
// flat -> local/private
- if (ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
+ if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
unsigned DestAS = ASC->getDestAddressSpace();
if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
+ SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
+
+ if (isKnownNonNull(Src, DAG, TM, SrcAS))
+ return Ptr;
+
unsigned NullVal = TM.getNullPointerValue(DestAS);
SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
- SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
- return DAG.getNode(ISD::SELECT, SL, MVT::i32,
- NonNull, Ptr, SegmentNullPtr);
+ return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
+ SegmentNullPtr);
}
}
// local/private -> flat
if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
- unsigned SrcAS = ASC->getSrcAddressSpace();
-
if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
+
+ SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG);
+ SDValue CvtPtr =
+ DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
+ CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
+
+ if (isKnownNonNull(Src, DAG, TM, SrcAS))
+ return CvtPtr;
+
unsigned NullVal = TM.getNullPointerValue(SrcAS);
SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
SDValue NonNull
= DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
- SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG);
- SDValue CvtPtr
- = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
-
- return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull,
- DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr),
+ return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
FlatNullPtr);
}
}
+ if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
+ Op.getValueType() == MVT::i64) {
+ const SIMachineFunctionInfo *Info =
+ DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
+ SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
+ SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
+ return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
+ }
+
if (ASC->getDestAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
Src.getValueType() == MVT::i64)
return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
@@ -5676,7 +5767,6 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
EVT VecVT = Vec.getValueType();
unsigned VecSize = VecVT.getSizeInBits();
EVT EltVT = VecVT.getVectorElementType();
- assert(VecSize <= 64);
DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
@@ -5687,6 +5777,28 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
return Combined;
+ if (VecSize == 128) {
+ SDValue Lo, Hi;
+ EVT LoVT, HiVT;
+ SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT);
+ Lo =
+ DAG.getBitcast(LoVT, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64,
+ V2, DAG.getConstant(0, SL, MVT::i32)));
+ Hi =
+ DAG.getBitcast(HiVT, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64,
+ V2, DAG.getConstant(1, SL, MVT::i32)));
+ EVT IdxVT = Idx.getValueType();
+ unsigned NElem = VecVT.getVectorNumElements();
+ assert(isPowerOf2_32(NElem));
+ SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
+ SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
+ SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
+ }
+
+ assert(VecSize <= 64);
+
unsigned EltSize = EltVT.getSizeInBits();
assert(isPowerOf2_32(EltSize));
@@ -5769,20 +5881,27 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
SDLoc SL(Op);
EVT VT = Op.getValueType();
- if (VT == MVT::v4i16 || VT == MVT::v4f16) {
- EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
+ if (VT == MVT::v4i16 || VT == MVT::v4f16 ||
+ VT == MVT::v8i16 || VT == MVT::v8f16) {
+ EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(),
+ VT.getVectorNumElements() / 2);
+ MVT HalfIntVT = MVT::getIntegerVT(HalfVT.getSizeInBits());
// Turn into pair of packed build_vectors.
// TODO: Special case for constants that can be materialized with s_mov_b64.
- SDValue Lo = DAG.getBuildVector(HalfVT, SL,
- { Op.getOperand(0), Op.getOperand(1) });
- SDValue Hi = DAG.getBuildVector(HalfVT, SL,
- { Op.getOperand(2), Op.getOperand(3) });
+ SmallVector<SDValue, 4> LoOps, HiOps;
+ for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I != E; ++I) {
+ LoOps.push_back(Op.getOperand(I));
+ HiOps.push_back(Op.getOperand(I + E));
+ }
+ SDValue Lo = DAG.getBuildVector(HalfVT, SL, LoOps);
+ SDValue Hi = DAG.getBuildVector(HalfVT, SL, HiOps);
- SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Lo);
- SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Hi);
+ SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, HalfIntVT, Lo);
+ SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, HalfIntVT, Hi);
- SDValue Blend = DAG.getBuildVector(MVT::v2i32, SL, { CastLo, CastHi });
+ SDValue Blend = DAG.getBuildVector(MVT::getVectorVT(HalfIntVT, 2), SL,
+ { CastLo, CastHi });
return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
}
@@ -6155,10 +6274,6 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
- const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
- AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode);
- const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo =
- AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode);
unsigned IntrOpcode = Intr->BaseOpcode;
bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
@@ -6246,28 +6361,6 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
SmallVector<SDValue, 4> VAddrs;
- // Optimize _L to _LZ when _L is zero
- if (LZMappingInfo) {
- if (auto *ConstantLod = dyn_cast<ConstantFPSDNode>(
- Op.getOperand(ArgOffset + Intr->LodIndex))) {
- if (ConstantLod->isZero() || ConstantLod->isNegative()) {
- IntrOpcode = LZMappingInfo->LZ; // set new opcode to _lz variant of _l
- VAddrEnd--; // remove 'lod'
- }
- }
- }
-
- // Optimize _mip away, when 'lod' is zero
- if (MIPMappingInfo) {
- if (auto *ConstantLod = dyn_cast<ConstantSDNode>(
- Op.getOperand(ArgOffset + Intr->MipIndex))) {
- if (ConstantLod->isZero()) {
- IntrOpcode = MIPMappingInfo->NONMIP; // set new opcode to variant without _mip
- VAddrEnd--; // remove 'mip'
- }
- }
- }
-
// Check for 16 bit addresses or derivatives and pack if true.
MVT VAddrVT =
Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
@@ -6283,12 +6376,18 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
// Push back extra arguments.
for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
+ assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
// Special handling of bias when A16 is on. Bias is of type half but
// occupies full 32-bit.
- SDValue bias = DAG.getBuildVector( MVT::v2f16, DL, {Op.getOperand(ArgOffset + I), DAG.getUNDEF(MVT::f16)});
- VAddrs.push_back(bias);
- } else
+ SDValue Bias = DAG.getBuildVector(
+ MVT::v2f16, DL,
+ {Op.getOperand(ArgOffset + I), DAG.getUNDEF(MVT::f16)});
+ VAddrs.push_back(Bias);
+ } else {
+ assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
+ "Bias needs to be converted to 16 bit in A16 mode");
VAddrs.push_back(Op.getOperand(ArgOffset + I));
+ }
}
if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
@@ -6731,14 +6830,23 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return getPreloadedValue(DAG, *MFI, VT,
AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
case Intrinsic::amdgcn_workitem_id_x:
+ if (Subtarget->getMaxWorkitemID(MF.getFunction(), 0) == 0)
+ return DAG.getConstant(0, DL, MVT::i32);
+
return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
SDLoc(DAG.getEntryNode()),
MFI->getArgInfo().WorkItemIDX);
case Intrinsic::amdgcn_workitem_id_y:
+ if (Subtarget->getMaxWorkitemID(MF.getFunction(), 1) == 0)
+ return DAG.getConstant(0, DL, MVT::i32);
+
return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
SDLoc(DAG.getEntryNode()),
MFI->getArgInfo().WorkItemIDY);
case Intrinsic::amdgcn_workitem_id_z:
+ if (Subtarget->getMaxWorkitemID(MF.getFunction(), 2) == 0)
+ return DAG.getConstant(0, DL, MVT::i32);
+
return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
SDLoc(DAG.getEntryNode()),
MFI->getArgInfo().WorkItemIDZ);
@@ -6899,9 +7007,6 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
DAG.getConstant(1, SL, MVT::i32));
return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
}
- case Intrinsic::amdgcn_alignbit:
- return DAG.getNode(ISD::FSHR, DL, VT,
- Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
case Intrinsic::amdgcn_perm:
return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
Op.getOperand(2), Op.getOperand(3));
@@ -8408,21 +8513,14 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
+ if (VT.getSizeInBits() == 128)
+ return splitTernaryVectorOp(Op, DAG);
+
assert(VT.getSizeInBits() == 64);
SDLoc DL(Op);
SDValue Cond = Op.getOperand(0);
- if (Subtarget->hasScalarCompareEq64() && Op->getOperand(0)->hasOneUse() &&
- !Op->isDivergent()) {
- if (VT == MVT::i64)
- return Op;
- SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Op.getOperand(1));
- SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Op.getOperand(2));
- return DAG.getNode(ISD::BITCAST, DL, VT,
- DAG.getSelect(DL, MVT::i64, Cond, LHS, RHS));
- }
-
SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
SDValue One = DAG.getConstant(1, DL, MVT::i32);
@@ -9550,6 +9648,9 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
SDValue SITargetLowering::performXorCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
+ if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
+ return RV;
+
EVT VT = N->getValueType(0);
if (VT != MVT::i64)
return SDValue();
@@ -10462,6 +10563,9 @@ SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
if (VT != MVT::i32 && VT != MVT::i64)
return SDValue();
+ if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
+ return SDValue();
+
unsigned Opc = N->getOpcode();
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
@@ -10483,12 +10587,6 @@ SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
if (Op1->isDivergent())
std::swap(Op1, Op2);
- // If either operand is constant this will conflict with
- // DAGCombiner::ReassociateOps().
- if (DAG.isConstantIntBuildVectorOrConstantInt(Op0) ||
- DAG.isConstantIntBuildVectorOrConstantInt(Op1))
- return SDValue();
-
SDLoc SL(N);
SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
return DAG.getNode(Opc, SL, VT, Add1, Op2);
@@ -11130,7 +11228,9 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
bool UsesTFC = ((int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
- Node->getConstantOperandVal(LWEIdx)) ? 1 : 0;
+ Node->getConstantOperandVal(LWEIdx))
+ ? true
+ : false;
unsigned TFCLane = 0;
bool HasChain = Node->getNumValues() > 1;
@@ -11719,25 +11819,51 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
return std::make_pair(0U, RC);
}
- if (Constraint.size() > 1) {
- if (Constraint[1] == 'v') {
+ if (Constraint.startswith("{") && Constraint.endswith("}")) {
+ StringRef RegName(Constraint.data() + 1, Constraint.size() - 2);
+ if (RegName.consume_front("v")) {
RC = &AMDGPU::VGPR_32RegClass;
- } else if (Constraint[1] == 's') {
+ } else if (RegName.consume_front("s")) {
RC = &AMDGPU::SGPR_32RegClass;
- } else if (Constraint[1] == 'a') {
+ } else if (RegName.consume_front("a")) {
RC = &AMDGPU::AGPR_32RegClass;
}
if (RC) {
uint32_t Idx;
- bool Failed = Constraint.substr(2).getAsInteger(10, Idx);
- if (!Failed && Idx < RC->getNumRegs())
- return std::make_pair(RC->getRegister(Idx), RC);
+ if (RegName.consume_front("[")) {
+ uint32_t End;
+ bool Failed = RegName.consumeInteger(10, Idx);
+ Failed |= !RegName.consume_front(":");
+ Failed |= RegName.consumeInteger(10, End);
+ Failed |= !RegName.consume_back("]");
+ if (!Failed) {
+ uint32_t Width = (End - Idx + 1) * 32;
+ MCRegister Reg = RC->getRegister(Idx);
+ if (SIRegisterInfo::isVGPRClass(RC))
+ RC = TRI->getVGPRClassForBitWidth(Width);
+ else if (SIRegisterInfo::isSGPRClass(RC))
+ RC = TRI->getSGPRClassForBitWidth(Width);
+ else if (SIRegisterInfo::isAGPRClass(RC))
+ RC = TRI->getAGPRClassForBitWidth(Width);
+ if (RC) {
+ Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
+ return std::make_pair(Reg, RC);
+ }
+ }
+ } else {
+ bool Failed = RegName.getAsInteger(10, Idx);
+ if (!Failed && Idx < RC->getNumRegs())
+ return std::make_pair(RC->getRegister(Idx), RC);
+ }
}
}
- // FIXME: Returns VS_32 for physical SGPR constraints
- return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+ auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+ if (Ret.first)
+ Ret.second = TRI->getPhysRegClass(Ret.first);
+
+ return Ret;
}
static bool isImmConstraint(StringRef Constraint) {
@@ -11975,13 +12101,6 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
}
TargetLoweringBase::finalizeLowering(MF);
-
- // Allocate a VGPR for future SGPR Spill if
- // "amdgpu-reserve-vgpr-for-sgpr-spill" option is used
- // FIXME: We won't need this hack if we split SGPR allocation from VGPR
- if (VGPRReserveforSGPRSpill && TRI->spillSGPRToVGPR() &&
- !Info->VGPRReservedForSGPRSpill && !Info->isEntryFunction())
- Info->reserveVGPRforSGPRSpills(MF);
}
void SITargetLowering::computeKnownBitsForFrameIndex(
@@ -12441,17 +12560,10 @@ bool SITargetLowering::requiresUniformRegister(MachineFunction &MF,
for (auto &TC : TargetConstraints) {
if (TC.Type == InlineAsm::isOutput) {
ComputeConstraintToUse(TC, SDValue());
- unsigned AssignedReg;
- const TargetRegisterClass *RC;
- std::tie(AssignedReg, RC) = getRegForInlineAsmConstraint(
- SIRI, TC.ConstraintCode, TC.ConstraintVT);
- if (RC) {
- MachineRegisterInfo &MRI = MF.getRegInfo();
- if (AssignedReg != 0 && SIRI->isSGPRReg(MRI, AssignedReg))
- return true;
- else if (SIRI->isSGPRClass(RC))
- return true;
- }
+ const TargetRegisterClass *RC = getRegForInlineAsmConstraint(
+ SIRI, TC.ConstraintCode, TC.ConstraintVT).second;
+ if (RC && SIRI->isSGPRClass(RC))
+ return true;
}
}
}
@@ -12475,3 +12587,27 @@ SITargetLowering::getTypeLegalizationCost(const DataLayout &DL,
Cost.first += (Size + 255) / 256;
return Cost;
}
+
+bool SITargetLowering::hasMemSDNodeUser(SDNode *N) const {
+ SDNode::use_iterator I = N->use_begin(), E = N->use_end();
+ for (; I != E; ++I) {
+ if (MemSDNode *M = dyn_cast<MemSDNode>(*I)) {
+ if (getBasePtrIndex(M) == I.getOperandNo())
+ return true;
+ }
+ }
+ return false;
+}
+
+bool SITargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0,
+ SDValue N1) const {
+ if (!N0.hasOneUse())
+ return false;
+ // Take care of the oportunity to keep N0 uniform
+ if (N0->isDivergent() || !N1->isDivergent())
+ return true;
+ // Check if we have a good chance to form the memory access pattern with the
+ // base and offset
+ return (DAG.isBaseWithConstantOffset(N0) &&
+ hasMemSDNodeUser(*N0->use_begin()));
+}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 1315cc15dd02..bf81e082b478 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -449,6 +449,11 @@ public:
bool isSDNodeSourceOfDivergence(const SDNode *N,
FunctionLoweringInfo *FLI, LegacyDivergenceAnalysis *DA) const override;
+ bool hasMemSDNodeUser(SDNode *N) const;
+
+ bool isReassocProfitable(SelectionDAG &DAG, SDValue N0,
+ SDValue N1) const override;
+
bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
unsigned MaxDepth = 5) const;
bool isCanonicalized(Register Reg, MachineFunction &MF,
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 6fbe5d45ce0a..f8a10bc8ef6f 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -863,7 +863,7 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
Wait.ExpCnt = ~0u;
LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
- << "Old Instr: " << MI << "New Instr: " << *WaitcntInstr
+ << "Old Instr: " << *MI << "New Instr: " << *WaitcntInstr
<< '\n');
} else {
WaitcntInstr->eraseFromParent();
@@ -886,7 +886,7 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
Wait.VsCnt = ~0u;
LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
- << "Old Instr: " << MI
+ << "Old Instr: " << *MI
<< "New Instr: " << *WaitcntVsCntInstr << '\n');
} else {
WaitcntVsCntInstr->eraseFromParent();
@@ -1382,7 +1382,6 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
for (auto T : inst_counter_types()) {
// Merge event flags for this counter
- const bool OldOutOfOrder = counterOutOfOrder(T);
const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
if (OtherEvents & ~OldEvents)
@@ -1425,7 +1424,7 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
}
}
- if (RegStrictDom && !OldOutOfOrder)
+ if (RegStrictDom)
StrictDom = true;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 1755b93538ce..0a2f9381e71f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -130,10 +130,24 @@ bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
return false;
}
+static bool readsExecAsData(const MachineInstr &MI) {
+ if (MI.isCompare())
+ return true;
+
+ switch (MI.getOpcode()) {
+ default:
+ break;
+ case AMDGPU::V_READFIRSTLANE_B32:
+ return true;
+ }
+
+ return false;
+}
+
bool SIInstrInfo::isIgnorableUse(const MachineOperand &MO) const {
// Any implicit use of exec by VALU is not a real register read.
return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
- isVALU(*MO.getParent());
+ isVALU(*MO.getParent()) && !readsExecAsData(*MO.getParent());
}
bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
@@ -3184,10 +3198,14 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
+ int NewMFMAOpc = -1;
switch (Opc) {
default:
- return nullptr;
+ NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
+ if (NewMFMAOpc == -1)
+ return nullptr;
+ break;
case AMDGPU::V_MAC_F16_e64:
case AMDGPU::V_FMAC_F16_e64:
IsF16 = true;
@@ -3216,6 +3234,19 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
}
}
+ MachineInstrBuilder MIB;
+ MachineBasicBlock &MBB = *MI.getParent();
+
+ if (NewMFMAOpc != -1) {
+ MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
+ for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
+ MIB.add(MI.getOperand(I));
+ updateLiveVariables(LV, MI, *MIB);
+ if (LIS)
+ LIS->ReplaceMachineInstrInMaps(MI, *MIB);
+ return MIB;
+ }
+
const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
const MachineOperand *Src0Mods =
@@ -3226,8 +3257,6 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
- MachineInstrBuilder MIB;
- MachineBasicBlock &MBB = *MI.getParent();
if (!Src0Mods && !Src1Mods && !Clamp && !Omod && !IsF64 &&
// If we have an SGPR input, we will violate the constant bus restriction.
@@ -4520,6 +4549,14 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
}
+ if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
+ const MachineOperand &SrcOp = MI.getOperand(1);
+ if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
+ ErrInfo = "pseudo expects only physical SGPRs";
+ return false;
+ }
+ }
+
return true;
}
@@ -6122,11 +6159,8 @@ MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst,
continue;
case AMDGPU::S_CSELECT_B32:
- lowerSelect32(Worklist, Inst, MDT);
- Inst.eraseFromParent();
- continue;
case AMDGPU::S_CSELECT_B64:
- splitSelect64(Worklist, Inst, MDT);
+ lowerSelect(Worklist, Inst, MDT);
Inst.eraseFromParent();
continue;
case AMDGPU::S_CMP_EQ_I32:
@@ -6304,8 +6338,8 @@ SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
return std::make_pair(false, nullptr);
}
-void SIInstrInfo::lowerSelect32(SetVectorType &Worklist, MachineInstr &Inst,
- MachineDominatorTree *MDT) const {
+void SIInstrInfo::lowerSelect(SetVectorType &Worklist, MachineInstr &Inst,
+ MachineDominatorTree *MDT) const {
MachineBasicBlock &MBB = *Inst.getParent();
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -6380,95 +6414,6 @@ void SIInstrInfo::lowerSelect32(SetVectorType &Worklist, MachineInstr &Inst,
addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
}
-void SIInstrInfo::splitSelect64(SetVectorType &Worklist, MachineInstr &Inst,
- MachineDominatorTree *MDT) const {
- // Split S_CSELECT_B64 into a pair of S_CSELECT_B32 and lower them
- // further.
- const DebugLoc &DL = Inst.getDebugLoc();
- MachineBasicBlock::iterator MII = Inst;
- MachineBasicBlock &MBB = *Inst.getParent();
- MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
-
- // Get the original operands.
- MachineOperand &Dest = Inst.getOperand(0);
- MachineOperand &Src0 = Inst.getOperand(1);
- MachineOperand &Src1 = Inst.getOperand(2);
- MachineOperand &Cond = Inst.getOperand(3);
-
- Register SCCSource = Cond.getReg();
- bool IsSCC = (SCCSource == AMDGPU::SCC);
-
- // If this is a trivial select where the condition is effectively not SCC
- // (SCCSource is a source of copy to SCC), then the select is semantically
- // equivalent to copying SCCSource. Hence, there is no need to create
- // V_CNDMASK, we can just use that and bail out.
- if (!IsSCC && (Src0.isImm() && Src0.getImm() == -1) &&
- (Src1.isImm() && Src1.getImm() == 0)) {
- MRI.replaceRegWith(Dest.getReg(), SCCSource);
- return;
- }
-
- // Prepare the split destination.
- Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
- Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-
- // Split the source operands.
- const TargetRegisterClass *Src0RC = nullptr;
- const TargetRegisterClass *Src0SubRC = nullptr;
- if (Src0.isReg()) {
- Src0RC = MRI.getRegClass(Src0.getReg());
- Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
- }
- const TargetRegisterClass *Src1RC = nullptr;
- const TargetRegisterClass *Src1SubRC = nullptr;
- if (Src1.isReg()) {
- Src1RC = MRI.getRegClass(Src1.getReg());
- Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
- }
- // Split lo.
- MachineOperand SrcReg0Sub0 =
- buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
- MachineOperand SrcReg1Sub0 =
- buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
- // Split hi.
- MachineOperand SrcReg0Sub1 =
- buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
- MachineOperand SrcReg1Sub1 =
- buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
- // Select the lo part.
- MachineInstr *LoHalf =
- BuildMI(MBB, MII, DL, get(AMDGPU::S_CSELECT_B32), DestSub0)
- .add(SrcReg0Sub0)
- .add(SrcReg1Sub0);
- // Replace the condition operand with the original one.
- LoHalf->getOperand(3).setReg(SCCSource);
- Worklist.insert(LoHalf);
- // Select the hi part.
- MachineInstr *HiHalf =
- BuildMI(MBB, MII, DL, get(AMDGPU::S_CSELECT_B32), DestSub1)
- .add(SrcReg0Sub1)
- .add(SrcReg1Sub1);
- // Replace the condition operand with the original one.
- HiHalf->getOperand(3).setReg(SCCSource);
- Worklist.insert(HiHalf);
- // Merge them back to the original 64-bit one.
- BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
- .addReg(DestSub0)
- .addImm(AMDGPU::sub0)
- .addReg(DestSub1)
- .addImm(AMDGPU::sub1);
- MRI.replaceRegWith(Dest.getReg(), FullDestReg);
-
- // Try to legalize the operands in case we need to swap the order to keep
- // it valid.
- legalizeOperands(*LoHalf, MDT);
- legalizeOperands(*HiHalf, MDT);
-
- // Move all users of this moved value.
- addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
-}
-
void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist,
MachineInstr &Inst) const {
MachineBasicBlock &MBB = *Inst.getParent();
@@ -7820,6 +7765,12 @@ int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
}
}
+ if (isMAI(Opcode)) {
+ int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
+ if (MFMAOp != -1)
+ Opcode = MFMAOp;
+ }
+
int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
// -1 means that Opcode is already a native instruction.
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index dd9ea2b53ca2..e551d6c7223f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -78,11 +78,8 @@ private:
moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
MachineDominatorTree *MDT = nullptr) const;
- void lowerSelect32(SetVectorType &Worklist, MachineInstr &Inst,
- MachineDominatorTree *MDT = nullptr) const;
-
- void splitSelect64(SetVectorType &Worklist, MachineInstr &Inst,
- MachineDominatorTree *MDT = nullptr) const;
+ void lowerSelect(SetVectorType &Worklist, MachineInstr &Inst,
+ MachineDominatorTree *MDT = nullptr) const;
void lowerScalarAbs(SetVectorType &Worklist,
MachineInstr &Inst) const;
@@ -1249,6 +1246,10 @@ namespace AMDGPU {
LLVM_READONLY
int getFlatScratchInstSVfromSS(uint16_t Opcode);
+ /// \returns earlyclobber version of a MAC MFMA is exists.
+ LLVM_READONLY
+ int getMFMAEarlyClobberOp(uint16_t Opcode);
+
const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19);
const uint64_t RSRC_INDEX_STRIDE_SHIFT = (32 + 21);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index dda92d3d25ff..713a08907e99 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2588,6 +2588,14 @@ def getFlatScratchInstSVfromSS : InstrMapping {
let ValueCols = [["SV"]];
}
+def getMFMAEarlyClobberOp : InstrMapping {
+ let FilterClass = "MFMATable";
+ let RowFields = ["FMAOp"];
+ let ColFields = ["IsMac"];
+ let KeyCol = ["1"];
+ let ValueCols = [["0"]];
+}
+
include "SIInstructions.td"
include "DSInstructions.td"
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 636337ede000..7be63ae6964b 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1011,7 +1011,7 @@ def : GCNPat <
}
def : GCNPat <
- (i32 (ctpop i32:$popcnt)),
+ (i32 (DivergentUnaryFrag<ctpop> i32:$popcnt)),
(V_BCNT_U32_B32_e64 VSrc_b32:$popcnt, (i32 0))
>;
@@ -1020,6 +1020,14 @@ def : GCNPat <
(V_BCNT_U32_B32_e64 $popcnt, $val)
>;
+def : GCNPat <
+ (i64 (DivergentUnaryFrag<ctpop> i64:$src)),
+ (REG_SEQUENCE VReg_64,
+ (V_BCNT_U32_B32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub1)),
+ (i32 (V_BCNT_U32_B32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0)))), sub0,
+ (i32 (V_MOV_B32_e32 (i32 0))), sub1)
+>;
+
/********** ============================================ **********/
/********** Extraction, Insertion, Building and Casting **********/
/********** ============================================ **********/
@@ -1184,6 +1192,26 @@ def : Pat <
(v2f16 (EXTRACT_SUBREG v4f16:$vec, sub1))
>;
+def : Pat <
+ (extract_subvector v8i16:$vec, (i32 0)),
+ (v4i16 (EXTRACT_SUBREG v8i16:$vec, sub0_sub1))
+>;
+
+def : Pat <
+ (extract_subvector v8i16:$vec, (i32 4)),
+ (v4i16 (EXTRACT_SUBREG v8i16:$vec, sub2_sub3))
+>;
+
+def : Pat <
+ (extract_subvector v8f16:$vec, (i32 0)),
+ (v4f16 (EXTRACT_SUBREG v8f16:$vec, sub0_sub1))
+>;
+
+def : Pat <
+ (extract_subvector v8f16:$vec, (i32 4)),
+ (v4f16 (EXTRACT_SUBREG v8f16:$vec, sub2_sub3))
+>;
+
foreach Index = 0-31 in {
def Extract_Element_v32i32_#Index : Extract_Element <
i32, v32i32, Index, !cast<SubRegIndex>(sub#Index)
@@ -1279,6 +1307,26 @@ def : BitConvert <v2i64, v2f64, VReg_128>;
def : BitConvert <v2f64, v2i64, VReg_128>;
def : BitConvert <v4f32, v2i64, VReg_128>;
def : BitConvert <v2i64, v4f32, VReg_128>;
+def : BitConvert <v8i16, v4i32, SReg_128>;
+def : BitConvert <v4i32, v8i16, SReg_128>;
+def : BitConvert <v8f16, v4f32, VReg_128>;
+def : BitConvert <v8f16, v4i32, VReg_128>;
+def : BitConvert <v4f32, v8f16, VReg_128>;
+def : BitConvert <v4i32, v8f16, VReg_128>;
+def : BitConvert <v8i16, v8f16, VReg_128>;
+def : BitConvert <v8f16, v8i16, VReg_128>;
+def : BitConvert <v4f32, v8i16, VReg_128>;
+def : BitConvert <v8i16, v4f32, VReg_128>;
+def : BitConvert <v8i16, v8f16, SReg_128>;
+def : BitConvert <v8i16, v2i64, SReg_128>;
+def : BitConvert <v8i16, v2f64, SReg_128>;
+def : BitConvert <v8f16, v2i64, SReg_128>;
+def : BitConvert <v8f16, v2f64, SReg_128>;
+def : BitConvert <v8f16, v8i16, SReg_128>;
+def : BitConvert <v2i64, v8i16, SReg_128>;
+def : BitConvert <v2f64, v8i16, SReg_128>;
+def : BitConvert <v2i64, v8f16, SReg_128>;
+def : BitConvert <v2f64, v8f16, SReg_128>;
// 160-bit bitcast
def : BitConvert <v5i32, v5f32, SReg_160>;
@@ -1762,44 +1810,44 @@ def BFIImm32 : PatFrag<
// (y & x) | (z & ~x)
def : AMDGPUPat <
(DivergentBinFrag<or> (and i32:$y, i32:$x), (and i32:$z, (not i32:$x))),
- (V_BFI_B32_e64 $x, $y, $z)
+ (V_BFI_B32_e64 VSrc_b32:$x, VSrc_b32:$y, VSrc_b32:$z)
>;
// (y & C) | (z & ~C)
def : AMDGPUPat <
(BFIImm32 i32:$x, i32:$y, i32:$z),
- (V_BFI_B32_e64 $x, $y, $z)
+ (V_BFI_B32_e64 VSrc_b32:$x, VSrc_b32:$y, VSrc_b32:$z)
>;
// 64-bit version
def : AMDGPUPat <
(DivergentBinFrag<or> (and i64:$y, i64:$x), (and i64:$z, (not i64:$x))),
- (REG_SEQUENCE SReg_64,
- (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub0)),
- (i32 (EXTRACT_SUBREG SReg_64:$y, sub0)),
- (i32 (EXTRACT_SUBREG SReg_64:$z, sub0))), sub0,
- (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub1)),
- (i32 (EXTRACT_SUBREG SReg_64:$y, sub1)),
- (i32 (EXTRACT_SUBREG SReg_64:$z, sub1))), sub1)
+ (REG_SEQUENCE VReg_64,
+ (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)),
+ (i32 (EXTRACT_SUBREG VReg_64:$y, sub0)),
+ (i32 (EXTRACT_SUBREG VReg_64:$z, sub0))), sub0,
+ (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)),
+ (i32 (EXTRACT_SUBREG VReg_64:$y, sub1)),
+ (i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1)
>;
// SHA-256 Ch function
// z ^ (x & (y ^ z))
def : AMDGPUPat <
(DivergentBinFrag<xor> i32:$z, (and i32:$x, (xor i32:$y, i32:$z))),
- (V_BFI_B32_e64 $x, $y, $z)
+ (V_BFI_B32_e64 VSrc_b32:$x, VSrc_b32:$y, VSrc_b32:$z)
>;
// 64-bit version
def : AMDGPUPat <
(DivergentBinFrag<xor> i64:$z, (and i64:$x, (xor i64:$y, i64:$z))),
- (REG_SEQUENCE SReg_64,
- (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub0)),
- (i32 (EXTRACT_SUBREG SReg_64:$y, sub0)),
- (i32 (EXTRACT_SUBREG SReg_64:$z, sub0))), sub0,
- (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub1)),
- (i32 (EXTRACT_SUBREG SReg_64:$y, sub1)),
- (i32 (EXTRACT_SUBREG SReg_64:$z, sub1))), sub1)
+ (REG_SEQUENCE VReg_64,
+ (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)),
+ (i32 (EXTRACT_SUBREG VReg_64:$y, sub0)),
+ (i32 (EXTRACT_SUBREG VReg_64:$z, sub0))), sub0,
+ (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)),
+ (i32 (EXTRACT_SUBREG VReg_64:$y, sub1)),
+ (i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1)
>;
def : AMDGPUPat <
@@ -2725,21 +2773,21 @@ def : AMDGPUPat <
def : AMDGPUPat <
(DivergentBinFrag<or> (and i32:$x, i32:$z),
(and i32:$y, (or i32:$x, i32:$z))),
- (V_BFI_B32_e64 (V_XOR_B32_e64 i32:$x, i32:$y), i32:$z, i32:$y)
+ (V_BFI_B32_e64 (V_XOR_B32_e64 VSrc_b32:$x, VSrc_b32:$y), VSrc_b32:$z, VSrc_b32:$y)
>;
def : AMDGPUPat <
(DivergentBinFrag<or> (and i64:$x, i64:$z),
(and i64:$y, (or i64:$x, i64:$z))),
- (REG_SEQUENCE SReg_64,
- (V_BFI_B32_e64 (V_XOR_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub0)),
- (i32 (EXTRACT_SUBREG SReg_64:$y, sub0))),
- (i32 (EXTRACT_SUBREG SReg_64:$z, sub0)),
- (i32 (EXTRACT_SUBREG SReg_64:$y, sub0))), sub0,
- (V_BFI_B32_e64 (V_XOR_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub1)),
- (i32 (EXTRACT_SUBREG SReg_64:$y, sub1))),
- (i32 (EXTRACT_SUBREG SReg_64:$z, sub1)),
- (i32 (EXTRACT_SUBREG SReg_64:$y, sub1))), sub1)
+ (REG_SEQUENCE VReg_64,
+ (V_BFI_B32_e64 (V_XOR_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)),
+ (i32 (EXTRACT_SUBREG VReg_64:$y, sub0))),
+ (i32 (EXTRACT_SUBREG VReg_64:$z, sub0)),
+ (i32 (EXTRACT_SUBREG VReg_64:$y, sub0))), sub0,
+ (V_BFI_B32_e64 (V_XOR_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)),
+ (i32 (EXTRACT_SUBREG VReg_64:$y, sub1))),
+ (i32 (EXTRACT_SUBREG VReg_64:$z, sub1)),
+ (i32 (EXTRACT_SUBREG VReg_64:$y, sub1))), sub1)
>;
multiclass IntMed3Pat<Instruction med3Inst,
@@ -2825,6 +2873,15 @@ class AMDGPUGenericInstruction : GenericInstruction {
let Namespace = "AMDGPU";
}
+// Convert a wave address to a swizzled vector address (i.e. this is
+// for copying the stack pointer to a vector address appropriate to
+// use in the offset field of mubuf instructions).
+def G_AMDGPU_WAVE_ADDRESS : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src);
+ let hasSideEffects = 0;
+}
+
// Returns -1 if the input is zero.
def G_AMDGPU_FFBH_U32 : AMDGPUGenericInstruction {
let OutOperandList = (outs type0:$dst);
@@ -3027,6 +3084,16 @@ def G_AMDGPU_INTRIN_IMAGE_LOAD : AMDGPUGenericInstruction {
let mayStore = 1;
}
+def G_AMDGPU_INTRIN_IMAGE_LOAD_D16 : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins unknown:$intrin, variable_ops);
+ let hasSideEffects = 0;
+ let mayLoad = 1;
+
+ // FIXME: Use separate opcode for atomics.
+ let mayStore = 1;
+}
+
// This is equivalent to the G_INTRINSIC*, but the operands may have
// been legalized depending on the subtarget requirements.
def G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPUGenericInstruction {
@@ -3036,6 +3103,13 @@ def G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPUGenericInstruction {
let mayStore = 1;
}
+def G_AMDGPU_INTRIN_IMAGE_STORE_D16 : AMDGPUGenericInstruction {
+ let OutOperandList = (outs);
+ let InOperandList = (ins unknown:$intrin, variable_ops);
+ let hasSideEffects = 0;
+ let mayStore = 1;
+}
+
def G_AMDGPU_INTRIN_BVH_INTERSECT_RAY : AMDGPUGenericInstruction {
let OutOperandList = (outs type0:$dst);
let InOperandList = (ins unknown:$intrin, variable_ops);
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index f4d9002e930e..c18637bdbc43 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -105,6 +105,7 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
unsigned DMask;
InstClassEnum InstClass;
unsigned CPol = 0;
+ bool IsAGPR;
bool UseST64;
int AddrIdx[MaxAddressRegs];
const MachineOperand *AddrReg[MaxAddressRegs];
@@ -158,8 +159,7 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
return true;
}
- void setMI(MachineBasicBlock::iterator MI, const SIInstrInfo &TII,
- const GCNSubtarget &STM);
+ void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
};
struct BaseRegisters {
@@ -484,15 +484,16 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
}
void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
- const SIInstrInfo &TII,
- const GCNSubtarget &STM) {
+ const SILoadStoreOptimizer &LSO) {
I = MI;
unsigned Opc = MI->getOpcode();
- InstClass = getInstClass(Opc, TII);
+ InstClass = getInstClass(Opc, *LSO.TII);
if (InstClass == UNKNOWN)
return;
+ IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI));
+
switch (InstClass) {
case DS_READ:
EltSize =
@@ -505,7 +506,7 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
: 4;
break;
case S_BUFFER_LOAD_IMM:
- EltSize = AMDGPU::convertSMRDOffsetUnits(STM, 4);
+ EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4);
break;
default:
EltSize = 4;
@@ -513,7 +514,7 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
}
if (InstClass == MIMG) {
- DMask = TII.getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
+ DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
// Offset is not considered for MIMG instructions.
Offset = 0;
} else {
@@ -522,17 +523,17 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
}
if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
- Format = TII.getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
+ Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
- Width = getOpcodeWidth(*I, TII);
+ Width = getOpcodeWidth(*I, *LSO.TII);
if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
Offset &= 0xffff;
} else if (InstClass != MIMG) {
- CPol = TII.getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
+ CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
}
- AddressRegs Regs = getRegs(Opc, TII);
+ AddressRegs Regs = getRegs(Opc, *LSO.TII);
NumAddresses = 0;
for (unsigned J = 0; J < Regs.NumVAddrs; J++)
@@ -910,19 +911,10 @@ bool SILoadStoreOptimizer::checkAndPrepareMerge(
}
const unsigned InstSubclass = getInstSubclass(Opc, *TII);
- // Do not merge VMEM buffer instructions with "swizzled" bit set.
- int Swizzled =
- AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::swz);
- if (Swizzled != -1 && CI.I->getOperand(Swizzled).getImm())
- return false;
-
DenseSet<Register> RegDefsToMove;
DenseSet<Register> PhysRegUsesToMove;
addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
- const TargetRegisterClass *DataRC = getDataRegClass(*CI.I);
- bool IsAGPR = TRI->hasAGPRs(DataRC);
-
MachineBasicBlock::iterator E = std::next(Paired.I);
MachineBasicBlock::iterator MBBI = std::next(CI.I);
MachineBasicBlock::iterator MBBE = CI.I->getParent()->end();
@@ -971,15 +963,6 @@ bool SILoadStoreOptimizer::checkAndPrepareMerge(
continue;
}
- // Don't merge volatiles.
- if (MBBI->hasOrderedMemoryRef())
- return false;
-
- int Swizzled =
- AMDGPU::getNamedOperandIdx(MBBI->getOpcode(), AMDGPU::OpName::swz);
- if (Swizzled != -1 && MBBI->getOperand(Swizzled).getImm())
- return false;
-
// Handle a case like
// DS_WRITE_B32 addr, v, idx0
// w = DS_READ_B32 addr, idx0
@@ -991,17 +974,6 @@ bool SILoadStoreOptimizer::checkAndPrepareMerge(
continue;
if (&*MBBI == &*Paired.I) {
- if (TRI->hasAGPRs(getDataRegClass(*MBBI)) != IsAGPR)
- return false;
- // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
- // operands. However we are reporting that ds_write2 shall have
- // only VGPR data so that machine copy propagation does not
- // create an illegal instruction with a VGPR and AGPR sources.
- // Consequenctially if we create such instruction the verifier
- // will complain.
- if (IsAGPR && CI.InstClass == DS_WRITE)
- return false;
-
// We need to go through the list of instructions that we plan to
// move and make sure they are all safe to move down past the merged
// instruction.
@@ -1542,49 +1514,36 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
std::pair<unsigned, unsigned>
SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
const CombineInfo &Paired) {
-
- assert(CI.Width != 0 && Paired.Width != 0 && "Width cannot be zero");
-
bool ReverseOrder;
if (CI.InstClass == MIMG) {
assert(
(countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) &&
"No overlaps");
ReverseOrder = CI.DMask > Paired.DMask;
- } else
+ } else {
ReverseOrder = CI.Offset > Paired.Offset;
+ }
unsigned Idx0;
unsigned Idx1;
- if (CI.Width + Paired.Width > 4) {
- assert(CI.Width == 4 && Paired.Width == 4);
+ static const unsigned Idxs[5][4] = {
+ {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
+ {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
+ {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
+ {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
+ {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
+ };
- if (ReverseOrder) {
- Idx1 = AMDGPU::sub0_sub1_sub2_sub3;
- Idx0 = AMDGPU::sub4_sub5_sub6_sub7;
- } else {
- Idx0 = AMDGPU::sub0_sub1_sub2_sub3;
- Idx1 = AMDGPU::sub4_sub5_sub6_sub7;
- }
+ assert(CI.Width >= 1 && CI.Width <= 4);
+ assert(Paired.Width >= 1 && Paired.Width <= 4);
+
+ if (ReverseOrder) {
+ Idx1 = Idxs[0][Paired.Width - 1];
+ Idx0 = Idxs[Paired.Width][CI.Width - 1];
} else {
- static const unsigned Idxs[4][4] = {
- {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
- {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0},
- {AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0},
- {AMDGPU::sub3, 0, 0, 0},
- };
-
- assert(CI.Width >= 1 && CI.Width <= 3);
- assert(Paired.Width >= 1 && Paired.Width <= 3);
-
- if (ReverseOrder) {
- Idx1 = Idxs[0][Paired.Width - 1];
- Idx0 = Idxs[Paired.Width][CI.Width - 1];
- } else {
- Idx0 = Idxs[0][CI.Width - 1];
- Idx1 = Idxs[CI.Width][Paired.Width - 1];
- }
+ Idx0 = Idxs[0][CI.Width - 1];
+ Idx1 = Idxs[CI.Width][Paired.Width - 1];
}
return std::make_pair(Idx0, Idx1);
@@ -1847,7 +1806,8 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0)
return false;
- if (MI.mayLoad() && TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL)
+ if (MI.mayLoad() &&
+ TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr)
return false;
if (AnchorList.count(&MI))
@@ -1988,6 +1948,7 @@ void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
std::list<std::list<CombineInfo> > &MergeableInsts) const {
for (std::list<CombineInfo> &AddrList : MergeableInsts) {
if (AddrList.front().InstClass == CI.InstClass &&
+ AddrList.front().IsAGPR == CI.IsAGPR &&
AddrList.front().hasSameBaseAddress(*CI.I)) {
AddrList.emplace_back(CI);
return;
@@ -2030,13 +1991,29 @@ SILoadStoreOptimizer::collectMergeableInsts(
if (InstClass == UNKNOWN)
continue;
+ // Do not merge VMEM buffer instructions with "swizzled" bit set.
+ int Swizzled =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
+ if (Swizzled != -1 && MI.getOperand(Swizzled).getImm())
+ continue;
+
CombineInfo CI;
- CI.setMI(MI, *TII, *STM);
+ CI.setMI(MI, *this);
CI.Order = Order++;
if (!CI.hasMergeableAddress(*MRI))
continue;
+ if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
+ // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
+ // operands. However we are reporting that ds_write2 shall have
+ // only VGPR data so that machine copy propagation does not
+ // create an illegal instruction with a VGPR and AGPR sources.
+ // Consequenctially if we create such instruction the verifier
+ // will complain.
+ continue;
+ }
+
LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
addInstToMergeableList(CI, MergeableInsts);
@@ -2144,54 +2121,54 @@ SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
case DS_READ: {
MachineBasicBlock::iterator NewMI =
mergeRead2Pair(CI, Paired, InstsToMove);
- CI.setMI(NewMI, *TII, *STM);
+ CI.setMI(NewMI, *this);
break;
}
case DS_WRITE: {
MachineBasicBlock::iterator NewMI =
mergeWrite2Pair(CI, Paired, InstsToMove);
- CI.setMI(NewMI, *TII, *STM);
+ CI.setMI(NewMI, *this);
break;
}
case S_BUFFER_LOAD_IMM: {
MachineBasicBlock::iterator NewMI =
mergeSBufferLoadImmPair(CI, Paired, InstsToMove);
- CI.setMI(NewMI, *TII, *STM);
+ CI.setMI(NewMI, *this);
OptimizeListAgain |= (CI.Width + Paired.Width) < 8;
break;
}
case BUFFER_LOAD: {
MachineBasicBlock::iterator NewMI =
mergeBufferLoadPair(CI, Paired, InstsToMove);
- CI.setMI(NewMI, *TII, *STM);
+ CI.setMI(NewMI, *this);
OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
break;
}
case BUFFER_STORE: {
MachineBasicBlock::iterator NewMI =
mergeBufferStorePair(CI, Paired, InstsToMove);
- CI.setMI(NewMI, *TII, *STM);
+ CI.setMI(NewMI, *this);
OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
break;
}
case MIMG: {
MachineBasicBlock::iterator NewMI =
mergeImagePair(CI, Paired, InstsToMove);
- CI.setMI(NewMI, *TII, *STM);
+ CI.setMI(NewMI, *this);
OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
break;
}
case TBUFFER_LOAD: {
MachineBasicBlock::iterator NewMI =
mergeTBufferLoadPair(CI, Paired, InstsToMove);
- CI.setMI(NewMI, *TII, *STM);
+ CI.setMI(NewMI, *this);
OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
break;
}
case TBUFFER_STORE: {
MachineBasicBlock::iterator NewMI =
mergeTBufferStorePair(CI, Paired, InstsToMove);
- CI.setMI(NewMI, *TII, *STM);
+ CI.setMI(NewMI, *this);
OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
break;
}
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index 3168bcd53eda..e1018bdfde46 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -56,6 +56,7 @@
#include "llvm/CodeGen/LiveVariables.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Target/TargetMachine.h"
using namespace llvm;
@@ -90,6 +91,8 @@ private:
unsigned OrSaveExecOpc;
unsigned Exec;
+ bool EnableOptimizeEndCf = false;
+
bool hasKill(const MachineBasicBlock *Begin, const MachineBasicBlock *End);
void emitIf(MachineInstr &MI);
@@ -579,10 +582,10 @@ void SILowerControlFlow::combineMasks(MachineInstr &MI) {
void SILowerControlFlow::optimizeEndCf() {
// If the only instruction immediately following this END_CF is an another
// END_CF in the only successor we can avoid emitting exec mask restore here.
- if (!RemoveRedundantEndcf)
+ if (!EnableOptimizeEndCf)
return;
- for (MachineInstr *MI : LoweredEndCf) {
+ for (MachineInstr *MI : reverse(LoweredEndCf)) {
MachineBasicBlock &MBB = *MI->getParent();
auto Next =
skipIgnoreExecInstsTrivialSucc(MBB, std::next(MI->getIterator()));
@@ -807,6 +810,8 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
TII = ST.getInstrInfo();
TRI = &TII->getRegisterInfo();
+ EnableOptimizeEndCf =
+ RemoveRedundantEndcf && MF.getTarget().getOptLevel() > CodeGenOpt::None;
// This doesn't actually need LiveIntervals, but we can preserve them.
LIS = getAnalysisIfAvailable<LiveIntervals>();
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index 55196fe334e6..0fbdbef6fcce 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -127,7 +127,7 @@ static void insertCSRRestores(MachineBasicBlock &RestoreBlock,
// FIXME: Just emit the readlane/writelane directly
if (!TFI->restoreCalleeSavedRegisters(RestoreBlock, I, CSI, TRI)) {
for (const CalleeSavedInfo &CI : reverse(CSI)) {
- unsigned Reg = CI.getReg();
+ Register Reg = CI.getReg();
const TargetRegisterClass *RC =
TRI->getMinimalPhysRegClass(Reg, MVT::i32);
@@ -239,50 +239,6 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) {
return false;
}
-// Find lowest available VGPR and use it as VGPR reserved for SGPR spills.
-static bool lowerShiftReservedVGPR(MachineFunction &MF,
- const GCNSubtarget &ST) {
- SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
- const Register PreReservedVGPR = FuncInfo->VGPRReservedForSGPRSpill;
- // Early out if pre-reservation of a VGPR for SGPR spilling is disabled.
- if (!PreReservedVGPR)
- return false;
-
- // If there are no free lower VGPRs available, default to using the
- // pre-reserved register instead.
- const SIRegisterInfo *TRI = ST.getRegisterInfo();
- Register LowestAvailableVGPR =
- TRI->findUnusedRegister(MF.getRegInfo(), &AMDGPU::VGPR_32RegClass, MF);
- if (!LowestAvailableVGPR)
- LowestAvailableVGPR = PreReservedVGPR;
-
- MachineFrameInfo &FrameInfo = MF.getFrameInfo();
- // Create a stack object for a possible spill in the function prologue.
- // Note Non-CSR VGPR also need this as we may overwrite inactive lanes.
- Optional<int> FI = FrameInfo.CreateSpillStackObject(4, Align(4));
-
- // Find saved info about the pre-reserved register.
- const auto *ReservedVGPRInfoItr =
- llvm::find_if(FuncInfo->getSGPRSpillVGPRs(),
- [PreReservedVGPR](const auto &SpillRegInfo) {
- return SpillRegInfo.VGPR == PreReservedVGPR;
- });
-
- assert(ReservedVGPRInfoItr != FuncInfo->getSGPRSpillVGPRs().end());
- auto Index =
- std::distance(FuncInfo->getSGPRSpillVGPRs().begin(), ReservedVGPRInfoItr);
-
- FuncInfo->setSGPRSpillVGPRs(LowestAvailableVGPR, FI, Index);
-
- for (MachineBasicBlock &MBB : MF) {
- assert(LowestAvailableVGPR.isValid() && "Did not find an available VGPR");
- MBB.addLiveIn(LowestAvailableVGPR);
- MBB.sortUniqueLiveIns();
- }
-
- return true;
-}
-
bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
TII = ST.getInstrInfo();
@@ -304,11 +260,6 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
if (!MFI.hasStackObjects() && !HasCSRs) {
SaveBlocks.clear();
RestoreBlocks.clear();
- if (FuncInfo->VGPRReservedForSGPRSpill) {
- // Free the reserved VGPR for later possible use by frame lowering.
- FuncInfo->removeVGPRForSGPRSpill(FuncInfo->VGPRReservedForSGPRSpill, MF);
- MRI.freezeReservedRegs(MF);
- }
return false;
}
@@ -326,8 +277,6 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
// This operates under the assumption that only other SGPR spills are users
// of the frame index.
- lowerShiftReservedVGPR(MF, ST);
-
// To track the spill frame indices handled in this pass.
BitVector SpillFIs(MFI.getObjectIndexEnd(), false);
@@ -375,8 +324,6 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
FuncInfo->removeDeadFrameIndices(MFI);
MadeChange = true;
- } else if (FuncInfo->VGPRReservedForSGPRSpill) {
- FuncInfo->removeVGPRForSGPRSpill(FuncInfo->VGPRReservedForSGPRSpill, MF);
}
SaveBlocks.clear();
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 3ce368ef4db9..cca8565c9ff9 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -118,10 +118,12 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
if (IsKernel || !F.hasFnAttribute("amdgpu-no-workitem-id-x"))
WorkItemIDX = true;
- if (!F.hasFnAttribute("amdgpu-no-workitem-id-y"))
+ if (!F.hasFnAttribute("amdgpu-no-workitem-id-y") &&
+ ST.getMaxWorkitemID(F, 1) != 0)
WorkItemIDY = true;
- if (!F.hasFnAttribute("amdgpu-no-workitem-id-z"))
+ if (!F.hasFnAttribute("amdgpu-no-workitem-id-z") &&
+ ST.getMaxWorkitemID(F, 2) != 0)
WorkItemIDZ = true;
if (!F.hasFnAttribute("amdgpu-no-dispatch-ptr"))
@@ -274,7 +276,6 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
MachineFrameInfo &FrameInfo = MF.getFrameInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
unsigned WaveSize = ST.getWavefrontSize();
- SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
unsigned Size = FrameInfo.getObjectSize(FI);
unsigned NumLanes = Size / 4;
@@ -291,16 +292,7 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
Register LaneVGPR;
unsigned VGPRIndex = (NumVGPRSpillLanes % WaveSize);
- // Reserve a VGPR (when NumVGPRSpillLanes = 0, WaveSize, 2*WaveSize, ..) and
- // when one of the two conditions is true:
- // 1. One reserved VGPR being tracked by VGPRReservedForSGPRSpill is not yet
- // reserved.
- // 2. All spill lanes of reserved VGPR(s) are full and another spill lane is
- // required.
- if (FuncInfo->VGPRReservedForSGPRSpill && NumVGPRSpillLanes < WaveSize) {
- assert(FuncInfo->VGPRReservedForSGPRSpill == SpillVGPRs.back().VGPR);
- LaneVGPR = FuncInfo->VGPRReservedForSGPRSpill;
- } else if (VGPRIndex == 0) {
+ if (VGPRIndex == 0) {
LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
if (LaneVGPR == AMDGPU::NoRegister) {
// We have no VGPRs left for spilling SGPRs. Reset because we will not
@@ -308,6 +300,8 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
SGPRToVGPRSpills.erase(FI);
NumVGPRSpillLanes -= I;
+ // FIXME: We can run out of free registers with split allocation if
+ // IPRA is enabled and a called function already uses every VGPR.
#if 0
DiagnosticInfoResourceLimit DiagOutOfRegs(MF.getFunction(),
"VGPRs for SGPR spilling",
@@ -340,21 +334,6 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
return true;
}
-/// Reserve a VGPR for spilling of SGPRs
-bool SIMachineFunctionInfo::reserveVGPRforSGPRSpills(MachineFunction &MF) {
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- const SIRegisterInfo *TRI = ST.getRegisterInfo();
- SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
-
- Register LaneVGPR = TRI->findUnusedRegister(
- MF.getRegInfo(), &AMDGPU::VGPR_32RegClass, MF, true);
- if (LaneVGPR == Register())
- return false;
- SpillVGPRs.push_back(SGPRSpillVGPR(LaneVGPR, None));
- FuncInfo->VGPRReservedForSGPRSpill = LaneVGPR;
- return true;
-}
-
/// Reserve AGPRs or VGPRs to support spilling for FrameIndex \p FI.
/// Either AGPR is spilled to VGPR to vice versa.
/// Returns true if a \p FI can be eliminated completely.
@@ -616,24 +595,6 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
return false;
}
-// Remove VGPR which was reserved for SGPR spills if there are no spilled SGPRs
-bool SIMachineFunctionInfo::removeVGPRForSGPRSpill(Register ReservedVGPR,
- MachineFunction &MF) {
- for (auto *i = SpillVGPRs.begin(); i < SpillVGPRs.end(); i++) {
- if (i->VGPR == ReservedVGPR) {
- SpillVGPRs.erase(i);
-
- for (MachineBasicBlock &MBB : MF) {
- MBB.removeLiveIn(ReservedVGPR);
- MBB.sortUniqueLiveIns();
- }
- this->VGPRReservedForSGPRSpill = AMDGPU::NoRegister;
- return true;
- }
- }
- return false;
-}
-
bool SIMachineFunctionInfo::usesAGPRs(const MachineFunction &MF) const {
if (UsesAGPRs)
return *UsesAGPRs;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 8accbf611c5f..8e821274bb77 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -502,7 +502,6 @@ public: // FIXME
Register SGPRForBPSaveRestoreCopy;
Optional<int> BasePointerSaveIndex;
- Register VGPRReservedForSGPRSpill;
bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg);
public:
@@ -528,7 +527,6 @@ public:
void setSGPRSpillVGPRs(Register NewVGPR, Optional<int> newFI, int Index) {
SpillVGPRs[Index].VGPR = NewVGPR;
SpillVGPRs[Index].FI = newFI;
- VGPRReservedForSGPRSpill = NewVGPR;
}
bool removeVGPRForSGPRSpill(Register ReservedVGPR, MachineFunction &MF);
@@ -556,7 +554,6 @@ public:
bool haveFreeLanesForSGPRSpill(const MachineFunction &MF,
unsigned NumLane) const;
bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI);
- bool reserveVGPRforSGPRSpills(MachineFunction &MF);
bool allocateVGPRSpillToAGPR(MachineFunction &MF, int FI, bool isAGPRtoVGPR);
void removeDeadFrameIndices(MachineFrameInfo &MFI);
diff --git a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
index 69eab762f05c..24a8879b5684 100644
--- a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
+++ b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
@@ -188,7 +188,7 @@ void SIModeRegister::insertSetreg(MachineBasicBlock &MBB, MachineInstr *MI,
unsigned Offset = countTrailingZeros<unsigned>(InstrMode.Mask);
unsigned Width = countTrailingOnes<unsigned>(InstrMode.Mask >> Offset);
unsigned Value = (InstrMode.Mode >> Offset) & ((1 << Width) - 1);
- BuildMI(MBB, MI, 0, TII->get(AMDGPU::S_SETREG_IMM32_B32))
+ BuildMI(MBB, MI, nullptr, TII->get(AMDGPU::S_SETREG_IMM32_B32))
.addImm(Value)
.addImm(((Width - 1) << AMDGPU::Hwreg::WIDTH_M1_SHIFT_) |
(Offset << AMDGPU::Hwreg::OFFSET_SHIFT_) |
diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
index 6bf6c45d8cf6..e13e33ed5457 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
@@ -155,6 +155,11 @@ public:
return MachineFunctionProperties().set(
MachineFunctionProperties::Property::IsSSA);
}
+
+ MachineFunctionProperties getClearedProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoPHIs);
+ }
};
} // end anonymous namespace
@@ -366,47 +371,42 @@ void SIOptimizeVGPRLiveRange::collectWaterfallCandidateRegisters(
// Re-calculate the liveness of \p Reg in the THEN-region
void SIOptimizeVGPRLiveRange::updateLiveRangeInThenRegion(
Register Reg, MachineBasicBlock *If, MachineBasicBlock *Flow) const {
-
- SmallPtrSet<MachineBasicBlock *, 16> PHIIncoming;
-
- MachineBasicBlock *ThenEntry = nullptr;
- for (auto *Succ : If->successors()) {
- if (Succ != Flow) {
- ThenEntry = Succ;
- break;
+ SetVector<MachineBasicBlock *> Blocks;
+ SmallVector<MachineBasicBlock *> WorkList({If});
+
+ // Collect all successors until we see the flow block, where we should
+ // reconverge.
+ while (!WorkList.empty()) {
+ auto *MBB = WorkList.pop_back_val();
+ for (auto *Succ : MBB->successors()) {
+ if (Succ != Flow && !Blocks.contains(Succ)) {
+ WorkList.push_back(Succ);
+ Blocks.insert(Succ);
+ }
}
}
- assert(ThenEntry && "No successor in Then region?");
LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo(Reg);
- df_iterator_default_set<MachineBasicBlock *, 16> Visited;
-
- for (MachineBasicBlock *MBB : depth_first_ext(ThenEntry, Visited)) {
- if (MBB == Flow)
- break;
-
+ for (MachineBasicBlock *MBB : Blocks) {
// Clear Live bit, as we will recalculate afterwards
LLVM_DEBUG(dbgs() << "Clear AliveBlock " << printMBBReference(*MBB)
<< '\n');
OldVarInfo.AliveBlocks.reset(MBB->getNumber());
}
+ SmallPtrSet<MachineBasicBlock *, 4> PHIIncoming;
+
// Get the blocks the Reg should be alive through
for (auto I = MRI->use_nodbg_begin(Reg), E = MRI->use_nodbg_end(); I != E;
++I) {
auto *UseMI = I->getParent();
if (UseMI->isPHI() && I->readsReg()) {
- if (Visited.contains(UseMI->getParent()))
+ if (Blocks.contains(UseMI->getParent()))
PHIIncoming.insert(UseMI->getOperand(I.getOperandNo() + 1).getMBB());
}
}
- Visited.clear();
-
- for (MachineBasicBlock *MBB : depth_first_ext(ThenEntry, Visited)) {
- if (MBB == Flow)
- break;
-
+ for (MachineBasicBlock *MBB : Blocks) {
SmallVector<MachineInstr *> Uses;
// PHI instructions has been processed before.
findNonPHIUsesInBlock(Reg, MBB, Uses);
@@ -433,7 +433,7 @@ void SIOptimizeVGPRLiveRange::updateLiveRangeInThenRegion(
// Set the isKilled flag if we get new Kills in the THEN region.
for (auto *MI : OldVarInfo.Kills) {
- if (Visited.contains(MI->getParent()))
+ if (Blocks.contains(MI->getParent()))
MI->addRegisterKilled(Reg, TRI);
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 340e2b48e5cd..eb9452f4b85e 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -617,7 +617,7 @@ def Pseudo_SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16
let HasSGPR = 1;
}
-def Pseudo_SReg_128 : SIRegisterClass<"AMDGPU", [v4i32, v2i64, v2f64], 32,
+def Pseudo_SReg_128 : SIRegisterClass<"AMDGPU", [v4i32, v2i64, v2f64, v8i16, v8f16], 32,
(add PRIVATE_RSRC_REG)> {
let isAllocatable = 0;
let CopyCost = -1;
@@ -784,7 +784,7 @@ multiclass SRegClass<int numRegs, int priority,
}
defm "" : SRegClass<3, 14, [v3i32, v3f32], SGPR_96Regs, TTMP_96Regs>;
-defm "" : SRegClass<4, 15, [v4i32, v4f32, v2i64], SGPR_128Regs, TTMP_128Regs>;
+defm "" : SRegClass<4, 15, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16], SGPR_128Regs, TTMP_128Regs>;
defm "" : SRegClass<5, 16, [v5i32, v5f32], SGPR_160Regs, TTMP_160Regs>;
defm "" : SRegClass<6, 17, [v6i32, v6f32, v3i64, v3f64], SGPR_192Regs, TTMP_192Regs>;
defm "" : SRegClass<7, 18, [v7i32, v7f32], SGPR_224Regs, TTMP_224Regs>;
@@ -824,7 +824,7 @@ multiclass VRegClass<int numRegs, list<ValueType> regTypes, dag regList> {
defm VReg_64 : VRegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16, p0, p1, p4],
(add VGPR_64)>;
defm VReg_96 : VRegClass<3, [v3i32, v3f32], (add VGPR_96)>;
-defm VReg_128 : VRegClass<4, [v4i32, v4f32, v2i64, v2f64], (add VGPR_128)>;
+defm VReg_128 : VRegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16], (add VGPR_128)>;
defm VReg_160 : VRegClass<5, [v5i32, v5f32], (add VGPR_160)>;
defm VReg_192 : VRegClass<6, [v6i32, v6f32, v3i64, v3f64], (add VGPR_192)>;
@@ -846,7 +846,7 @@ multiclass ARegClass<int numRegs, list<ValueType> regTypes, dag regList> {
defm AReg_64 : ARegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16],
(add AGPR_64)>;
defm AReg_96 : ARegClass<3, [v3i32, v3f32], (add AGPR_96)>;
-defm AReg_128 : ARegClass<4, [v4i32, v4f32, v2i64, v2f64], (add AGPR_128)>;
+defm AReg_128 : ARegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16], (add AGPR_128)>;
defm AReg_160 : ARegClass<5, [v5i32, v5f32], (add AGPR_160)>;
defm AReg_192 : ARegClass<6, [v6i32, v6f32, v3i64, v3f64], (add AGPR_192)>;
defm AReg_224 : ARegClass<7, [v7i32, v7f32], (add AGPR_224)>;
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 77ee3c0ff0e4..46efb3c605c6 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -861,12 +861,16 @@ MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB,
MachineInstr *VcmpMI;
const MachineOperand &Op0 = MI.getOperand(0);
const MachineOperand &Op1 = MI.getOperand(1);
+
+ // VCC represents lanes killed.
+ Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
+
if (TRI->isVGPR(*MRI, Op0.getReg())) {
Opcode = AMDGPU::getVOPe32(Opcode);
VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0);
} else {
VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode))
- .addReg(AMDGPU::VCC, RegState::Define)
+ .addReg(VCC, RegState::Define)
.addImm(0) // src0 modifiers
.add(Op1)
.addImm(0) // src1 modifiers
@@ -874,9 +878,6 @@ MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB,
.addImm(0); // omod
}
- // VCC represents lanes killed.
- Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
-
MachineInstr *MaskUpdateMI =
BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
.addReg(LiveMaskReg)
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 1713586dcf5b..3f7837f7dbf1 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -246,10 +246,10 @@ let Defs = [SCC] in {
def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32">;
def S_BCNT0_I32_B64 : SOP1_32_64 <"s_bcnt0_i32_b64">;
def S_BCNT1_I32_B32 : SOP1_32 <"s_bcnt1_i32_b32",
- [(set i32:$sdst, (ctpop i32:$src0))]
+ [(set i32:$sdst, (UniformUnaryFrag<ctpop> i32:$src0))]
>;
def S_BCNT1_I32_B64 : SOP1_32_64 <"s_bcnt1_i32_b64",
- [(set i32:$sdst, (ctpop i64:$src0))]
+ [(set i32:$sdst, (UniformUnaryFrag<ctpop> i64:$src0))]
>;
} // End Defs = [SCC]
@@ -518,10 +518,9 @@ let Uses = [SCC] in {
def S_CSELECT_B32 : SOP2_32 <"s_cselect_b32",
[(set i32:$sdst, (SelectPat<select> i32:$src0, i32:$src1))]
>;
- def S_CSELECT_B64 : SOP2_64 <"s_cselect_b64",
- [(set i64:$sdst, (SelectPat<select> i64:$src0, i64:$src1))]
- >;
}
+
+ def S_CSELECT_B64 : SOP2_64 <"s_cselect_b64">;
} // End Uses = [SCC]
let Defs = [SCC] in {
@@ -551,11 +550,11 @@ def S_XOR_B64 : SOP2_64 <"s_xor_b64",
>;
def S_XNOR_B32 : SOP2_32 <"s_xnor_b32",
- [(set i32:$sdst, (not (xor_oneuse i32:$src0, i32:$src1)))]
+ [(set i32:$sdst, (UniformUnaryFrag<not> (xor_oneuse i32:$src0, i32:$src1)))]
>;
def S_XNOR_B64 : SOP2_64 <"s_xnor_b64",
- [(set i64:$sdst, (not (xor_oneuse i64:$src0, i64:$src1)))]
+ [(set i64:$sdst, (UniformUnaryFrag<not> (xor_oneuse i64:$src0, i64:$src1)))]
>;
def S_NAND_B32 : SOP2_32 <"s_nand_b32",
@@ -1371,7 +1370,7 @@ def : GCNPat <
>;
def : GCNPat <
- (i64 (ctpop i64:$src)),
+ (i64 (UniformUnaryFrag<ctpop> i64:$src)),
(i64 (REG_SEQUENCE SReg_64,
(i32 (COPY_TO_REGCLASS (S_BCNT1_I32_B64 $src), SReg_32)), sub0,
(S_MOV_B32 (i32 0)), sub1))
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
index 0bee9022975e..18c348d1cf89 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
@@ -79,8 +79,8 @@ const char* const IdSymbolic[] = {
"HW_REG_FLAT_SCR_LO",
"HW_REG_FLAT_SCR_HI",
"HW_REG_XNACK_MASK",
- nullptr, // HW_ID1, no predictable values
- nullptr, // HW_ID2, no predictable values
+ "HW_REG_HW_ID1",
+ "HW_REG_HW_ID2",
"HW_REG_POPS_PACKER",
nullptr,
nullptr,
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index d20eaaaa65e8..1e96266eb06c 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -132,6 +132,8 @@ bool isHsaAbiVersion3Or4(const MCSubtargetInfo *STI) {
#define GET_MIMGInfoTable_IMPL
#define GET_MIMGLZMappingTable_IMPL
#define GET_MIMGMIPMappingTable_IMPL
+#define GET_MIMGBiasMappingTable_IMPL
+#define GET_MIMGOffsetMappingTable_IMPL
#define GET_MIMGG16MappingTable_IMPL
#include "AMDGPUGenSearchableTables.inc"
@@ -410,7 +412,7 @@ void AMDGPUTargetID::setTargetIDFromTargetIDStream(StringRef TargetID) {
}
std::string AMDGPUTargetID::toString() const {
- std::string StringRep = "";
+ std::string StringRep;
raw_string_ostream StreamRep(StringRep);
auto TargetTriple = STI.getTargetTriple();
@@ -421,7 +423,7 @@ std::string AMDGPUTargetID::toString() const {
<< TargetTriple.getOSName() << '-'
<< TargetTriple.getEnvironmentName() << '-';
- std::string Processor = "";
+ std::string Processor;
// TODO: Following else statement is present here because we used various
// alias names for GPUs up until GFX9 (e.g. 'fiji' is same as 'gfx803').
// Remove once all aliases are removed from GCNProcessors.td.
@@ -432,7 +434,7 @@ std::string AMDGPUTargetID::toString() const {
Twine(Version.Stepping))
.str();
- std::string Features = "";
+ std::string Features;
if (Optional<uint8_t> HsaAbiVersion = getHsaAbiVersion(&STI)) {
switch (*HsaAbiVersion) {
case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
@@ -1018,9 +1020,18 @@ static unsigned getLastSymbolicHwreg(const MCSubtargetInfo &STI) {
}
bool isValidHwreg(int64_t Id, const MCSubtargetInfo &STI) {
- return
- ID_SYMBOLIC_FIRST_ <= Id && Id < getLastSymbolicHwreg(STI) &&
- IdSymbolic[Id] && (Id != ID_XNACK_MASK || !AMDGPU::isGFX10_BEncoding(STI));
+ switch (Id) {
+ case ID_HW_ID:
+ return isSI(STI) || isCI(STI) || isVI(STI) || isGFX9(STI);
+ case ID_HW_ID1:
+ case ID_HW_ID2:
+ return isGFX10Plus(STI);
+ case ID_XNACK_MASK:
+ return isGFX10(STI) && !AMDGPU::isGFX10_BEncoding(STI);
+ default:
+ return ID_SYMBOLIC_FIRST_ <= Id && Id < getLastSymbolicHwreg(STI) &&
+ IdSymbolic[Id];
+ }
}
bool isValidHwreg(int64_t Id) {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 061c74c0ace6..89f928eb8b92 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -64,6 +64,7 @@ struct GcnBufferFormatInfo {
#define GET_MIMGEncoding_DECL
#define GET_MIMGLZMapping_DECL
#define GET_MIMGMIPMapping_DECL
+#define GET_MIMGBiASMapping_DECL
#include "AMDGPUGenSearchableTables.inc"
namespace IsaInfo {
@@ -330,6 +331,16 @@ struct MIMGMIPMappingInfo {
MIMGBaseOpcode NONMIP;
};
+struct MIMGBiasMappingInfo {
+ MIMGBaseOpcode Bias;
+ MIMGBaseOpcode NoBias;
+};
+
+struct MIMGOffsetMappingInfo {
+ MIMGBaseOpcode Offset;
+ MIMGBaseOpcode NoOffset;
+};
+
struct MIMGG16MappingInfo {
MIMGBaseOpcode G;
MIMGBaseOpcode G16;
@@ -342,6 +353,12 @@ LLVM_READONLY
const MIMGMIPMappingInfo *getMIMGMIPMappingInfo(unsigned MIP);
LLVM_READONLY
+const MIMGBiasMappingInfo *getMIMGBiasMappingInfo(unsigned Bias);
+
+LLVM_READONLY
+const MIMGOffsetMappingInfo *getMIMGOffsetMappingInfo(unsigned Offset);
+
+LLVM_READONLY
const MIMGG16MappingInfo *getMIMGG16MappingInfo(unsigned G);
LLVM_READONLY
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 8d232ffe4114..b9ff814a4dc5 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -637,9 +637,9 @@ class divergent_i64_BinOp <SDPatternOperator Op, Instruction Inst> :
)
>;
-def : divergent_i64_BinOp <and, V_AND_B32_e32>;
-def : divergent_i64_BinOp <or, V_OR_B32_e32>;
-def : divergent_i64_BinOp <xor, V_XOR_B32_e32>;
+def : divergent_i64_BinOp <and, V_AND_B32_e64>;
+def : divergent_i64_BinOp <or, V_OR_B32_e64>;
+def : divergent_i64_BinOp <xor, V_XOR_B32_e64>;
let SubtargetPredicate = Has16BitInsts in {
@@ -688,6 +688,36 @@ let SubtargetPredicate = HasDLInsts in {
let isReMaterializable = 1 in
defm V_XNOR_B32 : VOP2Inst <"v_xnor_b32", VOP_I32_I32_I32, xnor>;
+def : GCNPat<
+ (i32 (DivergentUnaryFrag<not> (xor_oneuse i32:$src0, i32:$src1))),
+ (i32 (V_XNOR_B32_e64 $src0, $src1))
+>;
+
+def : GCNPat<
+ (i32 (DivergentBinFrag<xor_oneuse> (not i32:$src0), i32:$src1)),
+ (i32 (V_XNOR_B32_e64 $src0, $src1))
+>;
+
+def : GCNPat<
+ (i64 (DivergentUnaryFrag<not> (xor_oneuse i64:$src0, i64:$src1))),
+ (REG_SEQUENCE VReg_64, (i32 (V_XNOR_B32_e64
+ (i32 (EXTRACT_SUBREG $src0, sub0)),
+ (i32 (EXTRACT_SUBREG $src1, sub0)))), sub0,
+ (i32 (V_XNOR_B32_e64
+ (i32 (EXTRACT_SUBREG $src0, sub1)),
+ (i32 (EXTRACT_SUBREG $src1, sub1)))), sub1)
+>;
+
+def : GCNPat<
+ (i64 (DivergentBinFrag<xor_oneuse> (not i64:$src0), i64:$src1)),
+ (REG_SEQUENCE VReg_64, (i32 (V_XNOR_B32_e64
+ (i32 (EXTRACT_SUBREG $src0, sub0)),
+ (i32 (EXTRACT_SUBREG $src1, sub0)))), sub0,
+ (i32 (V_XNOR_B32_e64
+ (i32 (EXTRACT_SUBREG $src0, sub1)),
+ (i32 (EXTRACT_SUBREG $src1, sub1)))), sub1)
+>;
+
let Constraints = "$vdst = $src2",
DisableEncoding = "$src2",
isConvertibleToThreeAddress = 1,
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 32222b3eb93c..707475ceccee 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -388,6 +388,12 @@ class VOPProfileMAI<VOPProfile P, RegisterOperand _SrcRC, RegisterOperand _DstRC
let HasModifiers = 0;
let Asm64 = "$vdst, $src0, $src1, $src2$cbsz$abid$blgp";
let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, Src2RC64:$src2, cbsz:$cbsz, abid:$abid, blgp:$blgp);
+ // Dst and SrcC cannot partially overlap if SrcC/Dst is bigger than 4 VGPRs.
+ // We then create two versions of the instruction: with tied dst and src2
+ // and with the eralyclobber flag on the dst. This is strciter than the
+ // actual HW restriction. In particular earlyclobber also affects src0 and
+ // src1 allocation which is not required.
+ bit NoDstOverlap = !gt(DstVT.Size, 128);
}
def VOPProfileMAI_F32_F32_X4 : VOPProfileMAI<VOP_V4F32_F32_F32_V4F32, AISrc_128_f32, ADst_128>;
@@ -426,6 +432,11 @@ def VOPProfileMAI_F32_V4I16_X32_VCD : VOPProfileMAI<VOP_V32F32_V4I16_V4I16_V32F
def VOPProfileMAI_F64_16X16X4F64_VCD : VOPProfileMAI<VOP_V4F64_F64_F64_V4F64, VISrc_256_f64, VDst_256, AVSrc_64>;
def VOPProfileMAI_F64_4X4X4F64_VCD : VOPProfileMAI<VOP_F64_F64_F64_F64, VISrc_64_f64, VDst_64, AVSrc_64>;
+class MFMATable <bit is_mac, string Name> {
+ bit IsMac = is_mac;
+ string FMAOp = Name;
+}
+
let Predicates = [HasMAIInsts] in {
let isAsCheapAsAMove = 1, isReMaterializable = 1 in {
@@ -435,13 +446,31 @@ let isAsCheapAsAMove = 1, isReMaterializable = 1 in {
} // End isMoveImm = 1
} // End isAsCheapAsAMove = 1, isReMaterializable = 1
-multiclass MAIInst<string OpName, string P, SDPatternOperator node> {
+multiclass MAIInst<string OpName, string P, SDPatternOperator node,
+ bit NoDstOverlap = !cast<VOPProfileMAI>("VOPProfileMAI_" # P).NoDstOverlap> {
let isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 in {
// FP32 denorm mode is respected, rounding mode is not. Exceptions are not supported.
- defm "" : VOP3Inst<OpName, !cast<VOPProfileMAI>("VOPProfileMAI_" # P), node>;
-
- let SubtargetPredicate = isGFX90APlus, Mnemonic = OpName in
- defm _vgprcd : VOP3Inst<OpName # "_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD")>;
+ let Constraints = !if(NoDstOverlap, "@earlyclobber $vdst", "") in {
+ defm "" : VOP3Inst<OpName, !cast<VOPProfileMAI>("VOPProfileMAI_" # P), !if(NoDstOverlap, null_frag, node)>,
+ MFMATable<0, NAME # "_e64">;
+
+ let SubtargetPredicate = isGFX90APlus, Mnemonic = OpName in
+ defm _vgprcd : VOP3Inst<OpName # "_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD")>,
+ MFMATable<0, NAME # "_vgprcd_e64">;
+ }
+
+ foreach _ = BoolToList<NoDstOverlap>.ret in {
+ let Constraints = !if(NoDstOverlap, "$vdst = $src2", ""),
+ isConvertibleToThreeAddress = NoDstOverlap,
+ Mnemonic = OpName in {
+ defm "_mac" : VOP3Inst<OpName # "_mac", !cast<VOPProfileMAI>("VOPProfileMAI_" # P), node>,
+ MFMATable<1, NAME # "_e64">;
+
+ let SubtargetPredicate = isGFX90APlus in
+ defm _mac_vgprcd : VOP3Inst<OpName # "_mac_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD")>,
+ MFMATable<1, NAME # "_vgprcd_e64">;
+ }
+ }
} // End isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1
}
@@ -517,6 +546,7 @@ multiclass VOP3P_Real_MAI<bits<7> op> {
}
}
+let Constraints = "" in {
multiclass VOP3P_Real_MFMA_gfx90a<bits<7> op> {
let SubtargetPredicate = isGFX90AOnly,
AssemblerPredicate = isGFX90AOnly, DecoderNamespace = "GFX90A" in {
@@ -536,6 +566,7 @@ multiclass VOP3P_Real_MFMA<bits<7> op> :
let DecoderNamespace = "GFX8";
}
}
+}
defm V_PK_MAD_I16 : VOP3P_Real_vi <0x00>;
defm V_PK_MUL_LO_U16 : VOP3P_Real_vi <0x01>;