1 files changed, 1263 insertions, 462 deletions
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 544867513d9c..51241cf0a432 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -18,33 +18,46 @@
 #include <cmath>
 #endif
 
-#include "SIISelLowering.h"
 #include "AMDGPU.h"
-#include "AMDGPUDiagnosticInfoUnsupported.h"
 #include "AMDGPUIntrinsicInfo.h"
 #include "AMDGPUSubtarget.h"
+#include "SIISelLowering.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
 #include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Function.h"
-#include "llvm/ADT/SmallString.h"
 
 using namespace llvm;
 
-SITargetLowering::SITargetLowering(TargetMachine &TM,
-                                   const AMDGPUSubtarget &STI)
+// -amdgpu-fast-fdiv - Command line option to enable faster 2.5 ulp fdiv.
+static cl::opt<bool> EnableAMDGPUFastFDIV(
+  "amdgpu-fast-fdiv",
+  cl::desc("Enable faster 2.5 ulp fdiv"),
+  cl::init(false));
+
+static unsigned findFirstFreeSGPR(CCState &CCInfo) {
+  unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
+  for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
+    if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
+      return AMDGPU::SGPR0 + Reg;
+    }
+  }
+  llvm_unreachable("Cannot allocate sgpr");
+}
+
+SITargetLowering::SITargetLowering(const TargetMachine &TM,
+                                   const SISubtarget &STI)
     : AMDGPUTargetLowering(TM, STI) {
   addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
   addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
 
-  addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass);
-  addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass);
-
   addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
   addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
 
@@ -66,34 +79,25 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
 
   computeRegisterProperties(STI.getRegisterInfo());
 
-  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
-  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
-  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
-  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
-
-  setOperationAction(ISD::ADD, MVT::i32, Legal);
-  setOperationAction(ISD::ADDC, MVT::i32, Legal);
-  setOperationAction(ISD::ADDE, MVT::i32, Legal);
-  setOperationAction(ISD::SUBC, MVT::i32, Legal);
-  setOperationAction(ISD::SUBE, MVT::i32, Legal);
-
-  setOperationAction(ISD::FSIN, MVT::f32, Custom);
-  setOperationAction(ISD::FCOS, MVT::f32, Custom);
-
-  setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
-  setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
-
   // We need to custom lower vector stores from local memory
+  setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
   setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
   setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
+  setOperationAction(ISD::LOAD, MVT::i1, Custom);
 
+  setOperationAction(ISD::STORE, MVT::v2i32, Custom);
+  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
   setOperationAction(ISD::STORE, MVT::v8i32, Custom);
   setOperationAction(ISD::STORE, MVT::v16i32, Custom);
-
   setOperationAction(ISD::STORE, MVT::i1, Custom);
-  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
 
+  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
+  setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
+  setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand);
+
+  setOperationAction(ISD::SELECT, MVT::i1, Promote);
   setOperationAction(ISD::SELECT, MVT::i64, Custom);
   setOperationAction(ISD::SELECT, MVT::f64, Promote);
   AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
@@ -102,109 +106,39 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
   setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
   setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
   setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
 
+  setOperationAction(ISD::SETCC, MVT::i1, Promote);
   setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
   setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
 
-  setOperationAction(ISD::BSWAP, MVT::i32, Legal);
-  setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
+  setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
+  setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
 
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
-
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Legal);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
-
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
-
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
 
-  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
-  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v16i8, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
 
-  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
-
-  for (MVT VT : MVT::integer_valuetypes()) {
-    if (VT == MVT::i64)
-      continue;
-
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal);
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal);
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand);
-
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal);
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal);
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand);
-
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal);
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal);
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand);
-  }
-
-  for (MVT VT : MVT::integer_vector_valuetypes()) {
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i16, Expand);
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v16i16, Expand);
-  }
-
-  for (MVT VT : MVT::fp_valuetypes())
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
-
-  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
-
-  setTruncStoreAction(MVT::i64, MVT::i32, Expand);
-  setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
-  setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
-  setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
-
-
-  setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
-
-  setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
-  setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
-
-  setOperationAction(ISD::LOAD, MVT::i1, Custom);
-
-  setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
-  AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
-
-  setOperationAction(ISD::STORE, MVT::v2i64, Promote);
-  AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
-
-  setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand);
-
-  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
-  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
-  setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
-
-  // These should use UDIVREM, so set them to expand
-  setOperationAction(ISD::UDIV, MVT::i64, Expand);
-  setOperationAction(ISD::UREM, MVT::i64, Expand);
-
-  setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
-  setOperationAction(ISD::SELECT, MVT::i1, Promote);
-
-  setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
-
-
-  setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
+  setOperationAction(ISD::BR_CC, MVT::i1, Expand);
+  setOperationAction(ISD::BR_CC, MVT::i32, Expand);
+  setOperationAction(ISD::BR_CC, MVT::i64, Expand);
+  setOperationAction(ISD::BR_CC, MVT::f32, Expand);
+  setOperationAction(ISD::BR_CC, MVT::f64, Expand);
 
   // We only support LOAD/STORE and vector manipulation ops for vectors
   // with > 4 elements.
   for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64}) {
     for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
-      switch(Op) {
+      switch (Op) {
       case ISD::LOAD:
       case ISD::STORE:
       case ISD::BUILD_VECTOR:
@@ -241,13 +175,46 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
     AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
   }
 
-  if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
+  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
+  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
+  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
+  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
+
+  // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
+  // and output demarshalling
+  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
+
+  // We can't return success/failure, only the old value,
+  // let LLVM add the comparison
+  setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand);
+  setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand);
+
+  if (getSubtarget()->hasFlatAddressSpace()) {
+    setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
+    setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
+  }
+
+  setOperationAction(ISD::BSWAP, MVT::i32, Legal);
+  setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
+
+  // On SI this is s_memtime and s_memrealtime on VI.
+  setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
+  setOperationAction(ISD::TRAP, MVT::Other, Custom);
+
+  setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
+  setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
+
+  if (Subtarget->getGeneration() >= SISubtarget::SEA_ISLANDS) {
     setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
     setOperationAction(ISD::FCEIL, MVT::f64, Legal);
     setOperationAction(ISD::FRINT, MVT::f64, Legal);
   }
 
   setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
+
+  setOperationAction(ISD::FSIN, MVT::f32, Custom);
+  setOperationAction(ISD::FCOS, MVT::f32, Custom);
   setOperationAction(ISD::FDIV, MVT::f32, Custom);
   setOperationAction(ISD::FDIV, MVT::f64, Custom);
 
@@ -263,6 +230,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
   setTargetDAGCombine(ISD::AND);
   setTargetDAGCombine(ISD::OR);
   setTargetDAGCombine(ISD::UINT_TO_FP);
+  setTargetDAGCombine(ISD::FCANONICALIZE);
 
   // All memory operations. Some folding on the pointer operand is done to help
   // matching the constant offsets in the addressing modes.
@@ -287,10 +255,33 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
   setSchedulingPreference(Sched::RegPressure);
 }
 
+const SISubtarget *SITargetLowering::getSubtarget() const {
+  return static_cast<const SISubtarget *>(Subtarget);
+}
+
 //===----------------------------------------------------------------------===//
 // TargetLowering queries
 //===----------------------------------------------------------------------===//
 
+bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
+                                          const CallInst &CI,
+                                          unsigned IntrID) const {
+  switch (IntrID) {
+  case Intrinsic::amdgcn_atomic_inc:
+  case Intrinsic::amdgcn_atomic_dec:
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::getVT(CI.getType());
+    Info.ptrVal = CI.getOperand(0);
+    Info.align = 0;
+    Info.vol = false;
+    Info.readMem = true;
+    Info.writeMem = true;
+    return true;
+  default:
+    return false;
+  }
+}
+
 bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &,
                                           EVT) const {
   // SI has some legal vector types, but no legal vector operations. Say no
@@ -348,7 +339,7 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
 
   switch (AS) {
   case AMDGPUAS::GLOBAL_ADDRESS: {
-    if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+    if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
       // Assume the we will use FLAT for all global memory accesses
       // on VI.
       // FIXME: This assumption is currently wrong.  On VI we still use
@@ -376,16 +367,16 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
     if (DL.getTypeStoreSize(Ty) < 4)
       return isLegalMUBUFAddressingMode(AM);
 
-    if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+    if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) {
       // SMRD instructions have an 8-bit, dword offset on SI.
       if (!isUInt<8>(AM.BaseOffs / 4))
         return false;
-    } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
+    } else if (Subtarget->getGeneration() == SISubtarget::SEA_ISLANDS) {
       // On CI+, this can also be a 32-bit literal constant offset. If it fits
       // in 8-bits, it can use a smaller encoding.
       if (!isUInt<32>(AM.BaseOffs / 4))
         return false;
-    } else if (Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+    } else if (Subtarget->getGeneration() == SISubtarget::VOLCANIC_ISLANDS) {
       // On VI, these use the SMEM format and the offset is 20-bit in bytes.
       if (!isUInt<20>(AM.BaseOffs))
         return false;
@@ -402,7 +393,6 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
   }
 
   case AMDGPUAS::PRIVATE_ADDRESS:
-  case AMDGPUAS::UNKNOWN_ADDRESS_SPACE:
     return isLegalMUBUFAddressingMode(AM);
 
   case AMDGPUAS::LOCAL_ADDRESS:
@@ -423,6 +413,12 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
     return false;
   }
   case AMDGPUAS::FLAT_ADDRESS:
+  case AMDGPUAS::UNKNOWN_ADDRESS_SPACE:
+    // For an unknown address space, this usually means that this is for some
+    // reason being used for pure arithmetic, and not based on some addressing
+    // computation. We don't have instructions that compute pointers with any
+    // addressing modes, so treat them as having no offset like flat
+    // instructions.
     return isLegalFlatAddressingMode(AM);
 
   default:
@@ -442,24 +438,30 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
   if (!VT.isSimple() || VT == MVT::Other)
     return false;
 
-  // TODO - CI+ supports unaligned memory accesses, but this requires driver
-  // support.
-
-  // XXX - The only mention I see of this in the ISA manual is for LDS direct
-  // reads the "byte address and must be dword aligned". Is it also true for the
-  // normal loads and stores?
-  if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS) {
+  if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
+      AddrSpace == AMDGPUAS::REGION_ADDRESS) {
     // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
     // aligned, 8 byte access in a single operation using ds_read2/write2_b32
     // with adjacent offsets.
     bool AlignedBy4 = (Align % 4 == 0);
     if (IsFast)
       *IsFast = AlignedBy4;
+
     return AlignedBy4;
   }
 
+  if (Subtarget->hasUnalignedBufferAccess()) {
+    // If we have an uniform constant load, it still requires using a slow
+    // buffer instruction if unaligned.
+    if (IsFast) {
+      *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS) ?
+        (Align % 4 == 0) : true;
+    }
+
+    return true;
+  }
+
   // Smaller than dword value must be aligned.
-  // FIXME: This should be allowed on CI+
   if (VT.bitsLT(MVT::i32))
     return false;
 
@@ -500,21 +502,22 @@ static bool isFlatGlobalAddrSpace(unsigned AS) {
 
 bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
                                            unsigned DestAS) const {
-  return isFlatGlobalAddrSpace(SrcAS) &&  isFlatGlobalAddrSpace(DestAS);
+  return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS);
 }
 
-
 bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
   const MemSDNode *MemNode = cast<MemSDNode>(N);
   const Value *Ptr = MemNode->getMemOperand()->getValue();
 
   // UndefValue means this is a load of a kernel input.  These are uniform.
-  // Sometimes LDS instructions have constant pointers
-  if (isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || isa<Constant>(Ptr) ||
-      isa<GlobalValue>(Ptr))
+  // Sometimes LDS instructions have constant pointers.
+  // If Ptr is null, then that means this mem operand contains a
+  // PseudoSourceValue like GOT.
+  if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
+      isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
     return true;
 
-  const Instruction *I = dyn_cast_or_null<Instruction>(Ptr);
+  const Instruction *I = dyn_cast<Instruction>(Ptr);
   return I && I->getMetadata("amdgpu.uniform");
 }
 
@@ -528,29 +531,42 @@ SITargetLowering::getPreferredVectorAction(EVT VT) const {
 
 bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                                          Type *Ty) const {
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
+  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
   return TII->isInlineConstant(Imm);
 }
 
-SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
-                                         SDLoc SL, SDValue Chain,
-                                         unsigned Offset, bool Signed) const {
+bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
+
+  // SimplifySetCC uses this function to determine whether or not it should
+  // create setcc with i1 operands.  We don't have instructions for i1 setcc.
+  if (VT == MVT::i1 && Op == ISD::SETCC)
+    return false;
+
+  return TargetLowering::isTypeDesirableForOp(Op, VT);
+}
+
+SDValue SITargetLowering::LowerParameterPtr(SelectionDAG &DAG,
+                                            const SDLoc &SL, SDValue Chain,
+                                            unsigned Offset) const {
   const DataLayout &DL = DAG.getDataLayout();
   MachineFunction &MF = DAG.getMachineFunction();
-  const SIRegisterInfo *TRI =
-      static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
+  const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
   unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
 
-  Type *Ty = VT.getTypeForEVT(*DAG.getContext());
-
   MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
   MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
-  PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
   SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
                                        MRI.getLiveInVirtReg(InputPtrReg), PtrVT);
-  SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
-                            DAG.getConstant(Offset, SL, PtrVT));
+  return DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
+                     DAG.getConstant(Offset, SL, PtrVT));
+}
+SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
+                                         const SDLoc &SL, SDValue Chain,
+                                         unsigned Offset, bool Signed) const {
+  const DataLayout &DL = DAG.getDataLayout();
+  Type *Ty = VT.getTypeForEVT(*DAG.getContext());
+  MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
+  PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
   SDValue PtrOffset = DAG.getUNDEF(PtrVT);
   MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
 
@@ -560,34 +576,35 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
   if (MemVT.isFloatingPoint())
     ExtTy = ISD::EXTLOAD;
 
-  return DAG.getLoad(ISD::UNINDEXED, ExtTy,
-                     VT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemVT,
-                     false, // isVolatile
-                     true, // isNonTemporal
-                     true, // isInvariant
-                     Align); // Alignment
+  SDValue Ptr = LowerParameterPtr(DAG, SL, Chain, Offset);
+  return DAG.getLoad(ISD::UNINDEXED, ExtTy, VT, SL, Chain, Ptr, PtrOffset,
+                     PtrInfo, MemVT, Align, MachineMemOperand::MONonTemporal |
+                                                MachineMemOperand::MOInvariant);
 }
 
 SDValue SITargetLowering::LowerFormalArguments(
     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
-    const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
-    SmallVectorImpl<SDValue> &InVals) const {
-  const SIRegisterInfo *TRI =
-      static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+  const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
 
   MachineFunction &MF = DAG.getMachineFunction();
   FunctionType *FType = MF.getFunction()->getFunctionType();
   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
-  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
 
-  if (Subtarget->isAmdHsaOS() && Info->getShaderType() != ShaderType::COMPUTE) {
+  if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
     const Function *Fn = MF.getFunction();
-    DiagnosticInfoUnsupported NoGraphicsHSA(*Fn, "non-compute shaders with HSA");
+    DiagnosticInfoUnsupported NoGraphicsHSA(
+        *Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
     DAG.getContext()->diagnose(NoGraphicsHSA);
-    return SDValue();
+    return DAG.getEntryNode();
   }
 
-  // FIXME: We currently assume all calling conventions are kernels.
+  // Create stack objects that are used for emitting debugger prologue if
+  // "amdgpu-debugger-emit-prologue" attribute was specified.
+  if (ST.debuggerEmitPrologue())
+    createDebuggerPrologueStackObjects(MF);
 
   SmallVector<ISD::InputArg, 16> Splits;
   BitVector Skipped(Ins.size());
@@ -596,7 +613,7 @@ SDValue SITargetLowering::LowerFormalArguments(
     const ISD::InputArg &Arg = Ins[i];
 
     // First check if it's a PS input addr
-    if (Info->getShaderType() == ShaderType::PIXEL && !Arg.Flags.isInReg() &&
+    if (CallConv == CallingConv::AMDGPU_PS && !Arg.Flags.isInReg() &&
         !Arg.Flags.isByVal() && PSInputNum <= 15) {
 
       if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) {
@@ -613,25 +630,26 @@ SDValue SITargetLowering::LowerFormalArguments(
       ++PSInputNum;
     }
 
-    // Second split vertices into their elements
-    if (Info->getShaderType() != ShaderType::COMPUTE && Arg.VT.isVector()) {
-      ISD::InputArg NewArg = Arg;
-      NewArg.Flags.setSplit();
-      NewArg.VT = Arg.VT.getVectorElementType();
-
-      // We REALLY want the ORIGINAL number of vertex elements here, e.g. a
-      // three or five element vertex only needs three or five registers,
-      // NOT four or eight.
-      Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
-      unsigned NumElements = ParamType->getVectorNumElements();
-
-      for (unsigned j = 0; j != NumElements; ++j) {
-        Splits.push_back(NewArg);
-        NewArg.PartOffset += NewArg.VT.getStoreSize();
+    if (AMDGPU::isShader(CallConv)) {
+      // Second split vertices into their elements
+      if (Arg.VT.isVector()) {
+        ISD::InputArg NewArg = Arg;
+        NewArg.Flags.setSplit();
+        NewArg.VT = Arg.VT.getVectorElementType();
+
+        // We REALLY want the ORIGINAL number of vertex elements here, e.g. a
+        // three or five element vertex only needs three or five registers,
+        // NOT four or eight.
+        Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
+        unsigned NumElements = ParamType->getVectorNumElements();
+
+        for (unsigned j = 0; j != NumElements; ++j) {
+          Splits.push_back(NewArg);
+          NewArg.PartOffset += NewArg.VT.getStoreSize();
+        }
+      } else {
+        Splits.push_back(Arg);
       }
-
-    } else if (Info->getShaderType() != ShaderType::COMPUTE) {
-      Splits.push_back(Arg);
     }
   }
 
@@ -651,19 +669,27 @@ SDValue SITargetLowering::LowerFormalArguments(
   // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
   // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
   //   enabled too.
-  if (Info->getShaderType() == ShaderType::PIXEL &&
+  if (CallConv == CallingConv::AMDGPU_PS &&
       ((Info->getPSInputAddr() & 0x7F) == 0 ||
-       ((Info->getPSInputAddr() & 0xF) == 0 &&
-	Info->isPSInputAllocated(11)))) {
+       ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11)))) {
     CCInfo.AllocateReg(AMDGPU::VGPR0);
     CCInfo.AllocateReg(AMDGPU::VGPR1);
     Info->markPSInputAllocated(0);
     Info->PSInputEna |= 1;
   }
 
-  if (Info->getShaderType() == ShaderType::COMPUTE) {
+  if (!AMDGPU::isShader(CallConv)) {
     getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins,
                             Splits);
+
+    assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
+  } else {
+    assert(!Info->hasPrivateSegmentBuffer() && !Info->hasDispatchPtr() &&
+           !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
+           !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
+           !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
+           !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
+           !Info->hasWorkItemIDZ());
   }
 
   // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
@@ -679,12 +705,24 @@ SDValue SITargetLowering::LowerFormalArguments(
     CCInfo.AllocateReg(DispatchPtrReg);
   }
 
+  if (Info->hasQueuePtr()) {
+    unsigned QueuePtrReg = Info->addQueuePtr(*TRI);
+    MF.addLiveIn(QueuePtrReg, &AMDGPU::SReg_64RegClass);
+    CCInfo.AllocateReg(QueuePtrReg);
+  }
+
   if (Info->hasKernargSegmentPtr()) {
     unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI);
     MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass);
     CCInfo.AllocateReg(InputPtrReg);
   }
 
+  if (Info->hasFlatScratchInit()) {
+    unsigned FlatScratchInitReg = Info->addFlatScratchInit(*TRI);
+    MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SReg_64RegClass);
+    CCInfo.AllocateReg(FlatScratchInitReg);
+  }
+
   AnalyzeFormalArguments(CCInfo, Splits);
 
   SmallVector<SDValue, 16> Chains;
@@ -713,7 +751,7 @@ SDValue SITargetLowering::LowerFormalArguments(
 
       auto *ParamTy =
         dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
-      if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
+      if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS &&
           ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
         // On SI local pointers are just offsets into LDS, so they are always
         // less than 16-bits.  On CI and newer they could potentially be
@@ -765,7 +803,7 @@ SDValue SITargetLowering::LowerFormalArguments(
       NumElements = Arg.VT.getVectorNumElements() - NumElements;
       Regs.append(NumElements, DAG.getUNDEF(VT));
 
-      InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT, Regs));
+      InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs));
       continue;
     }
 
@@ -780,8 +818,7 @@ SDValue SITargetLowering::LowerFormalArguments(
     unsigned Reg = Info->addWorkGroupIDX();
     MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
     CCInfo.AllocateReg(Reg);
-  } else
-    llvm_unreachable("work group id x is always enabled");
+  }
 
   if (Info->hasWorkGroupIDY()) {
     unsigned Reg = Info->addWorkGroupIDY();
@@ -803,8 +840,13 @@ SDValue SITargetLowering::LowerFormalArguments(
 
   if (Info->hasPrivateSegmentWaveByteOffset()) {
     // Scratch wave offset passed in system SGPR.
-    unsigned PrivateSegmentWaveByteOffsetReg
-      = Info->addPrivateSegmentWaveByteOffset();
+    unsigned PrivateSegmentWaveByteOffsetReg;
+
+    if (AMDGPU::isShader(CallConv)) {
+      PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
+      Info->setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
+    } else
+      PrivateSegmentWaveByteOffsetReg = Info->addPrivateSegmentWaveByteOffset();
 
     MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
     CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
@@ -812,8 +854,11 @@ SDValue SITargetLowering::LowerFormalArguments(
 
   // Now that we've figured out where the scratch register inputs are, see if
   // should reserve the arguments and use them directly.
-
   bool HasStackObjects = MF.getFrameInfo()->hasStackObjects();
+  // Record that we know we have non-spill stack objects so we don't need to
+  // check all stack objects later.
+  if (HasStackObjects)
+    Info->setHasNonSpillStackObjects(true);
 
   if (ST.isAmdHsaOS()) {
     // TODO: Assume we will spill without optimizations.
@@ -866,8 +911,7 @@ SDValue SITargetLowering::LowerFormalArguments(
     unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X);
     MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
     CCInfo.AllocateReg(Reg);
-  } else
-    llvm_unreachable("workitem id x should always be enabled");
+  }
 
   if (Info->hasWorkItemIDY()) {
     unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y);
@@ -887,16 +931,16 @@ SDValue SITargetLowering::LowerFormalArguments(
   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
 }
 
-SDValue SITargetLowering::LowerReturn(SDValue Chain,
-                                      CallingConv::ID CallConv,
-                                      bool isVarArg,
-                                      const SmallVectorImpl<ISD::OutputArg> &Outs,
-                                      const SmallVectorImpl<SDValue> &OutVals,
-                                      SDLoc DL, SelectionDAG &DAG) const {
+SDValue
+SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                              bool isVarArg,
+                              const SmallVectorImpl<ISD::OutputArg> &Outs,
+                              const SmallVectorImpl<SDValue> &OutVals,
+                              const SDLoc &DL, SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
 
-  if (Info->getShaderType() == ShaderType::COMPUTE)
+  if (!AMDGPU::isShader(CallConv))
     return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
                                              OutVals, DL, DAG);
 
@@ -975,17 +1019,131 @@ SDValue SITargetLowering::LowerReturn(SDValue Chain,
   if (Flag.getNode())
     RetOps.push_back(Flag);
 
-  return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, RetOps);
+  unsigned Opc = Info->returnsVoid() ? AMDGPUISD::ENDPGM : AMDGPUISD::RETURN;
+  return DAG.getNode(Opc, DL, MVT::Other, RetOps);
 }
 
-MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
-    MachineInstr * MI, MachineBasicBlock * BB) const {
-
-  switch (MI->getOpcode()) {
+unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
+                                             SelectionDAG &DAG) const {
+  unsigned Reg = StringSwitch<unsigned>(RegName)
+    .Case("m0", AMDGPU::M0)
+    .Case("exec", AMDGPU::EXEC)
+    .Case("exec_lo", AMDGPU::EXEC_LO)
+    .Case("exec_hi", AMDGPU::EXEC_HI)
+    .Case("flat_scratch", AMDGPU::FLAT_SCR)
+    .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
+    .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
+    .Default(AMDGPU::NoRegister);
+
+  if (Reg == AMDGPU::NoRegister) {
+    report_fatal_error(Twine("invalid register name \""
+                             + StringRef(RegName)  + "\"."));
+
+  }
+
+  if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS &&
+      Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
+    report_fatal_error(Twine("invalid register \""
+                             + StringRef(RegName)  + "\" for subtarget."));
+  }
+
+  switch (Reg) {
+  case AMDGPU::M0:
+  case AMDGPU::EXEC_LO:
+  case AMDGPU::EXEC_HI:
+  case AMDGPU::FLAT_SCR_LO:
+  case AMDGPU::FLAT_SCR_HI:
+    if (VT.getSizeInBits() == 32)
+      return Reg;
+    break;
+  case AMDGPU::EXEC:
+  case AMDGPU::FLAT_SCR:
+    if (VT.getSizeInBits() == 64)
+      return Reg;
+    break;
   default:
-    return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
+    llvm_unreachable("missing register type checking");
+  }
+
+  report_fatal_error(Twine("invalid type for register \""
+                           + StringRef(RegName) + "\"."));
+}
+
+// If kill is not the last instruction, split the block so kill is always a
+// proper terminator.
+MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI,
+                                                    MachineBasicBlock *BB) const {
+  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+
+  MachineBasicBlock::iterator SplitPoint(&MI);
+  ++SplitPoint;
+
+  if (SplitPoint == BB->end()) {
+    // Don't bother with a new block.
+    MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR));
+    return BB;
+  }
+
+  MachineFunction *MF = BB->getParent();
+  MachineBasicBlock *SplitBB
+    = MF->CreateMachineBasicBlock(BB->getBasicBlock());
+
+  // Fix the block phi references to point to the new block for the defs in the
+  // second piece of the block.
+  for (MachineBasicBlock *Succ : BB->successors()) {
+    for (MachineInstr &MI : *Succ) {
+      if (!MI.isPHI())
+        break;
+
+      for (unsigned I = 2, E = MI.getNumOperands(); I != E; I += 2) {
+        MachineOperand &FromBB = MI.getOperand(I);
+        if (BB == FromBB.getMBB()) {
+          FromBB.setMBB(SplitBB);
+          break;
+        }
+      }
+    }
+  }
+
+  MF->insert(++MachineFunction::iterator(BB), SplitBB);
+  SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end());
+
+  SplitBB->transferSuccessors(BB);
+  BB->addSuccessor(SplitBB);
+
+  MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR));
+  return SplitBB;
+}
+
+MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
+  MachineInstr &MI, MachineBasicBlock *BB) const {
+  switch (MI.getOpcode()) {
+  case AMDGPU::SI_INIT_M0: {
+    const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+    BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
+            TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+        .addOperand(MI.getOperand(0));
+    MI.eraseFromParent();
+    break;
+  }
   case AMDGPU::BRANCH:
     return BB;
+  case AMDGPU::GET_GROUPSTATICSIZE: {
+    const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+
+    MachineFunction *MF = BB->getParent();
+    SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+    DebugLoc DL = MI.getDebugLoc();
+    BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOVK_I32))
+        .addOperand(MI.getOperand(0))
+        .addImm(MFI->LDSSize);
+    MI.eraseFromParent();
+    return BB;
+  }
+  case AMDGPU::SI_KILL:
+    return splitKillBlock(MI, BB);
+  default:
+    return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
   }
   return BB;
 }
@@ -1072,6 +1230,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     return LowerTrig(Op, DAG);
   case ISD::SELECT: return LowerSELECT(Op, DAG);
   case ISD::FDIV: return LowerFDIV(Op, DAG);
+  case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
   case ISD::STORE: return LowerSTORE(Op, DAG);
   case ISD::GlobalAddress: {
     MachineFunction &MF = DAG.getMachineFunction();
@@ -1079,7 +1238,10 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     return LowerGlobalAddress(MFI, Op, DAG);
   }
   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+  case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
   case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
+  case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
+  case ISD::TRAP: return lowerTRAP(Op, DAG);
   }
   return SDValue();
 }
@@ -1106,25 +1268,78 @@ SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
   FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op);
   unsigned FrameIndex = FINode->getIndex();
 
-  // A FrameIndex node represents a 32-bit offset into scratch memory.  If
-  // the high bit of a frame index offset were to be set, this would mean
-  // that it represented an offset of ~2GB * 64 = ~128GB from the start of the
-  // scratch buffer, with 64 being the number of threads per wave.
+  // A FrameIndex node represents a 32-bit offset into scratch memory. If the
+  // high bit of a frame index offset were to be set, this would mean that it
+  // represented an offset of ~2GB * 64 = ~128GB from the start of the scratch
+  // buffer, with 64 being the number of threads per wave.
   //
-  // If we know the machine uses less than 128GB of scratch, then we can
-  // amrk the high bit of the FrameIndex node as known zero,
-  // which is important, because it means in most situations we can
-  // prove that values derived from FrameIndex nodes are non-negative.
-  // This enables us to take advantage of more addressing modes when
-  // accessing scratch buffers, since for scratch reads/writes, the register
-  // offset must always be positive.
+  // The maximum private allocation for the entire GPU is 4G, and we are
+  // concerned with the largest the index could ever be for an individual
+  // workitem. This will occur with the minmum dispatch size. If a program
+  // requires more, the dispatch size will be reduced.
+  //
+  // With this limit, we can mark the high bit of the FrameIndex node as known
+  // zero, which is important, because it means in most situations we can prove
+  // that values derived from FrameIndex nodes are non-negative. This enables us
+  // to take advantage of more addressing modes when accessing scratch buffers,
+  // since for scratch reads/writes, the register offset must always be
+  // positive.
 
-  SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
-  if (Subtarget->enableHugeScratchBuffer())
-    return TFI;
+  uint64_t MaxGPUAlloc = UINT64_C(4) * 1024 * 1024 * 1024;
 
+  // XXX - It is unclear if partial dispatch works. Assume it works at half wave
+  // granularity. It is probably a full wave.
+  uint64_t MinGranularity = 32;
+
+  unsigned KnownBits = Log2_64(MaxGPUAlloc / MinGranularity);
+  EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), KnownBits);
+
+  SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
   return DAG.getNode(ISD::AssertZext, SL, MVT::i32, TFI,
-                    DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), 31)));
+                     DAG.getValueType(ExtVT));
+}
+
+bool SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
+  if (Intr->getOpcode() != ISD::INTRINSIC_W_CHAIN)
+    return false;
+
+  switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
+  default: return false;
+  case AMDGPUIntrinsic::amdgcn_if:
+  case AMDGPUIntrinsic::amdgcn_else:
+  case AMDGPUIntrinsic::amdgcn_break:
+  case AMDGPUIntrinsic::amdgcn_if_break:
+  case AMDGPUIntrinsic::amdgcn_else_break:
+  case AMDGPUIntrinsic::amdgcn_loop:
+  case AMDGPUIntrinsic::amdgcn_end_cf:
+    return true;
+  }
+}
+
+void SITargetLowering::createDebuggerPrologueStackObjects(
+    MachineFunction &MF) const {
+  // Create stack objects that are used for emitting debugger prologue.
+  //
+  // Debugger prologue writes work group IDs and work item IDs to scratch memory
+  // at fixed location in the following format:
+  //   offset 0:  work group ID x
+  //   offset 4:  work group ID y
+  //   offset 8:  work group ID z
+  //   offset 16: work item ID x
+  //   offset 20: work item ID y
+  //   offset 24: work item ID z
+  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+  int ObjectIdx = 0;
+
+  // For each dimension:
+  for (unsigned i = 0; i < 3; ++i) {
+    // Create fixed stack object for work group ID.
+    ObjectIdx = MF.getFrameInfo()->CreateFixedObject(4, i * 4, true);
+    Info->setDebuggerWorkGroupIDStackObjectIndex(i, ObjectIdx);
+    // Create fixed stack object for work item ID.
+    ObjectIdx = MF.getFrameInfo()->CreateFixedObject(4, i * 4 + 16, true);
+    Info->setDebuggerWorkItemIDStackObjectIndex(i, ObjectIdx);
+  }
 }
 
 /// This transforms the control flow intrinsics to get the branch destination as
@@ -1137,13 +1352,11 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
   SDNode *Intr = BRCOND.getOperand(1).getNode();
   SDValue Target = BRCOND.getOperand(2);
   SDNode *BR = nullptr;
+  SDNode *SetCC = nullptr;
 
   if (Intr->getOpcode() == ISD::SETCC) {
     // As long as we negate the condition everything is fine
-    SDNode *SetCC = Intr;
-    assert(SetCC->getConstantOperandVal(1) == 1);
-    assert(cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
-           ISD::SETNE);
+    SetCC = Intr;
     Intr = SetCC->getOperand(0).getNode();
 
   } else {
@@ -1152,7 +1365,15 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
     Target = BR->getOperand(1);
   }
 
-  assert(Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN);
+  if (!isCFIntrinsic(Intr)) {
+    // This is a uniform branch so we don't need to legalize.
+    return BRCOND;
+  }
+
+  assert(!SetCC ||
+        (SetCC->getConstantOperandVal(1) == 1 &&
+         cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
+                                                             ISD::SETNE));
 
   // Build the result and
   ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
@@ -1204,37 +1425,185 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
   return Chain;
 }
 
+SDValue SITargetLowering::getSegmentAperture(unsigned AS,
+                                             SelectionDAG &DAG) const {
+  SDLoc SL;
+  MachineFunction &MF = DAG.getMachineFunction();
+  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+  unsigned UserSGPR = Info->getQueuePtrUserSGPR();
+  assert(UserSGPR != AMDGPU::NoRegister);
+
+  SDValue QueuePtr = CreateLiveInRegister(
+    DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
+
+  // Offset into amd_queue_t for group_segment_aperture_base_hi /
+  // private_segment_aperture_base_hi.
+  uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
+
+  SDValue Ptr = DAG.getNode(ISD::ADD, SL, MVT::i64, QueuePtr,
+                            DAG.getConstant(StructOffset, SL, MVT::i64));
+
+  // TODO: Use custom target PseudoSourceValue.
+  // TODO: We should use the value from the IR intrinsic call, but it might not
+  // be available and how do we get it?
+  Value *V = UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG.getContext()),
+                                              AMDGPUAS::CONSTANT_ADDRESS));
+
+  MachinePointerInfo PtrInfo(V, StructOffset);
+  return DAG.getLoad(MVT::i32, SL, QueuePtr.getValue(1), Ptr, PtrInfo,
+                     MinAlign(64, StructOffset),
+                     MachineMemOperand::MOInvariant);
+}
+
+SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);
+
+  SDValue Src = ASC->getOperand(0);
+
+  // FIXME: Really support non-0 null pointers.
+  SDValue SegmentNullPtr = DAG.getConstant(-1, SL, MVT::i32);
+  SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
+
+  // flat -> local/private
+  if (ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
+    if (ASC->getDestAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+        ASC->getDestAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
+      SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
+      SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
+
+      return DAG.getNode(ISD::SELECT, SL, MVT::i32,
+                         NonNull, Ptr, SegmentNullPtr);
+    }
+  }
+
+  // local/private -> flat
+  if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
+    if (ASC->getSrcAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+        ASC->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
+      SDValue NonNull
+        = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
+
+      SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), DAG);
+      SDValue CvtPtr
+        = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
+
+      return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull,
+                         DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr),
+                         FlatNullPtr);
+    }
+  }
+
+  // global <-> flat are no-ops and never emitted.
+
+  const MachineFunction &MF = DAG.getMachineFunction();
+  DiagnosticInfoUnsupported InvalidAddrSpaceCast(
+    *MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
+  DAG.getContext()->diagnose(InvalidAddrSpaceCast);
+
+  return DAG.getUNDEF(ASC->getValueType(0));
+}
+
+static bool shouldEmitGOTReloc(const GlobalValue *GV,
+                               const TargetMachine &TM) {
+  return GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
+         !TM.shouldAssumeDSOLocal(*GV->getParent(), GV);
+}
+
+bool
+SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
+  // We can fold offsets for anything that doesn't require a GOT relocation.
+  return GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
+         !shouldEmitGOTReloc(GA->getGlobal(), getTargetMachine());
+}
+
+static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
+                                      SDLoc DL, unsigned Offset, EVT PtrVT,
+                                      unsigned GAFlags = SIInstrInfo::MO_NONE) {
+  // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
+  // lowered to the following code sequence:
+  // s_getpc_b64 s[0:1]
+  // s_add_u32 s0, s0, $symbol
+  // s_addc_u32 s1, s1, 0
+  //
+  // s_getpc_b64 returns the address of the s_add_u32 instruction and then
+  // a fixup or relocation is emitted to replace $symbol with a literal
+  // constant, which is a pc-relative offset from the encoding of the $symbol
+  // operand to the global variable.
+  //
+  // What we want here is an offset from the value returned by s_getpc
+  // (which is the address of the s_add_u32 instruction) to the global
+  // variable, but since the encoding of $symbol starts 4 bytes after the start
+  // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
+  // small. This requires us to add 4 to the global variable offset in order to
+  // compute the correct address.
+  SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
+                                          GAFlags);
+  return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, GA);
+}
+
 SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
                                              SDValue Op,
                                              SelectionDAG &DAG) const {
   GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
 
-  if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
+  if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS &&
+      GSD->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS)
     return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
 
   SDLoc DL(GSD);
   const GlobalValue *GV = GSD->getGlobal();
-  MVT PtrVT = getPointerTy(DAG.getDataLayout(), GSD->getAddressSpace());
+  EVT PtrVT = Op.getValueType();
+
+  if (!shouldEmitGOTReloc(GV, getTargetMachine()))
+    return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
+
+  SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
+                                            SIInstrInfo::MO_GOTPCREL);
+
+  Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
+  PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
+  const DataLayout &DataLayout = DAG.getDataLayout();
+  unsigned Align = DataLayout.getABITypeAlignment(PtrTy);
+  // FIXME: Use a PseudoSourceValue once those can be assigned an address space.
+  MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
 
-  SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32);
-  return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT, GA);
+  return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Align,
+                     MachineMemOperand::MOInvariant);
 }
 
-SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL,
-                                   SDValue V) const {
+SDValue SITargetLowering::lowerTRAP(SDValue Op,
+                                    SelectionDAG &DAG) const {
+  const MachineFunction &MF = DAG.getMachineFunction();
+  DiagnosticInfoUnsupported NoTrap(*MF.getFunction(),
+                                   "trap handler not supported",
+                                   Op.getDebugLoc(),
+                                   DS_Warning);
+  DAG.getContext()->diagnose(NoTrap);
+
+  // Emit s_endpgm.
+
+  // FIXME: This should really be selected to s_trap, but that requires
+  // setting up the trap handler for it o do anything.
+  return DAG.getNode(AMDGPUISD::ENDPGM, SDLoc(Op), MVT::Other,
+                     Op.getOperand(0));
+}
+
+SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,
+                                   const SDLoc &DL, SDValue V) const {
+  // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
+  // the destination register.
+  //
   // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
   // so we will end up with redundant moves to m0.
   //
-  // We can't use S_MOV_B32, because there is no way to specify m0 as the
-  // destination register.
-  //
-  // We have to use them both.  Machine cse will combine all the S_MOV_B32
-  // instructions and the register coalescer eliminate the extra copies.
-  SDNode *M0 = DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, V.getValueType(), V);
-  return DAG.getCopyToReg(Chain, DL, DAG.getRegister(AMDGPU::M0, MVT::i32),
-                          SDValue(M0, 0), SDValue()); // Glue
-                                                      // A Null SDValue creates
-                                                      // a glue result.
+  // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
+
+  // A Null SDValue creates a glue result.
+  SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
+                                  V, Chain);
+  return SDValue(M0, 0);
 }
 
 SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
@@ -1249,12 +1618,27 @@ SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
                      DAG.getValueType(VT));
 }
 
+static SDValue emitNonHSAIntrinsicError(SelectionDAG& DAG, SDLoc DL, EVT VT) {
+  DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(),
+                                      "non-hsa intrinsic with hsa target",
+                                      DL.getDebugLoc());
+  DAG.getContext()->diagnose(BadIntrin);
+  return DAG.getUNDEF(VT);
+}
+
+static SDValue emitRemovedIntrinsicError(SelectionDAG& DAG, SDLoc DL, EVT VT) {
+  DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(),
+                                      "intrinsic not supported on subtarget",
+                                      DL.getDebugLoc());
+  DAG.getContext()->diagnose(BadIntrin);
+  return DAG.getUNDEF(VT);
+}
+
 SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                   SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   auto MFI = MF.getInfo<SIMachineFunctionInfo>();
-  const SIRegisterInfo *TRI =
-      static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
+  const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
 
   EVT VT = Op.getValueType();
   SDLoc DL(Op);
@@ -1264,62 +1648,134 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
 
   switch (IntrinsicID) {
   case Intrinsic::amdgcn_dispatch_ptr:
+  case Intrinsic::amdgcn_queue_ptr: {
     if (!Subtarget->isAmdHsaOS()) {
-      DiagnosticInfoUnsupported BadIntrin(*MF.getFunction(),
-                                          "hsa intrinsic without hsa target");
+      DiagnosticInfoUnsupported BadIntrin(
+          *MF.getFunction(), "unsupported hsa intrinsic without hsa target",
+          DL.getDebugLoc());
       DAG.getContext()->diagnose(BadIntrin);
       return DAG.getUNDEF(VT);
     }
 
+    auto Reg = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
+      SIRegisterInfo::DISPATCH_PTR : SIRegisterInfo::QUEUE_PTR;
     return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass,
-      TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_PTR), VT);
-
+                                TRI->getPreloadedValue(MF, Reg), VT);
+  }
+  case Intrinsic::amdgcn_implicitarg_ptr: {
+    unsigned offset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
+    return LowerParameterPtr(DAG, DL, DAG.getEntryNode(), offset);
+  }
+  case Intrinsic::amdgcn_kernarg_segment_ptr: {
+    unsigned Reg
+      = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
+    return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT);
+  }
+  case Intrinsic::amdgcn_rcp:
+    return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
+  case Intrinsic::amdgcn_rsq:
+  case AMDGPUIntrinsic::AMDGPU_rsq: // Legacy name
+    return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
+  case Intrinsic::amdgcn_rsq_legacy: {
+    if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
+      return emitRemovedIntrinsicError(DAG, DL, VT);
+
+    return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
+  }
+  case Intrinsic::amdgcn_rsq_clamp: {
+    if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
+      return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
+
+    Type *Type = VT.getTypeForEVT(*DAG.getContext());
+    APFloat Max = APFloat::getLargest(Type->getFltSemantics());
+    APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
+
+    SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
+    SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
+                              DAG.getConstantFP(Max, DL, VT));
+    return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
+                       DAG.getConstantFP(Min, DL, VT));
+  }
   case Intrinsic::r600_read_ngroups_x:
+    if (Subtarget->isAmdHsaOS())
+      return emitNonHSAIntrinsicError(DAG, DL, VT);
+
     return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
                           SI::KernelInputOffsets::NGROUPS_X, false);
   case Intrinsic::r600_read_ngroups_y:
+    if (Subtarget->isAmdHsaOS())
+      return emitNonHSAIntrinsicError(DAG, DL, VT);
+
     return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
                           SI::KernelInputOffsets::NGROUPS_Y, false);
   case Intrinsic::r600_read_ngroups_z:
+    if (Subtarget->isAmdHsaOS())
+      return emitNonHSAIntrinsicError(DAG, DL, VT);
+
     return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
                           SI::KernelInputOffsets::NGROUPS_Z, false);
   case Intrinsic::r600_read_global_size_x:
+    if (Subtarget->isAmdHsaOS())
+      return emitNonHSAIntrinsicError(DAG, DL, VT);
+
     return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
                           SI::KernelInputOffsets::GLOBAL_SIZE_X, false);
   case Intrinsic::r600_read_global_size_y:
+    if (Subtarget->isAmdHsaOS())
+      return emitNonHSAIntrinsicError(DAG, DL, VT);
+
     return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
                           SI::KernelInputOffsets::GLOBAL_SIZE_Y, false);
   case Intrinsic::r600_read_global_size_z:
+    if (Subtarget->isAmdHsaOS())
+      return emitNonHSAIntrinsicError(DAG, DL, VT);
+
     return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
                           SI::KernelInputOffsets::GLOBAL_SIZE_Z, false);
   case Intrinsic::r600_read_local_size_x:
+    if (Subtarget->isAmdHsaOS())
+      return emitNonHSAIntrinsicError(DAG, DL, VT);
+
     return lowerImplicitZextParam(DAG, Op, MVT::i16,
                                   SI::KernelInputOffsets::LOCAL_SIZE_X);
   case Intrinsic::r600_read_local_size_y:
+    if (Subtarget->isAmdHsaOS())
+      return emitNonHSAIntrinsicError(DAG, DL, VT);
+
     return lowerImplicitZextParam(DAG, Op, MVT::i16,
                                   SI::KernelInputOffsets::LOCAL_SIZE_Y);
   case Intrinsic::r600_read_local_size_z:
+    if (Subtarget->isAmdHsaOS())
+      return emitNonHSAIntrinsicError(DAG, DL, VT);
+
     return lowerImplicitZextParam(DAG, Op, MVT::i16,
                                   SI::KernelInputOffsets::LOCAL_SIZE_Z);
-  case Intrinsic::AMDGPU_read_workdim:
+  case Intrinsic::amdgcn_read_workdim:
+  case AMDGPUIntrinsic::AMDGPU_read_workdim: // Legacy name.
     // Really only 2 bits.
     return lowerImplicitZextParam(DAG, Op, MVT::i8,
                                   getImplicitParameterOffset(MFI, GRID_DIM));
+  case Intrinsic::amdgcn_workgroup_id_x:
   case Intrinsic::r600_read_tgid_x:
     return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
       TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_X), VT);
+  case Intrinsic::amdgcn_workgroup_id_y:
   case Intrinsic::r600_read_tgid_y:
     return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
       TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Y), VT);
+  case Intrinsic::amdgcn_workgroup_id_z:
   case Intrinsic::r600_read_tgid_z:
     return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
       TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Z), VT);
+  case Intrinsic::amdgcn_workitem_id_x:
   case Intrinsic::r600_read_tidig_x:
     return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
       TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X), VT);
+  case Intrinsic::amdgcn_workitem_id_y:
   case Intrinsic::r600_read_tidig_y:
     return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
       TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y), VT);
+  case Intrinsic::amdgcn_workitem_id_z:
   case Intrinsic::r600_read_tidig_z:
     return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
       TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z), VT);
@@ -1336,24 +1792,12 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
                                    Op->getVTList(), Ops, VT, MMO);
   }
-  case AMDGPUIntrinsic::SI_sample:
-    return LowerSampleIntrinsic(AMDGPUISD::SAMPLE, Op, DAG);
-  case AMDGPUIntrinsic::SI_sampleb:
-    return LowerSampleIntrinsic(AMDGPUISD::SAMPLEB, Op, DAG);
-  case AMDGPUIntrinsic::SI_sampled:
-    return LowerSampleIntrinsic(AMDGPUISD::SAMPLED, Op, DAG);
-  case AMDGPUIntrinsic::SI_samplel:
-    return LowerSampleIntrinsic(AMDGPUISD::SAMPLEL, Op, DAG);
   case AMDGPUIntrinsic::SI_vs_load_input:
     return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT,
                        Op.getOperand(1),
                        Op.getOperand(2),
                        Op.getOperand(3));
 
-  case AMDGPUIntrinsic::AMDGPU_fract:
-  case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name.
-    return DAG.getNode(ISD::FSUB, DL, VT, Op.getOperand(1),
-                       DAG.getNode(ISD::FFLOOR, DL, VT, Op.getOperand(1)));
   case AMDGPUIntrinsic::SI_fs_constant: {
     SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3));
     SDValue Glue = M0.getValue(1);
@@ -1393,11 +1837,93 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
                        Glue);
   }
+  case Intrinsic::amdgcn_sin:
+    return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
+
+  case Intrinsic::amdgcn_cos:
+    return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
+
+  case Intrinsic::amdgcn_log_clamp: {
+    if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
+      return SDValue();
+
+    DiagnosticInfoUnsupported BadIntrin(
+      *MF.getFunction(), "intrinsic not supported on subtarget",
+      DL.getDebugLoc());
+      DAG.getContext()->diagnose(BadIntrin);
+      return DAG.getUNDEF(VT);
+  }
+  case Intrinsic::amdgcn_ldexp:
+    return DAG.getNode(AMDGPUISD::LDEXP, DL, VT,
+                       Op.getOperand(1), Op.getOperand(2));
+
+  case Intrinsic::amdgcn_fract:
+    return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
+
+  case Intrinsic::amdgcn_class:
+    return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
+                       Op.getOperand(1), Op.getOperand(2));
+  case Intrinsic::amdgcn_div_fmas:
+    return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
+                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
+                       Op.getOperand(4));
+
+  case Intrinsic::amdgcn_div_fixup:
+    return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
+                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
+  case Intrinsic::amdgcn_trig_preop:
+    return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
+                       Op.getOperand(1), Op.getOperand(2));
+  case Intrinsic::amdgcn_div_scale: {
+    // 3rd parameter required to be a constant.
+    const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3));
+    if (!Param)
+      return DAG.getUNDEF(VT);
+
+    // Translate to the operands expected by the machine instruction. The
+    // first parameter must be the same as the first instruction.
+    SDValue Numerator = Op.getOperand(1);
+    SDValue Denominator = Op.getOperand(2);
+
+    // Note this order is opposite of the machine instruction's operations,
+    // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
+    // intrinsic has the numerator as the first operand to match a normal
+    // division operation.
+
+    SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;
+
+    return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
+                       Denominator, Numerator);
+  }
   default:
     return AMDGPUTargetLowering::LowerOperation(Op, DAG);
   }
 }
 
+SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+  switch (IntrID) {
+  case Intrinsic::amdgcn_atomic_inc:
+  case Intrinsic::amdgcn_atomic_dec: {
+    MemSDNode *M = cast<MemSDNode>(Op);
+    unsigned Opc = (IntrID == Intrinsic::amdgcn_atomic_inc) ?
+      AMDGPUISD::ATOMIC_INC : AMDGPUISD::ATOMIC_DEC;
+    SDValue Ops[] = {
+      M->getOperand(0), // Chain
+      M->getOperand(2), // Ptr
+      M->getOperand(3)  // Value
+    };
+
+    return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops,
+                                   M->getMemoryVT(), M->getMemOperand());
+  }
+  default:
+    return SDValue();
+  }
+}
+
 SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
                                               SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
@@ -1439,6 +1965,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
                                    Op->getVTList(), Ops, VT, MMO);
   }
+  case AMDGPUIntrinsic::AMDGPU_kill: {
+    if (const ConstantFPSDNode *K = dyn_cast<ConstantFPSDNode>(Op.getOperand(2))) {
+      if (!K->isNegative())
+        return Chain;
+    }
+
+    return Op;
+  }
   default:
     return SDValue();
   }
@@ -1447,48 +1981,92 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
 SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
   LoadSDNode *Load = cast<LoadSDNode>(Op);
+  ISD::LoadExtType ExtType = Load->getExtensionType();
+  EVT MemVT = Load->getMemoryVT();
 
-  if (Op.getValueType().isVector()) {
-    assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
-           "Custom lowering for non-i32 vectors hasn't been implemented.");
-    unsigned NumElements = Op.getValueType().getVectorNumElements();
-    assert(NumElements != 2 && "v2 loads are supported for all address spaces.");
+  if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
+    assert(MemVT == MVT::i1 && "Only i1 non-extloads expected");
+    // FIXME: Copied from PPC
+    // First, load into 32 bits, then truncate to 1 bit.
 
-    switch (Load->getAddressSpace()) {
-      default: break;
-      case AMDGPUAS::CONSTANT_ADDRESS:
-      if (isMemOpUniform(Load))
-        break;
-        // Non-uniform loads will be selected to MUBUF instructions, so they
-        // have the same legalization requires ments as global and private
-        // loads.
-        //
-        // Fall-through
-      case AMDGPUAS::GLOBAL_ADDRESS:
-      case AMDGPUAS::PRIVATE_ADDRESS:
-        if (NumElements >= 8)
-          return SplitVectorLoad(Op, DAG);
-
-        // v4 loads are supported for private and global memory.
-        if (NumElements <= 4)
-          break;
-        // fall-through
-      case AMDGPUAS::LOCAL_ADDRESS:
-        // If properly aligned, if we split we might be able to use ds_read_b64.
+    SDValue Chain = Load->getChain();
+    SDValue BasePtr = Load->getBasePtr();
+    MachineMemOperand *MMO = Load->getMemOperand();
+
+    SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
+                                   BasePtr, MVT::i8, MMO);
+
+    SDValue Ops[] = {
+      DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
+      NewLD.getValue(1)
+    };
+
+    return DAG.getMergeValues(Ops, DL);
+  }
+
+  if (!MemVT.isVector())
+    return SDValue();
+
+  assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
+         "Custom lowering for non-i32 vectors hasn't been implemented.");
+
+  unsigned AS = Load->getAddressSpace();
+  if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
+                          AS, Load->getAlignment())) {
+    SDValue Ops[2];
+    std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
+    return DAG.getMergeValues(Ops, DL);
+  }
+
+  unsigned NumElements = MemVT.getVectorNumElements();
+  switch (AS) {
+  case AMDGPUAS::CONSTANT_ADDRESS:
+    if (isMemOpUniform(Load))
+      return SDValue();
+    // Non-uniform loads will be selected to MUBUF instructions, so they
+    // have the same legalization requires ments as global and private
+    // loads.
+    //
+    // Fall-through
+  case AMDGPUAS::GLOBAL_ADDRESS:
+  case AMDGPUAS::FLAT_ADDRESS:
+    if (NumElements > 4)
+      return SplitVectorLoad(Op, DAG);
+    // v4 loads are supported for private and global memory.
+    return SDValue();
+  case AMDGPUAS::PRIVATE_ADDRESS: {
+    // Depending on the setting of the private_element_size field in the
+    // resource descriptor, we can only make private accesses up to a certain
+    // size.
+    switch (Subtarget->getMaxPrivateElementSize()) {
+    case 4:
+      return scalarizeVectorLoad(Load, DAG);
+    case 8:
+      if (NumElements > 2)
+        return SplitVectorLoad(Op, DAG);
+      return SDValue();
+    case 16:
+      // Same as global/flat
+      if (NumElements > 4)
         return SplitVectorLoad(Op, DAG);
+      return SDValue();
+    default:
+      llvm_unreachable("unsupported private_element_size");
     }
   }
+  case AMDGPUAS::LOCAL_ADDRESS: {
+    if (NumElements > 2)
+      return SplitVectorLoad(Op, DAG);
 
-  return AMDGPUTargetLowering::LowerLOAD(Op, DAG);
-}
+    if (NumElements == 2)
+      return SDValue();
 
-SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode,
-                                               const SDValue &Op,
-                                               SelectionDAG &DAG) const {
-  return DAG.getNode(Opcode, SDLoc(Op), Op.getValueType(), Op.getOperand(1),
-                     Op.getOperand(2),
-                     Op.getOperand(3),
-                     Op.getOperand(4));
+    // If properly aligned, if we split we might be able to use ds_read_b64.
+    return SplitVectorLoad(Op, DAG);
+  }
+  default:
+    return SDValue();
+  }
 }
 
 SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
@@ -1514,7 +2092,7 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
 
   SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
 
-  SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i32, Lo, Hi);
+  SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
   return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res);
 }
 
@@ -1547,7 +2125,9 @@ SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const {
     }
   }
 
-  if (Unsafe) {
+  const SDNodeFlags *Flags = Op->getFlags();
+
+  if (Unsafe || Flags->hasAllowReciprocal()) {
     // Turn into multiply by the reciprocal.
     // x / y -> x * (1.0 / y)
     SDNodeFlags Flags;
@@ -1560,45 +2140,71 @@ SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const {
 }
 
 SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
-  SDValue FastLowered = LowerFastFDIV(Op, DAG);
-  if (FastLowered.getNode())
+  if (SDValue FastLowered = LowerFastFDIV(Op, DAG))
     return FastLowered;
 
-  // This uses v_rcp_f32 which does not handle denormals. Let this hit a
-  // selection error for now rather than do something incorrect.
-  if (Subtarget->hasFP32Denormals())
-    return SDValue();
-
   SDLoc SL(Op);
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
 
-  SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
+  // faster 2.5 ulp fdiv when using -amdgpu-fast-fdiv flag
+  if (EnableAMDGPUFastFDIV) {
+    // This does not support denormals.
+    SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
+
+    const APFloat K0Val(BitsToFloat(0x6f800000));
+    const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
+
+    const APFloat K1Val(BitsToFloat(0x2f800000));
+    const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
+
+    const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
+
+    EVT SetCCVT =
+        getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
+
+    SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
+
+    SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
+
+    // TODO: Should this propagate fast-math-flags?
 
-  const APFloat K0Val(BitsToFloat(0x6f800000));
-  const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
+    r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
 
-  const APFloat K1Val(BitsToFloat(0x2f800000));
-  const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
+    // rcp does not support denormals.
+    SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
 
+    SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
+
+    return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
+  }
+
+  // Generates more precise fpdiv32.
   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
 
-  EVT SetCCVT =
-      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
+  SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
 
-  SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
+  SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, RHS, RHS, LHS);
+  SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, LHS, RHS, LHS);
 
-  SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
+  // Denominator is scaled to not be denormal, so using rcp is ok.
+  SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled);
 
-  // TODO: Should this propagate fast-math-flags?
+  SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled);
 
-  r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
+  SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, ApproxRcp, One);
+  SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp, ApproxRcp);
 
-  SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
+  SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1);
 
-  SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
+  SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, Mul, NumeratorScaled);
+  SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul);
+  SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3, NumeratorScaled);
 
-  return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
+  SDValue Scale = NumeratorScaled.getValue(1);
+  SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32, Fma4, Fma1, Fma3, Scale);
+
+  return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS);
 }
 
 SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
@@ -1635,7 +2241,7 @@ SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
 
   SDValue Scale;
 
-  if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+  if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) {
     // Workaround a hardware bug on SI where the condition output from div_scale
     // is not usable.
 
@@ -1685,26 +2291,57 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   StoreSDNode *Store = cast<StoreSDNode>(Op);
   EVT VT = Store->getMemoryVT();
 
-  // These stores are legal.
-  if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
-    if (VT.isVector() && VT.getVectorNumElements() > 4)
-      return ScalarizeVectorStore(Op, DAG);
-    return SDValue();
+  if (VT == MVT::i1) {
+    return DAG.getTruncStore(Store->getChain(), DL,
+       DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
+       Store->getBasePtr(), MVT::i1, Store->getMemOperand());
   }
 
-  SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
-  if (Ret.getNode())
-    return Ret;
+  assert(VT.isVector() &&
+         Store->getValue().getValueType().getScalarType() == MVT::i32);
 
-  if (VT.isVector() && VT.getVectorNumElements() >= 8)
+  unsigned AS = Store->getAddressSpace();
+  if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
+                          AS, Store->getAlignment())) {
+    return expandUnalignedStore(Store, DAG);
+  }
+
+  unsigned NumElements = VT.getVectorNumElements();
+  switch (AS) {
+  case AMDGPUAS::GLOBAL_ADDRESS:
+  case AMDGPUAS::FLAT_ADDRESS:
+    if (NumElements > 4)
+      return SplitVectorStore(Op, DAG);
+    return SDValue();
+  case AMDGPUAS::PRIVATE_ADDRESS: {
+    switch (Subtarget->getMaxPrivateElementSize()) {
+    case 4:
+      return scalarizeVectorStore(Store, DAG);
+    case 8:
+      if (NumElements > 2)
+        return SplitVectorStore(Op, DAG);
+      return SDValue();
+    case 16:
+      if (NumElements > 4)
+        return SplitVectorStore(Op, DAG);
+      return SDValue();
+    default:
+      llvm_unreachable("unsupported private_element_size");
+    }
+  }
+  case AMDGPUAS::LOCAL_ADDRESS: {
+    if (NumElements > 2)
       return SplitVectorStore(Op, DAG);
 
-  if (VT == MVT::i1)
-    return DAG.getTruncStore(Store->getChain(), DL,
-                        DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
-                        Store->getBasePtr(), MVT::i1, Store->getMemOperand());
+    if (NumElements == 2)
+      return Op;
 
-  return SDValue();
+    // If properly aligned, if we split we might be able to use ds_write_b64.
+    return SplitVectorStore(Op, DAG);
+  }
+  default:
+    llvm_unreachable("unhandled address space");
+  }
 }
 
 SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
@@ -1727,6 +2364,33 @@ SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
   }
 }
 
+SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
+  AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
+  assert(AtomicNode->isCompareAndSwap());
+  unsigned AS = AtomicNode->getAddressSpace();
+
+  // No custom lowering required for local address space
+  if (!isFlatGlobalAddrSpace(AS))
+    return Op;
+
+  // Non-local address space requires custom lowering for atomic compare
+  // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
+  SDLoc DL(Op);
+  SDValue ChainIn = Op.getOperand(0);
+  SDValue Addr = Op.getOperand(1);
+  SDValue Old = Op.getOperand(2);
+  SDValue New = Op.getOperand(3);
+  EVT VT = Op.getValueType();
+  MVT SimpleVT = VT.getSimpleVT();
+  MVT VecType = MVT::getVectorVT(SimpleVT, 2);
+
+  SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
+  SDValue Ops[] = { ChainIn, Addr, NewOld };
+
+  return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(),
+                                 Ops, VT, AtomicNode->getMemOperand());
+}
+
 //===----------------------------------------------------------------------===//
 // Custom DAG optimizations
 //===----------------------------------------------------------------------===//
@@ -1756,88 +2420,13 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
     }
   }
 
-  // We are primarily trying to catch operations on illegal vector types
-  // before they are expanded.
-  // For scalars, we can use the more flexible method of checking masked bits
-  // after legalization.
-  if (!DCI.isBeforeLegalize() ||
-      !SrcVT.isVector() ||
-      SrcVT.getVectorElementType() != MVT::i8) {
-    return SDValue();
-  }
-
-  assert(DCI.isBeforeLegalize() && "Unexpected legal type");
-
-  // Weird sized vectors are a pain to handle, but we know 3 is really the same
-  // size as 4.
-  unsigned NElts = SrcVT.getVectorNumElements();
-  if (!SrcVT.isSimple() && NElts != 3)
-    return SDValue();
-
-  // Handle v4i8 -> v4f32 extload. Replace the v4i8 with a legal i32 load to
-  // prevent a mess from expanding to v4i32 and repacking.
-  if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
-    EVT LoadVT = getEquivalentMemType(*DAG.getContext(), SrcVT);
-    EVT RegVT = getEquivalentLoadRegType(*DAG.getContext(), SrcVT);
-    EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32, NElts);
-    LoadSDNode *Load = cast<LoadSDNode>(Src);
-
-    unsigned AS = Load->getAddressSpace();
-    unsigned Align = Load->getAlignment();
-    Type *Ty = LoadVT.getTypeForEVT(*DAG.getContext());
-    unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty);
-
-    // Don't try to replace the load if we have to expand it due to alignment
-    // problems. Otherwise we will end up scalarizing the load, and trying to
-    // repack into the vector for no real reason.
-    if (Align < ABIAlignment &&
-        !allowsMisalignedMemoryAccesses(LoadVT, AS, Align, nullptr)) {
-      return SDValue();
-    }
-
-    SDValue NewLoad = DAG.getExtLoad(ISD::ZEXTLOAD, DL, RegVT,
-                                     Load->getChain(),
-                                     Load->getBasePtr(),
-                                     LoadVT,
-                                     Load->getMemOperand());
-
-    // Make sure successors of the original load stay after it by updating
-    // them to use the new Chain.
-    DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), NewLoad.getValue(1));
-
-    SmallVector<SDValue, 4> Elts;
-    if (RegVT.isVector())
-      DAG.ExtractVectorElements(NewLoad, Elts);
-    else
-      Elts.push_back(NewLoad);
-
-    SmallVector<SDValue, 4> Ops;
-
-    unsigned EltIdx = 0;
-    for (SDValue Elt : Elts) {
-      unsigned ComponentsInElt = std::min(4u, NElts - 4 * EltIdx);
-      for (unsigned I = 0; I < ComponentsInElt; ++I) {
-        unsigned Opc = AMDGPUISD::CVT_F32_UBYTE0 + I;
-        SDValue Cvt = DAG.getNode(Opc, DL, MVT::f32, Elt);
-        DCI.AddToWorklist(Cvt.getNode());
-        Ops.push_back(Cvt);
-      }
-
-      ++EltIdx;
-    }
-
-    assert(Ops.size() == NElts);
-
-    return DAG.getNode(ISD::BUILD_VECTOR, DL, FloatVT, Ops);
-  }
-
   return SDValue();
 }
 
 /// \brief Return true if the given offset Size in bytes can be folded into
 /// the immediate offsets of a memory instruction for the given address space.
 static bool canFoldOffset(unsigned OffsetSize, unsigned AS,
-                          const AMDGPUSubtarget &STI) {
+                          const SISubtarget &STI) {
   switch (AS) {
   case AMDGPUAS::GLOBAL_ADDRESS: {
     // MUBUF instructions a 12-bit offset in bytes.
@@ -1846,7 +2435,7 @@ static bool canFoldOffset(unsigned OffsetSize, unsigned AS,
   case AMDGPUAS::CONSTANT_ADDRESS: {
     // SMRD instructions have an 8-bit offset in dwords on SI and
     // a 20-bit offset in bytes on VI.
-    if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+    if (STI.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
       return isUInt<20>(OffsetSize);
     else
       return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4);
@@ -1897,7 +2486,7 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
   // If the resulting offset is too large, we can't fold it into the addressing
   // mode offset.
   APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
-  if (!canFoldOffset(Offset.getZExtValue(), AddrSpace, *Subtarget))
+  if (!canFoldOffset(Offset.getZExtValue(), AddrSpace, *getSubtarget()))
     return SDValue();
 
   SelectionDAG &DAG = DCI.DAG;
@@ -1915,6 +2504,9 @@ SDValue SITargetLowering::performAndCombine(SDNode *N,
   if (DCI.isBeforeLegalize())
     return SDValue();
 
+  if (SDValue Base = AMDGPUTargetLowering::performAndCombine(N, DCI))
+    return Base;
+
   SelectionDAG &DAG = DCI.DAG;
 
   // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
@@ -1970,6 +2562,36 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
 
+  EVT VT = N->getValueType(0);
+  if (VT == MVT::i64) {
+    // TODO: This could be a generic combine with a predicate for extracting the
+    // high half of an integer being free.
+
+    // (or i64:x, (zero_extend i32:y)) ->
+    //   i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
+    if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
+        RHS.getOpcode() != ISD::ZERO_EXTEND)
+      std::swap(LHS, RHS);
+
+    if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
+      SDValue ExtSrc = RHS.getOperand(0);
+      EVT SrcVT = ExtSrc.getValueType();
+      if (SrcVT == MVT::i32) {
+        SDLoc SL(N);
+        SDValue LowLHS, HiBits;
+        std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
+        SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
+
+        DCI.AddToWorklist(LowOr.getNode());
+        DCI.AddToWorklist(HiBits.getNode());
+
+        SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
+                                  LowOr, HiBits);
+        return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
+      }
+    }
+  }
+
   // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
   if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
       RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
@@ -2005,9 +2627,52 @@ SDValue SITargetLowering::performClassCombine(SDNode *N,
       return DAG.getConstant(0, SDLoc(N), MVT::i1);
   }
 
+  if (N->getOperand(0).isUndef())
+    return DAG.getUNDEF(MVT::i1);
+
   return SDValue();
 }
 
+// Constant fold canonicalize.
+SDValue SITargetLowering::performFCanonicalizeCombine(
+  SDNode *N,
+  DAGCombinerInfo &DCI) const {
+  ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
+  if (!CFP)
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  const APFloat &C = CFP->getValueAPF();
+
+  // Flush denormals to 0 if not enabled.
+  if (C.isDenormal()) {
+    EVT VT = N->getValueType(0);
+    if (VT == MVT::f32 && !Subtarget->hasFP32Denormals())
+      return DAG.getConstantFP(0.0, SDLoc(N), VT);
+
+    if (VT == MVT::f64 && !Subtarget->hasFP64Denormals())
+      return DAG.getConstantFP(0.0, SDLoc(N), VT);
+  }
+
+  if (C.isNaN()) {
+    EVT VT = N->getValueType(0);
+    APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
+    if (C.isSignaling()) {
+      // Quiet a signaling NaN.
+      return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT);
+    }
+
+    // Make sure it is the canonical NaN bitpattern.
+    //
+    // TODO: Can we use -1 as the canonical NaN value since it's an inline
+    // immediate?
+    if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
+      return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT);
+  }
+
+  return SDValue(CFP, 0);
+}
+
 static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
   switch (Opc) {
   case ISD::FMAXNUM:
@@ -2027,8 +2692,64 @@ static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
   }
 }
 
-SDValue SITargetLowering::performMin3Max3Combine(SDNode *N,
-                                                 DAGCombinerInfo &DCI) const {
+static SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
+                                        SDValue Op0, SDValue Op1, bool Signed) {
+  ConstantSDNode *K1 = dyn_cast<ConstantSDNode>(Op1);
+  if (!K1)
+    return SDValue();
+
+  ConstantSDNode *K0 = dyn_cast<ConstantSDNode>(Op0.getOperand(1));
+  if (!K0)
+    return SDValue();
+
+  if (Signed) {
+    if (K0->getAPIntValue().sge(K1->getAPIntValue()))
+      return SDValue();
+  } else {
+    if (K0->getAPIntValue().uge(K1->getAPIntValue()))
+      return SDValue();
+  }
+
+  EVT VT = K0->getValueType(0);
+  return DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, VT,
+                     Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
+}
+
+static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
+  if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions())
+    return true;
+
+  return DAG.isKnownNeverNaN(Op);
+}
+
+static SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
+                                       SDValue Op0, SDValue Op1) {
+  ConstantFPSDNode *K1 = dyn_cast<ConstantFPSDNode>(Op1);
+  if (!K1)
+    return SDValue();
+
+  ConstantFPSDNode *K0 = dyn_cast<ConstantFPSDNode>(Op0.getOperand(1));
+  if (!K0)
+    return SDValue();
+
+  // Ordered >= (although NaN inputs should have folded away by now).
+  APFloat::cmpResult Cmp = K0->getValueAPF().compare(K1->getValueAPF());
+  if (Cmp == APFloat::cmpGreaterThan)
+    return SDValue();
+
+  // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
+  // signaling NaN gives a quiet NaN. The quiet NaN input to the min would then
+  // give the other result, which is different from med3 with a NaN input.
+  SDValue Var = Op0.getOperand(0);
+  if (!isKnownNeverSNan(DAG, Var))
+    return SDValue();
+
+  return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
+                     Var, SDValue(K0, 0), SDValue(K1, 0));
+}
+
+SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
+                                               DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
 
   unsigned Opc = N->getOpcode();
@@ -2038,26 +2759,51 @@ SDValue SITargetLowering::performMin3Max3Combine(SDNode *N,
   // Only do this if the inner op has one use since this will just increases
   // register pressure for no benefit.
 
-  // max(max(a, b), c)
-  if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
-    SDLoc DL(N);
-    return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
-                       DL,
-                       N->getValueType(0),
-                       Op0.getOperand(0),
-                       Op0.getOperand(1),
-                       Op1);
+  if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY) {
+    // max(max(a, b), c) -> max3(a, b, c)
+    // min(min(a, b), c) -> min3(a, b, c)
+    if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
+      SDLoc DL(N);
+      return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
+                         DL,
+                         N->getValueType(0),
+                         Op0.getOperand(0),
+                         Op0.getOperand(1),
+                         Op1);
+    }
+
+    // Try commuted.
+    // max(a, max(b, c)) -> max3(a, b, c)
+    // min(a, min(b, c)) -> min3(a, b, c)
+    if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
+      SDLoc DL(N);
+      return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
+                         DL,
+                         N->getValueType(0),
+                         Op0,
+                         Op1.getOperand(0),
+                         Op1.getOperand(1));
+    }
   }
 
-  // max(a, max(b, c))
-  if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
-    SDLoc DL(N);
-    return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
-                       DL,
-                       N->getValueType(0),
-                       Op0,
-                       Op1.getOperand(0),
-                       Op1.getOperand(1));
+  // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
+  if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
+    if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, true))
+      return Med3;
+  }
+
+  if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
+    if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, false))
+      return Med3;
+  }
+
+  // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
+  if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
+       (Opc == AMDGPUISD::FMIN_LEGACY &&
+        Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
+      N->getValueType(0) == MVT::f32 && Op0.hasOneUse()) {
+    if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
+      return Res;
   }
 
   return SDValue();
@@ -2104,16 +2850,18 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
     return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
   case ISD::SETCC:
     return performSetCCCombine(N, DCI);
-  case ISD::FMAXNUM: // TODO: What about fmax_legacy?
+  case ISD::FMAXNUM:
   case ISD::FMINNUM:
   case ISD::SMAX:
   case ISD::SMIN:
   case ISD::UMAX:
-  case ISD::UMIN: {
+  case ISD::UMIN:
+  case AMDGPUISD::FMIN_LEGACY:
+  case AMDGPUISD::FMAX_LEGACY: {
     if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG &&
         N->getValueType(0) != MVT::f64 &&
         getTargetMachine().getOptLevel() > CodeGenOpt::None)
-      return performMin3Max3Combine(N, DCI);
+      return performMinMaxCombine(N, DCI);
     break;
   }
 
@@ -2122,8 +2870,23 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
   case AMDGPUISD::CVT_F32_UBYTE2:
   case AMDGPUISD::CVT_F32_UBYTE3: {
     unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
-
     SDValue Src = N->getOperand(0);
+
+    // TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero.
+    if (Src.getOpcode() == ISD::SRL) {
+      // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
+      // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
+      // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
+
+      if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(1))) {
+        unsigned SrcOffset = C->getZExtValue() + 8 * Offset;
+        if (SrcOffset < 32 && SrcOffset % 8 == 0) {
+          return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + SrcOffset / 8, DL,
+                             MVT::f32, Src.getOperand(0));
+        }
+      }
+    }
+
     APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
 
     APInt KnownZero, KnownOne;
@@ -2238,7 +3001,9 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::ATOMIC_LOAD_MIN:
   case ISD::ATOMIC_LOAD_MAX:
   case ISD::ATOMIC_LOAD_UMIN:
-  case ISD::ATOMIC_LOAD_UMAX: { // TODO: Target mem intrinsics.
+  case ISD::ATOMIC_LOAD_UMAX:
+  case AMDGPUISD::ATOMIC_INC:
+  case AMDGPUISD::ATOMIC_DEC: { // TODO: Target mem intrinsics.
     if (DCI.isBeforeLegalize())
       break;
 
@@ -2264,6 +3029,19 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
     return performOrCombine(N, DCI);
   case AMDGPUISD::FP_CLASS:
     return performClassCombine(N, DCI);
+  case ISD::FCANONICALIZE:
+    return performFCanonicalizeCombine(N, DCI);
+  case AMDGPUISD::FRACT:
+  case AMDGPUISD::RCP:
+  case AMDGPUISD::RSQ:
+  case AMDGPUISD::RSQ_LEGACY:
+  case AMDGPUISD::RSQ_CLAMP:
+  case AMDGPUISD::LDEXP: {
+    SDValue Src = N->getOperand(0);
+    if (Src.isUndef())
+      return Src;
+    break;
+  }
   }
   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
 }
@@ -2273,9 +3051,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
 /// Returns -1 if it isn't an immediate, 0 if it's and inline immediate
 /// and the immediate value if it's a literal immediate
 int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const {
-
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
+  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
 
   if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N)) {
     if (TII->isInlineConstant(Node->getAPIntValue()))
@@ -2314,7 +3090,8 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
                                        SelectionDAG &DAG) const {
   SDNode *Users[4] = { };
   unsigned Lane = 0;
-  unsigned OldDmask = Node->getConstantOperandVal(0);
+  unsigned DmaskIdx = (Node->getNumOperands() - Node->getNumValues() == 9) ? 2 : 3;
+  unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
   unsigned NewDmask = 0;
 
   // Try to figure out the used register components
@@ -2354,8 +3131,9 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
 
   // Adjust the writemask in the node
   std::vector<SDValue> Ops;
+  Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
   Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
-  Ops.insert(Ops.end(), Node->op_begin() + 1, Node->op_end());
+  Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
   Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops);
 
   // If we only got one lane, replace it with a copy
@@ -2421,14 +3199,15 @@ void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
 /// \brief Fold the instructions after selecting them.
 SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
                                           SelectionDAG &DAG) const {
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
+  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+  unsigned Opcode = Node->getMachineOpcode();
 
-  if (TII->isMIMG(Node->getMachineOpcode()))
+  if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() &&
+      !TII->isGather4(Opcode))
     adjustWritemask(Node, DAG);
 
-  if (Node->getMachineOpcode() == AMDGPU::INSERT_SUBREG ||
-      Node->getMachineOpcode() == AMDGPU::REG_SEQUENCE) {
+  if (Opcode == AMDGPU::INSERT_SUBREG ||
+      Opcode == AMDGPU::REG_SEQUENCE) {
     legalizeTargetIndependentNode(Node, DAG);
     return Node;
   }
@@ -2437,22 +3216,22 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
 
 /// \brief Assign the register class depending on the number of
 /// bits set in the writemask
-void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
+void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
                                                      SDNode *Node) const {
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
+  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
 
-  MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+  MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
 
-  if (TII->isVOP3(MI->getOpcode())) {
+  if (TII->isVOP3(MI.getOpcode())) {
     // Make sure constant bus requirements are respected.
     TII->legalizeOperandsVOP3(MRI, MI);
     return;
   }
 
-  if (TII->isMIMG(*MI)) {
-    unsigned VReg = MI->getOperand(0).getReg();
-    unsigned Writemask = MI->getOperand(1).getImm();
+  if (TII->isMIMG(MI)) {
+    unsigned VReg = MI.getOperand(0).getReg();
+    unsigned DmaskIdx = MI.getNumOperands() == 12 ? 3 : 4;
+    unsigned Writemask = MI.getOperand(DmaskIdx).getImm();
     unsigned BitsSet = 0;
     for (unsigned i = 0; i < 4; ++i)
       BitsSet += Writemask & (1 << i) ? 1 : 0;
@@ -2465,34 +3244,58 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
     case 3:  RC = &AMDGPU::VReg_96RegClass; break;
     }
 
-    unsigned NewOpcode = TII->getMaskedMIMGOp(MI->getOpcode(), BitsSet);
-    MI->setDesc(TII->get(NewOpcode));
+    unsigned NewOpcode = TII->getMaskedMIMGOp(MI.getOpcode(), BitsSet);
+    MI.setDesc(TII->get(NewOpcode));
     MRI.setRegClass(VReg, RC);
     return;
   }
 
   // Replace unused atomics with the no return version.
-  int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI->getOpcode());
+  int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode());
   if (NoRetAtomicOp != -1) {
     if (!Node->hasAnyUseOfValue(0)) {
-      MI->setDesc(TII->get(NoRetAtomicOp));
-      MI->RemoveOperand(0);
+      MI.setDesc(TII->get(NoRetAtomicOp));
+      MI.RemoveOperand(0);
+      return;
     }
 
+    // For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg
+    // instruction, because the return type of these instructions is a vec2 of
+    // the memory type, so it can be tied to the input operand.
+    // This means these instructions always have a use, so we need to add a
+    // special case to check if the atomic has only one extract_subreg use,
+    // which itself has no uses.
+    if ((Node->hasNUsesOfValue(1, 0) &&
+         Node->use_begin()->isMachineOpcode() &&
+         Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG &&
+         !Node->use_begin()->hasAnyUseOfValue(0))) {
+      unsigned Def = MI.getOperand(0).getReg();
+
+      // Change this into a noret atomic.
+      MI.setDesc(TII->get(NoRetAtomicOp));
+      MI.RemoveOperand(0);
+
+      // If we only remove the def operand from the atomic instruction, the
+      // extract_subreg will be left with a use of a vreg without a def.
+      // So we need to insert an implicit_def to avoid machine verifier
+      // errors.
+      BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
+              TII->get(AMDGPU::IMPLICIT_DEF), Def);
+    }
     return;
   }
 }
 
-static SDValue buildSMovImm32(SelectionDAG &DAG, SDLoc DL, uint64_t Val) {
+static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
+                              uint64_t Val) {
   SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
   return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
 }
 
 MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
-                                                SDLoc DL,
+                                                const SDLoc &DL,
                                                 SDValue Ptr) const {
-  const SIInstrInfo *TII =
-    static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
+  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
 
   // Build the half of the subregister with the constants before building the
   // full 128-bit register. If we are building multiple resource descriptors,
@@ -2524,10 +3327,8 @@ MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
 ///        The TID (Thread ID) is multiplied by the stride value (bits [61:48]
 ///        of the resource descriptor) to create an offset, which is added to
 ///        the resource pointer.
-MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG,
-                                           SDLoc DL,
-                                           SDValue Ptr,
-                                           uint32_t RsrcDword1,
+MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,
+                                           SDValue Ptr, uint32_t RsrcDword1,
                                            uint64_t RsrcDword2And3) const {
   SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
   SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);