145 files changed, 16770 insertions, 9267 deletions
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h
index 4f718e1ca310..7e59710a427a 100644
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@@ -8,8 +8,8 @@
 /// \file
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_R600_AMDGPU_H
-#define LLVM_LIB_TARGET_R600_AMDGPU_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPU_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPU_H
 
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetMachine.h"
@@ -29,7 +29,6 @@ class TargetMachine;
 
 // R600 Passes
 FunctionPass *createR600VectorRegMerger(TargetMachine &tm);
-FunctionPass *createR600TextureIntrinsicsReplacer();
 FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm);
 FunctionPass *createR600EmitClauseMarkers();
 FunctionPass *createR600ClauseMergePass(TargetMachine &tm);
@@ -44,12 +43,14 @@ FunctionPass *createSIFoldOperandsPass();
 FunctionPass *createSILowerI1CopiesPass();
 FunctionPass *createSIShrinkInstructionsPass();
 FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm);
-FunctionPass *createSILowerControlFlowPass(TargetMachine &tm);
+FunctionPass *createSIWholeQuadModePass();
+FunctionPass *createSILowerControlFlowPass();
 FunctionPass *createSIFixControlFlowLiveIntervalsPass();
 FunctionPass *createSIFixSGPRCopiesPass();
-FunctionPass *createSIFixSGPRLiveRangesPass();
 FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
-FunctionPass *createSIInsertWaits(TargetMachine &tm);
+FunctionPass *createSIDebuggerInsertNopsPass();
+FunctionPass *createSIInsertWaitsPass();
+FunctionPass *createAMDGPUCodeGenPreparePass(const TargetMachine *TM = nullptr);
 
 ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C);
 
@@ -60,6 +61,9 @@ extern char &AMDGPUAnnotateKernelFeaturesID;
 void initializeSIFoldOperandsPass(PassRegistry &);
 extern char &SIFoldOperandsID;
 
+void initializeSIShrinkInstructionsPass(PassRegistry&);
+extern char &SIShrinkInstructionsID;
+
 void initializeSIFixSGPRCopiesPass(PassRegistry &);
 extern char &SIFixSGPRCopiesID;
 
@@ -69,8 +73,19 @@ extern char &SILowerI1CopiesID;
 void initializeSILoadStoreOptimizerPass(PassRegistry &);
 extern char &SILoadStoreOptimizerID;
 
+void initializeSIWholeQuadModePass(PassRegistry &);
+extern char &SIWholeQuadModeID;
+
+void initializeSILowerControlFlowPass(PassRegistry &);
+extern char &SILowerControlFlowPassID;
+
+
 // Passes common to R600 and SI
-FunctionPass *createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST);
+FunctionPass *createAMDGPUPromoteAlloca(const TargetMachine *TM = nullptr);
+void initializeAMDGPUPromoteAllocaPass(PassRegistry&);
+extern char &AMDGPUPromoteAllocaID;
+
+FunctionPass *createAMDGPUAddDivergenceMetadata(const AMDGPUSubtarget &ST);
 Pass *createAMDGPUStructurizeCFGPass();
 FunctionPass *createAMDGPUISelDag(TargetMachine &tm);
 ModulePass *createAMDGPUAlwaysInlinePass();
@@ -80,12 +95,21 @@ FunctionPass *createAMDGPUAnnotateUniformValues();
 void initializeSIFixControlFlowLiveIntervalsPass(PassRegistry&);
 extern char &SIFixControlFlowLiveIntervalsID;
 
-void initializeSIFixSGPRLiveRangesPass(PassRegistry&);
-extern char &SIFixSGPRLiveRangesID;
-
 void initializeAMDGPUAnnotateUniformValuesPass(PassRegistry&);
 extern char &AMDGPUAnnotateUniformValuesPassID;
 
+void initializeAMDGPUCodeGenPreparePass(PassRegistry&);
+extern char &AMDGPUCodeGenPrepareID;
+
+void initializeSIAnnotateControlFlowPass(PassRegistry&);
+extern char &SIAnnotateControlFlowPassID;
+
+void initializeSIDebuggerInsertNopsPass(PassRegistry&);
+extern char &SIDebuggerInsertNopsID;
+
+void initializeSIInsertWaitsPass(PassRegistry&);
+extern char &SIInsertWaitsID;
+
 extern Target TheAMDGPUTarget;
 extern Target TheGCNTarget;
 
@@ -101,15 +125,6 @@ enum TargetIndex {
 
 } // End namespace llvm
 
-namespace ShaderType {
-  enum Type {
-    PIXEL = 0,
-    VERTEX = 1,
-    GEOMETRY = 2,
-    COMPUTE = 3
-  };
-}
-
 /// OpenCL uses address spaces to differentiate between
 /// various memory regions on the hardware. On the CPU
 /// all of the address spaces point to the same memory,
@@ -120,7 +135,7 @@ namespace AMDGPUAS {
 enum AddressSpaces : unsigned {
   PRIVATE_ADDRESS  = 0, ///< Address space for private memory.
   GLOBAL_ADDRESS   = 1, ///< Address space for global memory (RAT0, VTX0).
-  CONSTANT_ADDRESS = 2, ///< Address space for constant memory
+  CONSTANT_ADDRESS = 2, ///< Address space for constant memory (VTX2)
   LOCAL_ADDRESS    = 3, ///< Address space for local memory.
   FLAT_ADDRESS     = 4, ///< Address space for flat memory.
   REGION_ADDRESS   = 5, ///< Address space for region memory.
@@ -148,8 +163,6 @@ enum AddressSpaces : unsigned {
   CONSTANT_BUFFER_13 = 21,
   CONSTANT_BUFFER_14 = 22,
   CONSTANT_BUFFER_15 = 23,
-  ADDRESS_NONE = 24, ///< Address space for unknown memory.
-  LAST_ADDRESS = ADDRESS_NONE,
 
   // Some places use this if the address space can't be determined.
   UNKNOWN_ADDRESS_SPACE = ~0u
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td
index 844d89c737bf..72c455354411 100644
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@@ -1,182 +1,121 @@
-//===-- AMDGPU.td - AMDGPU Tablegen files ------------------*- tablegen -*-===//
+//===-- AMDGPU.td - AMDGPU Tablegen files --------*- tablegen -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-//===----------------------------------------------------------------------===//
+//===------------------------------------------------------------===//
 
 include "llvm/Target/Target.td"
 
-//===----------------------------------------------------------------------===//
-// Subtarget Features
-//===----------------------------------------------------------------------===//
-
-// Debugging Features
-
-def FeatureDumpCode : SubtargetFeature <"DumpCode",
-        "DumpCode",
-        "true",
-        "Dump MachineInstrs in the CodeEmitter">;
-
-def FeatureDumpCodeLower : SubtargetFeature <"dumpcode",
-        "DumpCode",
-        "true",
-        "Dump MachineInstrs in the CodeEmitter">;
-
-def FeatureIRStructurizer : SubtargetFeature <"disable-irstructurizer",
-        "EnableIRStructurizer",
-        "false",
-        "Disable IR Structurizer">;
-
-def FeaturePromoteAlloca : SubtargetFeature <"promote-alloca",
-        "EnablePromoteAlloca",
-        "true",
-        "Enable promote alloca pass">;
-
-// Target features
-
-def FeatureIfCvt : SubtargetFeature <"disable-ifcvt",
-        "EnableIfCvt",
-        "false",
-        "Disable the if conversion pass">;
+//===------------------------------------------------------------===//
+// Subtarget Features (device properties)
+//===------------------------------------------------------------===//
 
 def FeatureFP64 : SubtargetFeature<"fp64",
-        "FP64",
-        "true",
-        "Enable double precision operations">;
-
-def FeatureFP64Denormals : SubtargetFeature<"fp64-denormals",
-        "FP64Denormals",
-        "true",
-        "Enable double precision denormal handling",
-        [FeatureFP64]>;
+  "FP64",
+  "true",
+  "Enable double precision operations"
+>;
 
 def FeatureFastFMAF32 : SubtargetFeature<"fast-fmaf",
-        "FastFMAF32",
-        "true",
-        "Assuming f32 fma is at least as fast as mul + add",
-        []>;
-
-// Some instructions do not support denormals despite this flag. Using
-// fp32 denormals also causes instructions to run at the double
-// precision rate for the device.
-def FeatureFP32Denormals : SubtargetFeature<"fp32-denormals",
-        "FP32Denormals",
-        "true",
-        "Enable single precision denormal handling">;
+  "FastFMAF32",
+  "true",
+  "Assuming f32 fma is at least as fast as mul + add"
+>;
 
-def Feature64BitPtr : SubtargetFeature<"64BitPtr",
-        "Is64bit",
-        "true",
-        "Specify if 64-bit addressing should be used">;
+def HalfRate64Ops : SubtargetFeature<"half-rate-64-ops",
+  "HalfRate64Ops",
+  "true",
+  "Most fp64 instructions are half rate instead of quarter"
+>;
 
 def FeatureR600ALUInst : SubtargetFeature<"R600ALUInst",
-        "R600ALUInst",
-        "false",
-        "Older version of ALU instructions encoding">;
+  "R600ALUInst",
+  "false",
+  "Older version of ALU instructions encoding"
+>;
 
 def FeatureVertexCache : SubtargetFeature<"HasVertexCache",
-        "HasVertexCache",
-        "true",
-        "Specify use of dedicated vertex cache">;
+  "HasVertexCache",
+  "true",
+  "Specify use of dedicated vertex cache"
+>;
 
 def FeatureCaymanISA : SubtargetFeature<"caymanISA",
-        "CaymanISA",
-        "true",
-        "Use Cayman ISA">;
+  "CaymanISA",
+  "true",
+  "Use Cayman ISA"
+>;
 
 def FeatureCFALUBug : SubtargetFeature<"cfalubug",
-        "CFALUBug",
-        "true",
-        "GPU has CF_ALU bug">;
-
-// XXX - This should probably be removed once enabled by default
-def FeatureEnableLoadStoreOpt : SubtargetFeature <"load-store-opt",
-        "EnableLoadStoreOpt",
-        "true",
-        "Enable SI load/store optimizer pass">;
-
-// Performance debugging feature. Allow using DS instruction immediate
-// offsets even if the base pointer can't be proven to be base. On SI,
-// base pointer values that won't give the same result as a 16-bit add
-// are not safe to fold, but this will override the conservative test
-// for the base pointer.
-def FeatureEnableUnsafeDSOffsetFolding : SubtargetFeature <"unsafe-ds-offset-folding",
-        "EnableUnsafeDSOffsetFolding",
-        "true",
-        "Force using DS instruction immediate offsets on SI">;
-
-def FeatureFlatForGlobal : SubtargetFeature<"flat-for-global",
-        "FlatForGlobal",
-        "true",
-        "Force to generate flat instruction for global">;
+  "CFALUBug",
+  "true",
+  "GPU has CF_ALU bug"
+>;
 
 def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space",
-        "FlatAddressSpace",
-        "true",
-        "Support flat address space">;
+  "FlatAddressSpace",
+  "true",
+  "Support flat address space"
+>;
 
-def FeatureXNACK : SubtargetFeature<"xnack",
-        "EnableXNACK",
-        "true",
-        "Enable XNACK support">;
+def FeatureUnalignedBufferAccess : SubtargetFeature<"unaligned-buffer-access",
+  "UnalignedBufferAccess",
+  "true",
+  "Support unaligned global loads and stores"
+>;
 
-def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling",
-        "EnableVGPRSpilling",
-        "true",
-        "Enable spilling of VGPRs to scratch memory">;
+def FeatureXNACK : SubtargetFeature<"xnack",
+  "EnableXNACK",
+  "true",
+  "Enable XNACK support"
+>;
 
 def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug",
-        "SGPRInitBug",
-        "true",
-        "VI SGPR initilization bug requiring a fixed SGPR allocation size">;
-
-def FeatureEnableHugeScratchBuffer : SubtargetFeature<"huge-scratch-buffer",
-        "EnableHugeScratchBuffer",
-        "true",
-        "Enable scratch buffer sizes greater than 128 GB">;
-
-def FeatureEnableSIScheduler : SubtargetFeature<"si-scheduler",
-        "EnableSIScheduler",
-        "true",
-        "Enable SI Machine Scheduler">;
+  "SGPRInitBug",
+  "true",
+  "VI SGPR initilization bug requiring a fixed SGPR allocation size"
+>;
 
 class SubtargetFeatureFetchLimit <string Value> :
                           SubtargetFeature <"fetch"#Value,
-        "TexVTXClauseSize",
-        Value,
-        "Limit the maximum number of fetches in a clause to "#Value>;
+  "TexVTXClauseSize",
+  Value,
+  "Limit the maximum number of fetches in a clause to "#Value
+>;
 
 def FeatureFetchLimit8 : SubtargetFeatureFetchLimit <"8">;
 def FeatureFetchLimit16 : SubtargetFeatureFetchLimit <"16">;
 
 class SubtargetFeatureWavefrontSize <int Value> : SubtargetFeature<
-        "wavefrontsize"#Value,
-        "WavefrontSize",
-        !cast<string>(Value),
-        "The number of threads per wavefront">;
+  "wavefrontsize"#Value,
+  "WavefrontSize",
+  !cast<string>(Value),
+  "The number of threads per wavefront"
+>;
 
 def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<16>;
 def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>;
 def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>;
 
 class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature <
-      "ldsbankcount"#Value,
-      "LDSBankCount",
-      !cast<string>(Value),
-      "The number of LDS banks per compute unit.">;
+  "ldsbankcount"#Value,
+  "LDSBankCount",
+  !cast<string>(Value),
+  "The number of LDS banks per compute unit."
+>;
 
 def FeatureLDSBankCount16 : SubtargetFeatureLDSBankCount<16>;
 def FeatureLDSBankCount32 : SubtargetFeatureLDSBankCount<32>;
 
 class SubtargetFeatureISAVersion <int Major, int Minor, int Stepping>
                                  : SubtargetFeature <
-      "isaver"#Major#"."#Minor#"."#Stepping,
-      "IsaVersion",
-      "ISAVersion"#Major#"_"#Minor#"_"#Stepping,
-      "Instruction set version number"
+  "isaver"#Major#"."#Minor#"."#Stepping,
+  "IsaVersion",
+  "ISAVersion"#Major#"_"#Minor#"_"#Stepping,
+  "Instruction set version number"
 >;
 
 def FeatureISAVersion7_0_0 : SubtargetFeatureISAVersion <7,0,0>;
@@ -186,36 +125,145 @@ def FeatureISAVersion8_0_1 : SubtargetFeatureISAVersion <8,0,1>;
 def FeatureISAVersion8_0_3 : SubtargetFeatureISAVersion <8,0,3>;
 
 class SubtargetFeatureLocalMemorySize <int Value> : SubtargetFeature<
-        "localmemorysize"#Value,
-        "LocalMemorySize",
-        !cast<string>(Value),
-        "The size of local memory in bytes">;
+  "localmemorysize"#Value,
+  "LocalMemorySize",
+  !cast<string>(Value),
+  "The size of local memory in bytes"
+>;
 
 def FeatureGCN : SubtargetFeature<"gcn",
-        "IsGCN",
-        "true",
-        "GCN or newer GPU">;
+  "IsGCN",
+  "true",
+  "GCN or newer GPU"
+>;
 
 def FeatureGCN1Encoding : SubtargetFeature<"gcn1-encoding",
-        "GCN1Encoding",
-        "true",
-        "Encoding format for SI and CI">;
+  "GCN1Encoding",
+  "true",
+  "Encoding format for SI and CI"
+>;
 
 def FeatureGCN3Encoding : SubtargetFeature<"gcn3-encoding",
-        "GCN3Encoding",
-        "true",
-        "Encoding format for VI">;
+  "GCN3Encoding",
+  "true",
+  "Encoding format for VI"
+>;
 
 def FeatureCIInsts : SubtargetFeature<"ci-insts",
-        "CIInsts",
-        "true",
-        "Additional intstructions for CI+">;
+  "CIInsts",
+  "true",
+  "Additional intstructions for CI+"
+>;
+
+def FeatureSMemRealTime : SubtargetFeature<"s-memrealtime",
+  "HasSMemRealTime",
+  "true",
+  "Has s_memrealtime instruction"
+>;
+
+def Feature16BitInsts : SubtargetFeature<"16-bit-insts",
+  "Has16BitInsts",
+  "true",
+  "Has i16/f16 instructions"
+>;
+
+//===------------------------------------------------------------===//
+// Subtarget Features (options and debugging)
+//===------------------------------------------------------------===//
+
+// Some instructions do not support denormals despite this flag. Using
+// fp32 denormals also causes instructions to run at the double
+// precision rate for the device.
+def FeatureFP32Denormals : SubtargetFeature<"fp32-denormals",
+  "FP32Denormals",
+  "true",
+  "Enable single precision denormal handling"
+>;
+
+def FeatureFP64Denormals : SubtargetFeature<"fp64-denormals",
+  "FP64Denormals",
+  "true",
+  "Enable double precision denormal handling",
+  [FeatureFP64]
+>;
+
+def FeatureFPExceptions : SubtargetFeature<"fp-exceptions",
+  "FPExceptions",
+  "true",
+  "Enable floating point exceptions"
+>;
+
+class FeatureMaxPrivateElementSize<int size> : SubtargetFeature<
+  "max-private-element-size-"#size,
+  "MaxPrivateElementSize",
+  !cast<string>(size),
+  "Maximum private access size may be "#size
+>;
+
+def FeatureMaxPrivateElementSize4 : FeatureMaxPrivateElementSize<4>;
+def FeatureMaxPrivateElementSize8 : FeatureMaxPrivateElementSize<8>;
+def FeatureMaxPrivateElementSize16 : FeatureMaxPrivateElementSize<16>;
+
+def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling",
+  "EnableVGPRSpilling",
+  "true",
+  "Enable spilling of VGPRs to scratch memory"
+>;
+
+def FeatureDumpCode : SubtargetFeature <"DumpCode",
+  "DumpCode",
+  "true",
+  "Dump MachineInstrs in the CodeEmitter"
+>;
+
+def FeatureDumpCodeLower : SubtargetFeature <"dumpcode",
+  "DumpCode",
+  "true",
+  "Dump MachineInstrs in the CodeEmitter"
+>;
+
+def FeaturePromoteAlloca : SubtargetFeature <"promote-alloca",
+  "EnablePromoteAlloca",
+  "true",
+  "Enable promote alloca pass"
+>;
+
+// XXX - This should probably be removed once enabled by default
+def FeatureEnableLoadStoreOpt : SubtargetFeature <"load-store-opt",
+  "EnableLoadStoreOpt",
+  "true",
+  "Enable SI load/store optimizer pass"
+>;
+
+// Performance debugging feature. Allow using DS instruction immediate
+// offsets even if the base pointer can't be proven to be base. On SI,
+// base pointer values that won't give the same result as a 16-bit add
+// are not safe to fold, but this will override the conservative test
+// for the base pointer.
+def FeatureEnableUnsafeDSOffsetFolding : SubtargetFeature <
+  "unsafe-ds-offset-folding",
+  "EnableUnsafeDSOffsetFolding",
+  "true",
+  "Force using DS instruction immediate offsets on SI"
+>;
+
+def FeatureEnableSIScheduler : SubtargetFeature<"si-scheduler",
+  "EnableSIScheduler",
+  "true",
+  "Enable SI Machine Scheduler"
+>;
+
+def FeatureFlatForGlobal : SubtargetFeature<"flat-for-global",
+  "FlatForGlobal",
+  "true",
+  "Force to generate flat instruction for global"
+>;
 
 // Dummy feature used to disable assembler instructions.
 def FeatureDisable : SubtargetFeature<"",
-                                      "FeatureDisable","true",
-                                      "Dummy feature to disable assembler"
-                                      " instructions">;
+  "FeatureDisable","true",
+  "Dummy feature to disable assembler instructions"
+>;
 
 class SubtargetFeatureGeneration <string Value,
                                   list<SubtargetFeature> Implies> :
@@ -227,33 +275,66 @@ def FeatureLocalMemorySize32768 : SubtargetFeatureLocalMemorySize<32768>;
 def FeatureLocalMemorySize65536 : SubtargetFeatureLocalMemorySize<65536>;
 
 def FeatureR600 : SubtargetFeatureGeneration<"R600",
-        [FeatureR600ALUInst, FeatureFetchLimit8, FeatureLocalMemorySize0]>;
+  [FeatureR600ALUInst, FeatureFetchLimit8, FeatureLocalMemorySize0]
+>;
 
 def FeatureR700 : SubtargetFeatureGeneration<"R700",
-        [FeatureFetchLimit16, FeatureLocalMemorySize0]>;
+  [FeatureFetchLimit16, FeatureLocalMemorySize0]
+>;
 
 def FeatureEvergreen : SubtargetFeatureGeneration<"EVERGREEN",
-        [FeatureFetchLimit16, FeatureLocalMemorySize32768]>;
+  [FeatureFetchLimit16, FeatureLocalMemorySize32768]
+>;
 
 def FeatureNorthernIslands : SubtargetFeatureGeneration<"NORTHERN_ISLANDS",
-        [FeatureFetchLimit16, FeatureWavefrontSize64,
-         FeatureLocalMemorySize32768]
+  [FeatureFetchLimit16, FeatureWavefrontSize64,
+   FeatureLocalMemorySize32768]
 >;
 
 def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
-        [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize32768,
-         FeatureWavefrontSize64, FeatureGCN, FeatureGCN1Encoding,
-         FeatureLDSBankCount32]>;
+  [FeatureFP64, FeatureLocalMemorySize32768,
+  FeatureWavefrontSize64, FeatureGCN, FeatureGCN1Encoding,
+  FeatureLDSBankCount32]
+>;
 
 def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS",
-        [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536,
-         FeatureWavefrontSize64, FeatureGCN, FeatureFlatAddressSpace,
-         FeatureGCN1Encoding, FeatureCIInsts]>;
+  [FeatureFP64, FeatureLocalMemorySize65536,
+  FeatureWavefrontSize64, FeatureGCN, FeatureFlatAddressSpace,
+  FeatureGCN1Encoding, FeatureCIInsts]
+>;
 
 def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
-        [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536,
-         FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN,
-         FeatureGCN3Encoding, FeatureCIInsts]>;
+  [FeatureFP64, FeatureLocalMemorySize65536,
+   FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN,
+   FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
+   FeatureSMemRealTime
+  ]
+>;
+
+//===----------------------------------------------------------------------===//
+// Debugger related subtarget features.
+//===----------------------------------------------------------------------===//
+
+def FeatureDebuggerInsertNops : SubtargetFeature<
+  "amdgpu-debugger-insert-nops",
+  "DebuggerInsertNops",
+  "true",
+  "Insert one nop instruction for each high level source statement"
+>;
+
+def FeatureDebuggerReserveRegs : SubtargetFeature<
+  "amdgpu-debugger-reserve-regs",
+  "DebuggerReserveRegs",
+  "true",
+  "Reserve registers for debugger usage"
+>;
+
+def FeatureDebuggerEmitPrologue : SubtargetFeature<
+  "amdgpu-debugger-emit-prologue",
+  "DebuggerEmitPrologue",
+  "true",
+  "Emit debugger prologue"
+>;
 
 //===----------------------------------------------------------------------===//
 
@@ -283,6 +364,7 @@ def NullALU : InstrItinClass;
 //===----------------------------------------------------------------------===//
 
 def TruePredicate : Predicate<"true">;
+
 def isSICI : Predicate<
   "Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||"
   "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS"
@@ -292,6 +374,13 @@ def isVI : Predicate <
   "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">,
   AssemblerPredicate<"FeatureGCN3Encoding">;
 
+def isCIVI : Predicate <
+  "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS || "
+  "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS"
+>, AssemblerPredicate<"FeatureCIInsts">;
+
+def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">;
+
 class PredicateControl {
   Predicate SubtargetPredicate;
   Predicate SIAssemblerPredicate = isSICI;
diff --git a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
index ad267d350850..63f5fb3cdf00 100644
--- a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
@@ -45,9 +45,8 @@ bool AMDGPUAlwaysInline::runOnModule(Module &M) {
 
   for (Function *F : FuncsToClone) {
     ValueToValueMapTy VMap;
-    Function *NewFunc = CloneFunction(F, VMap, false);
+    Function *NewFunc = CloneFunction(F, VMap);
     NewFunc->setLinkage(GlobalValue::InternalLinkage);
-    M.getFunctionList().push_back(NewFunc);
     F->replaceAllUsesWith(NewFunc);
   }
 
diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
index 378183927242..0910b2877b09 100644
--- a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 
@@ -24,6 +25,8 @@ namespace {
 
 class AMDGPUAnnotateKernelFeatures : public ModulePass {
 private:
+  static bool hasAddrSpaceCast(const Function &F);
+
   void addAttrToCallers(Function *Intrin, StringRef AttrName);
   bool addAttrsForIntrinsics(Module &M, ArrayRef<StringRef[2]>);
 
@@ -40,6 +43,11 @@ public:
     AU.setPreservesAll();
     ModulePass::getAnalysisUsage(AU);
   }
+
+  static bool visitConstantExpr(const ConstantExpr *CE);
+  static bool visitConstantExprsRecursively(
+    const Constant *EntryC,
+    SmallPtrSet<const Constant *, 8> &ConstantExprVisited);
 };
 
 }
@@ -48,12 +56,87 @@ char AMDGPUAnnotateKernelFeatures::ID = 0;
 
 char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID;
 
+INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
+                "Add AMDGPU function attributes", false, false)
+
+
+// The queue ptr is only needed when casting to flat, not from it.
+static bool castRequiresQueuePtr(unsigned SrcAS) {
+  return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
+}
+
+static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) {
+  return castRequiresQueuePtr(ASC->getSrcAddressSpace());
+}
+
+bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) {
+  if (CE->getOpcode() == Instruction::AddrSpaceCast) {
+    unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
+    return castRequiresQueuePtr(SrcAS);
+  }
+
+  return false;
+}
+
+bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
+  const Constant *EntryC,
+  SmallPtrSet<const Constant *, 8> &ConstantExprVisited) {
 
-INITIALIZE_PASS_BEGIN(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
-                      "Add AMDGPU function attributes", false, false)
-INITIALIZE_PASS_END(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
-                    "Add AMDGPU function attributes", false, false)
+  if (!ConstantExprVisited.insert(EntryC).second)
+    return false;
 
+  SmallVector<const Constant *, 16> Stack;
+  Stack.push_back(EntryC);
+
+  while (!Stack.empty()) {
+    const Constant *C = Stack.pop_back_val();
+
+    // Check this constant expression.
+    if (const auto *CE = dyn_cast<ConstantExpr>(C)) {
+      if (visitConstantExpr(CE))
+        return true;
+    }
+
+    // Visit all sub-expressions.
+    for (const Use &U : C->operands()) {
+      const auto *OpC = dyn_cast<Constant>(U);
+      if (!OpC)
+        continue;
+
+      if (!ConstantExprVisited.insert(OpC).second)
+        continue;
+
+      Stack.push_back(OpC);
+    }
+  }
+
+  return false;
+}
+
+// Return true if an addrspacecast is used that requires the queue ptr.
+bool AMDGPUAnnotateKernelFeatures::hasAddrSpaceCast(const Function &F) {
+  SmallPtrSet<const Constant *, 8> ConstantExprVisited;
+
+  for (const BasicBlock &BB : F) {
+    for (const Instruction &I : BB) {
+      if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
+        if (castRequiresQueuePtr(ASC))
+          return true;
+      }
+
+      for (const Use &U : I.operands()) {
+        const auto *OpC = dyn_cast<Constant>(U);
+        if (!OpC)
+          continue;
+
+        if (visitConstantExprsRecursively(OpC, ConstantExprVisited))
+          return true;
+      }
+    }
+  }
+
+  return false;
+}
 
 void AMDGPUAnnotateKernelFeatures::addAttrToCallers(Function *Intrin,
                                                     StringRef AttrName) {
@@ -89,35 +172,46 @@ bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) {
 
   static const StringRef IntrinsicToAttr[][2] = {
     // .x omitted
+    { "llvm.amdgcn.workitem.id.y", "amdgpu-work-item-id-y" },
+    { "llvm.amdgcn.workitem.id.z", "amdgpu-work-item-id-z" },
+
+    { "llvm.amdgcn.workgroup.id.y", "amdgpu-work-group-id-y" },
+    { "llvm.amdgcn.workgroup.id.z", "amdgpu-work-group-id-z" },
+
     { "llvm.r600.read.tgid.y", "amdgpu-work-group-id-y" },
     { "llvm.r600.read.tgid.z", "amdgpu-work-group-id-z" },
 
     // .x omitted
     { "llvm.r600.read.tidig.y", "amdgpu-work-item-id-y" },
     { "llvm.r600.read.tidig.z", "amdgpu-work-item-id-z" }
-
   };
 
   static const StringRef HSAIntrinsicToAttr[][2] = {
-    { "llvm.r600.read.local.size.x", "amdgpu-dispatch-ptr" },
-    { "llvm.r600.read.local.size.y", "amdgpu-dispatch-ptr" },
-    { "llvm.r600.read.local.size.z", "amdgpu-dispatch-ptr" },
-
-    { "llvm.r600.read.global.size.x", "amdgpu-dispatch-ptr" },
-    { "llvm.r600.read.global.size.y", "amdgpu-dispatch-ptr" },
-    { "llvm.r600.read.global.size.z", "amdgpu-dispatch-ptr" },
-    { "llvm.amdgcn.dispatch.ptr",     "amdgpu-dispatch-ptr" }
+    { "llvm.amdgcn.dispatch.ptr", "amdgpu-dispatch-ptr" },
+    { "llvm.amdgcn.queue.ptr", "amdgpu-queue-ptr" }
   };
 
+  // TODO: We should not add the attributes if the known compile time workgroup
+  // size is 1 for y/z.
+
   // TODO: Intrinsics that require queue ptr.
 
   // We do not need to note the x workitem or workgroup id because they are
   // always initialized.
 
   bool Changed = addAttrsForIntrinsics(M, IntrinsicToAttr);
-  if (TT.getOS() == Triple::AMDHSA)
+  if (TT.getOS() == Triple::AMDHSA) {
     Changed |= addAttrsForIntrinsics(M, HSAIntrinsicToAttr);
 
+    for (Function &F : M) {
+      if (F.hasFnAttribute("amdgpu-queue-ptr"))
+        continue;
+
+      if (hasAddrSpaceCast(F))
+        F.addFnAttr("amdgpu-queue-ptr");
+    }
+  }
+
   return Changed;
 }
 
diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
index dfddc345f286..2010cc952265 100644
--- a/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
@@ -43,6 +43,7 @@ public:
     AU.setPreservesAll();
  }
 
+  void visitBranchInst(BranchInst &I);
   void visitLoadInst(LoadInst &I);
 
 };
@@ -57,13 +58,28 @@ INITIALIZE_PASS_END(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
 
 char AMDGPUAnnotateUniformValues::ID = 0;
 
+static void setUniformMetadata(Instruction *I) {
+  I->setMetadata("amdgpu.uniform", MDNode::get(I->getContext(), {}));
+}
+
+void AMDGPUAnnotateUniformValues::visitBranchInst(BranchInst &I) {
+  if (I.isUnconditional())
+    return;
+
+  Value *Cond = I.getCondition();
+  if (!DA->isUniform(Cond))
+    return;
+
+  setUniformMetadata(I.getParent()->getTerminator());
+}
+
 void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) {
   Value *Ptr = I.getPointerOperand();
   if (!DA->isUniform(Ptr))
     return;
 
   if (Instruction *PtrI = dyn_cast<Instruction>(Ptr))
-    PtrI->setMetadata("amdgpu.uniform", MDNode::get(I.getContext(), {}));
+    setUniformMetadata(PtrI);
 
 }
 
@@ -72,6 +88,9 @@ bool AMDGPUAnnotateUniformValues::doInitialization(Module &M) {
 }
 
 bool AMDGPUAnnotateUniformValues::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
   DA = &getAnalysis<DivergenceAnalysis>();
   visit(F);
 
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 1239dfb235ef..cfe6346fb6b1 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -28,8 +28,10 @@
 #include "R600RegisterInfo.h"
 #include "SIDefines.h"
 #include "SIMachineFunctionInfo.h"
+#include "SIInstrInfo.h"
 #include "SIRegisterInfo.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCStreamer.h"
@@ -37,7 +39,9 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
+#include "AMDGPURuntimeMetadata.h"
 
+using namespace ::AMDGPU;
 using namespace llvm;
 
 // TODO: This should get the default rounding mode from the kernel. We just set
@@ -61,7 +65,7 @@ using namespace llvm;
 // instructions to run at the double precision rate for the device so it's
 // probably best to just report no single precision denormals.
 static uint32_t getFPMode(const MachineFunction &F) {
-  const AMDGPUSubtarget& ST = F.getSubtarget<AMDGPUSubtarget>();
+  const SISubtarget& ST = F.getSubtarget<SISubtarget>();
   // TODO: Is there any real use for the flush in only / flush out only modes?
 
   uint32_t FP32Denormals =
@@ -104,10 +108,12 @@ void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
   AMDGPUTargetStreamer *TS =
       static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
 
-  TS->EmitDirectiveHSACodeObjectVersion(1, 0);
+  TS->EmitDirectiveHSACodeObjectVersion(2, 1);
+
   AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(STI->getFeatureBits());
   TS->EmitDirectiveHSACodeObjectISA(ISA.Major, ISA.Minor, ISA.Stepping,
                                     "AMD", "AMDGPU");
+  emitStartOfRuntimeMetadata(M);
 }
 
 void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
@@ -132,54 +138,13 @@ void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
   AsmPrinter::EmitFunctionEntryLabel();
 }
 
-static bool isModuleLinkage(const GlobalValue *GV) {
-  switch (GV->getLinkage()) {
-  case GlobalValue::InternalLinkage:
-  case GlobalValue::CommonLinkage:
-   return true;
-  case GlobalValue::ExternalLinkage:
-   return false;
-  default: llvm_unreachable("unknown linkage type");
-  }
-}
-
 void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
 
-  if (TM.getTargetTriple().getOS() != Triple::AMDHSA) {
-    AsmPrinter::EmitGlobalVariable(GV);
-    return;
-  }
-
-  if (GV->isDeclaration() || GV->getLinkage() == GlobalValue::PrivateLinkage) {
-    AsmPrinter::EmitGlobalVariable(GV);
-    return;
-  }
-
   // Group segment variables aren't emitted in HSA.
   if (AMDGPU::isGroupSegment(GV))
     return;
 
-  AMDGPUTargetStreamer *TS =
-      static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
-  if (isModuleLinkage(GV)) {
-    TS->EmitAMDGPUHsaModuleScopeGlobal(GV->getName());
-  } else {
-    TS->EmitAMDGPUHsaProgramScopeGlobal(GV->getName());
-  }
-
-  MCSymbolELF *GVSym = cast<MCSymbolELF>(getSymbol(GV));
-  const DataLayout &DL = getDataLayout();
-
-  // Emit the size
-  uint64_t Size = DL.getTypeAllocSize(GV->getType()->getElementType());
-  OutStreamer->emitELFSize(GVSym, MCConstantExpr::create(Size, OutContext));
-  OutStreamer->PushSection();
-  OutStreamer->SwitchSection(
-      getObjFileLowering().SectionForGlobal(GV, *Mang, TM));
-  const Constant *C = GV->getInitializer();
-  OutStreamer->EmitLabel(GVSym);
-  EmitGlobalConstant(DL, C);
-  OutStreamer->PopSection();
+  AsmPrinter::EmitGlobalVariable(GV);
 }
 
 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
@@ -230,6 +195,20 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
                                   false);
       OutStreamer->emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize),
                                   false);
+      OutStreamer->emitRawComment(" LDSByteSize: " + Twine(KernelInfo.LDSSize) +
+                                  " bytes/workgroup (compile time only)", false);
+
+      OutStreamer->emitRawComment(" ReservedVGPRFirst: " + Twine(KernelInfo.ReservedVGPRFirst),
+                                  false);
+      OutStreamer->emitRawComment(" ReservedVGPRCount: " + Twine(KernelInfo.ReservedVGPRCount),
+                                  false);
+
+      if (MF.getSubtarget<SISubtarget>().debuggerEmitPrologue()) {
+        OutStreamer->emitRawComment(" DebuggerWavefrontPrivateSegmentOffsetSGPR: s" +
+                                    Twine(KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false);
+        OutStreamer->emitRawComment(" DebuggerPrivateSegmentBufferSGPR: s" +
+                                    Twine(KernelInfo.DebuggerPrivateSegmentBufferSGPR), false);
+      }
 
       OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
                                   Twine(G_00B84C_USER_SGPR(KernelInfo.ComputePGMRSrc2)),
@@ -268,15 +247,16 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
     }
   }
 
+  emitRuntimeMetadata(*MF.getFunction());
+
   return false;
 }
 
 void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) {
   unsigned MaxGPR = 0;
   bool killPixel = false;
-  const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
-  const R600RegisterInfo *RI =
-      static_cast<const R600RegisterInfo *>(STM.getRegisterInfo());
+  const R600Subtarget &STM = MF.getSubtarget<R600Subtarget>();
+  const R600RegisterInfo *RI = STM.getRegisterInfo();
   const R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 
   for (const MachineBasicBlock &MBB : MF) {
@@ -299,23 +279,23 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) {
   }
 
   unsigned RsrcReg;
-  if (STM.getGeneration() >= AMDGPUSubtarget::EVERGREEN) {
+  if (STM.getGeneration() >= R600Subtarget::EVERGREEN) {
     // Evergreen / Northern Islands
-    switch (MFI->getShaderType()) {
+    switch (MF.getFunction()->getCallingConv()) {
     default: // Fall through
-    case ShaderType::COMPUTE:  RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break;
-    case ShaderType::GEOMETRY: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break;
-    case ShaderType::PIXEL:    RsrcReg = R_028844_SQ_PGM_RESOURCES_PS; break;
-    case ShaderType::VERTEX:   RsrcReg = R_028860_SQ_PGM_RESOURCES_VS; break;
+    case CallingConv::AMDGPU_CS: RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break;
+    case CallingConv::AMDGPU_GS: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break;
+    case CallingConv::AMDGPU_PS: RsrcReg = R_028844_SQ_PGM_RESOURCES_PS; break;
+    case CallingConv::AMDGPU_VS: RsrcReg = R_028860_SQ_PGM_RESOURCES_VS; break;
     }
   } else {
     // R600 / R700
-    switch (MFI->getShaderType()) {
+    switch (MF.getFunction()->getCallingConv()) {
     default: // Fall through
-    case ShaderType::GEOMETRY: // Fall through
-    case ShaderType::COMPUTE:  // Fall through
-    case ShaderType::VERTEX:   RsrcReg = R_028868_SQ_PGM_RESOURCES_VS; break;
-    case ShaderType::PIXEL:    RsrcReg = R_028850_SQ_PGM_RESOURCES_PS; break;
+    case CallingConv::AMDGPU_GS: // Fall through
+    case CallingConv::AMDGPU_CS: // Fall through
+    case CallingConv::AMDGPU_VS: RsrcReg = R_028868_SQ_PGM_RESOURCES_VS; break;
+    case CallingConv::AMDGPU_PS: RsrcReg = R_028850_SQ_PGM_RESOURCES_PS; break;
     }
   }
 
@@ -325,23 +305,23 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) {
   OutStreamer->EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4);
   OutStreamer->EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4);
 
-  if (MFI->getShaderType() == ShaderType::COMPUTE) {
+  if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) {
     OutStreamer->EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4);
-    OutStreamer->EmitIntValue(RoundUpToAlignment(MFI->LDSSize, 4) >> 2, 4);
+    OutStreamer->EmitIntValue(alignTo(MFI->LDSSize, 4) >> 2, 4);
   }
 }
 
 void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
                                         const MachineFunction &MF) const {
-  const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
+  const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   uint64_t CodeSize = 0;
   unsigned MaxSGPR = 0;
   unsigned MaxVGPR = 0;
   bool VCCUsed = false;
   bool FlatUsed = false;
-  const SIRegisterInfo *RI =
-      static_cast<const SIRegisterInfo *>(STM.getRegisterInfo());
+  const SIRegisterInfo *RI = STM.getRegisterInfo();
+  const SIInstrInfo *TII = STM.getInstrInfo();
 
   for (const MachineBasicBlock &MBB : MF) {
     for (const MachineInstr &MI : MBB) {
@@ -351,8 +331,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
       if (MI.isDebugValue())
         continue;
 
-      // FIXME: This is reporting 0 for many instructions.
-      CodeSize += MI.getDesc().Size;
+      CodeSize += TII->getInstSizeInBytes(MI);
 
       unsigned numOperands = MI.getNumOperands();
       for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
@@ -366,6 +345,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
         unsigned reg = MO.getReg();
         switch (reg) {
         case AMDGPU::EXEC:
+        case AMDGPU::EXEC_LO:
+        case AMDGPU::EXEC_HI:
         case AMDGPU::SCC:
         case AMDGPU::M0:
           continue;
@@ -382,17 +363,32 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
           FlatUsed = true;
           continue;
 
+        case AMDGPU::TBA:
+        case AMDGPU::TBA_LO:
+        case AMDGPU::TBA_HI:
+        case AMDGPU::TMA:
+        case AMDGPU::TMA_LO:
+        case AMDGPU::TMA_HI:
+          llvm_unreachable("Trap Handler registers should not be used");
+          continue;
+
         default:
           break;
         }
 
         if (AMDGPU::SReg_32RegClass.contains(reg)) {
+          if (AMDGPU::TTMP_32RegClass.contains(reg)) {
+            llvm_unreachable("Trap Handler registers should not be used");
+          }
           isSGPR = true;
           width = 1;
         } else if (AMDGPU::VGPR_32RegClass.contains(reg)) {
           isSGPR = false;
           width = 1;
         } else if (AMDGPU::SReg_64RegClass.contains(reg)) {
+          if (AMDGPU::TTMP_64RegClass.contains(reg)) {
+            llvm_unreachable("Trap Handler registers should not be used");
+          }
           isSGPR = true;
           width = 2;
         } else if (AMDGPU::VReg_64RegClass.contains(reg)) {
@@ -438,7 +434,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   if (VCCUsed)
     ExtraSGPRs = 2;
 
-  if (STM.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+  if (STM.getGeneration() < SISubtarget::VOLCANIC_ISLANDS) {
     if (FlatUsed)
       ExtraSGPRs = 4;
   } else {
@@ -451,23 +447,54 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
 
   MaxSGPR += ExtraSGPRs;
 
+  // Record first reserved register and reserved register count fields, and
+  // update max register counts if "amdgpu-debugger-reserve-regs" attribute was
+  // specified.
+  if (STM.debuggerReserveRegs()) {
+    ProgInfo.ReservedVGPRFirst = MaxVGPR + 1;
+    ProgInfo.ReservedVGPRCount = MFI->getDebuggerReservedVGPRCount();
+    MaxVGPR += MFI->getDebuggerReservedVGPRCount();
+  }
+
+  // Update DebuggerWavefrontPrivateSegmentOffsetSGPR and
+  // DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue"
+  // attribute was specified.
+  if (STM.debuggerEmitPrologue()) {
+    ProgInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR =
+      RI->getHWRegIndex(MFI->getScratchWaveOffsetReg());
+    ProgInfo.DebuggerPrivateSegmentBufferSGPR =
+      RI->getHWRegIndex(MFI->getScratchRSrcReg());
+  }
+
   // We found the maximum register index. They start at 0, so add one to get the
   // number of registers.
   ProgInfo.NumVGPR = MaxVGPR + 1;
   ProgInfo.NumSGPR = MaxSGPR + 1;
 
   if (STM.hasSGPRInitBug()) {
-    if (ProgInfo.NumSGPR > AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG) {
+    if (ProgInfo.NumSGPR > SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG) {
       LLVMContext &Ctx = MF.getFunction()->getContext();
-      Ctx.emitError("too many SGPRs used with the SGPR init bug");
+      DiagnosticInfoResourceLimit Diag(*MF.getFunction(),
+                                       "SGPRs with SGPR init bug",
+                                       ProgInfo.NumSGPR, DS_Error);
+      Ctx.diagnose(Diag);
     }
 
-    ProgInfo.NumSGPR = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
+    ProgInfo.NumSGPR = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
   }
 
   if (MFI->NumUserSGPRs > STM.getMaxNumUserSGPRs()) {
     LLVMContext &Ctx = MF.getFunction()->getContext();
-    Ctx.emitError("too many user SGPRs used");
+    DiagnosticInfoResourceLimit Diag(*MF.getFunction(), "user SGPRs",
+                                     MFI->NumUserSGPRs, DS_Error);
+    Ctx.diagnose(Diag);
+  }
+
+  if (MFI->LDSSize > static_cast<unsigned>(STM.getLocalMemorySize())) {
+    LLVMContext &Ctx = MF.getFunction()->getContext();
+    DiagnosticInfoResourceLimit Diag(*MF.getFunction(), "local memory",
+                                     MFI->LDSSize, DS_Error);
+    Ctx.diagnose(Diag);
   }
 
   ProgInfo.VGPRBlocks = (ProgInfo.NumVGPR - 1) / 4;
@@ -476,21 +503,20 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   // register.
   ProgInfo.FloatMode = getFPMode(MF);
 
-  // XXX: Not quite sure what this does, but sc seems to unset this.
   ProgInfo.IEEEMode = 0;
 
-  // Do not clamp NAN to 0.
-  ProgInfo.DX10Clamp = 0;
+  // Make clamp modifier on NaN input returns 0.
+  ProgInfo.DX10Clamp = 1;
 
   const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
-  ProgInfo.ScratchSize = FrameInfo->estimateStackSize(MF);
+  ProgInfo.ScratchSize = FrameInfo->getStackSize();
 
   ProgInfo.FlatUsed = FlatUsed;
   ProgInfo.VCCUsed = VCCUsed;
   ProgInfo.CodeLen = CodeSize;
 
   unsigned LDSAlignShift;
-  if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
+  if (STM.getGeneration() < SISubtarget::SEA_ISLANDS) {
     // LDS is allocated in 64 dword blocks.
     LDSAlignShift = 8;
   } else {
@@ -503,7 +529,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
 
   ProgInfo.LDSSize = MFI->LDSSize + LDSSpillSize;
   ProgInfo.LDSBlocks =
-     RoundUpToAlignment(ProgInfo.LDSSize, 1 << LDSAlignShift) >> LDSAlignShift;
+      alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
 
   // Scratch is allocated in 256 dword blocks.
   unsigned ScratchAlignShift = 10;
@@ -511,8 +537,9 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   // is used by the entire wave.  ProgInfo.ScratchSize is the amount of
   // scratch memory used per thread.
   ProgInfo.ScratchBlocks =
-    RoundUpToAlignment(ProgInfo.ScratchSize * STM.getWavefrontSize(),
-                       1 << ScratchAlignShift) >> ScratchAlignShift;
+      alignTo(ProgInfo.ScratchSize * STM.getWavefrontSize(),
+              1ULL << ScratchAlignShift) >>
+      ScratchAlignShift;
 
   ProgInfo.ComputePGMRSrc1 =
       S_00B848_VGPRS(ProgInfo.VGPRBlocks) |
@@ -544,23 +571,23 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
       S_00B84C_EXCP_EN(0);
 }
 
-static unsigned getRsrcReg(unsigned ShaderType) {
-  switch (ShaderType) {
+static unsigned getRsrcReg(CallingConv::ID CallConv) {
+  switch (CallConv) {
   default: // Fall through
-  case ShaderType::COMPUTE:  return R_00B848_COMPUTE_PGM_RSRC1;
-  case ShaderType::GEOMETRY: return R_00B228_SPI_SHADER_PGM_RSRC1_GS;
-  case ShaderType::PIXEL:    return R_00B028_SPI_SHADER_PGM_RSRC1_PS;
-  case ShaderType::VERTEX:   return R_00B128_SPI_SHADER_PGM_RSRC1_VS;
+  case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1;
+  case CallingConv::AMDGPU_GS: return R_00B228_SPI_SHADER_PGM_RSRC1_GS;
+  case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS;
+  case CallingConv::AMDGPU_VS: return R_00B128_SPI_SHADER_PGM_RSRC1_VS;
   }
 }
 
 void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
                                          const SIProgramInfo &KernelInfo) {
-  const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
+  const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-  unsigned RsrcReg = getRsrcReg(MFI->getShaderType());
+  unsigned RsrcReg = getRsrcReg(MF.getFunction()->getCallingConv());
 
-  if (MFI->getShaderType() == ShaderType::COMPUTE) {
+  if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) {
     OutStreamer->EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4);
 
     OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc1, 4);
@@ -577,13 +604,13 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
     OutStreamer->EmitIntValue(RsrcReg, 4);
     OutStreamer->EmitIntValue(S_00B028_VGPRS(KernelInfo.VGPRBlocks) |
                               S_00B028_SGPRS(KernelInfo.SGPRBlocks), 4);
-    if (STM.isVGPRSpillingEnabled(MFI)) {
+    if (STM.isVGPRSpillingEnabled(*MF.getFunction())) {
       OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4);
       OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(KernelInfo.ScratchBlocks), 4);
     }
   }
 
-  if (MFI->getShaderType() == ShaderType::PIXEL) {
+  if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_PS) {
     OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
     OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4);
     OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
@@ -591,12 +618,31 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
     OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4);
     OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4);
   }
+
+  OutStreamer->EmitIntValue(R_SPILLED_SGPRS, 4);
+  OutStreamer->EmitIntValue(MFI->getNumSpilledSGPRs(), 4);
+  OutStreamer->EmitIntValue(R_SPILLED_VGPRS, 4);
+  OutStreamer->EmitIntValue(MFI->getNumSpilledVGPRs(), 4);
+}
+
+// This is supposed to be log2(Size)
+static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
+  switch (Size) {
+  case 4:
+    return AMD_ELEMENT_4_BYTES;
+  case 8:
+    return AMD_ELEMENT_8_BYTES;
+  case 16:
+    return AMD_ELEMENT_16_BYTES;
+  default:
+    llvm_unreachable("invalid private_element_size");
+  }
 }
 
 void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF,
                                          const SIProgramInfo &KernelInfo) const {
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-  const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
+  const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
   amd_kernel_code_t header;
 
   AMDGPU::initDefaultAMDKernelCodeT(header, STM.getFeatureBits());
@@ -606,6 +652,11 @@ void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF,
       (KernelInfo.ComputePGMRSrc2 << 32);
   header.code_properties = AMD_CODE_PROPERTY_IS_PTR64;
 
+
+  AMD_HSA_BITS_SET(header.code_properties,
+                   AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE,
+                   getElementByteSizeValue(STM.getMaxPrivateElementSize()));
+
   if (MFI->hasPrivateSegmentBuffer()) {
     header.code_properties |=
       AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
@@ -646,6 +697,9 @@ void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF,
   if (MFI->hasDispatchPtr())
     header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
 
+  if (STM.debuggerSupported())
+    header.code_properties |= AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED;
+
   if (STM.isXNACKEnabled())
     header.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
 
@@ -654,9 +708,20 @@ void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF,
   header.workitem_vgpr_count = KernelInfo.NumVGPR;
   header.workitem_private_segment_byte_size = KernelInfo.ScratchSize;
   header.workgroup_group_segment_byte_size = KernelInfo.LDSSize;
+  header.reserved_vgpr_first = KernelInfo.ReservedVGPRFirst;
+  header.reserved_vgpr_count = KernelInfo.ReservedVGPRCount;
+
+  if (STM.debuggerEmitPrologue()) {
+    header.debug_wavefront_private_segment_offset_sgpr =
+      KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR;
+    header.debug_private_segment_buffer_sgpr =
+      KernelInfo.DebuggerPrivateSegmentBufferSGPR;
+  }
 
   AMDGPUTargetStreamer *TS =
       static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
+
+  OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
   TS->EmitAMDKernelCodeT(header);
 }
 
@@ -680,3 +745,227 @@ bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
                    *TM.getSubtargetImpl(*MF->getFunction())->getRegisterInfo());
   return false;
 }
+
+// Emit a key and an integer value for runtime metadata.
+static void emitRuntimeMDIntValue(std::unique_ptr<MCStreamer> &Streamer,
+                                  RuntimeMD::Key K, uint64_t V,
+                                  unsigned Size) {
+  Streamer->EmitIntValue(K, 1);
+  Streamer->EmitIntValue(V, Size);
+}
+
+// Emit a key and a string value for runtime metadata.
+static void emitRuntimeMDStringValue(std::unique_ptr<MCStreamer> &Streamer,
+                                     RuntimeMD::Key K, StringRef S) {
+  Streamer->EmitIntValue(K, 1);
+  Streamer->EmitIntValue(S.size(), 4);
+  Streamer->EmitBytes(S);
+}
+
+// Emit a key and three integer values for runtime metadata.
+// The three integer values are obtained from MDNode \p Node;
+static void emitRuntimeMDThreeIntValues(std::unique_ptr<MCStreamer> &Streamer,
+                                        RuntimeMD::Key K, MDNode *Node,
+                                        unsigned Size) {
+  Streamer->EmitIntValue(K, 1);
+  Streamer->EmitIntValue(mdconst::extract<ConstantInt>(
+    Node->getOperand(0))->getZExtValue(), Size);
+  Streamer->EmitIntValue(mdconst::extract<ConstantInt>(
+    Node->getOperand(1))->getZExtValue(), Size);
+  Streamer->EmitIntValue(mdconst::extract<ConstantInt>(
+    Node->getOperand(2))->getZExtValue(), Size);
+}
+
+void AMDGPUAsmPrinter::emitStartOfRuntimeMetadata(const Module &M) {
+  OutStreamer->SwitchSection(getObjFileLowering().getContext()
+    .getELFSection(RuntimeMD::SectionName, ELF::SHT_PROGBITS, 0));
+
+  emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyMDVersion,
+                        RuntimeMD::MDVersion << 8 | RuntimeMD::MDRevision, 2);
+  if (auto MD = M.getNamedMetadata("opencl.ocl.version")) {
+    emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyLanguage,
+                          RuntimeMD::OpenCL_C, 1);
+    auto Node = MD->getOperand(0);
+    unsigned short Major = mdconst::extract<ConstantInt>(Node->getOperand(0))
+                             ->getZExtValue();
+    unsigned short Minor = mdconst::extract<ConstantInt>(Node->getOperand(1))
+                             ->getZExtValue();
+    emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyLanguageVersion,
+                          Major * 100 + Minor * 10, 2);
+  }
+}
+
+static std::string getOCLTypeName(Type *Ty, bool isSigned) {
+  if (VectorType* VecTy = dyn_cast<VectorType>(Ty)) {
+    Type* EleTy = VecTy->getElementType();
+    unsigned Size = VecTy->getVectorNumElements();
+    return (Twine(getOCLTypeName(EleTy, isSigned)) + Twine(Size)).str();
+  }
+  switch (Ty->getTypeID()) {
+  case Type::HalfTyID:   return "half";
+  case Type::FloatTyID:  return "float";
+  case Type::DoubleTyID: return "double";
+  case Type::IntegerTyID: {
+    if (!isSigned)
+      return (Twine('u') + Twine(getOCLTypeName(Ty, true))).str();
+    auto IntTy = cast<IntegerType>(Ty);
+    auto BW = IntTy->getIntegerBitWidth();
+    switch (BW) {
+    case 8:
+      return "char";
+    case 16:
+      return "short";
+    case 32:
+      return "int";
+    case 64:
+      return "long";
+    default:
+      return (Twine('i') + Twine(BW)).str();
+    }
+  }
+  default:
+    llvm_unreachable("invalid type");
+  }
+}
+
+static RuntimeMD::KernelArg::ValueType getRuntimeMDValueType(
+         Type *Ty, StringRef TypeName) {
+  if (auto VT = dyn_cast<VectorType>(Ty))
+    return getRuntimeMDValueType(VT->getElementType(), TypeName);
+  else if (auto PT = dyn_cast<PointerType>(Ty))
+    return getRuntimeMDValueType(PT->getElementType(), TypeName);
+  else if (Ty->isHalfTy())
+    return RuntimeMD::KernelArg::F16;
+  else if (Ty->isFloatTy())
+    return RuntimeMD::KernelArg::F32;
+  else if (Ty->isDoubleTy())
+    return RuntimeMD::KernelArg::F64;
+  else if (IntegerType* intTy = dyn_cast<IntegerType>(Ty)) {
+    bool Signed = !TypeName.startswith("u");
+    switch (intTy->getIntegerBitWidth()) {
+    case 8:
+      return Signed ? RuntimeMD::KernelArg::I8 : RuntimeMD::KernelArg::U8;
+    case 16:
+      return Signed ? RuntimeMD::KernelArg::I16 : RuntimeMD::KernelArg::U16;
+    case 32:
+      return Signed ? RuntimeMD::KernelArg::I32 : RuntimeMD::KernelArg::U32;
+    case 64:
+      return Signed ? RuntimeMD::KernelArg::I64 : RuntimeMD::KernelArg::U64;
+    default:
+      // Runtime does not recognize other integer types. Report as
+      // struct type.
+      return RuntimeMD::KernelArg::Struct;
+    }
+  } else
+    return RuntimeMD::KernelArg::Struct;
+}
+
+void AMDGPUAsmPrinter::emitRuntimeMetadata(const Function &F) {
+  if (!F.getMetadata("kernel_arg_type"))
+    return;
+
+  MCContext &Context = getObjFileLowering().getContext();
+  OutStreamer->SwitchSection(
+      Context.getELFSection(RuntimeMD::SectionName, ELF::SHT_PROGBITS, 0));
+  OutStreamer->EmitIntValue(RuntimeMD::KeyKernelBegin, 1);
+  emitRuntimeMDStringValue(OutStreamer, RuntimeMD::KeyKernelName, F.getName());
+
+  for (auto &Arg:F.args()) {
+    // Emit KeyArgBegin.
+    unsigned I = Arg.getArgNo();
+    OutStreamer->EmitIntValue(RuntimeMD::KeyArgBegin, 1);
+
+    // Emit KeyArgSize and KeyArgAlign.
+    auto T = Arg.getType();
+    auto DL = F.getParent()->getDataLayout();
+    emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyArgSize,
+                          DL.getTypeAllocSize(T), 4);
+    emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyArgAlign,
+                          DL.getABITypeAlignment(T), 4);
+
+    // Emit KeyArgTypeName.
+    auto TypeName = dyn_cast<MDString>(F.getMetadata(
+      "kernel_arg_type")->getOperand(I))->getString();
+    emitRuntimeMDStringValue(OutStreamer, RuntimeMD::KeyArgTypeName, TypeName);
+
+    // Emit KeyArgName.
+    if (auto ArgNameMD = F.getMetadata("kernel_arg_name")) {
+      auto ArgName = cast<MDString>(ArgNameMD->getOperand(
+        I))->getString();
+      emitRuntimeMDStringValue(OutStreamer, RuntimeMD::KeyArgName, ArgName);
+    }
+
+    // Emit KeyArgIsVolatile, KeyArgIsRestrict, KeyArgIsConst and KeyArgIsPipe.
+    auto TypeQual = cast<MDString>(F.getMetadata(
+      "kernel_arg_type_qual")->getOperand(I))->getString();
+    SmallVector<StringRef, 1> SplitQ;
+    TypeQual.split(SplitQ, " ", -1, false/* drop empty entry*/);
+    for (auto &I:SplitQ) {
+      auto Key = StringSwitch<RuntimeMD::Key>(I)
+        .Case("volatile", RuntimeMD::KeyArgIsVolatile)
+        .Case("restrict", RuntimeMD::KeyArgIsRestrict)
+        .Case("const",    RuntimeMD::KeyArgIsConst)
+        .Case("pipe",     RuntimeMD::KeyArgIsPipe)
+        .Default(RuntimeMD::KeyNull);
+      OutStreamer->EmitIntValue(Key, 1);
+    }
+
+    // Emit KeyArgTypeKind.
+    auto BaseTypeName = cast<MDString>(
+      F.getMetadata("kernel_arg_base_type")->getOperand(I))->getString();
+    auto TypeKind = StringSwitch<RuntimeMD::KernelArg::TypeKind>(BaseTypeName)
+      .Case("sampler_t", RuntimeMD::KernelArg::Sampler)
+      .Case("queue_t",   RuntimeMD::KernelArg::Queue)
+      .Cases("image1d_t", "image1d_array_t", "image1d_buffer_t",
+             "image2d_t" , "image2d_array_t",  RuntimeMD::KernelArg::Image)
+      .Cases("image2d_depth_t", "image2d_array_depth_t",
+             "image2d_msaa_t", "image2d_array_msaa_t",
+             "image2d_msaa_depth_t",  RuntimeMD::KernelArg::Image)
+      .Cases("image2d_array_msaa_depth_t", "image3d_t",
+             RuntimeMD::KernelArg::Image)
+      .Default(isa<PointerType>(T) ? RuntimeMD::KernelArg::Pointer :
+               RuntimeMD::KernelArg::Value);
+    emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyArgTypeKind, TypeKind, 1);
+
+    // Emit KeyArgValueType.
+    emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyArgValueType,
+                          getRuntimeMDValueType(T, BaseTypeName), 2);
+
+    // Emit KeyArgAccQual.
+    auto AccQual = cast<MDString>(F.getMetadata(
+      "kernel_arg_access_qual")->getOperand(I))->getString();
+    auto AQ = StringSwitch<RuntimeMD::KernelArg::AccessQualifer>(AccQual)
+      .Case("read_only",  RuntimeMD::KernelArg::ReadOnly)
+      .Case("write_only", RuntimeMD::KernelArg::WriteOnly)
+      .Case("read_write", RuntimeMD::KernelArg::ReadWrite)
+      .Default(RuntimeMD::KernelArg::None);
+    emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyArgAccQual,
+                          AQ, 1);
+
+    // Emit KeyArgAddrQual.
+    if (isa<PointerType>(T))
+      emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyArgAddrQual,
+                            T->getPointerAddressSpace(), 1);
+
+    // Emit KeyArgEnd
+    OutStreamer->EmitIntValue(RuntimeMD::KeyArgEnd, 1);
+  }
+
+  // Emit KeyReqdWorkGroupSize, KeyWorkGroupSizeHint, and KeyVecTypeHint.
+  if (auto RWGS = F.getMetadata("reqd_work_group_size"))
+    emitRuntimeMDThreeIntValues(OutStreamer, RuntimeMD::KeyReqdWorkGroupSize,
+                                RWGS, 4);
+  if (auto WGSH = F.getMetadata("work_group_size_hint"))
+    emitRuntimeMDThreeIntValues(OutStreamer, RuntimeMD::KeyWorkGroupSizeHint,
+                                WGSH, 4);
+  if (auto VTH = F.getMetadata("vec_type_hint")) {
+    auto TypeName = getOCLTypeName(cast<ValueAsMetadata>(
+      VTH->getOperand(0))->getType(), mdconst::extract<ConstantInt>(
+      VTH->getOperand(1))->getZExtValue());
+    emitRuntimeMDStringValue(OutStreamer, RuntimeMD::KeyVecTypeHint,
+                             TypeName);
+  }
+
+  // Emit KeyKernelEnd
+  OutStreamer->EmitIntValue(RuntimeMD::KeyKernelEnd, 1);
+}
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index 99d4091670fe..7b04c539520d 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -12,15 +12,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_R600_AMDGPUASMPRINTER_H
-#define LLVM_LIB_TARGET_R600_AMDGPUASMPRINTER_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUASMPRINTER_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUASMPRINTER_H
 
 #include "llvm/CodeGen/AsmPrinter.h"
 #include <vector>
 
 namespace llvm {
 
-class AMDGPUAsmPrinter : public AsmPrinter {
+class AMDGPUAsmPrinter final : public AsmPrinter {
 private:
   struct SIProgramInfo {
     SIProgramInfo() :
@@ -40,6 +40,10 @@ private:
       NumVGPR(0),
       NumSGPR(0),
       FlatUsed(false),
+      ReservedVGPRFirst(0),
+      ReservedVGPRCount(0),
+      DebuggerWavefrontPrivateSegmentOffsetSGPR((uint16_t)-1),
+      DebuggerPrivateSegmentBufferSGPR((uint16_t)-1),
       VCCUsed(false),
       CodeLen(0) {}
 
@@ -67,6 +71,20 @@ private:
     uint32_t LDSSize;
     bool FlatUsed;
 
+    // If ReservedVGPRCount is 0 then must be 0. Otherwise, this is the first
+    // fixed VGPR number reserved.
+    uint16_t ReservedVGPRFirst;
+    // The number of consecutive VGPRs reserved.
+    uint16_t ReservedVGPRCount;
+
+    // Fixed SGPR number used to hold wave scratch offset for entire kernel
+    // execution, or uint16_t(-1) if the register is not used or not known.
+    uint16_t DebuggerWavefrontPrivateSegmentOffsetSGPR;
+    // Fixed SGPR number of the first 4 SGPRs used to hold scratch V# for entire
+    // kernel execution, or uint16_t(-1) if the register is not used or not
+    // known.
+    uint16_t DebuggerPrivateSegmentBufferSGPR;
+
     // Bonus information for debugging.
     bool VCCUsed;
     uint64_t CodeLen;
@@ -109,6 +127,10 @@ public:
                        unsigned AsmVariant, const char *ExtraCode,
                        raw_ostream &O) override;
 
+  void emitStartOfRuntimeMetadata(const Module &M);
+
+  void emitRuntimeMetadata(const Function &F);
+
 protected:
   std::vector<std::string> DisasmLines, HexLines;
   size_t DisasmLineMaxLen;
diff --git a/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
new file mode 100644
index 000000000000..1a1da8a254a7
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -0,0 +1,42 @@
+//===-- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp - Call lowering ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements the lowering of LLVM calls to machine code calls for
+/// GlobalISel.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUCallLowering.h"
+#include "AMDGPUISelLowering.h"
+
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+
+using namespace llvm;
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+#error "This shouldn't be built without GISel"
+#endif
+
+AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI)
+  : CallLowering(&TLI) {
+}
+
+bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
+                                        const Value *Val, unsigned VReg) const {
+  return true;
+}
+
+bool AMDGPUCallLowering::lowerFormalArguments(
+    MachineIRBuilder &MIRBuilder, const Function::ArgumentListType &Args,
+    const SmallVectorImpl<unsigned> &VRegs) const {
+  // TODO: Implement once there are generic loads/stores.
+  return true;
+}
diff --git a/lib/Target/AMDGPU/AMDGPUCallLowering.h b/lib/Target/AMDGPU/AMDGPUCallLowering.h
new file mode 100644
index 000000000000..61174bacdac3
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUCallLowering.h
@@ -0,0 +1,36 @@
+//===- lib/Target/AMDGPU/AMDGPUCallLowering.h - Call lowering -*- C++ -*---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file describes how to lower LLVM calls to machine code calls.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUCALLLOWERING_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUCALLLOWERING_H
+
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
+
+namespace llvm {
+
+class AMDGPUTargetLowering;
+
+class AMDGPUCallLowering: public CallLowering {
+ public:
+  AMDGPUCallLowering(const AMDGPUTargetLowering &TLI);
+
+  bool lowerReturn(MachineIRBuilder &MIRBuiler, const Value *Val,
+                   unsigned VReg) const override;
+  bool
+  lowerFormalArguments(MachineIRBuilder &MIRBuilder,
+                       const Function::ArgumentListType &Args,
+                       const SmallVectorImpl<unsigned> &VRegs) const override;
+};
+} // End of namespace llvm;
+#endif
diff --git a/lib/Target/AMDGPU/AMDGPUCallingConv.td b/lib/Target/AMDGPU/AMDGPUCallingConv.td
index b0db26124a0c..47dfa4992068 100644
--- a/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -110,21 +110,19 @@ def CC_R600 : CallingConv<[
 
 // Calling convention for compute kernels
 def CC_AMDGPU_Kernel : CallingConv<[
-  CCCustom<"allocateStack">
+  CCCustom<"allocateKernArg">
 ]>;
 
 def CC_AMDGPU : CallingConv<[
   CCIf<"static_cast<const AMDGPUSubtarget&>"
         "(State.getMachineFunction().getSubtarget()).getGeneration() >="
           "AMDGPUSubtarget::SOUTHERN_ISLANDS && "
-        "State.getMachineFunction().getInfo<SIMachineFunctionInfo>()"
-         "->getShaderType() == ShaderType::COMPUTE",
+        "!AMDGPU::isShader(State.getCallingConv())",
        CCDelegateTo<CC_AMDGPU_Kernel>>,
   CCIf<"static_cast<const AMDGPUSubtarget&>"
         "(State.getMachineFunction().getSubtarget()).getGeneration() < "
           "AMDGPUSubtarget::SOUTHERN_ISLANDS && "
-         "State.getMachineFunction().getInfo<R600MachineFunctionInfo>()"
-          "->getShaderType() == ShaderType::COMPUTE",
+         "!AMDGPU::isShader(State.getCallingConv())",
         CCDelegateTo<CC_AMDGPU_Kernel>>,
    CCIf<"static_cast<const AMDGPUSubtarget&>"
          "(State.getMachineFunction().getSubtarget()).getGeneration() >= "
diff --git a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
new file mode 100644
index 000000000000..3b415774df49
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -0,0 +1,82 @@
+//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass does misc. AMDGPU optimizations on IR before instruction
+/// selection.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+
+#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "amdgpu-codegenprepare"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUCodeGenPrepare : public FunctionPass,
+                             public InstVisitor<AMDGPUCodeGenPrepare> {
+  DivergenceAnalysis *DA;
+  const TargetMachine *TM;
+
+public:
+  static char ID;
+  AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) :
+    FunctionPass(ID),
+    TM(TM) { }
+
+  bool doInitialization(Module &M) override;
+  bool runOnFunction(Function &F) override;
+
+  const char *getPassName() const override {
+    return "AMDGPU IR optimizations";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DivergenceAnalysis>();
+    AU.setPreservesAll();
+ }
+};
+
+} // End anonymous namespace
+
+bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
+  return false;
+}
+
+bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
+  if (!TM || skipFunction(F))
+    return false;
+
+  DA = &getAnalysis<DivergenceAnalysis>();
+  visit(F);
+
+  return true;
+}
+
+INITIALIZE_TM_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
+                      "AMDGPU IR optimizations", false, false)
+INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
+INITIALIZE_TM_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE,
+                       "AMDGPU IR optimizations", false, false)
+
+char AMDGPUCodeGenPrepare::ID = 0;
+
+FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const TargetMachine *TM) {
+  return new AMDGPUCodeGenPrepare(TM);
+}
diff --git a/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.cpp b/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.cpp
deleted file mode 100644
index 2f6b3022dd6e..000000000000
--- a/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.cpp
+++ /dev/null
@@ -1,26 +0,0 @@
-//===-- AMDGPUDiagnosticInfoUnsupported.cpp -------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPUDiagnosticInfoUnsupported.h"
-
-using namespace llvm;
-
-DiagnosticInfoUnsupported::DiagnosticInfoUnsupported(
-  const Function &Fn,
-  const Twine &Desc,
-  DiagnosticSeverity Severity)
-  : DiagnosticInfo(getKindID(), Severity),
-    Description(Desc),
-    Fn(Fn) { }
-
-int DiagnosticInfoUnsupported::KindID = 0;
-
-void DiagnosticInfoUnsupported::print(DiagnosticPrinter &DP) const {
-  DP << "unsupported " << getDescription() << " in " << Fn.getName();
-}
diff --git a/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.h b/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.h
deleted file mode 100644
index 0fd37e1ede6b..000000000000
--- a/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.h
+++ /dev/null
@@ -1,48 +0,0 @@
-//===-- AMDGPUDiagnosticInfoUnsupported.h - Error reporting -----*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUDIAGNOSTICINFOUNSUPPORTED_H
-#define LLVM_LIB_TARGET_AMDGPU_AMDGPUDIAGNOSTICINFOUNSUPPORTED_H
-
-#include "llvm/IR/DiagnosticInfo.h"
-#include "llvm/IR/DiagnosticPrinter.h"
-
-namespace llvm {
-
-/// Diagnostic information for unimplemented or unsupported feature reporting.
-class DiagnosticInfoUnsupported : public DiagnosticInfo {
-private:
-  const Twine &Description;
-  const Function &Fn;
-
-  static int KindID;
-
-  static int getKindID() {
-    if (KindID == 0)
-      KindID = llvm::getNextAvailablePluginDiagnosticKind();
-    return KindID;
-  }
-
-public:
-  DiagnosticInfoUnsupported(const Function &Fn, const Twine &Desc,
-                            DiagnosticSeverity Severity = DS_Error);
-
-  const Function &getFunction() const { return Fn; }
-  const Twine &getDescription() const { return Description; }
-
-  void print(DiagnosticPrinter &DP) const override;
-
-  static bool classof(const DiagnosticInfo *DI) {
-    return DI->getKind() == getKindID();
-  }
-};
-
-}
-
-#endif
diff --git a/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp b/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp
index 4d84d281d998..bbc28b885721 100644
--- a/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp
@@ -7,12 +7,13 @@
 //
 //==-----------------------------------------------------------------------===//
 //
-// Interface to describe a layout of a stack frame on a AMDIL target machine
+// Interface to describe a layout of a stack frame on a AMDGPU target machine.
 //
 //===----------------------------------------------------------------------===//
 #include "AMDGPUFrameLowering.h"
 #include "AMDGPURegisterInfo.h"
-#include "R600MachineFunctionInfo.h"
+#include "AMDGPUSubtarget.h"
+
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/Instructions.h"
@@ -57,7 +58,7 @@ unsigned AMDGPUFrameLowering::getStackWidth(const MachineFunction &MF) const {
   // T2.Y = stack[1].y
   // T3.X = stack[1].z
   // T3.Y = stack[1].w
-  // 
+  //
   // StackWidth = 4:
   // T0.X = stack[0].x
   // T0.Y = stack[0].y
@@ -75,7 +76,8 @@ int AMDGPUFrameLowering::getFrameIndexReference(const MachineFunction &MF,
                                                 int FI,
                                                 unsigned &FrameReg) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
-  const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
+  const AMDGPURegisterInfo *RI
+    = MF.getSubtarget<AMDGPUSubtarget>().getRegisterInfo();
 
   // Fill in FrameReg output argument.
   FrameReg = RI->getFrameRegister(MF);
@@ -87,32 +89,16 @@ int AMDGPUFrameLowering::getFrameIndexReference(const MachineFunction &MF,
   int UpperBound = FI == -1 ? MFI->getNumObjects() : FI;
 
   for (int i = MFI->getObjectIndexBegin(); i < UpperBound; ++i) {
-    OffsetBytes = RoundUpToAlignment(OffsetBytes, MFI->getObjectAlignment(i));
+    OffsetBytes = alignTo(OffsetBytes, MFI->getObjectAlignment(i));
     OffsetBytes += MFI->getObjectSize(i);
     // Each register holds 4 bytes, so we must always align the offset to at
     // least 4 bytes, so that 2 frame objects won't share the same register.
-    OffsetBytes = RoundUpToAlignment(OffsetBytes, 4);
+    OffsetBytes = alignTo(OffsetBytes, 4);
   }
 
   if (FI != -1)
-    OffsetBytes = RoundUpToAlignment(OffsetBytes, MFI->getObjectAlignment(FI));
+    OffsetBytes = alignTo(OffsetBytes, MFI->getObjectAlignment(FI));
 
   return OffsetBytes / (getStackWidth(MF) * 4);
 }
 
-const TargetFrameLowering::SpillSlot *
-AMDGPUFrameLowering::getCalleeSavedSpillSlots(unsigned &NumEntries) const {
-  NumEntries = 0;
-  return nullptr;
-}
-void AMDGPUFrameLowering::emitPrologue(MachineFunction &MF,
-                                       MachineBasicBlock &MBB) const {}
-void
-AMDGPUFrameLowering::emitEpilogue(MachineFunction &MF,
-                                  MachineBasicBlock &MBB) const {
-}
-
-bool
-AMDGPUFrameLowering::hasFP(const MachineFunction &MF) const {
-  return false;
-}
diff --git a/lib/Target/AMDGPU/AMDGPUFrameLowering.h b/lib/Target/AMDGPU/AMDGPUFrameLowering.h
index 257a3da40589..513848a1d887 100644
--- a/lib/Target/AMDGPU/AMDGPUFrameLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUFrameLowering.h
@@ -32,13 +32,13 @@ public:
   /// \returns The number of 32-bit sub-registers that are used when storing
   /// values to the stack.
   unsigned getStackWidth(const MachineFunction &MF) const;
+
   int getFrameIndexReference(const MachineFunction &MF, int FI,
                              unsigned &FrameReg) const override;
-  const SpillSlot *
-    getCalleeSavedSpillSlots(unsigned &NumEntries) const override;
-  void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
-  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
-  bool hasFP(const MachineFunction &MF) const override;
+
+  bool hasFP(const MachineFunction &MF) const override {
+    return false;
+  }
 };
 } // namespace llvm
 #endif
diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index b33040b4d06a..23c9352ce273 100644
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1,4 +1,4 @@
-//===-- AMDILISelDAGToDAG.cpp - A dag to dag inst selector for AMDIL ------===//
+//===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,30 +12,44 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "AMDGPUDiagnosticInfoUnsupported.h"
 #include "AMDGPUInstrInfo.h"
+#include "AMDGPUIntrinsicInfo.h"
 #include "AMDGPUISelLowering.h" // For AMDGPUISD
-#include "AMDGPURegisterInfo.h"
 #include "AMDGPUSubtarget.h"
-#include "R600InstrInfo.h"
-#include "SIDefines.h"
 #include "SIISelLowering.h"
 #include "SIMachineFunctionInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
-#include "llvm/IR/Function.h"
+#include "llvm/IR/DiagnosticInfo.h"
 
 using namespace llvm;
 
+namespace llvm {
+class R600InstrInfo;
+}
+
 //===----------------------------------------------------------------------===//
 // Instruction Selector Implementation
 //===----------------------------------------------------------------------===//
 
 namespace {
+
+static bool isCBranchSCC(const SDNode *N) {
+  assert(N->getOpcode() == ISD::BRCOND);
+  if (!N->hasOneUse())
+    return false;
+
+  SDValue Cond = N->getOperand(1);
+  if (Cond.getOpcode() == ISD::CopyToReg)
+    Cond = Cond.getOperand(2);
+  return Cond.getOpcode() == ISD::SETCC &&
+         Cond.getOperand(0).getValueType() == MVT::i32 && Cond.hasOneUse();
+}
+
 /// AMDGPU specific code to select AMDGPU machine instructions for
 /// SelectionDAG operations.
 class AMDGPUDAGToDAGISel : public SelectionDAGISel {
@@ -47,7 +61,7 @@ public:
   AMDGPUDAGToDAGISel(TargetMachine &TM);
   virtual ~AMDGPUDAGToDAGISel();
   bool runOnMachineFunction(MachineFunction &MF) override;
-  SDNode *Select(SDNode *N) override;
+  void Select(SDNode *N) override;
   const char *getPassName() const override;
   void PreprocessISelDAG() override;
   void PostprocessISelDAG() override;
@@ -59,28 +73,8 @@ private:
   bool FoldOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &);
   bool FoldDotOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &);
 
-  // Complex pattern selectors
-  bool SelectADDRParam(SDValue Addr, SDValue& R1, SDValue& R2);
-  bool SelectADDR(SDValue N, SDValue &R1, SDValue &R2);
-  bool SelectADDR64(SDValue N, SDValue &R1, SDValue &R2);
-
-  static bool checkType(const Value *ptr, unsigned int addrspace);
-  static bool checkPrivateAddress(const MachineMemOperand *Op);
-
-  static bool isGlobalStore(const StoreSDNode *N);
-  static bool isFlatStore(const StoreSDNode *N);
-  static bool isPrivateStore(const StoreSDNode *N);
-  static bool isLocalStore(const StoreSDNode *N);
-  static bool isRegionStore(const StoreSDNode *N);
-
-  bool isCPLoad(const LoadSDNode *N) const;
-  bool isConstantLoad(const LoadSDNode *N, int cbID) const;
-  bool isGlobalLoad(const LoadSDNode *N) const;
-  bool isFlatLoad(const LoadSDNode *N) const;
-  bool isParamLoad(const LoadSDNode *N) const;
-  bool isPrivateLoad(const LoadSDNode *N) const;
-  bool isLocalLoad(const LoadSDNode *N) const;
-  bool isRegionLoad(const LoadSDNode *N) const;
+  bool isConstantLoad(const MemSDNode *N, int cbID) const;
+  bool isUniformBr(const SDNode *N) const;
 
   SDNode *glueCopyToM0(SDNode *N) const;
 
@@ -111,7 +105,20 @@ private:
                          SDValue &Offset, SDValue &GLC, SDValue &SLC,
                          SDValue &TFE) const;
   bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
-                         SDValue &Offset, SDValue &GLC) const;
+                         SDValue &Offset, SDValue &SLC) const;
+  bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
+                         SDValue &Offset) const;
+  bool SelectMUBUFConstant(SDValue Constant,
+                           SDValue &SOffset,
+                           SDValue &ImmOffset) const;
+  bool SelectMUBUFIntrinsicOffset(SDValue Offset, SDValue &SOffset,
+                                  SDValue &ImmOffset) const;
+  bool SelectMUBUFIntrinsicVOffset(SDValue Offset, SDValue &SOffset,
+                                   SDValue &ImmOffset, SDValue &VOffset) const;
+
+  bool SelectFlat(SDValue Addr, SDValue &VAddr,
+                  SDValue &SLC, SDValue &TFE) const;
+
   bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset,
                         bool &Imm) const;
   bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset,
@@ -122,7 +129,7 @@ private:
   bool SelectSMRDBufferImm(SDValue Addr, SDValue &Offset) const;
   bool SelectSMRDBufferImm32(SDValue Addr, SDValue &Offset) const;
   bool SelectSMRDBufferSgpr(SDValue Addr, SDValue &Offset) const;
-  SDNode *SelectAddrSpaceCast(SDNode *N);
+  bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const;
   bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
   bool SelectVOP3NoMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
   bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods,
@@ -136,13 +143,15 @@ private:
                                  SDValue &Clamp,
                                  SDValue &Omod) const;
 
-  SDNode *SelectADD_SUB_I64(SDNode *N);
-  SDNode *SelectDIV_SCALE(SDNode *N);
+  void SelectADD_SUB_I64(SDNode *N);
+  void SelectDIV_SCALE(SDNode *N);
 
-  SDNode *getS_BFE(unsigned Opcode, SDLoc DL, SDValue Val,
+  SDNode *getS_BFE(unsigned Opcode, const SDLoc &DL, SDValue Val,
                    uint32_t Offset, uint32_t Width);
-  SDNode *SelectS_BFEFromShifts(SDNode *N);
-  SDNode *SelectS_BFE(SDNode *N);
+  void SelectS_BFEFromShifts(SDNode *N);
+  void SelectS_BFE(SDNode *N);
+  void SelectBRCOND(SDNode *N);
+  void SelectATOMIC_CMP_SWAP(SDNode *N);
 
   // Include the pieces autogenerated from the target description.
 #include "AMDGPUGenDAGISel.inc"
@@ -159,7 +168,7 @@ AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM)
     : SelectionDAGISel(TM) {}
 
 bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
-  Subtarget = &static_cast<const AMDGPUSubtarget &>(MF.getSubtarget());
+  Subtarget = &MF.getSubtarget<AMDGPUSubtarget>();
   return SelectionDAGISel::runOnMachineFunction(MF);
 }
 
@@ -207,64 +216,9 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
   }
 }
 
-bool AMDGPUDAGToDAGISel::SelectADDRParam(
-  SDValue Addr, SDValue& R1, SDValue& R2) {
-
-  if (Addr.getOpcode() == ISD::FrameIndex) {
-    if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
-      R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
-      R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
-    } else {
-      R1 = Addr;
-      R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
-    }
-  } else if (Addr.getOpcode() == ISD::ADD) {
-    R1 = Addr.getOperand(0);
-    R2 = Addr.getOperand(1);
-  } else {
-    R1 = Addr;
-    R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
-  }
-  return true;
-}
-
-bool AMDGPUDAGToDAGISel::SelectADDR(SDValue Addr, SDValue& R1, SDValue& R2) {
-  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
-      Addr.getOpcode() == ISD::TargetGlobalAddress) {
-    return false;
-  }
-  return SelectADDRParam(Addr, R1, R2);
-}
-
-
-bool AMDGPUDAGToDAGISel::SelectADDR64(SDValue Addr, SDValue& R1, SDValue& R2) {
-  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
-      Addr.getOpcode() == ISD::TargetGlobalAddress) {
-    return false;
-  }
-
-  if (Addr.getOpcode() == ISD::FrameIndex) {
-    if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
-      R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i64);
-      R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i64);
-    } else {
-      R1 = Addr;
-      R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i64);
-    }
-  } else if (Addr.getOpcode() == ISD::ADD) {
-    R1 = Addr.getOperand(0);
-    R2 = Addr.getOperand(1);
-  } else {
-    R1 = Addr;
-    R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i64);
-  }
-  return true;
-}
-
 SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const {
   if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS ||
-      !checkType(cast<MemSDNode>(N)->getMemOperand()->getValue(),
-                 AMDGPUAS::LOCAL_ADDRESS))
+      cast<MemSDNode>(N)->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
     return N;
 
   const SITargetLowering& Lowering =
@@ -304,14 +258,15 @@ static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) {
   llvm_unreachable("invalid vector size");
 }
 
-SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
+void AMDGPUDAGToDAGISel::Select(SDNode *N) {
   unsigned int Opc = N->getOpcode();
   if (N->isMachineOpcode()) {
     N->setNodeId(-1);
-    return nullptr;   // Already selected.
+    return;   // Already selected.
   }
 
-  if (isa<AtomicSDNode>(N))
+  if (isa<AtomicSDNode>(N) ||
+      (Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC))
     N = glueCopyToM0(N);
 
   switch (Opc) {
@@ -325,7 +280,8 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
         Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
       break;
 
-    return SelectADD_SUB_I64(N);
+    SelectADD_SUB_I64(N);
+    return;
   }
   case ISD::SCALAR_TO_VECTOR:
   case AMDGPUISD::BUILD_VERTICAL_VECTOR:
@@ -359,8 +315,9 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
     SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
 
     if (NumVectorElts == 1) {
-      return CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT,
-                                  N->getOperand(0), RegClass);
+      CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),
+                           RegClass);
+      return;
     }
 
     assert(NumVectorElts <= 16 && "Vectors with more than 16 elements not "
@@ -400,8 +357,8 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
 
     if (!IsRegSeq)
       break;
-    return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(),
-                                RegSeqArgs);
+    CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);
+    return;
   }
   case ISD::BUILD_PAIR: {
     SDValue RC, SubReg0, SubReg1;
@@ -422,8 +379,9 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
     }
     const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
                             N->getOperand(1), SubReg1 };
-    return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE,
-                                  DL, N->getValueType(0), Ops);
+    ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
+                                          N->getValueType(0), Ops));
+    return;
   }
 
   case ISD::Constant:
@@ -452,8 +410,9 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
       SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
     };
 
-    return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
-                                  N->getValueType(0), Ops);
+    ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
+                                          N->getValueType(0), Ops));
+    return;
   }
   case ISD::LOAD:
   case ISD::STORE: {
@@ -487,11 +446,13 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
     uint32_t OffsetVal = Offset->getZExtValue();
     uint32_t WidthVal = Width->getZExtValue();
 
-    return getS_BFE(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32, SDLoc(N),
-                    N->getOperand(0), OffsetVal, WidthVal);
+    ReplaceNode(N, getS_BFE(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32,
+                            SDLoc(N), N->getOperand(0), OffsetVal, WidthVal));
+    return;
   }
   case AMDGPUISD::DIV_SCALE: {
-    return SelectDIV_SCALE(N);
+    SelectDIV_SCALE(N);
+    return;
   }
   case ISD::CopyToReg: {
     const SITargetLowering& Lowering =
@@ -499,139 +460,48 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
     Lowering.legalizeTargetIndependentNode(N, *CurDAG);
     break;
   }
-  case ISD::ADDRSPACECAST:
-    return SelectAddrSpaceCast(N);
   case ISD::AND:
   case ISD::SRL:
   case ISD::SRA:
+  case ISD::SIGN_EXTEND_INREG:
     if (N->getValueType(0) != MVT::i32 ||
         Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
       break;
 
-    return SelectS_BFE(N);
+    SelectS_BFE(N);
+    return;
+  case ISD::BRCOND:
+    SelectBRCOND(N);
+    return;
+
+  case AMDGPUISD::ATOMIC_CMP_SWAP:
+    SelectATOMIC_CMP_SWAP(N);
+    return;
   }
 
-  return SelectCode(N);
+  SelectCode(N);
 }
 
-bool AMDGPUDAGToDAGISel::checkType(const Value *Ptr, unsigned AS) {
-  assert(AS != 0 && "Use checkPrivateAddress instead.");
-  if (!Ptr)
+bool AMDGPUDAGToDAGISel::isConstantLoad(const MemSDNode *N, int CbId) const {
+  if (!N->readMem())
     return false;
-
-  return Ptr->getType()->getPointerAddressSpace() == AS;
-}
-
-bool AMDGPUDAGToDAGISel::checkPrivateAddress(const MachineMemOperand *Op) {
-  if (Op->getPseudoValue())
-    return true;
-
-  if (PointerType *PT = dyn_cast<PointerType>(Op->getValue()->getType()))
-    return PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS;
-
-  return false;
-}
-
-bool AMDGPUDAGToDAGISel::isGlobalStore(const StoreSDNode *N) {
-  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::GLOBAL_ADDRESS);
-}
-
-bool AMDGPUDAGToDAGISel::isPrivateStore(const StoreSDNode *N) {
-  const Value *MemVal = N->getMemOperand()->getValue();
-  return (!checkType(MemVal, AMDGPUAS::LOCAL_ADDRESS) &&
-          !checkType(MemVal, AMDGPUAS::GLOBAL_ADDRESS) &&
-          !checkType(MemVal, AMDGPUAS::REGION_ADDRESS));
-}
-
-bool AMDGPUDAGToDAGISel::isLocalStore(const StoreSDNode *N) {
-  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::LOCAL_ADDRESS);
-}
-
-bool AMDGPUDAGToDAGISel::isFlatStore(const StoreSDNode *N) {
-  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::FLAT_ADDRESS);
-}
-
-bool AMDGPUDAGToDAGISel::isRegionStore(const StoreSDNode *N) {
-  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::REGION_ADDRESS);
-}
-
-bool AMDGPUDAGToDAGISel::isConstantLoad(const LoadSDNode *N, int CbId) const {
-  const Value *MemVal = N->getMemOperand()->getValue();
   if (CbId == -1)
-    return checkType(MemVal, AMDGPUAS::CONSTANT_ADDRESS);
+    return N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS;
 
-  return checkType(MemVal, AMDGPUAS::CONSTANT_BUFFER_0 + CbId);
+  return N->getAddressSpace() == AMDGPUAS::CONSTANT_BUFFER_0 + CbId;
 }
 
-bool AMDGPUDAGToDAGISel::isGlobalLoad(const LoadSDNode *N) const {
-  if (N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS)
-    if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS ||
-        N->getMemoryVT().bitsLT(MVT::i32))
-      return true;
-
-  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::GLOBAL_ADDRESS);
-}
-
-bool AMDGPUDAGToDAGISel::isParamLoad(const LoadSDNode *N) const {
-  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::PARAM_I_ADDRESS);
-}
-
-bool AMDGPUDAGToDAGISel::isLocalLoad(const  LoadSDNode *N) const {
-  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::LOCAL_ADDRESS);
-}
-
-bool AMDGPUDAGToDAGISel::isFlatLoad(const  LoadSDNode *N) const {
-  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::FLAT_ADDRESS);
-}
-
-bool AMDGPUDAGToDAGISel::isRegionLoad(const  LoadSDNode *N) const {
-  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::REGION_ADDRESS);
-}
-
-bool AMDGPUDAGToDAGISel::isCPLoad(const LoadSDNode *N) const {
-  MachineMemOperand *MMO = N->getMemOperand();
-  if (checkPrivateAddress(N->getMemOperand())) {
-    if (MMO) {
-      const PseudoSourceValue *PSV = MMO->getPseudoValue();
-      if (PSV && PSV->isConstantPool()) {
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
-bool AMDGPUDAGToDAGISel::isPrivateLoad(const LoadSDNode *N) const {
-  if (checkPrivateAddress(N->getMemOperand())) {
-    // Check to make sure we are not a constant pool load or a constant load
-    // that is marked as a private load
-    if (isCPLoad(N) || isConstantLoad(N, -1)) {
-      return false;
-    }
-  }
-
-  const Value *MemVal = N->getMemOperand()->getValue();
-  if (!checkType(MemVal, AMDGPUAS::LOCAL_ADDRESS) &&
-      !checkType(MemVal, AMDGPUAS::GLOBAL_ADDRESS) &&
-      !checkType(MemVal, AMDGPUAS::FLAT_ADDRESS) &&
-      !checkType(MemVal, AMDGPUAS::REGION_ADDRESS) &&
-      !checkType(MemVal, AMDGPUAS::CONSTANT_ADDRESS) &&
-      !checkType(MemVal, AMDGPUAS::PARAM_D_ADDRESS) &&
-      !checkType(MemVal, AMDGPUAS::PARAM_I_ADDRESS)) {
-    return true;
-  }
-  return false;
+bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
+  const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
+  const Instruction *Term = BB->getTerminator();
+  return Term->getMetadata("amdgpu.uniform") ||
+         Term->getMetadata("structurizecfg.uniform");
 }
 
 const char *AMDGPUDAGToDAGISel::getPassName() const {
   return "AMDGPU DAG->DAG Pattern Instruction Selection";
 }
 
-#ifdef DEBUGTMP
-#undef INT64_C
-#endif
-#undef DEBUGTMP
-
 //===----------------------------------------------------------------------===//
 // Complex Patterns
 //===----------------------------------------------------------------------===//
@@ -705,7 +575,7 @@ bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
   return true;
 }
 
-SDNode *AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
+void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
   SDLoc DL(N);
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
@@ -728,7 +598,6 @@ SDNode *AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
   SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
   SDValue AddLoArgs[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
 
-
   unsigned Opc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
   unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
 
@@ -745,12 +614,12 @@ SDNode *AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
     SDValue(AddHi,0),
     Sub1,
   };
-  return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args);
+  CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args);
 }
 
 // We need to handle this here because tablegen doesn't support matching
 // instructions with multiple outputs.
-SDNode *AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
+void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
   SDLoc SL(N);
   EVT VT = N->getValueType(0);
 
@@ -766,7 +635,7 @@ SDNode *AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
   SelectVOP3Mods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
   SelectVOP3Mods(N->getOperand(1), Ops[3], Ops[2]);
   SelectVOP3Mods(N->getOperand(2), Ops[5], Ops[4]);
-  return CurDAG->SelectNodeTo(N, Opc, VT, MVT::i1, Ops);
+  CurDAG->SelectNodeTo(N, Opc, VT, MVT::i1, Ops);
 }
 
 bool AMDGPUDAGToDAGISel::isDSOffsetLegal(const SDValue &Base, unsigned Offset,
@@ -786,6 +655,7 @@ bool AMDGPUDAGToDAGISel::isDSOffsetLegal(const SDValue &Base, unsigned Offset,
 
 bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
                                               SDValue &Offset) const {
+  SDLoc DL(Addr);
   if (CurDAG->isBaseWithConstantOffset(Addr)) {
     SDValue N0 = Addr.getOperand(0);
     SDValue N1 = Addr.getOperand(1);
@@ -793,7 +663,7 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
     if (isDSOffsetLegal(N0, C1->getSExtValue(), 16)) {
       // (add n0, c0)
       Base = N0;
-      Offset = N1;
+      Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
       return true;
     }
   } else if (Addr.getOpcode() == ISD::SUB) {
@@ -801,7 +671,6 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
     if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
       int64_t ByteOffset = C->getSExtValue();
       if (isUInt<16>(ByteOffset)) {
-        SDLoc DL(Addr);
         SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
 
         // XXX - This is kind of hacky. Create a dummy sub node so we can check
@@ -816,7 +685,7 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
                                      Zero, Addr.getOperand(1));
 
           Base = SDValue(MachineSub, 0);
-          Offset = Addr.getOperand(0);
+          Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16);
           return true;
         }
       }
@@ -834,7 +703,7 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
       MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
                                  DL, MVT::i32, Zero);
       Base = SDValue(MovZero, 0);
-      Offset = Addr;
+      Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
       return true;
     }
   }
@@ -932,8 +801,10 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
 
   SDLoc DL(Addr);
 
-  GLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
-  SLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
+  if (!GLC.getNode())
+    GLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
+  if (!SLC.getNode())
+    SLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
   TFE = CurDAG->getTargetConstant(0, DL, MVT::i1);
 
   Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
@@ -961,9 +832,11 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
     }
 
     if (isLegalMUBUFImmOffset(C1)) {
-        Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
-        return true;
-    } else if (isUInt<32>(C1->getZExtValue())) {
+      Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
+      return true;
+    }
+
+    if (isUInt<32>(C1->getZExtValue())) {
       // Illegal offset, store it in soffset.
       Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
       SOffset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
@@ -1045,14 +918,13 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
   if (CurDAG->isBaseWithConstantOffset(Addr)) {
     SDValue N0 = Addr.getOperand(0);
     SDValue N1 = Addr.getOperand(1);
+
     // Offsets in vaddr must be positive.
-    if (CurDAG->SignBitIsZero(N0)) {
-      ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
-      if (isLegalMUBUFImmOffset(C1)) {
-        VAddr = N0;
-        ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
-        return true;
-      }
+    ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
+    if (isLegalMUBUFImmOffset(C1)) {
+      VAddr = N0;
+      ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
+      return true;
     }
   }
 
@@ -1091,13 +963,118 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
 }
 
 bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
+                                           SDValue &Soffset, SDValue &Offset
+                                           ) const {
+  SDValue GLC, SLC, TFE;
+
+  return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE);
+}
+bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
                                            SDValue &Soffset, SDValue &Offset,
-                                           SDValue &GLC) const {
-  SDValue SLC, TFE;
+                                           SDValue &SLC) const {
+  SDValue GLC, TFE;
 
   return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE);
 }
 
+bool AMDGPUDAGToDAGISel::SelectMUBUFConstant(SDValue Constant,
+                                             SDValue &SOffset,
+                                             SDValue &ImmOffset) const {
+  SDLoc DL(Constant);
+  uint32_t Imm = cast<ConstantSDNode>(Constant)->getZExtValue();
+  uint32_t Overflow = 0;
+
+  if (Imm >= 4096) {
+    if (Imm <= 4095 + 64) {
+      // Use an SOffset inline constant for 1..64
+      Overflow = Imm - 4095;
+      Imm = 4095;
+    } else {
+      // Try to keep the same value in SOffset for adjacent loads, so that
+      // the corresponding register contents can be re-used.
+      //
+      // Load values with all low-bits set into SOffset, so that a larger
+      // range of values can be covered using s_movk_i32
+      uint32_t High = (Imm + 1) & ~4095;
+      uint32_t Low = (Imm + 1) & 4095;
+      Imm = Low;
+      Overflow = High - 1;
+    }
+  }
+
+  // There is a hardware bug in SI and CI which prevents address clamping in
+  // MUBUF instructions from working correctly with SOffsets. The immediate
+  // offset is unaffected.
+  if (Overflow > 0 &&
+      Subtarget->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
+    return false;
+
+  ImmOffset = CurDAG->getTargetConstant(Imm, DL, MVT::i16);
+
+  if (Overflow <= 64)
+    SOffset = CurDAG->getTargetConstant(Overflow, DL, MVT::i32);
+  else
+    SOffset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
+                      CurDAG->getTargetConstant(Overflow, DL, MVT::i32)),
+                      0);
+
+  return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectMUBUFIntrinsicOffset(SDValue Offset,
+                                                    SDValue &SOffset,
+                                                    SDValue &ImmOffset) const {
+  SDLoc DL(Offset);
+
+  if (!isa<ConstantSDNode>(Offset))
+    return false;
+
+  return SelectMUBUFConstant(Offset, SOffset, ImmOffset);
+}
+
+bool AMDGPUDAGToDAGISel::SelectMUBUFIntrinsicVOffset(SDValue Offset,
+                                                     SDValue &SOffset,
+                                                     SDValue &ImmOffset,
+                                                     SDValue &VOffset) const {
+  SDLoc DL(Offset);
+
+  // Don't generate an unnecessary voffset for constant offsets.
+  if (isa<ConstantSDNode>(Offset)) {
+    SDValue Tmp1, Tmp2;
+
+    // When necessary, use a voffset in <= CI anyway to work around a hardware
+    // bug.
+    if (Subtarget->getGeneration() > AMDGPUSubtarget::SEA_ISLANDS ||
+        SelectMUBUFConstant(Offset, Tmp1, Tmp2))
+      return false;
+  }
+
+  if (CurDAG->isBaseWithConstantOffset(Offset)) {
+    SDValue N0 = Offset.getOperand(0);
+    SDValue N1 = Offset.getOperand(1);
+    if (cast<ConstantSDNode>(N1)->getSExtValue() >= 0 &&
+        SelectMUBUFConstant(N1, SOffset, ImmOffset)) {
+      VOffset = N0;
+      return true;
+    }
+  }
+
+  SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
+  ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16);
+  VOffset = Offset;
+
+  return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectFlat(SDValue Addr,
+                                    SDValue &VAddr,
+                                    SDValue &SLC,
+                                    SDValue &TFE) const {
+  VAddr = Addr;
+  TFE = SLC = CurDAG->getTargetConstant(0, SDLoc(), MVT::i1);
+  return true;
+}
+
 ///
 /// \param EncodedOffset This is the immediate value that will be encoded
 ///        directly into the instruction.  On SI/CI the \p EncodedOffset
@@ -1213,71 +1190,33 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgpr(SDValue Addr,
          !isa<ConstantSDNode>(Offset);
 }
 
-// FIXME: This is incorrect and only enough to be able to compile.
-SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
-  AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(N);
-  SDLoc DL(N);
-
-  const MachineFunction &MF = CurDAG->getMachineFunction();
-  DiagnosticInfoUnsupported NotImplemented(*MF.getFunction(),
-                                           "addrspacecast not implemented");
-  CurDAG->getContext()->diagnose(NotImplemented);
-
-  assert(Subtarget->hasFlatAddressSpace() &&
-         "addrspacecast only supported with flat address space!");
-
-  assert((ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
-          ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) &&
-         "Can only cast to / from flat address space!");
-
-  // The flat instructions read the address as the index of the VGPR holding the
-  // address, so casting should just be reinterpreting the base VGPR, so just
-  // insert trunc / bitcast / zext.
-
-  SDValue Src = ASC->getOperand(0);
-  EVT DestVT = ASC->getValueType(0);
-  EVT SrcVT = Src.getValueType();
-
-  unsigned SrcSize = SrcVT.getSizeInBits();
-  unsigned DestSize = DestVT.getSizeInBits();
-
-  if (SrcSize > DestSize) {
-    assert(SrcSize == 64 && DestSize == 32);
-    return CurDAG->getMachineNode(
-      TargetOpcode::EXTRACT_SUBREG,
-      DL,
-      DestVT,
-      Src,
-      CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32));
-  }
-
-  if (DestSize > SrcSize) {
-    assert(SrcSize == 32 && DestSize == 64);
-
-    // FIXME: This is probably wrong, we should never be defining
-    // a register class with both VGPRs and SGPRs
-    SDValue RC = CurDAG->getTargetConstant(AMDGPU::VS_64RegClassID, DL,
-                                           MVT::i32);
+bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
+                                            SDValue &Base,
+                                            SDValue &Offset) const {
+  SDLoc DL(Index);
 
-    const SDValue Ops[] = {
-      RC,
-      Src,
-      CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
-      SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
-                                     CurDAG->getConstant(0, DL, MVT::i32)), 0),
-      CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
-    };
+  if (CurDAG->isBaseWithConstantOffset(Index)) {
+    SDValue N0 = Index.getOperand(0);
+    SDValue N1 = Index.getOperand(1);
+    ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
 
-    return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE,
-                                  DL, N->getValueType(0), Ops);
+    // (add n0, c0)
+    Base = N0;
+    Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
+    return true;
   }
 
-  assert(SrcSize == 64 && DestSize == 64);
-  return CurDAG->getNode(ISD::BITCAST, DL, DestVT, Src).getNode();
+  if (isa<ConstantSDNode>(Index))
+    return false;
+
+  Base = Index;
+  Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
+  return true;
 }
 
-SDNode *AMDGPUDAGToDAGISel::getS_BFE(unsigned Opcode, SDLoc DL, SDValue Val,
-                                     uint32_t Offset, uint32_t Width) {
+SDNode *AMDGPUDAGToDAGISel::getS_BFE(unsigned Opcode, const SDLoc &DL,
+                                     SDValue Val, uint32_t Offset,
+                                     uint32_t Width) {
   // Transformation function, pack the offset and width of a BFE into
   // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
   // source, bits [5:0] contain the offset and bits [22:16] the width.
@@ -1287,7 +1226,7 @@ SDNode *AMDGPUDAGToDAGISel::getS_BFE(unsigned Opcode, SDLoc DL, SDValue Val,
   return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst);
 }
 
-SDNode *AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
+void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
   // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
   // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
   // Predicate: 0 < b <= c < 32
@@ -1304,14 +1243,15 @@ SDNode *AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
       bool Signed = N->getOpcode() == ISD::SRA;
       unsigned Opcode = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
 
-      return getS_BFE(Opcode, SDLoc(N), Shl.getOperand(0),
-                      CVal - BVal, 32 - CVal);
+      ReplaceNode(N, getS_BFE(Opcode, SDLoc(N), Shl.getOperand(0), CVal - BVal,
+                              32 - CVal));
+      return;
     }
   }
-  return SelectCode(N);
+  SelectCode(N);
 }
 
-SDNode *AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
+void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
   switch (N->getOpcode()) {
   case ISD::AND:
     if (N->getOperand(0).getOpcode() == ISD::SRL) {
@@ -1328,8 +1268,9 @@ SDNode *AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
         if (isMask_32(MaskVal)) {
           uint32_t WidthVal = countPopulation(MaskVal);
 
-          return getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N), Srl.getOperand(0),
-                          ShiftVal, WidthVal);
+          ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N),
+                                  Srl.getOperand(0), ShiftVal, WidthVal));
+          return;
         }
       }
     }
@@ -1349,20 +1290,139 @@ SDNode *AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
         if (isMask_32(MaskVal)) {
           uint32_t WidthVal = countPopulation(MaskVal);
 
-          return getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N), And.getOperand(0),
-                          ShiftVal, WidthVal);
+          ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N),
+                                  And.getOperand(0), ShiftVal, WidthVal));
+          return;
         }
       }
-    } else if (N->getOperand(0).getOpcode() == ISD::SHL)
-      return SelectS_BFEFromShifts(N);
+    } else if (N->getOperand(0).getOpcode() == ISD::SHL) {
+      SelectS_BFEFromShifts(N);
+      return;
+    }
     break;
   case ISD::SRA:
-    if (N->getOperand(0).getOpcode() == ISD::SHL)
-      return SelectS_BFEFromShifts(N);
+    if (N->getOperand(0).getOpcode() == ISD::SHL) {
+      SelectS_BFEFromShifts(N);
+      return;
+    }
     break;
+
+  case ISD::SIGN_EXTEND_INREG: {
+    // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8
+    SDValue Src = N->getOperand(0);
+    if (Src.getOpcode() != ISD::SRL)
+      break;
+
+    const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1));
+    if (!Amt)
+      break;
+
+    unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
+    ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_I32, SDLoc(N), Src.getOperand(0),
+                            Amt->getZExtValue(), Width));
+    return;
+  }
   }
 
-  return SelectCode(N);
+  SelectCode(N);
+}
+
+void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
+  SDValue Cond = N->getOperand(1);
+
+  if (isCBranchSCC(N)) {
+    // This brcond will use S_CBRANCH_SCC*, so let tablegen handle it.
+    SelectCode(N);
+    return;
+  }
+
+  // The result of VOPC instructions is or'd against ~EXEC before it is
+  // written to vcc or another SGPR.  This means that the value '1' is always
+  // written to the corresponding bit for results that are masked.  In order
+  // to correctly check against vccz, we need to and VCC with the EXEC
+  // register in order to clear the value from the masked bits.
+
+  SDLoc SL(N);
+
+  SDNode *MaskedCond =
+        CurDAG->getMachineNode(AMDGPU::S_AND_B64, SL, MVT::i1,
+                               CurDAG->getRegister(AMDGPU::EXEC, MVT::i1),
+                               Cond);
+  SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, AMDGPU::VCC,
+                                     SDValue(MaskedCond, 0),
+                                     SDValue()); // Passing SDValue() adds a
+                                                 // glue output.
+  CurDAG->SelectNodeTo(N, AMDGPU::S_CBRANCH_VCCNZ, MVT::Other,
+                       N->getOperand(2), // Basic Block
+                       VCC.getValue(0),  // Chain
+                       VCC.getValue(1)); // Glue
+  return;
+}
+
+// This is here because there isn't a way to use the generated sub0_sub1 as the
+// subreg index to EXTRACT_SUBREG in tablegen.
+void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) {
+  MemSDNode *Mem = cast<MemSDNode>(N);
+  unsigned AS = Mem->getAddressSpace();
+  if (AS == AMDGPUAS::FLAT_ADDRESS) {
+    SelectCode(N);
+    return;
+  }
+
+  MVT VT = N->getSimpleValueType(0);
+  bool Is32 = (VT == MVT::i32);
+  SDLoc SL(N);
+
+  MachineSDNode *CmpSwap = nullptr;
+  if (Subtarget->hasAddr64()) {
+    SDValue SRsrc, VAddr, SOffset, Offset, GLC, SLC;
+
+    if (SelectMUBUFAddr64(Mem->getBasePtr(), SRsrc, VAddr, SOffset, Offset, SLC)) {
+      unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_RTN_ADDR64 :
+        AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_RTN_ADDR64;
+      SDValue CmpVal = Mem->getOperand(2);
+
+      // XXX - Do we care about glue operands?
+
+      SDValue Ops[] = {
+        CmpVal, VAddr, SRsrc, SOffset, Offset, SLC, Mem->getChain()
+      };
+
+      CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops);
+    }
+  }
+
+  if (!CmpSwap) {
+    SDValue SRsrc, SOffset, Offset, SLC;
+    if (SelectMUBUFOffset(Mem->getBasePtr(), SRsrc, SOffset, Offset, SLC)) {
+      unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_RTN_OFFSET :
+        AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_RTN_OFFSET;
+
+      SDValue CmpVal = Mem->getOperand(2);
+      SDValue Ops[] = {
+        CmpVal, SRsrc, SOffset, Offset, SLC, Mem->getChain()
+      };
+
+      CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops);
+    }
+  }
+
+  if (!CmpSwap) {
+    SelectCode(N);
+    return;
+  }
+
+  MachineSDNode::mmo_iterator MMOs = MF->allocateMemRefsArray(1);
+  *MMOs = Mem->getMemOperand();
+  CmpSwap->setMemRefs(MMOs, MMOs + 1);
+
+  unsigned SubReg = Is32 ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
+  SDValue Extract
+    = CurDAG->getTargetExtractSubreg(SubReg, SL, VT, SDValue(CmpSwap, 0));
+
+  ReplaceUses(SDValue(N, 0), Extract);
+  ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 1));
+  CurDAG->RemoveDeadNode(N);
 }
 
 bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
@@ -1432,62 +1492,59 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src,
 }
 
 void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
-  bool Modified = false;
-
-  // XXX - Other targets seem to be able to do this without a worklist.
-  SmallVector<LoadSDNode *, 8> LoadsToReplace;
-  SmallVector<StoreSDNode *, 8> StoresToReplace;
-
-  for (SDNode &Node : CurDAG->allnodes()) {
-    if (LoadSDNode *LD = dyn_cast<LoadSDNode>(&Node)) {
-      EVT VT = LD->getValueType(0);
-      if (VT != MVT::i64 || LD->getExtensionType() != ISD::NON_EXTLOAD)
-        continue;
-
-      // To simplify the TableGen patters, we replace all i64 loads with v2i32
-      // loads.  Alternatively, we could promote i64 loads to v2i32 during DAG
-      // legalization, however, so places (ExpandUnalignedLoad) in the DAG
-      // legalizer assume that if i64 is legal, so doing this promotion early
-      // can cause problems.
-      LoadsToReplace.push_back(LD);
-    } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(&Node)) {
-      // Handle i64 stores here for the same reason mentioned above for loads.
-      SDValue Value = ST->getValue();
-      if (Value.getValueType() != MVT::i64 || ST->isTruncatingStore())
-        continue;
-      StoresToReplace.push_back(ST);
+  MachineFrameInfo *MFI = CurDAG->getMachineFunction().getFrameInfo();
+
+  // Handle the perverse case where a frame index is being stored. We don't
+  // want to see multiple frame index operands on the same instruction since
+  // it complicates things and violates some assumptions about frame index
+  // lowering.
+  for (int I = MFI->getObjectIndexBegin(), E = MFI->getObjectIndexEnd();
+       I != E; ++I) {
+    SDValue FI = CurDAG->getTargetFrameIndex(I, MVT::i32);
+
+    // It's possible that we have a frame index defined in the function that
+    // isn't used in this block.
+    if (FI.use_empty())
+      continue;
+
+    // Skip over the AssertZext inserted during lowering.
+    SDValue EffectiveFI = FI;
+    auto It = FI->use_begin();
+    if (It->getOpcode() == ISD::AssertZext && FI->hasOneUse()) {
+      EffectiveFI = SDValue(*It, 0);
+      It = EffectiveFI->use_begin();
     }
-  }
-
-  for (LoadSDNode *LD : LoadsToReplace) {
-    SDLoc SL(LD);
-
-    SDValue NewLoad = CurDAG->getLoad(MVT::v2i32, SL, LD->getChain(),
-                                      LD->getBasePtr(), LD->getMemOperand());
-    SDValue BitCast = CurDAG->getNode(ISD::BITCAST, SL,
-                                      MVT::i64, NewLoad);
-    CurDAG->ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1));
-    CurDAG->ReplaceAllUsesOfValueWith(SDValue(LD, 0), BitCast);
-    Modified = true;
-  }
 
-  for (StoreSDNode *ST : StoresToReplace) {
-    SDValue NewValue = CurDAG->getNode(ISD::BITCAST, SDLoc(ST),
-                                       MVT::v2i32, ST->getValue());
-    const SDValue StoreOps[] = {
-      ST->getChain(),
-      NewValue,
-      ST->getBasePtr(),
-      ST->getOffset()
-    };
+    for (auto It = EffectiveFI->use_begin(); !It.atEnd(); ) {
+      SDUse &Use = It.getUse();
+      SDNode *User = Use.getUser();
+      unsigned OpIdx = It.getOperandNo();
+      ++It;
+
+      if (MemSDNode *M = dyn_cast<MemSDNode>(User)) {
+        unsigned PtrIdx = M->getOpcode() == ISD::STORE ? 2 : 1;
+        if (OpIdx == PtrIdx)
+          continue;
+
+        unsigned OpN = M->getNumOperands();
+        SDValue NewOps[8];
+
+        assert(OpN < array_lengthof(NewOps));
+        for (unsigned Op = 0; Op != OpN; ++Op) {
+          if (Op != OpIdx) {
+            NewOps[Op] = M->getOperand(Op);
+            continue;
+          }
+
+          MachineSDNode *Mov = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
+                                                      SDLoc(M), MVT::i32, FI);
+          NewOps[Op] = SDValue(Mov, 0);
+        }
 
-    CurDAG->UpdateNodeOperands(ST, StoreOps);
-    Modified = true;
+        CurDAG->UpdateNodeOperands(M, makeArrayRef(NewOps, OpN));
+      }
+    }
   }
-
-  // XXX - Is this necessary?
-  if (Modified)
-    CurDAG->RemoveDeadNodes();
 }
 
 void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 1a59a460ee7d..352423ed3ad6 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -15,7 +15,6 @@
 
 #include "AMDGPUISelLowering.h"
 #include "AMDGPU.h"
-#include "AMDGPUDiagnosticInfoUnsupported.h"
 #include "AMDGPUFrameLowering.h"
 #include "AMDGPUIntrinsicInfo.h"
 #include "AMDGPURegisterInfo.h"
@@ -28,16 +27,19 @@
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/IR/DataLayout.h"
-
+#include "llvm/IR/DiagnosticInfo.h"
+#include "SIInstrInfo.h"
 using namespace llvm;
 
-static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT,
-                      CCValAssign::LocInfo LocInfo,
-                      ISD::ArgFlagsTy ArgFlags, CCState &State) {
-  unsigned Offset = State.AllocateStack(ValVT.getStoreSize(),
-                                        ArgFlags.getOrigAlign());
-  State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+static bool allocateKernArg(unsigned ValNo, MVT ValVT, MVT LocVT,
+                            CCValAssign::LocInfo LocInfo,
+                            ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  MachineFunction &MF = State.getMachineFunction();
+  AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
 
+  uint64_t Offset = MFI->allocateKernArg(ValVT.getStoreSize(),
+                                         ArgFlags.getOrigAlign());
+  State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo));
   return true;
 }
 
@@ -53,60 +55,104 @@ EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
   return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
 }
 
-// Type for a vector that will be loaded to.
-EVT AMDGPUTargetLowering::getEquivalentLoadRegType(LLVMContext &Ctx, EVT VT) {
+EVT AMDGPUTargetLowering::getEquivalentBitType(LLVMContext &Ctx, EVT VT) {
   unsigned StoreSize = VT.getStoreSizeInBits();
   if (StoreSize <= 32)
-    return EVT::getIntegerVT(Ctx, 32);
+    return EVT::getIntegerVT(Ctx, StoreSize);
 
   return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
 }
 
-AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
+AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
                                            const AMDGPUSubtarget &STI)
     : TargetLowering(TM), Subtarget(&STI) {
-  setOperationAction(ISD::Constant, MVT::i32, Legal);
-  setOperationAction(ISD::Constant, MVT::i64, Legal);
-  setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
-  setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
+  // Lower floating point store/load to integer store/load to reduce the number
+  // of patterns in tablegen.
+  setOperationAction(ISD::LOAD, MVT::f32, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
 
-  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
-  setOperationAction(ISD::BRIND, MVT::Other, Expand);
+  setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
 
-  // This is totally unsupported, just custom lower to produce an error.
-  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
+  setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
 
-  // We need to custom lower some of the intrinsics
-  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+  setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
 
-  // Library functions.  These default to Expand, but we have instructions
-  // for them.
-  setOperationAction(ISD::FCEIL,  MVT::f32, Legal);
-  setOperationAction(ISD::FEXP2,  MVT::f32, Legal);
-  setOperationAction(ISD::FPOW,   MVT::f32, Legal);
-  setOperationAction(ISD::FLOG2,  MVT::f32, Legal);
-  setOperationAction(ISD::FABS,   MVT::f32, Legal);
-  setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
-  setOperationAction(ISD::FRINT,  MVT::f32, Legal);
-  setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
-  setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
-  setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
+  setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
 
-  setOperationAction(ISD::FROUND, MVT::f32, Custom);
-  setOperationAction(ISD::FROUND, MVT::f64, Custom);
+  setOperationAction(ISD::LOAD, MVT::i64, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
 
-  setOperationAction(ISD::FREM, MVT::f32, Custom);
-  setOperationAction(ISD::FREM, MVT::f64, Custom);
+  setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
 
-  // v_mad_f32 does not support denormals according to some sources.
-  if (!Subtarget->hasFP32Denormals())
-    setOperationAction(ISD::FMAD, MVT::f32, Legal);
+  setOperationAction(ISD::LOAD, MVT::f64, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
 
-  // Expand to fneg + fadd.
-  setOperationAction(ISD::FSUB, MVT::f64, Expand);
+  setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
+
+  // There are no 64-bit extloads. These should be done as a 32-bit extload and
+  // an extension to 64-bit.
+  for (MVT VT : MVT::integer_valuetypes()) {
+    setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand);
+    setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand);
+    setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand);
+  }
+
+  for (MVT VT : MVT::integer_valuetypes()) {
+    if (VT == MVT::i64)
+      continue;
+
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand);
+
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand);
+
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand);
+  }
+
+  for (MVT VT : MVT::integer_vector_valuetypes()) {
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand);
+  }
+
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
+
+  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
+
+  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
 
-  // Lower floating point store/load to integer store/load to reduce the number
-  // of patterns in tablegen.
   setOperationAction(ISD::STORE, MVT::f32, Promote);
   AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
 
@@ -122,51 +168,99 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
   setOperationAction(ISD::STORE, MVT::v16f32, Promote);
   AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
 
+  setOperationAction(ISD::STORE, MVT::i64, Promote);
+  AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
+
+  setOperationAction(ISD::STORE, MVT::v2i64, Promote);
+  AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
+
   setOperationAction(ISD::STORE, MVT::f64, Promote);
-  AddPromotedToType(ISD::STORE, MVT::f64, MVT::i64);
+  AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
 
   setOperationAction(ISD::STORE, MVT::v2f64, Promote);
-  AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v2i64);
+  AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
 
-  // Custom lowering of vector stores is required for local address space
-  // stores.
-  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
-
-  setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
   setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
-  setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom);
+  setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
 
-  // XXX: This can be change to Custom, once ExpandVectorStores can
-  // handle 64-bit stores.
+  setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom);
   setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
 
-  setTruncStoreAction(MVT::i64, MVT::i16, Expand);
-  setTruncStoreAction(MVT::i64, MVT::i8, Expand);
+  setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
+  setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
+  setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
+
   setTruncStoreAction(MVT::i64, MVT::i1, Expand);
+  setTruncStoreAction(MVT::i64, MVT::i8, Expand);
+  setTruncStoreAction(MVT::i64, MVT::i16, Expand);
+  setTruncStoreAction(MVT::i64, MVT::i32, Expand);
+
   setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
-  setTruncStoreAction(MVT::v4i64, MVT::v4i1, Expand);
+  setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
+  setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
+  setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
 
+  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+  setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
+  setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
+  setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
 
-  setOperationAction(ISD::LOAD, MVT::f32, Promote);
-  AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
+  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 
-  setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
-  AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
+  setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
+  setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
 
-  setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
-  AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
+  setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
+  setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
 
-  setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
-  AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
+  setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
+  setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
 
-  setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
-  AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
 
-  setOperationAction(ISD::LOAD, MVT::f64, Promote);
-  AddPromotedToType(ISD::LOAD, MVT::f64, MVT::i64);
+  setOperationAction(ISD::Constant, MVT::i32, Legal);
+  setOperationAction(ISD::Constant, MVT::i64, Legal);
+  setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
+  setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
 
-  setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
-  AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v2i64);
+  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+  setOperationAction(ISD::BRIND, MVT::Other, Expand);
+
+  // This is totally unsupported, just custom lower to produce an error.
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
+
+  // We need to custom lower some of the intrinsics
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
+
+  // Library functions.  These default to Expand, but we have instructions
+  // for them.
+  setOperationAction(ISD::FCEIL,  MVT::f32, Legal);
+  setOperationAction(ISD::FEXP2,  MVT::f32, Legal);
+  setOperationAction(ISD::FPOW,   MVT::f32, Legal);
+  setOperationAction(ISD::FLOG2,  MVT::f32, Legal);
+  setOperationAction(ISD::FABS,   MVT::f32, Legal);
+  setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
+  setOperationAction(ISD::FRINT,  MVT::f32, Legal);
+  setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
+  setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
+  setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
+
+  setOperationAction(ISD::FROUND, MVT::f32, Custom);
+  setOperationAction(ISD::FROUND, MVT::f64, Custom);
+
+  setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
+  setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
+
+  setOperationAction(ISD::FREM, MVT::f32, Custom);
+  setOperationAction(ISD::FREM, MVT::f64, Custom);
+
+  // v_mad_f32 does not support denormals according to some sources.
+  if (!Subtarget->hasFP32Denormals())
+    setOperationAction(ISD::FMAD, MVT::f32, Legal);
+
+  // Expand to fneg + fadd.
+  setOperationAction(ISD::FSUB, MVT::f64, Expand);
 
   setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
   setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
@@ -179,31 +273,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
 
-  // There are no 64-bit extloads. These should be done as a 32-bit extload and
-  // an extension to 64-bit.
-  for (MVT VT : MVT::integer_valuetypes()) {
-    setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand);
-    setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand);
-    setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand);
-  }
-
-  for (MVT VT : MVT::integer_vector_valuetypes()) {
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand);
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand);
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand);
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand);
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand);
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand);
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand);
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand);
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand);
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand);
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand);
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand);
-  }
-
-  setOperationAction(ISD::BR_CC, MVT::i1, Expand);
-
   if (Subtarget->getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
     setOperationAction(ISD::FCEIL, MVT::f64, Custom);
     setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
@@ -219,28 +288,13 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
 
   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
 
-  setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
-
-  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
-
-  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
-  setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
-  setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
-  setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
-
-  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
-  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
-
   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
   for (MVT VT : ScalarIntVTs) {
-    setOperationAction(ISD::SREM, VT, Expand);
+    // These should use [SU]DIVREM, so set them to expand
     setOperationAction(ISD::SDIV, VT, Expand);
+    setOperationAction(ISD::UDIV, VT, Expand);
+    setOperationAction(ISD::SREM, VT, Expand);
+    setOperationAction(ISD::UREM, VT, Expand);
 
     // GPU does not have divrem function for signed or unsigned.
     setOperationAction(ISD::SDIVREM, VT, Custom);
@@ -284,17 +338,24 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
 
   if (Subtarget->hasFFBH())
     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
-  else
-    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
-
-  if (!Subtarget->hasFFBL())
-    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
 
-  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
+  if (Subtarget->hasFFBL())
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Legal);
 
   setOperationAction(ISD::CTLZ, MVT::i64, Custom);
   setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
 
+  // We only really have 32-bit BFE instructions (and 16-bit on VI).
+  //
+  // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
+  // effort to match them now. We want this to be false for i64 cases when the
+  // extraction isn't restricted to the upper or lower half. Ideally we would
+  // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
+  // span the midpoint are probably relatively rare, so don't worry about them
+  // for now.
+  if (Subtarget->hasBFE())
+    setHasExtractBitsInsn(true);
+
   static const MVT::SimpleValueType VectorIntTypes[] = {
     MVT::v2i32, MVT::v4i32
   };
@@ -334,9 +395,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
     setOperationAction(ISD::BSWAP, VT, Expand);
     setOperationAction(ISD::CTPOP, VT, Expand);
     setOperationAction(ISD::CTTZ, VT, Expand);
-    setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
     setOperationAction(ISD::CTLZ, VT, Expand);
-    setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
   }
 
@@ -366,24 +425,20 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
     setOperationAction(ISD::FSIN, VT, Expand);
     setOperationAction(ISD::FSUB, VT, Expand);
     setOperationAction(ISD::FNEG, VT, Expand);
-    setOperationAction(ISD::SELECT, VT, Expand);
     setOperationAction(ISD::VSELECT, VT, Expand);
     setOperationAction(ISD::SELECT_CC, VT, Expand);
     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
   }
 
-  setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
-  setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
-
-  setTargetDAGCombine(ISD::SHL);
-  setTargetDAGCombine(ISD::MUL);
-  setTargetDAGCombine(ISD::SELECT);
-  setTargetDAGCombine(ISD::SELECT_CC);
-  setTargetDAGCombine(ISD::STORE);
+  // This causes using an unrolled select operation rather than expansion with
+  // bit operations. This is in general better, but the alternative using BFI
+  // instructions may be better if the select sources are SGPRs.
+  setOperationAction(ISD::SELECT, MVT::v2f32, Promote);
+  AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
 
-  setTargetDAGCombine(ISD::FADD);
-  setTargetDAGCombine(ISD::FSUB);
+  setOperationAction(ISD::SELECT, MVT::v4f32, Promote);
+  AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
 
   setBooleanContents(ZeroOrNegativeOneBooleanContent);
   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
@@ -394,7 +449,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
   // SI at least has hardware support for floating point exceptions, but no way
   // of using or handling them is implemented. They are also optional in OpenCL
   // (Section 7.3)
-  setHasFloatingPointExceptions(false);
+  setHasFloatingPointExceptions(Subtarget->hasFPExceptions());
 
   setSelectIsExpensive(false);
   PredictableSelectIsExpensive = false;
@@ -415,6 +470,18 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
   MaxStoresPerMemcpy  = 4096;
   MaxStoresPerMemmove = 4096;
   MaxStoresPerMemset  = 4096;
+
+  setTargetDAGCombine(ISD::BITCAST);
+  setTargetDAGCombine(ISD::AND);
+  setTargetDAGCombine(ISD::SHL);
+  setTargetDAGCombine(ISD::SRA);
+  setTargetDAGCombine(ISD::SRL);
+  setTargetDAGCombine(ISD::MUL);
+  setTargetDAGCombine(ISD::SELECT);
+  setTargetDAGCombine(ISD::SELECT_CC);
+  setTargetDAGCombine(ISD::STORE);
+  setTargetDAGCombine(ISD::FADD);
+  setTargetDAGCombine(ISD::FSUB);
 }
 
 //===----------------------------------------------------------------------===//
@@ -467,15 +534,17 @@ bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
 
 bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
                                                    EVT CastTy) const {
-  if (LoadTy.getSizeInBits() != CastTy.getSizeInBits())
-    return true;
 
-  unsigned LScalarSize = LoadTy.getScalarType().getSizeInBits();
-  unsigned CastScalarSize = CastTy.getScalarType().getSizeInBits();
+  assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
 
-  return ((LScalarSize <= CastScalarSize) ||
-          (CastScalarSize >= 32) ||
-          (LScalarSize < 32));
+  if (LoadTy.getScalarType() == MVT::i32)
+    return false;
+
+  unsigned LScalarSize = LoadTy.getScalarSizeInBits();
+  unsigned CastScalarSize = CastTy.getScalarSizeInBits();
+
+  return (LScalarSize < CastScalarSize) ||
+         (CastScalarSize >= 32);
 }
 
 // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
@@ -578,14 +647,13 @@ void AMDGPUTargetLowering::AnalyzeReturn(CCState &State,
   State.AnalyzeReturn(Outs, RetCC_SI);
 }
 
-SDValue AMDGPUTargetLowering::LowerReturn(
-                                     SDValue Chain,
-                                     CallingConv::ID CallConv,
-                                     bool isVarArg,
-                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
-                                     const SmallVectorImpl<SDValue> &OutVals,
-                                     SDLoc DL, SelectionDAG &DAG) const {
-  return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, Chain);
+SDValue
+AMDGPUTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                                  bool isVarArg,
+                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                  const SmallVectorImpl<SDValue> &OutVals,
+                                  const SDLoc &DL, SelectionDAG &DAG) const {
+  return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
 }
 
 //===---------------------------------------------------------------------===//
@@ -606,32 +674,38 @@ SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
   else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
     FuncName = G->getGlobal()->getName();
 
-  DiagnosticInfoUnsupported NoCalls(Fn, "call to function " + FuncName);
+  DiagnosticInfoUnsupported NoCalls(
+      Fn, "unsupported call to function " + FuncName, CLI.DL.getDebugLoc());
   DAG.getContext()->diagnose(NoCalls);
-  return SDValue();
+
+  for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
+    InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
+
+  return DAG.getEntryNode();
 }
 
 SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
                                                       SelectionDAG &DAG) const {
   const Function &Fn = *DAG.getMachineFunction().getFunction();
 
-  DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "dynamic alloca");
+  DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
+                                            SDLoc(Op).getDebugLoc());
   DAG.getContext()->diagnose(NoDynamicAlloca);
-  return SDValue();
+  auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
+  return DAG.getMergeValues(Ops, SDLoc());
 }
 
 SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
                                              SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   default:
-    Op.getNode()->dump();
+    Op->dump(&DAG);
     llvm_unreachable("Custom lowering code for this"
                      "instruction is not implemented yet!");
     break;
   case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
   case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
   case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
-  case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
   case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
   case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
@@ -666,24 +740,6 @@ void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
     // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
     // nothing here and let the illegal result integer be handled normally.
     return;
-  case ISD::LOAD: {
-    SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
-    if (!Node)
-      return;
-
-    Results.push_back(SDValue(Node, 0));
-    Results.push_back(SDValue(Node, 1));
-    // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
-    // function
-    DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
-    return;
-  }
-  case ISD::STORE: {
-    SDValue Lowered = LowerSTORE(SDValue(N, 0), DAG);
-    if (Lowered.getNode())
-      Results.push_back(Lowered);
-    return;
-  }
   default:
     return;
   }
@@ -712,16 +768,16 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
     EVT VT = EVT::getEVT(InitTy);
     PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS);
     return DAG.getStore(Chain, DL, DAG.getConstant(*CI, DL, VT), InitPtr,
-                        MachinePointerInfo(UndefValue::get(PtrTy)), false,
-                        false, TD.getPrefTypeAlignment(InitTy));
+                        MachinePointerInfo(UndefValue::get(PtrTy)),
+                        TD.getPrefTypeAlignment(InitTy));
   }
 
   if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Init)) {
     EVT VT = EVT::getEVT(CFP->getType());
     PointerType *PtrTy = PointerType::get(CFP->getType(), 0);
     return DAG.getStore(Chain, DL, DAG.getConstantFP(*CFP, DL, VT), InitPtr,
-                        MachinePointerInfo(UndefValue::get(PtrTy)), false,
-                        false, TD.getPrefTypeAlignment(CFP->getType()));
+                        MachinePointerInfo(UndefValue::get(PtrTy)),
+                        TD.getPrefTypeAlignment(CFP->getType()));
   }
 
   if (StructType *ST = dyn_cast<StructType>(InitTy)) {
@@ -769,8 +825,8 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
     EVT VT = EVT::getEVT(InitTy);
     PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS);
     return DAG.getStore(Chain, DL, DAG.getUNDEF(VT), InitPtr,
-                        MachinePointerInfo(UndefValue::get(PtrTy)), false,
-                        false, TD.getPrefTypeAlignment(InitTy));
+                        MachinePointerInfo(UndefValue::get(PtrTy)),
+                        TD.getPrefTypeAlignment(InitTy));
   }
 
   Init->dump();
@@ -782,10 +838,7 @@ static bool hasDefinedInitializer(const GlobalValue *GV) {
   if (!GVar || !GVar->hasInitializer())
     return false;
 
-  if (isa<UndefValue>(GVar->getInitializer()))
-    return false;
-
-  return true;
+  return !isa<UndefValue>(GVar->getInitializer());
 }
 
 SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
@@ -797,6 +850,11 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
   const GlobalValue *GV = G->getGlobal();
 
   switch (G->getAddressSpace()) {
+  case AMDGPUAS::CONSTANT_ADDRESS: {
+    MVT ConstPtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
+    SDValue GA = DAG.getTargetGlobalAddress(GV, SDLoc(G), ConstPtrVT);
+    return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, SDLoc(G), ConstPtrVT, GA);
+  }
   case AMDGPUAS::LOCAL_ADDRESS: {
     // XXX: What does the value of G->getOffset() mean?
     assert(G->getOffset() == 0 &&
@@ -808,11 +866,16 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
 
     unsigned Offset;
     if (MFI->LocalMemoryObjects.count(GV) == 0) {
-      uint64_t Size = DL.getTypeAllocSize(GV->getType()->getElementType());
-      Offset = MFI->LDSSize;
+      unsigned Align = GV->getAlignment();
+      if (Align == 0)
+        Align = DL.getABITypeAlignment(GV->getValueType());
+
+      /// TODO: We should sort these to minimize wasted space due to alignment
+      /// padding. Currently the padding is decided by the first encountered use
+      /// during lowering.
+      Offset = MFI->LDSSize = alignTo(MFI->LDSSize, Align);
       MFI->LocalMemoryObjects[GV] = Offset;
-      // XXX: Account for alignment?
-      MFI->LDSSize += Size;
+      MFI->LDSSize += DL.getTypeAllocSize(GV->getValueType());
     } else {
       Offset = MFI->LocalMemoryObjects[GV];
     }
@@ -820,50 +883,11 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
     return DAG.getConstant(Offset, SDLoc(Op),
                            getPointerTy(DL, AMDGPUAS::LOCAL_ADDRESS));
   }
-  case AMDGPUAS::CONSTANT_ADDRESS: {
-    MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
-    Type *EltType = GV->getType()->getElementType();
-    unsigned Size = DL.getTypeAllocSize(EltType);
-    unsigned Alignment = DL.getPrefTypeAlignment(EltType);
-
-    MVT PrivPtrVT = getPointerTy(DL, AMDGPUAS::PRIVATE_ADDRESS);
-    MVT ConstPtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
-
-    int FI = FrameInfo->CreateStackObject(Size, Alignment, false);
-    SDValue InitPtr = DAG.getFrameIndex(FI, PrivPtrVT);
-
-    const GlobalVariable *Var = cast<GlobalVariable>(GV);
-    if (!Var->hasInitializer()) {
-      // This has no use, but bugpoint will hit it.
-      return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT);
-    }
-
-    const Constant *Init = Var->getInitializer();
-    SmallVector<SDNode*, 8> WorkList;
-
-    for (SDNode::use_iterator I = DAG.getEntryNode()->use_begin(),
-                              E = DAG.getEntryNode()->use_end(); I != E; ++I) {
-      if (I->getOpcode() != AMDGPUISD::REGISTER_LOAD && I->getOpcode() != ISD::LOAD)
-        continue;
-      WorkList.push_back(*I);
-    }
-    SDValue Chain = LowerConstantInitializer(Init, GV, InitPtr, DAG.getEntryNode(), DAG);
-    for (SmallVector<SDNode*, 8>::iterator I = WorkList.begin(),
-                                           E = WorkList.end(); I != E; ++I) {
-      SmallVector<SDValue, 8> Ops;
-      Ops.push_back(Chain);
-      for (unsigned i = 1; i < (*I)->getNumOperands(); ++i) {
-        Ops.push_back((*I)->getOperand(i));
-      }
-      DAG.UpdateNodeOperands(*I, Ops);
-    }
-    return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT);
-  }
   }
 
   const Function &Fn = *DAG.getMachineFunction().getFunction();
-  DiagnosticInfoUnsupported BadInit(Fn,
-                                    "initializer for address space");
+  DiagnosticInfoUnsupported BadInit(
+      Fn, "unsupported initializer for address space", SDLoc(Op).getDebugLoc());
   DAG.getContext()->diagnose(BadInit);
   return SDValue();
 }
@@ -875,7 +899,7 @@ SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
   for (const SDUse &U : Op->ops())
     DAG.ExtractVectorElements(U.get(), Args);
 
-  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args);
+  return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
 }
 
 SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
@@ -887,23 +911,7 @@ SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
   DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
                             VT.getVectorNumElements());
 
-  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args);
-}
-
-SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op,
-                                              SelectionDAG &DAG) const {
-
-  MachineFunction &MF = DAG.getMachineFunction();
-  const AMDGPUFrameLowering *TFL = Subtarget->getFrameLowering();
-
-  FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op);
-
-  unsigned FrameIndex = FIN->getIndex();
-  unsigned IgnoredFrameReg;
-  unsigned Offset =
-      TFL->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg);
-  return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), SDLoc(Op),
-                         Op.getValueType());
+  return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
 }
 
 SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
@@ -914,121 +922,10 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
 
   switch (IntrinsicID) {
     default: return Op;
-    case AMDGPUIntrinsic::AMDGPU_abs:
-    case AMDGPUIntrinsic::AMDIL_abs: // Legacy name.
-      return LowerIntrinsicIABS(Op, DAG);
-    case AMDGPUIntrinsic::AMDGPU_lrp:
-      return LowerIntrinsicLRP(Op, DAG);
-
-    case AMDGPUIntrinsic::AMDGPU_clamp:
-    case AMDGPUIntrinsic::AMDIL_clamp: // Legacy name.
+    case AMDGPUIntrinsic::AMDGPU_clamp: // Legacy name.
       return DAG.getNode(AMDGPUISD::CLAMP, DL, VT,
                          Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
 
-    case Intrinsic::AMDGPU_div_scale: {
-      // 3rd parameter required to be a constant.
-      const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3));
-      if (!Param)
-        return DAG.getUNDEF(VT);
-
-      // Translate to the operands expected by the machine instruction. The
-      // first parameter must be the same as the first instruction.
-      SDValue Numerator = Op.getOperand(1);
-      SDValue Denominator = Op.getOperand(2);
-
-      // Note this order is opposite of the machine instruction's operations,
-      // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
-      // intrinsic has the numerator as the first operand to match a normal
-      // division operation.
-
-      SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;
-
-      return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
-                         Denominator, Numerator);
-    }
-
-    case Intrinsic::AMDGPU_div_fmas:
-      return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
-                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
-                         Op.getOperand(4));
-
-    case Intrinsic::AMDGPU_div_fixup:
-      return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
-                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
-
-    case Intrinsic::AMDGPU_trig_preop:
-      return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
-                         Op.getOperand(1), Op.getOperand(2));
-
-    case Intrinsic::AMDGPU_rcp:
-      return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
-
-    case Intrinsic::AMDGPU_rsq:
-      return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
-
-    case AMDGPUIntrinsic::AMDGPU_legacy_rsq:
-      return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
-
-    case Intrinsic::AMDGPU_rsq_clamped:
-      if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
-        Type *Type = VT.getTypeForEVT(*DAG.getContext());
-        APFloat Max = APFloat::getLargest(Type->getFltSemantics());
-        APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
-
-        SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
-        SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
-                                  DAG.getConstantFP(Max, DL, VT));
-        return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
-                           DAG.getConstantFP(Min, DL, VT));
-      } else {
-        return DAG.getNode(AMDGPUISD::RSQ_CLAMPED, DL, VT, Op.getOperand(1));
-      }
-
-    case Intrinsic::AMDGPU_ldexp:
-      return DAG.getNode(AMDGPUISD::LDEXP, DL, VT, Op.getOperand(1),
-                                                   Op.getOperand(2));
-
-    case AMDGPUIntrinsic::AMDGPU_imax:
-      return DAG.getNode(ISD::SMAX, DL, VT, Op.getOperand(1),
-                                            Op.getOperand(2));
-    case AMDGPUIntrinsic::AMDGPU_umax:
-      return DAG.getNode(ISD::UMAX, DL, VT, Op.getOperand(1),
-                                            Op.getOperand(2));
-    case AMDGPUIntrinsic::AMDGPU_imin:
-      return DAG.getNode(ISD::SMIN, DL, VT, Op.getOperand(1),
-                                            Op.getOperand(2));
-    case AMDGPUIntrinsic::AMDGPU_umin:
-      return DAG.getNode(ISD::UMIN, DL, VT, Op.getOperand(1),
-                                            Op.getOperand(2));
-
-    case AMDGPUIntrinsic::AMDGPU_umul24:
-      return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT,
-                         Op.getOperand(1), Op.getOperand(2));
-
-    case AMDGPUIntrinsic::AMDGPU_imul24:
-      return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT,
-                         Op.getOperand(1), Op.getOperand(2));
-
-    case AMDGPUIntrinsic::AMDGPU_umad24:
-      return DAG.getNode(AMDGPUISD::MAD_U24, DL, VT,
-                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
-
-    case AMDGPUIntrinsic::AMDGPU_imad24:
-      return DAG.getNode(AMDGPUISD::MAD_I24, DL, VT,
-                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
-
-    case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte0:
-      return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Op.getOperand(1));
-
-    case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte1:
-      return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE1, DL, VT, Op.getOperand(1));
-
-    case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte2:
-      return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE2, DL, VT, Op.getOperand(1));
-
-    case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte3:
-      return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE3, DL, VT, Op.getOperand(1));
-
     case AMDGPUIntrinsic::AMDGPU_bfe_i32:
       return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
                          Op.getOperand(1),
@@ -1040,69 +937,13 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                          Op.getOperand(1),
                          Op.getOperand(2),
                          Op.getOperand(3));
-
-    case AMDGPUIntrinsic::AMDGPU_bfi:
-      return DAG.getNode(AMDGPUISD::BFI, DL, VT,
-                         Op.getOperand(1),
-                         Op.getOperand(2),
-                         Op.getOperand(3));
-
-    case AMDGPUIntrinsic::AMDGPU_bfm:
-      return DAG.getNode(AMDGPUISD::BFM, DL, VT,
-                         Op.getOperand(1),
-                         Op.getOperand(2));
-
-  case Intrinsic::AMDGPU_class:
-    return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
-                       Op.getOperand(1), Op.getOperand(2));
-
-    case AMDGPUIntrinsic::AMDIL_exp: // Legacy name.
-      return DAG.getNode(ISD::FEXP2, DL, VT, Op.getOperand(1));
-
-    case AMDGPUIntrinsic::AMDIL_round_nearest: // Legacy name.
-      return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1));
-    case AMDGPUIntrinsic::AMDGPU_trunc: // Legacy name.
-      return DAG.getNode(ISD::FTRUNC, DL, VT, Op.getOperand(1));
-    case AMDGPUIntrinsic::AMDGPU_brev: // Legacy name
-      return DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(1));
   }
 }
 
-///IABS(a) = SMAX(sub(0, a), a)
-SDValue AMDGPUTargetLowering::LowerIntrinsicIABS(SDValue Op,
-                                                 SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-  EVT VT = Op.getValueType();
-  SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
-                            Op.getOperand(1));
-
-  return DAG.getNode(ISD::SMAX, DL, VT, Neg, Op.getOperand(1));
-}
-
-/// Linear Interpolation
-/// LRP(a, b, c) = muladd(a,  b, (1 - a) * c)
-SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op,
-                                                SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-  EVT VT = Op.getValueType();
-  // TODO: Should this propagate fast-math-flags?
-  SDValue OneSubA = DAG.getNode(ISD::FSUB, DL, VT,
-                                DAG.getConstantFP(1.0f, DL, MVT::f32),
-                                Op.getOperand(1));
-  SDValue OneSubAC = DAG.getNode(ISD::FMUL, DL, VT, OneSubA,
-                                                    Op.getOperand(3));
-  return DAG.getNode(ISD::FADD, DL, VT,
-      DAG.getNode(ISD::FMUL, DL, VT, Op.getOperand(1), Op.getOperand(2)),
-      OneSubAC);
-}
-
 /// \brief Generate Min/Max node
-SDValue AMDGPUTargetLowering::CombineFMinMaxLegacy(SDLoc DL,
-                                                   EVT VT,
-                                                   SDValue LHS,
-                                                   SDValue RHS,
-                                                   SDValue True,
-                                                   SDValue False,
+SDValue AMDGPUTargetLowering::CombineFMinMaxLegacy(const SDLoc &DL, EVT VT,
+                                                   SDValue LHS, SDValue RHS,
+                                                   SDValue True, SDValue False,
                                                    SDValue CC,
                                                    DAGCombinerInfo &DCI) const {
   if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
@@ -1176,56 +1017,48 @@ SDValue AMDGPUTargetLowering::CombineFMinMaxLegacy(SDLoc DL,
   return SDValue();
 }
 
-SDValue AMDGPUTargetLowering::ScalarizeVectorLoad(const SDValue Op,
-                                                  SelectionDAG &DAG) const {
-  LoadSDNode *Load = cast<LoadSDNode>(Op);
-  EVT MemVT = Load->getMemoryVT();
-  EVT MemEltVT = MemVT.getVectorElementType();
+std::pair<SDValue, SDValue>
+AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
 
-  EVT LoadVT = Op.getValueType();
-  EVT EltVT = LoadVT.getVectorElementType();
-  EVT PtrVT = Load->getBasePtr().getValueType();
+  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
 
-  unsigned NumElts = Load->getMemoryVT().getVectorNumElements();
-  SmallVector<SDValue, 8> Loads;
-  SmallVector<SDValue, 8> Chains;
+  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
+  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
 
-  SDLoc SL(Op);
-  unsigned MemEltSize = MemEltVT.getStoreSize();
-  MachinePointerInfo SrcValue(Load->getMemOperand()->getValue());
+  SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
+  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
 
-  for (unsigned i = 0; i < NumElts; ++i) {
-    SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Load->getBasePtr(),
-                              DAG.getConstant(i * MemEltSize, SL, PtrVT));
+  return std::make_pair(Lo, Hi);
+}
 
-    SDValue NewLoad
-      = DAG.getExtLoad(Load->getExtensionType(), SL, EltVT,
-                       Load->getChain(), Ptr,
-                       SrcValue.getWithOffset(i * MemEltSize),
-                       MemEltVT, Load->isVolatile(), Load->isNonTemporal(),
-                       Load->isInvariant(), Load->getAlignment());
-    Loads.push_back(NewLoad.getValue(0));
-    Chains.push_back(NewLoad.getValue(1));
-  }
+SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
 
-  SDValue Ops[] = {
-    DAG.getNode(ISD::BUILD_VECTOR, SL, LoadVT, Loads),
-    DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains)
-  };
+  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
+  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
+}
 
-  return DAG.getMergeValues(Ops, SL);
+SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+
+  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
+  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
 }
 
 SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
                                               SelectionDAG &DAG) const {
+  LoadSDNode *Load = cast<LoadSDNode>(Op);
   EVT VT = Op.getValueType();
 
+
   // If this is a 2 element vector, we really want to scalarize and not create
   // weird 1 element vectors.
   if (VT.getVectorNumElements() == 2)
-    return ScalarizeVectorLoad(Op, DAG);
+    return scalarizeVectorLoad(Load, DAG);
 
-  LoadSDNode *Load = cast<LoadSDNode>(Op);
   SDValue BasePtr = Load->getBasePtr();
   EVT PtrVT = BasePtr.getValueType();
   EVT MemVT = Load->getMemoryVT();
@@ -1245,22 +1078,15 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
   unsigned BaseAlign = Load->getAlignment();
   unsigned HiAlign = MinAlign(BaseAlign, Size);
 
-  SDValue LoLoad
-    = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
-                     Load->getChain(), BasePtr,
-                     SrcValue,
-                     LoMemVT, Load->isVolatile(), Load->isNonTemporal(),
-                     Load->isInvariant(), BaseAlign);
-
+  SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
+                                  Load->getChain(), BasePtr, SrcValue, LoMemVT,
+                                  BaseAlign, Load->getMemOperand()->getFlags());
   SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
                               DAG.getConstant(Size, SL, PtrVT));
-
-  SDValue HiLoad
-    = DAG.getExtLoad(Load->getExtensionType(), SL, HiVT,
-                     Load->getChain(), HiPtr,
-                     SrcValue.getWithOffset(LoMemVT.getStoreSize()),
-                     HiMemVT, Load->isVolatile(), Load->isNonTemporal(),
-                     Load->isInvariant(), HiAlign);
+  SDValue HiLoad =
+      DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
+                     HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
+                     HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
 
   SDValue Ops[] = {
     DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad),
@@ -1271,6 +1097,8 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
   return DAG.getMergeValues(Ops, SL);
 }
 
+// FIXME: This isn't doing anything for SI. This should be used in a target
+// combine during type legalization.
 SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op,
                                                SelectionDAG &DAG) const {
   StoreSDNode *Store = cast<StoreSDNode>(Op);
@@ -1317,48 +1145,15 @@ SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op,
   if (PackedSize < 32) {
     EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), PackedSize);
     return DAG.getTruncStore(Store->getChain(), DL, PackedValue, Ptr,
-                             Store->getMemOperand()->getPointerInfo(),
-                             PackedVT,
-                             Store->isNonTemporal(), Store->isVolatile(),
-                             Store->getAlignment());
+                             Store->getMemOperand()->getPointerInfo(), PackedVT,
+                             Store->getAlignment(),
+                             Store->getMemOperand()->getFlags());
   }
 
   return DAG.getStore(Store->getChain(), DL, PackedValue, Ptr,
                       Store->getMemOperand()->getPointerInfo(),
-                      Store->isVolatile(),  Store->isNonTemporal(),
-                      Store->getAlignment());
-}
-
-SDValue AMDGPUTargetLowering::ScalarizeVectorStore(SDValue Op,
-                                                   SelectionDAG &DAG) const {
-  StoreSDNode *Store = cast<StoreSDNode>(Op);
-  EVT MemEltVT = Store->getMemoryVT().getVectorElementType();
-  EVT EltVT = Store->getValue().getValueType().getVectorElementType();
-  EVT PtrVT = Store->getBasePtr().getValueType();
-  unsigned NumElts = Store->getMemoryVT().getVectorNumElements();
-  SDLoc SL(Op);
-
-  SmallVector<SDValue, 8> Chains;
-
-  unsigned EltSize = MemEltVT.getStoreSize();
-  MachinePointerInfo SrcValue(Store->getMemOperand()->getValue());
-
-  for (unsigned i = 0, e = NumElts; i != e; ++i) {
-    SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
-                              Store->getValue(),
-                              DAG.getConstant(i, SL, MVT::i32));
-
-    SDValue Offset = DAG.getConstant(i * MemEltVT.getStoreSize(), SL, PtrVT);
-    SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Store->getBasePtr(), Offset);
-    SDValue NewStore =
-      DAG.getTruncStore(Store->getChain(), SL, Val, Ptr,
-                        SrcValue.getWithOffset(i * EltSize),
-                        MemEltVT, Store->isNonTemporal(), Store->isVolatile(),
-                        Store->getAlignment());
-    Chains.push_back(NewStore);
-  }
-
-  return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains);
+                      Store->getAlignment(),
+                      Store->getMemOperand()->getFlags());
 }
 
 SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
@@ -1370,7 +1165,7 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
   // If this is a 2 element vector, we really want to scalarize and not create
   // weird 1 element vectors.
   if (VT.getVectorNumElements() == 2)
-    return ScalarizeVectorStore(Op, DAG);
+    return scalarizeVectorStore(Store, DAG);
 
   EVT MemVT = Store->getMemoryVT();
   SDValue Chain = Store->getChain();
@@ -1395,171 +1190,21 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
   unsigned Size = LoMemVT.getStoreSize();
   unsigned HiAlign = MinAlign(BaseAlign, Size);
 
-  SDValue LoStore
-    = DAG.getTruncStore(Chain, SL, Lo,
-                        BasePtr,
-                        SrcValue,
-                        LoMemVT,
-                        Store->isNonTemporal(),
-                        Store->isVolatile(),
-                        BaseAlign);
-  SDValue HiStore
-    = DAG.getTruncStore(Chain, SL, Hi,
-                        HiPtr,
-                        SrcValue.getWithOffset(Size),
-                        HiMemVT,
-                        Store->isNonTemporal(),
-                        Store->isVolatile(),
-                        HiAlign);
+  SDValue LoStore =
+      DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
+                        Store->getMemOperand()->getFlags());
+  SDValue HiStore =
+      DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
+                        HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
 
   return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
 }
 
-
-SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-  LoadSDNode *Load = cast<LoadSDNode>(Op);
-  ISD::LoadExtType ExtType = Load->getExtensionType();
-  EVT VT = Op.getValueType();
-  EVT MemVT = Load->getMemoryVT();
-
-  if (ExtType == ISD::NON_EXTLOAD && VT.getSizeInBits() < 32) {
-    assert(VT == MVT::i1 && "Only i1 non-extloads expected");
-    // FIXME: Copied from PPC
-    // First, load into 32 bits, then truncate to 1 bit.
-
-    SDValue Chain = Load->getChain();
-    SDValue BasePtr = Load->getBasePtr();
-    MachineMemOperand *MMO = Load->getMemOperand();
-
-    SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
-                                   BasePtr, MVT::i8, MMO);
-
-    SDValue Ops[] = {
-      DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD),
-      NewLD.getValue(1)
-    };
-
-    return DAG.getMergeValues(Ops, DL);
-  }
-
-  if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS ||
-      Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS ||
-      ExtType == ISD::NON_EXTLOAD || Load->getMemoryVT().bitsGE(MVT::i32))
-    return SDValue();
-
-  // <SI && AS=PRIVATE && EXTLOAD && size < 32bit,
-  // register (2-)byte extract.
-
-  // Get Register holding the target.
-  SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(),
-                            DAG.getConstant(2, DL, MVT::i32));
-  // Load the Register.
-  SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
-                            Load->getChain(), Ptr,
-                            DAG.getTargetConstant(0, DL, MVT::i32),
-                            Op.getOperand(2));
-
-  // Get offset within the register.
-  SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32,
-                                Load->getBasePtr(),
-                                DAG.getConstant(0x3, DL, MVT::i32));
-
-  // Bit offset of target byte (byteIdx * 8).
-  SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
-                                 DAG.getConstant(3, DL, MVT::i32));
-
-  // Shift to the right.
-  Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt);
-
-  // Eliminate the upper bits by setting them to ...
-  EVT MemEltVT = MemVT.getScalarType();
-
-  // ... ones.
-  if (ExtType == ISD::SEXTLOAD) {
-    SDValue MemEltVTNode = DAG.getValueType(MemEltVT);
-
-    SDValue Ops[] = {
-      DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode),
-      Load->getChain()
-    };
-
-    return DAG.getMergeValues(Ops, DL);
-  }
-
-  // ... or zeros.
-  SDValue Ops[] = {
-    DAG.getZeroExtendInReg(Ret, DL, MemEltVT),
-    Load->getChain()
-  };
-
-  return DAG.getMergeValues(Ops, DL);
-}
-
-SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-  SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG);
-  if (Result.getNode()) {
-    return Result;
-  }
-
-  StoreSDNode *Store = cast<StoreSDNode>(Op);
-  SDValue Chain = Store->getChain();
-  if ((Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
-       Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) &&
-      Store->getValue().getValueType().isVector()) {
-    return SplitVectorStore(Op, DAG);
-  }
-
-  EVT MemVT = Store->getMemoryVT();
-  if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS &&
-      MemVT.bitsLT(MVT::i32)) {
-    unsigned Mask = 0;
-    if (Store->getMemoryVT() == MVT::i8) {
-      Mask = 0xff;
-    } else if (Store->getMemoryVT() == MVT::i16) {
-      Mask = 0xffff;
-    }
-    SDValue BasePtr = Store->getBasePtr();
-    SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr,
-                              DAG.getConstant(2, DL, MVT::i32));
-    SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
-                              Chain, Ptr,
-                              DAG.getTargetConstant(0, DL, MVT::i32));
-
-    SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr,
-                                  DAG.getConstant(0x3, DL, MVT::i32));
-
-    SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
-                                   DAG.getConstant(3, DL, MVT::i32));
-
-    SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32,
-                                    Store->getValue());
-
-    SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT);
-
-    SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32,
-                                       MaskedValue, ShiftAmt);
-
-    SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32,
-                                  DAG.getConstant(Mask, DL, MVT::i32),
-                                  ShiftAmt);
-    DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask,
-                          DAG.getConstant(0xffffffff, DL, MVT::i32));
-    Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask);
-
-    SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue);
-    return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
-                       Chain, Value, Ptr,
-                       DAG.getTargetConstant(0, DL, MVT::i32));
-  }
-  return SDValue();
-}
-
 // This is a shortcut for integer division because we have fast i32<->f32
 // conversions, and fast f32 reciprocal instructions. The fractional part of a
-// float is enough to accurately represent up to a 24-bit integer.
-SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const {
+// float is enough to accurately represent up to a 24-bit signed integer.
+SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
+                                            bool Sign) const {
   SDLoc DL(Op);
   EVT VT = Op.getValueType();
   SDValue LHS = Op.getOperand(0);
@@ -1567,20 +1212,26 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool
   MVT IntVT = MVT::i32;
   MVT FltVT = MVT::f32;
 
-  ISD::NodeType ToFp  = sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
-  ISD::NodeType ToInt = sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
+  unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
+  if (LHSSignBits < 9)
+    return SDValue();
 
-  if (VT.isVector()) {
-    unsigned NElts = VT.getVectorNumElements();
-    IntVT = MVT::getVectorVT(MVT::i32, NElts);
-    FltVT = MVT::getVectorVT(MVT::f32, NElts);
-  }
+  unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
+  if (RHSSignBits < 9)
+    return SDValue();
+
+  unsigned BitSize = VT.getSizeInBits();
+  unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
+  unsigned DivBits = BitSize - SignBits;
+  if (Sign)
+    ++DivBits;
 
-  unsigned BitSize = VT.getScalarType().getSizeInBits();
+  ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
+  ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
 
   SDValue jq = DAG.getConstant(1, DL, IntVT);
 
-  if (sign) {
+  if (Sign) {
     // char|short jq = ia ^ ib;
     jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
 
@@ -1590,18 +1241,13 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool
 
     // jq = jq | 0x1
     jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
-
-    // jq = (int)jq
-    jq = DAG.getSExtOrTrunc(jq, DL, IntVT);
   }
 
   // int ia = (int)LHS;
-  SDValue ia = sign ?
-    DAG.getSExtOrTrunc(LHS, DL, IntVT) : DAG.getZExtOrTrunc(LHS, DL, IntVT);
+  SDValue ia = LHS;
 
   // int ib, (int)RHS;
-  SDValue ib = sign ?
-    DAG.getSExtOrTrunc(RHS, DL, IntVT) : DAG.getZExtOrTrunc(RHS, DL, IntVT);
+  SDValue ib = RHS;
 
   // float fa = (float)ia;
   SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
@@ -1609,8 +1255,6 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool
   // float fb = (float)ib;
   SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
 
-  // TODO: Should this propagate fast-math-flags?
-  // float fq = native_divide(fa, fb);
   SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
                            fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
 
@@ -1621,8 +1265,7 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool
   SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
 
   // float fr = mad(fqneg, fb, fa);
-  SDValue fr = DAG.getNode(ISD::FADD, DL, FltVT,
-                           DAG.getNode(ISD::FMUL, DL, FltVT, fqneg, fb), fa);
+  SDValue fr = DAG.getNode(ISD::FMAD, DL, FltVT, fqneg, fb, fa);
 
   // int iq = (int)fq;
   SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
@@ -1641,9 +1284,6 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool
   // jq = (cv ? jq : 0);
   jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
 
-  // dst = trunc/extend to legal type
-  iq = sign ? DAG.getSExtOrTrunc(iq, DL, VT) : DAG.getZExtOrTrunc(iq, DL, VT);
-
   // dst = iq + jq;
   SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
 
@@ -1651,11 +1291,19 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool
   SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
   Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
 
-  SDValue Res[2] = {
-    Div,
-    Rem
-  };
-  return DAG.getMergeValues(Res, DL);
+  // Truncate to number of bits this divide really is.
+  if (Sign) {
+    SDValue InRegSize
+      = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
+    Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
+    Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
+  } else {
+    SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
+    Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
+    Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
+  }
+
+  return DAG.getMergeValues({ Div, Rem }, DL);
 }
 
 void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
@@ -1686,10 +1334,11 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
     SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
                               LHS_Lo, RHS_Lo);
 
-    SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, Res.getValue(0), zero);
-    SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, Res.getValue(1), zero);
-    Results.push_back(DIV);
-    Results.push_back(REM);
+    SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), zero});
+    SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), zero});
+
+    Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
+    Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
     return;
   }
 
@@ -1698,7 +1347,8 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
   SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
 
   SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ);
-  SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, zero);
+  SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, zero});
+  REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
 
   SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ);
   SDValue DIV_Lo = zero;
@@ -1718,7 +1368,7 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
     // Add LHS high bit
     REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
 
-    SDValue BIT = DAG.getConstant(1 << bitPos, DL, HalfVT);
+    SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
     SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETUGE);
 
     DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
@@ -1728,7 +1378,8 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
     REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
   }
 
-  SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi);
+  SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
+  DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
   Results.push_back(DIV);
   Results.push_back(REM);
 }
@@ -1744,19 +1395,14 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
     return DAG.getMergeValues(Results, DL);
   }
 
-  SDValue Num = Op.getOperand(0);
-  SDValue Den = Op.getOperand(1);
-
   if (VT == MVT::i32) {
-    if (DAG.MaskedValueIsZero(Num, APInt::getHighBitsSet(32, 8)) &&
-        DAG.MaskedValueIsZero(Den, APInt::getHighBitsSet(32, 8))) {
-      // TODO: We technically could do this for i64, but shouldn't that just be
-      // handled by something generally reducing 64-bit division on 32-bit
-      // values to 32-bit?
-      return LowerDIVREM24(Op, DAG, false);
-    }
+    if (SDValue Res = LowerDIVREM24(Op, DAG, false))
+      return Res;
   }
 
+  SDValue Num = Op.getOperand(0);
+  SDValue Den = Op.getOperand(1);
+
   // RCP =  URECIP(Den) = 2^32 / Den + e
   // e is rounding error.
   SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);
@@ -1864,11 +1510,11 @@ SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
   SDValue Zero = DAG.getConstant(0, DL, VT);
   SDValue NegOne = DAG.getConstant(-1, DL, VT);
 
-  if (VT == MVT::i32 &&
-      DAG.ComputeNumSignBits(LHS) > 8 &&
-      DAG.ComputeNumSignBits(RHS) > 8) {
-    return LowerDIVREM24(Op, DAG, true);
+  if (VT == MVT::i32) {
+    if (SDValue Res = LowerDIVREM24(Op, DAG, true))
+      return Res;
   }
+
   if (VT == MVT::i64 &&
       DAG.ComputeNumSignBits(LHS) > 32 &&
       DAG.ComputeNumSignBits(RHS) > 32) {
@@ -1954,7 +1600,8 @@ SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
 }
 
-static SDValue extractF64Exponent(SDValue Hi, SDLoc SL, SelectionDAG &DAG) {
+static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL,
+                                  SelectionDAG &DAG) {
   const unsigned FractBits = 52;
   const unsigned ExpBits = 11;
 
@@ -1992,8 +1639,7 @@ SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
   SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
 
   // Extend back to to 64-bits.
-  SDValue SignBit64 = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
-                                  Zero, SignBit);
+  SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
   SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
 
   SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
@@ -2391,7 +2037,7 @@ SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG,
                            MVT::i32, FloorMul);
   SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
 
-  SDValue Result = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Lo, Hi);
+  SDValue Result = DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi});
 
   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result);
 }
@@ -2437,7 +2083,7 @@ SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
   for (unsigned I = 0; I < NElts; ++I)
     Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
 
-  return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Args);
+  return DAG.getBuildVector(VT, DL, Args);
 }
 
 //===----------------------------------------------------------------------===//
@@ -2476,8 +2122,8 @@ static void simplifyI24(SDValue Op, TargetLowering::DAGCombinerInfo &DCI) {
 }
 
 template <typename IntTy>
-static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0,
-                               uint32_t Offset, uint32_t Width, SDLoc DL) {
+static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
+                               uint32_t Width, const SDLoc &DL) {
   if (Width + Offset < 32) {
     uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
     IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
@@ -2487,55 +2133,175 @@ static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0,
   return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
 }
 
-static bool usesAllNormalStores(SDNode *LoadVal) {
-  for (SDNode::use_iterator I = LoadVal->use_begin(); !I.atEnd(); ++I) {
-    if (!ISD::isNormalStore(*I))
-      return false;
+static bool hasVolatileUser(SDNode *Val) {
+  for (SDNode *U : Val->uses()) {
+    if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
+      if (M->isVolatile())
+        return true;
+    }
   }
 
+  return false;
+}
+
+bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
+  // i32 vectors are the canonical memory type.
+  if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
+    return false;
+
+  if (!VT.isByteSized())
+    return false;
+
+  unsigned Size = VT.getStoreSize();
+
+  if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
+    return false;
+
+  if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
+    return false;
+
   return true;
 }
 
-// If we have a copy of an illegal type, replace it with a load / store of an
-// equivalently sized legal type. This avoids intermediate bit pack / unpack
-// instructions emitted when handling extloads and truncstores. Ideally we could
-// recognize the pack / unpack pattern to eliminate it.
+// Replace load of an illegal type with a store of a bitcast to a friendlier
+// type.
+SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
+                                                 DAGCombinerInfo &DCI) const {
+  if (!DCI.isBeforeLegalize())
+    return SDValue();
+
+  LoadSDNode *LN = cast<LoadSDNode>(N);
+  if (LN->isVolatile() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
+    return SDValue();
+
+  SDLoc SL(N);
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = LN->getMemoryVT();
+
+  unsigned Size = VT.getStoreSize();
+  unsigned Align = LN->getAlignment();
+  if (Align < Size && isTypeLegal(VT)) {
+    bool IsFast;
+    unsigned AS = LN->getAddressSpace();
+
+    // Expand unaligned loads earlier than legalization. Due to visitation order
+    // problems during legalization, the emitted instructions to pack and unpack
+    // the bytes again are not eliminated in the case of an unaligned copy.
+    if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) {
+      SDValue Ops[2];
+      std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
+      return DAG.getMergeValues(Ops, SDLoc(N));
+    }
+
+    if (!IsFast)
+      return SDValue();
+  }
+
+  if (!shouldCombineMemoryType(VT))
+    return SDValue();
+
+  EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
+
+  SDValue NewLoad
+    = DAG.getLoad(NewVT, SL, LN->getChain(),
+                  LN->getBasePtr(), LN->getMemOperand());
+
+  SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
+  DCI.CombineTo(N, BC, NewLoad.getValue(1));
+  return SDValue(N, 0);
+}
+
+// Replace store of an illegal type with a store of a bitcast to a friendlier
+// type.
 SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
                                                   DAGCombinerInfo &DCI) const {
   if (!DCI.isBeforeLegalize())
     return SDValue();
 
   StoreSDNode *SN = cast<StoreSDNode>(N);
-  SDValue Value = SN->getValue();
-  EVT VT = Value.getValueType();
+  if (SN->isVolatile() || !ISD::isNormalStore(SN))
+    return SDValue();
 
-  if (isTypeLegal(VT) || SN->isVolatile() ||
-      !ISD::isNormalLoad(Value.getNode()) || VT.getSizeInBits() < 8)
+  EVT VT = SN->getMemoryVT();
+  unsigned Size = VT.getStoreSize();
+
+  SDLoc SL(N);
+  SelectionDAG &DAG = DCI.DAG;
+  unsigned Align = SN->getAlignment();
+  if (Align < Size && isTypeLegal(VT)) {
+    bool IsFast;
+    unsigned AS = SN->getAddressSpace();
+
+    // Expand unaligned stores earlier than legalization. Due to visitation
+    // order problems during legalization, the emitted instructions to pack and
+    // unpack the bytes again are not eliminated in the case of an unaligned
+    // copy.
+    if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast))
+      return expandUnalignedStore(SN, DAG);
+
+    if (!IsFast)
+      return SDValue();
+  }
+
+  if (!shouldCombineMemoryType(VT))
+    return SDValue();
+
+  EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
+  SDValue Val = SN->getValue();
+
+  //DCI.AddToWorklist(Val.getNode());
+
+  bool OtherUses = !Val.hasOneUse();
+  SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
+  if (OtherUses) {
+    SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
+    DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
+  }
+
+  return DAG.getStore(SN->getChain(), SL, CastVal,
+                      SN->getBasePtr(), SN->getMemOperand());
+}
+
+// TODO: Should repeat for other bit ops.
+SDValue AMDGPUTargetLowering::performAndCombine(SDNode *N,
+                                                DAGCombinerInfo &DCI) const {
+  if (N->getValueType(0) != MVT::i64)
     return SDValue();
 
-  LoadSDNode *LoadVal = cast<LoadSDNode>(Value);
-  if (LoadVal->isVolatile() || !usesAllNormalStores(LoadVal))
+  // Break up 64-bit and of a constant into two 32-bit ands. This will typically
+  // happen anyway for a VALU 64-bit and. This exposes other 32-bit integer
+  // combine opportunities since most 64-bit operations are decomposed this way.
+  // TODO: We won't want this for SALU especially if it is an inline immediate.
+  const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  if (!RHS)
     return SDValue();
 
-  EVT MemVT = LoadVal->getMemoryVT();
+  uint64_t Val = RHS->getZExtValue();
+  if (Lo_32(Val) != 0 && Hi_32(Val) != 0 && !RHS->hasOneUse()) {
+    // If either half of the constant is 0, this is really a 32-bit and, so
+    // split it. If we can re-use the full materialized constant, keep it.
+    return SDValue();
+  }
 
   SDLoc SL(N);
   SelectionDAG &DAG = DCI.DAG;
-  EVT LoadVT = getEquivalentMemType(*DAG.getContext(), MemVT);
 
-  SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD,
-                                LoadVT, SL,
-                                LoadVal->getChain(),
-                                LoadVal->getBasePtr(),
-                                LoadVal->getOffset(),
-                                LoadVT,
-                                LoadVal->getMemOperand());
+  SDValue Lo, Hi;
+  std::tie(Lo, Hi) = split64BitValue(N->getOperand(0), DAG);
 
-  SDValue CastLoad = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad.getValue(0));
-  DCI.CombineTo(LoadVal, CastLoad, NewLoad.getValue(1), false);
+  SDValue LoRHS = DAG.getConstant(Lo_32(Val), SL, MVT::i32);
+  SDValue HiRHS = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
 
-  return DAG.getStore(SN->getChain(), SL, NewLoad,
-                      SN->getBasePtr(), SN->getMemOperand());
+  SDValue LoAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Lo, LoRHS);
+  SDValue HiAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, HiRHS);
+
+  // Re-visit the ands. It's possible we eliminated one of them and it could
+  // simplify the vector.
+  DCI.AddToWorklist(Lo.getNode());
+  DCI.AddToWorklist(Hi.getNode());
+
+  SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
+  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
 }
 
 SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
@@ -2543,14 +2309,17 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
   if (N->getValueType(0) != MVT::i64)
     return SDValue();
 
-  // i64 (shl x, 32) -> (build_pair 0, x)
+  // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
 
-  // Doing this with moves theoretically helps MI optimizations that understand
-  // copies. 2 v_mov_b32_e32 will have the same code size / cycle count as
-  // v_lshl_b64. In the SALU case, I think this is slightly worse since it
-  // doubles the code size and I'm unsure about cycle count.
+  // On some subtargets, 64-bit shift is a quarter rate instruction. In the
+  // common case, splitting this into a move and a 32-bit shift is faster and
+  // the same code size.
   const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
-  if (!RHS || RHS->getZExtValue() != 32)
+  if (!RHS)
+    return SDValue();
+
+  unsigned RHSVal = RHS->getZExtValue();
+  if (RHSVal < 32)
     return SDValue();
 
   SDValue LHS = N->getOperand(0);
@@ -2558,11 +2327,85 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
   SDLoc SL(N);
   SelectionDAG &DAG = DCI.DAG;
 
-  // Extract low 32-bits.
+  SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
+
   SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
+  SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
 
   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
-  return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Zero, Lo);
+
+  SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
+  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
+}
+
+SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
+                                                DAGCombinerInfo &DCI) const {
+  if (N->getValueType(0) != MVT::i64)
+    return SDValue();
+
+  const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  if (!RHS)
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc SL(N);
+  unsigned RHSVal = RHS->getZExtValue();
+
+  // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
+  if (RHSVal == 32) {
+    SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
+    SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
+                                   DAG.getConstant(31, SL, MVT::i32));
+
+    SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
+    return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
+  }
+
+  // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
+  if (RHSVal == 63) {
+    SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
+    SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
+                                   DAG.getConstant(31, SL, MVT::i32));
+    SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
+    return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
+  }
+
+  return SDValue();
+}
+
+SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
+                                                DAGCombinerInfo &DCI) const {
+  if (N->getValueType(0) != MVT::i64)
+    return SDValue();
+
+  const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  if (!RHS)
+    return SDValue();
+
+  unsigned ShiftAmt = RHS->getZExtValue();
+  if (ShiftAmt < 32)
+    return SDValue();
+
+  // srl i64:x, C for C >= 32
+  // =>
+  //   build_pair (srl hi_32(x), C - 32), 0
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc SL(N);
+
+  SDValue One = DAG.getConstant(1, SL, MVT::i32);
+  SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
+
+  SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, N->getOperand(0));
+  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
+                           VecOp, One);
+
+  SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
+  SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
+
+  SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
+
+  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
 }
 
 SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
@@ -2610,8 +2453,8 @@ static bool isCtlzOpc(unsigned Opc) {
 // type VT.
 // Need to match pre-legalized type because the generic legalization inserts the
 // add/sub between the select and compare.
-static SDValue getFFBH_U32(const TargetLowering &TLI,
-                           SelectionDAG &DAG, SDLoc SL, SDValue Op) {
+static SDValue getFFBH_U32(const TargetLowering &TLI, SelectionDAG &DAG,
+                           const SDLoc &SL, SDValue Op) {
   EVT VT = Op.getValueType();
   EVT LegalVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
   if (LegalVT != MVT::i32)
@@ -2634,10 +2477,8 @@ static SDValue getFFBH_U32(const TargetLowering &TLI,
 // against the bitwidth.
 //
 // TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
-SDValue AMDGPUTargetLowering::performCtlzCombine(SDLoc SL,
-                                                 SDValue Cond,
-                                                 SDValue LHS,
-                                                 SDValue RHS,
+SDValue AMDGPUTargetLowering::performCtlzCombine(const SDLoc &SL, SDValue Cond,
+                                                 SDValue LHS, SDValue RHS,
                                                  DAGCombinerInfo &DCI) const {
   ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
   if (!CmpRhs || !CmpRhs->isNullValue())
@@ -2680,8 +2521,13 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
   SDValue True = N->getOperand(1);
   SDValue False = N->getOperand(2);
 
-  if (VT == MVT::f32 && Cond.hasOneUse())
-    return CombineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
+  if (VT == MVT::f32 && Cond.hasOneUse()) {
+    SDValue MinMax
+      = CombineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
+    // Revisit this node so we can catch min3/max3/med3 patterns.
+    //DCI.AddToWorklist(MinMax.getNode());
+    return MinMax;
+  }
 
   // There's no reason to not do this if the condition has other uses.
   return performCtlzCombine(SDLoc(N), Cond, True, False, DCI);
@@ -2695,12 +2541,62 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
   switch(N->getOpcode()) {
   default:
     break;
+  case ISD::BITCAST: {
+    EVT DestVT = N->getValueType(0);
+    if (DestVT.getSizeInBits() != 64 && !DestVT.isVector())
+      break;
+
+    // Fold bitcasts of constants.
+    //
+    // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
+    // TODO: Generalize and move to DAGCombiner
+    SDValue Src = N->getOperand(0);
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
+      assert(Src.getValueType() == MVT::i64);
+      SDLoc SL(N);
+      uint64_t CVal = C->getZExtValue();
+      return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT,
+                         DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
+                         DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
+    }
+
+    if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
+      const APInt &Val = C->getValueAPF().bitcastToAPInt();
+      SDLoc SL(N);
+      uint64_t CVal = Val.getZExtValue();
+      SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
+                                DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
+                                DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
+
+      return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
+    }
+
+    break;
+  }
   case ISD::SHL: {
     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
       break;
 
     return performShlCombine(N, DCI);
   }
+  case ISD::SRL: {
+    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
+      break;
+
+    return performSrlCombine(N, DCI);
+  }
+  case ISD::SRA: {
+    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
+      break;
+
+    return performSraCombine(N, DCI);
+  }
+  case ISD::AND: {
+    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
+      break;
+
+    return performAndCombine(N, DCI);
+  }
   case ISD::MUL:
     return performMulCombine(N, DCI);
   case AMDGPUISD::MUL_I24:
@@ -2797,7 +2693,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
 
     break;
   }
-
+  case ISD::LOAD:
+    return performLoadCombine(N, DCI);
   case ISD::STORE:
     return performStoreCombine(N, DCI);
   }
@@ -2840,20 +2737,6 @@ void AMDGPUTargetLowering::getOriginalFunctionArgs(
   }
 }
 
-bool AMDGPUTargetLowering::isHWTrueValue(SDValue Op) const {
-  if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
-    return CFP->isExactlyValue(1.0);
-  }
-  return isAllOnesConstant(Op);
-}
-
-bool AMDGPUTargetLowering::isHWFalseValue(SDValue Op) const {
-  if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
-    return CFP->getValueAPF().isZero();
-  }
-  return isNullConstant(Op);
-}
-
 SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
                                                   const TargetRegisterClass *RC,
                                                    unsigned Reg, EVT VT) const {
@@ -2889,10 +2772,11 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   // AMDIL DAG nodes
   NODE_NAME_CASE(CALL);
   NODE_NAME_CASE(UMUL);
-  NODE_NAME_CASE(RET_FLAG);
   NODE_NAME_CASE(BRANCH_COND);
 
   // AMDGPU DAG nodes
+  NODE_NAME_CASE(ENDPGM)
+  NODE_NAME_CASE(RETURN)
   NODE_NAME_CASE(DWORDADDR)
   NODE_NAME_CASE(FRACT)
   NODE_NAME_CASE(CLAMP)
@@ -2906,6 +2790,9 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(FMIN3)
   NODE_NAME_CASE(SMIN3)
   NODE_NAME_CASE(UMIN3)
+  NODE_NAME_CASE(FMED3)
+  NODE_NAME_CASE(SMED3)
+  NODE_NAME_CASE(UMED3)
   NODE_NAME_CASE(URECIP)
   NODE_NAME_CASE(DIV_SCALE)
   NODE_NAME_CASE(DIV_FMAS)
@@ -2914,7 +2801,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(RCP)
   NODE_NAME_CASE(RSQ)
   NODE_NAME_CASE(RSQ_LEGACY)
-  NODE_NAME_CASE(RSQ_CLAMPED)
+  NODE_NAME_CASE(RSQ_CLAMP)
   NODE_NAME_CASE(LDEXP)
   NODE_NAME_CASE(FP_CLASS)
   NODE_NAME_CASE(DOT4)
@@ -2934,7 +2821,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(CONST_ADDRESS)
   NODE_NAME_CASE(REGISTER_LOAD)
   NODE_NAME_CASE(REGISTER_STORE)
-  NODE_NAME_CASE(LOAD_CONSTANT)
   NODE_NAME_CASE(LOAD_INPUT)
   NODE_NAME_CASE(SAMPLE)
   NODE_NAME_CASE(SAMPLEB)
@@ -2946,13 +2832,18 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(CVT_F32_UBYTE3)
   NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
   NODE_NAME_CASE(CONST_DATA_PTR)
+  NODE_NAME_CASE(PC_ADD_REL_OFFSET)
   case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
   NODE_NAME_CASE(SENDMSG)
   NODE_NAME_CASE(INTERP_MOV)
   NODE_NAME_CASE(INTERP_P1)
   NODE_NAME_CASE(INTERP_P2)
   NODE_NAME_CASE(STORE_MSKOR)
+  NODE_NAME_CASE(LOAD_CONSTANT)
   NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
+  NODE_NAME_CASE(ATOMIC_CMP_SWAP)
+  NODE_NAME_CASE(ATOMIC_INC)
+  NODE_NAME_CASE(ATOMIC_DEC)
   case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
   }
   return nullptr;
@@ -2998,21 +2889,6 @@ SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
   return SDValue();
 }
 
-static void computeKnownBitsForMinMax(const SDValue Op0,
-                                      const SDValue Op1,
-                                      APInt &KnownZero,
-                                      APInt &KnownOne,
-                                      const SelectionDAG &DAG,
-                                      unsigned Depth) {
-  APInt Op0Zero, Op0One;
-  APInt Op1Zero, Op1One;
-  DAG.computeKnownBits(Op0, Op0Zero, Op0One, Depth);
-  DAG.computeKnownBits(Op1, Op1Zero, Op1One, Depth);
-
-  KnownZero = Op0Zero & Op1Zero;
-  KnownOne = Op0One & Op1One;
-}
-
 void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
   const SDValue Op,
   APInt &KnownZero,
@@ -3029,22 +2905,6 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
   switch (Opc) {
   default:
     break;
-  case ISD::INTRINSIC_WO_CHAIN: {
-    // FIXME: The intrinsic should just use the node.
-    switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) {
-    case AMDGPUIntrinsic::AMDGPU_imax:
-    case AMDGPUIntrinsic::AMDGPU_umax:
-    case AMDGPUIntrinsic::AMDGPU_imin:
-    case AMDGPUIntrinsic::AMDGPU_umin:
-      computeKnownBitsForMinMax(Op.getOperand(1), Op.getOperand(2),
-                                KnownZero, KnownOne, DAG, Depth);
-      break;
-    default:
-      break;
-    }
-
-    break;
-  }
   case AMDGPUISD::CARRY:
   case AMDGPUISD::BORROW: {
     KnownZero = APInt::getHighBitsSet(32, 31);
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 37925416a9c4..c2c758592d1c 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -13,8 +13,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_R600_AMDGPUISELLOWERING_H
-#define LLVM_LIB_TARGET_R600_AMDGPUISELLOWERING_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUISELLOWERING_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUISELLOWERING_H
 
 #include "llvm/Target/TargetLowering.h"
 
@@ -28,12 +28,10 @@ class AMDGPUTargetLowering : public TargetLowering {
 protected:
   const AMDGPUSubtarget *Subtarget;
 
-private:
   SDValue LowerConstantInitializer(const Constant* Init, const GlobalValue *GV,
                                    const SDValue &InitPtr,
                                    SDValue Chain,
                                    SelectionDAG &DAG) const;
-  SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
@@ -67,42 +65,43 @@ private:
 
   SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
 
+protected:
+  bool shouldCombineMemoryType(EVT VT) const;
+  SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const;
-  SDValue performCtlzCombine(SDLoc SL, SDValue Cond, SDValue LHS, SDValue RHS,
-                             DAGCombinerInfo &DCI) const;
+  SDValue performCtlzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS,
+                             SDValue RHS, DAGCombinerInfo &DCI) const;
   SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
-protected:
   static EVT getEquivalentMemType(LLVMContext &Context, EVT VT);
-  static EVT getEquivalentLoadRegType(LLVMContext &Context, EVT VT);
+  static EVT getEquivalentBitType(LLVMContext &Context, EVT VT);
 
   virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
                                      SelectionDAG &DAG) const;
 
-  /// \brief Split a vector load into a scalar load of each component.
-  SDValue ScalarizeVectorLoad(SDValue Op, SelectionDAG &DAG) const;
+  /// Return 64-bit value Op as two 32-bit integers.
+  std::pair<SDValue, SDValue> split64BitValue(SDValue Op,
+                                              SelectionDAG &DAG) const;
+  SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const;
+  SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const;
 
   /// \brief Split a vector load into 2 loads of half the vector.
   SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const;
 
-  /// \brief Split a vector store into a scalar store of each component.
-  SDValue ScalarizeVectorStore(SDValue Op, SelectionDAG &DAG) const;
-
   /// \brief Split a vector store into 2 stores of half the vector.
   SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const;
 
-  SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const;
   void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG,
                                     SmallVectorImpl<SDValue> &Results) const;
-  bool isHWTrueValue(SDValue Op) const;
-  bool isHWFalseValue(SDValue Op) const;
-
   /// The SelectionDAGBuilder will automatically promote function arguments
   /// with illegal types.  However, this does not work for the AMDGPU targets
   /// since the function arguments are stored in memory as these illegal types.
@@ -119,7 +118,7 @@ protected:
                      const SmallVectorImpl<ISD::OutputArg> &Outs) const;
 
 public:
-  AMDGPUTargetLowering(TargetMachine &TM, const AMDGPUSubtarget &STI);
+  AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI);
 
   bool isFAbsFree(EVT VT) const override;
   bool isFNegFree(EVT VT) const override;
@@ -141,7 +140,7 @@ public:
                              ISD::LoadExtType ExtType,
                              EVT ExtVT) const override;
 
-  bool isLoadBitCastBeneficial(EVT, EVT) const override;
+  bool isLoadBitCastBeneficial(EVT, EVT) const final;
 
   bool storeOfVectorConstantIsCheap(EVT MemVT,
                                     unsigned NumElem,
@@ -150,11 +149,10 @@ public:
   bool isCheapToSpeculateCttz() const override;
   bool isCheapToSpeculateCtlz() const override;
 
-  SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv,
-                      bool isVarArg,
+  SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
-                      const SmallVectorImpl<SDValue> &OutVals,
-                      SDLoc DL, SelectionDAG &DAG) const override;
+                      const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
+                      SelectionDAG &DAG) const override;
   SDValue LowerCall(CallLoweringInfo &CLI,
                     SmallVectorImpl<SDValue> &InVals) const override;
 
@@ -167,16 +165,9 @@ public:
                           SmallVectorImpl<SDValue> &Results,
                           SelectionDAG &DAG) const override;
 
-  SDValue LowerIntrinsicIABS(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerIntrinsicLRP(SDValue Op, SelectionDAG &DAG) const;
-  SDValue CombineFMinMaxLegacy(SDLoc DL,
-                               EVT VT,
-                               SDValue LHS,
-                               SDValue RHS,
-                               SDValue True,
-                               SDValue False,
-                               SDValue CC,
-                               DAGCombinerInfo &DCI) const;
+  SDValue CombineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS,
+                               SDValue RHS, SDValue True, SDValue False,
+                               SDValue CC, DAGCombinerInfo &DCI) const;
 
   const char* getTargetNodeName(unsigned Opcode) const override;
 
@@ -189,9 +180,7 @@ public:
                            unsigned &RefinementSteps) const override;
 
   virtual SDNode *PostISelFolding(MachineSDNode *N,
-                                  SelectionDAG &DAG) const {
-    return N;
-  }
+                                  SelectionDAG &DAG) const = 0;
 
   /// \brief Determine which of the bits specified in \p Mask are known to be
   /// either zero or one and return them in the \p KnownZero and \p KnownOne
@@ -214,8 +203,9 @@ public:
                                        unsigned Reg, EVT VT) const;
 
   enum ImplicitParameter {
-    GRID_DIM,
-    GRID_OFFSET
+    FIRST_IMPLICIT,
+    GRID_DIM = FIRST_IMPLICIT,
+    GRID_OFFSET,
   };
 
   /// \brief Helper function that returns the byte offset of the given
@@ -231,9 +221,10 @@ enum NodeType : unsigned {
   FIRST_NUMBER = ISD::BUILTIN_OP_END,
   CALL,        // Function call based on a single integer
   UMUL,        // 32bit unsigned multiplication
-  RET_FLAG,
   BRANCH_COND,
   // End AMDIL ISD Opcodes
+  ENDPGM,
+  RETURN,
   DWORDADDR,
   FRACT,
   CLAMP,
@@ -250,6 +241,9 @@ enum NodeType : unsigned {
   FMIN3,
   SMIN3,
   UMIN3,
+  FMED3,
+  SMED3,
+  UMED3,
   URECIP,
   DIV_SCALE,
   DIV_FMAS,
@@ -261,7 +255,7 @@ enum NodeType : unsigned {
   RCP,
   RSQ,
   RSQ_LEGACY,
-  RSQ_CLAMPED,
+  RSQ_CLAMP,
   LDEXP,
   FP_CLASS,
   DOT4,
@@ -307,10 +301,14 @@ enum NodeType : unsigned {
   INTERP_MOV,
   INTERP_P1,
   INTERP_P2,
+  PC_ADD_REL_OFFSET,
   FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
   STORE_MSKOR,
   LOAD_CONSTANT,
   TBUFFER_STORE_FORMAT,
+  ATOMIC_CMP_SWAP,
+  ATOMIC_INC,
+  ATOMIC_DEC,
   LAST_AMDGPU_ISD_NUMBER
 };
 
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
index a266e711af5b..9a00ecb24ebe 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
@@ -30,163 +30,8 @@ using namespace llvm;
 // Pin the vtable to this file.
 void AMDGPUInstrInfo::anchor() {}
 
-AMDGPUInstrInfo::AMDGPUInstrInfo(const AMDGPUSubtarget &st)
-    : AMDGPUGenInstrInfo(-1, -1), ST(st) {}
-
-const AMDGPURegisterInfo &AMDGPUInstrInfo::getRegisterInfo() const {
-  return RI;
-}
-
-bool AMDGPUInstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
-                                           unsigned &SrcReg, unsigned &DstReg,
-                                           unsigned &SubIdx) const {
-// TODO: Implement this function
-  return false;
-}
-
-unsigned AMDGPUInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
-                                             int &FrameIndex) const {
-// TODO: Implement this function
-  return 0;
-}
-
-unsigned AMDGPUInstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI,
-                                                   int &FrameIndex) const {
-// TODO: Implement this function
-  return 0;
-}
-
-bool AMDGPUInstrInfo::hasLoadFromStackSlot(const MachineInstr *MI,
-                                          const MachineMemOperand *&MMO,
-                                          int &FrameIndex) const {
-// TODO: Implement this function
-  return false;
-}
-unsigned AMDGPUInstrInfo::isStoreFromStackSlot(const MachineInstr *MI,
-                                              int &FrameIndex) const {
-// TODO: Implement this function
-  return 0;
-}
-unsigned AMDGPUInstrInfo::isStoreFromStackSlotPostFE(const MachineInstr *MI,
-                                                    int &FrameIndex) const {
-// TODO: Implement this function
-  return 0;
-}
-bool AMDGPUInstrInfo::hasStoreFromStackSlot(const MachineInstr *MI,
-                                           const MachineMemOperand *&MMO,
-                                           int &FrameIndex) const {
-// TODO: Implement this function
-  return false;
-}
-
-MachineInstr *
-AMDGPUInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
-                                      MachineBasicBlock::iterator &MBBI,
-                                      LiveVariables *LV) const {
-// TODO: Implement this function
-  return nullptr;
-}
-
-void
-AMDGPUInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
-                                    MachineBasicBlock::iterator MI,
-                                    unsigned SrcReg, bool isKill,
-                                    int FrameIndex,
-                                    const TargetRegisterClass *RC,
-                                    const TargetRegisterInfo *TRI) const {
-  llvm_unreachable("Not Implemented");
-}
-
-void
-AMDGPUInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator MI,
-                                     unsigned DestReg, int FrameIndex,
-                                     const TargetRegisterClass *RC,
-                                     const TargetRegisterInfo *TRI) const {
-  llvm_unreachable("Not Implemented");
-}
-
-bool AMDGPUInstrInfo::expandPostRAPseudo (MachineBasicBlock::iterator MI) const {
-  MachineBasicBlock *MBB = MI->getParent();
-  int OffsetOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
-                                               AMDGPU::OpName::addr);
-   // addr is a custom operand with multiple MI operands, and only the
-   // first MI operand is given a name.
-  int RegOpIdx = OffsetOpIdx + 1;
-  int ChanOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
-                                             AMDGPU::OpName::chan);
-  if (isRegisterLoad(*MI)) {
-    int DstOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
-                                              AMDGPU::OpName::dst);
-    unsigned RegIndex = MI->getOperand(RegOpIdx).getImm();
-    unsigned Channel = MI->getOperand(ChanOpIdx).getImm();
-    unsigned Address = calculateIndirectAddress(RegIndex, Channel);
-    unsigned OffsetReg = MI->getOperand(OffsetOpIdx).getReg();
-    if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) {
-      buildMovInstr(MBB, MI, MI->getOperand(DstOpIdx).getReg(),
-                    getIndirectAddrRegClass()->getRegister(Address));
-    } else {
-      buildIndirectRead(MBB, MI, MI->getOperand(DstOpIdx).getReg(),
-                        Address, OffsetReg);
-    }
-  } else if (isRegisterStore(*MI)) {
-    int ValOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
-                                              AMDGPU::OpName::val);
-    unsigned RegIndex = MI->getOperand(RegOpIdx).getImm();
-    unsigned Channel = MI->getOperand(ChanOpIdx).getImm();
-    unsigned Address = calculateIndirectAddress(RegIndex, Channel);
-    unsigned OffsetReg = MI->getOperand(OffsetOpIdx).getReg();
-    if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) {
-      buildMovInstr(MBB, MI, getIndirectAddrRegClass()->getRegister(Address),
-                    MI->getOperand(ValOpIdx).getReg());
-    } else {
-      buildIndirectWrite(MBB, MI, MI->getOperand(ValOpIdx).getReg(),
-                         calculateIndirectAddress(RegIndex, Channel),
-                         OffsetReg);
-    }
-  } else {
-    return false;
-  }
-
-  MBB->erase(MI);
-  return true;
-}
-
-MachineInstr *AMDGPUInstrInfo::foldMemoryOperandImpl(
-    MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops,
-    MachineBasicBlock::iterator InsertPt, int FrameIndex) const {
-// TODO: Implement this function
-  return nullptr;
-}
-MachineInstr *AMDGPUInstrInfo::foldMemoryOperandImpl(
-    MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops,
-    MachineBasicBlock::iterator InsertPt, MachineInstr *LoadMI) const {
-  // TODO: Implement this function
-  return nullptr;
-}
-bool
-AMDGPUInstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
-                                 unsigned Reg, bool UnfoldLoad,
-                                 bool UnfoldStore,
-                                 SmallVectorImpl<MachineInstr*> &NewMIs) const {
-  // TODO: Implement this function
-  return false;
-}
-
-bool
-AMDGPUInstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
-                                    SmallVectorImpl<SDNode*> &NewNodes) const {
-  // TODO: Implement this function
-  return false;
-}
-
-unsigned
-AMDGPUInstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc,
-                                           bool UnfoldLoad, bool UnfoldStore,
-                                           unsigned *LoadRegIndex) const {
-  // TODO: Implement this function
-  return 0;
-}
+AMDGPUInstrInfo::AMDGPUInstrInfo(const AMDGPUSubtarget &ST)
+  : AMDGPUGenInstrInfo(-1, -1), ST(ST) {}
 
 bool AMDGPUInstrInfo::enableClusterLoads() const {
   return true;
@@ -214,106 +59,6 @@ bool AMDGPUInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1,
   return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
 }
 
-bool
-AMDGPUInstrInfo::ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond)
-  const {
-  // TODO: Implement this function
-  return true;
-}
-void AMDGPUInstrInfo::insertNoop(MachineBasicBlock &MBB,
-                                MachineBasicBlock::iterator MI) const {
-  // TODO: Implement this function
-}
-
-bool AMDGPUInstrInfo::isPredicated(const MachineInstr *MI) const {
-  // TODO: Implement this function
-  return false;
-}
-
-bool AMDGPUInstrInfo::SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
-                                        ArrayRef<MachineOperand> Pred2) const {
-  // TODO: Implement this function
-  return false;
-}
-
-bool AMDGPUInstrInfo::DefinesPredicate(MachineInstr *MI,
-                                      std::vector<MachineOperand> &Pred) const {
-  // TODO: Implement this function
-  return false;
-}
-
-bool AMDGPUInstrInfo::isPredicable(MachineInstr *MI) const {
-  // TODO: Implement this function
-  return MI->getDesc().isPredicable();
-}
-
-bool
-AMDGPUInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
-  // TODO: Implement this function
-  return true;
-}
-
-bool AMDGPUInstrInfo::isRegisterStore(const MachineInstr &MI) const {
-  return get(MI.getOpcode()).TSFlags & AMDGPU_FLAG_REGISTER_STORE;
-}
-
-bool AMDGPUInstrInfo::isRegisterLoad(const MachineInstr &MI) const {
-  return get(MI.getOpcode()).TSFlags & AMDGPU_FLAG_REGISTER_LOAD;
-}
-
-int AMDGPUInstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const {
-  const MachineRegisterInfo &MRI = MF.getRegInfo();
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-  int Offset = -1;
-
-  if (MFI->getNumObjects() == 0) {
-    return -1;
-  }
-
-  if (MRI.livein_empty()) {
-    return 0;
-  }
-
-  const TargetRegisterClass *IndirectRC = getIndirectAddrRegClass();
-  for (MachineRegisterInfo::livein_iterator LI = MRI.livein_begin(),
-                                            LE = MRI.livein_end();
-                                            LI != LE; ++LI) {
-    unsigned Reg = LI->first;
-    if (TargetRegisterInfo::isVirtualRegister(Reg) ||
-        !IndirectRC->contains(Reg))
-      continue;
-
-    unsigned RegIndex;
-    unsigned RegEnd;
-    for (RegIndex = 0, RegEnd = IndirectRC->getNumRegs(); RegIndex != RegEnd;
-                                                          ++RegIndex) {
-      if (IndirectRC->getRegister(RegIndex) == Reg)
-        break;
-    }
-    Offset = std::max(Offset, (int)RegIndex);
-  }
-
-  return Offset + 1;
-}
-
-int AMDGPUInstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const {
-  int Offset = 0;
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-
-  // Variable sized objects are not supported
-  assert(!MFI->hasVarSizedObjects());
-
-  if (MFI->getNumObjects() == 0) {
-    return -1;
-  }
-
-  unsigned IgnoredFrameReg;
-  Offset = MF.getSubtarget().getFrameLowering()->getFrameIndexReference(
-      MF, -1, IgnoredFrameReg);
-
-  return getIndirectIndexBegin(MF) + Offset;
-}
-
 int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const {
   switch (Channels) {
   default: return Opcode;
@@ -323,35 +68,44 @@ int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const {
   }
 }
 
+// This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td
+enum SIEncodingFamily {
+  SI = 0,
+  VI = 1
+};
+
 // Wrapper for Tablegen'd function.  enum Subtarget is not defined in any
 // header files, so we need to wrap it in a function that takes unsigned
 // instead.
 namespace llvm {
 namespace AMDGPU {
 static int getMCOpcode(uint16_t Opcode, unsigned Gen) {
-  return getMCOpcodeGen(Opcode, (enum Subtarget)Gen);
+  return getMCOpcodeGen(Opcode, static_cast<Subtarget>(Gen));
 }
 }
 }
 
-// This must be kept in sync with the SISubtarget class in SIInstrInfo.td
-enum SISubtarget {
-  SI = 0,
-  VI = 1
-};
-
-static enum SISubtarget AMDGPUSubtargetToSISubtarget(unsigned Gen) {
-  switch (Gen) {
-  default:
-    return SI;
+static SIEncodingFamily subtargetEncodingFamily(const AMDGPUSubtarget &ST) {
+  switch (ST.getGeneration()) {
+  case AMDGPUSubtarget::SOUTHERN_ISLANDS:
+  case AMDGPUSubtarget::SEA_ISLANDS:
+    return SIEncodingFamily::SI;
   case AMDGPUSubtarget::VOLCANIC_ISLANDS:
-    return VI;
+    return SIEncodingFamily::VI;
+
+  // FIXME: This should never be called for r600 GPUs.
+  case AMDGPUSubtarget::R600:
+  case AMDGPUSubtarget::R700:
+  case AMDGPUSubtarget::EVERGREEN:
+  case AMDGPUSubtarget::NORTHERN_ISLANDS:
+    return SIEncodingFamily::SI;
   }
+
+  llvm_unreachable("Unknown subtarget generation!");
 }
 
 int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const {
-  int MCOp = AMDGPU::getMCOpcode(
-      Opcode, AMDGPUSubtargetToSISubtarget(ST.getGeneration()));
+  int MCOp = AMDGPU::getMCOpcode(Opcode, subtargetEncodingFamily(ST));
 
   // -1 means that Opcode is already a native instruction.
   if (MCOp == -1)
@@ -364,14 +118,3 @@ int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const {
 
   return MCOp;
 }
-
-ArrayRef<std::pair<int, const char *>>
-AMDGPUInstrInfo::getSerializableTargetIndices() const {
-  static const std::pair<int, const char *> TargetIndices[] = {
-      {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
-      {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
-      {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
-      {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
-      {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
-  return makeArrayRef(TargetIndices);
-}
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/lib/Target/AMDGPU/AMDGPUInstrInfo.h
index 53e8b23b3d62..a59eafadeb93 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.h
@@ -13,12 +13,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_R600_AMDGPUINSTRINFO_H
-#define LLVM_LIB_TARGET_R600_AMDGPUINSTRINFO_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRINFO_H
 
-#include "AMDGPURegisterInfo.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include <map>
 
 #define GET_INSTRINFO_HEADER
 #define GET_INSTRINFO_ENUM
@@ -39,78 +37,12 @@ class MachineInstrBuilder;
 
 class AMDGPUInstrInfo : public AMDGPUGenInstrInfo {
 private:
-  const AMDGPURegisterInfo RI;
-  virtual void anchor();
-protected:
   const AMDGPUSubtarget &ST;
-public:
-  explicit AMDGPUInstrInfo(const AMDGPUSubtarget &st);
 
-  virtual const AMDGPURegisterInfo &getRegisterInfo() const = 0;
-
-  bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg,
-                             unsigned &DstReg, unsigned &SubIdx) const override;
-
-  unsigned isLoadFromStackSlot(const MachineInstr *MI,
-                               int &FrameIndex) const override;
-  unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI,
-                                     int &FrameIndex) const override;
-  bool hasLoadFromStackSlot(const MachineInstr *MI,
-                            const MachineMemOperand *&MMO,
-                            int &FrameIndex) const override;
-  unsigned isStoreFromStackSlot(const MachineInstr *MI, int &FrameIndex) const;
-  unsigned isStoreFromStackSlotPostFE(const MachineInstr *MI,
-                                      int &FrameIndex) const;
-  bool hasStoreFromStackSlot(const MachineInstr *MI,
-                             const MachineMemOperand *&MMO,
-                             int &FrameIndex) const;
-
-  MachineInstr *
-  convertToThreeAddress(MachineFunction::iterator &MFI,
-                        MachineBasicBlock::iterator &MBBI,
-                        LiveVariables *LV) const override;
-
-
-  bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
-
-  void storeRegToStackSlot(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MI,
-                           unsigned SrcReg, bool isKill, int FrameIndex,
-                           const TargetRegisterClass *RC,
-                           const TargetRegisterInfo *TRI) const override;
-  void loadRegFromStackSlot(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator MI,
-                            unsigned DestReg, int FrameIndex,
-                            const TargetRegisterClass *RC,
-                            const TargetRegisterInfo *TRI) const override;
-
-protected:
-  MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
-                                      ArrayRef<unsigned> Ops,
-                                      MachineBasicBlock::iterator InsertPt,
-                                      int FrameIndex) const override;
-  MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
-                                      ArrayRef<unsigned> Ops,
-                                      MachineBasicBlock::iterator InsertPt,
-                                      MachineInstr *LoadMI) const override;
+  virtual void anchor();
 
 public:
-  /// \returns the smallest register index that will be accessed by an indirect
-  /// read or write or -1 if indirect addressing is not used by this program.
-  int getIndirectIndexBegin(const MachineFunction &MF) const;
-
-  /// \returns the largest register index that will be accessed by an indirect
-  /// read or write or -1 if indirect addressing is not used by this program.
-  int getIndirectIndexEnd(const MachineFunction &MF) const;
-
-  bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
-                        unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
-                        SmallVectorImpl<MachineInstr *> &NewMIs) const override;
-  bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
-                           SmallVectorImpl<SDNode *> &NewNodes) const override;
-  unsigned getOpcodeAfterMemoryUnfold(unsigned Opc,
-                               bool UnfoldLoad, bool UnfoldStore,
-                               unsigned *LoadRegIndex = nullptr) const override;
+  explicit AMDGPUInstrInfo(const AMDGPUSubtarget &st);
 
   bool enableClusterLoads() const override;
 
@@ -118,81 +50,14 @@ public:
                                int64_t Offset1, int64_t Offset2,
                                unsigned NumLoads) const override;
 
-  bool
-  ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
-  void insertNoop(MachineBasicBlock &MBB,
-                  MachineBasicBlock::iterator MI) const override;
-  bool isPredicated(const MachineInstr *MI) const override;
-  bool SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
-                         ArrayRef<MachineOperand> Pred2) const override;
-  bool DefinesPredicate(MachineInstr *MI,
-                        std::vector<MachineOperand> &Pred) const override;
-  bool isPredicable(MachineInstr *MI) const override;
-  bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override;
-
-  // Helper functions that check the opcode for status information
-  bool isRegisterStore(const MachineInstr &MI) const;
-  bool isRegisterLoad(const MachineInstr &MI) const;
-
   /// \brief Return a target-specific opcode if Opcode is a pseudo instruction.
   /// Return -1 if the target-specific opcode for the pseudo instruction does
   /// not exist. If Opcode is not a pseudo instruction, this is identity.
   int pseudoToMCOpcode(int Opcode) const;
 
-  /// \brief Return the descriptor of the target-specific machine instruction
-  /// that corresponds to the specified pseudo or native opcode.
-  const MCInstrDesc &getMCOpcodeFromPseudo(unsigned Opcode) const {
-    return get(pseudoToMCOpcode(Opcode));
-  }
-
-  ArrayRef<std::pair<int, const char *>>
-  getSerializableTargetIndices() const override;
-
-//===---------------------------------------------------------------------===//
-// Pure virtual funtions to be implemented by sub-classes.
-//===---------------------------------------------------------------------===//
-
-  virtual bool isMov(unsigned opcode) const = 0;
-
-  /// \brief Calculate the "Indirect Address" for the given \p RegIndex and
-  ///        \p Channel
-  ///
-  /// We model indirect addressing using a virtual address space that can be
-  /// accesed with loads and stores.  The "Indirect Address" is the memory
-  /// address in this virtual address space that maps to the given \p RegIndex
-  /// and \p Channel.
-  virtual unsigned calculateIndirectAddress(unsigned RegIndex,
-                                            unsigned Channel) const = 0;
-
-  /// \returns The register class to be used for loading and storing values
-  /// from an "Indirect Address" .
-  virtual const TargetRegisterClass *getIndirectAddrRegClass() const = 0;
-
-  /// \brief Build instruction(s) for an indirect register write.
-  ///
-  /// \returns The instruction that performs the indirect register write
-  virtual MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
-                                    MachineBasicBlock::iterator I,
-                                    unsigned ValueReg, unsigned Address,
-                                    unsigned OffsetReg) const = 0;
-
-  /// \brief Build instruction(s) for an indirect register read.
-  ///
-  /// \returns The instruction that performs the indirect register read
-  virtual MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
-                                    MachineBasicBlock::iterator I,
-                                    unsigned ValueReg, unsigned Address,
-                                    unsigned OffsetReg) const = 0;
-
-  /// \brief Build a MOV instruction.
-  virtual MachineInstr *buildMovInstr(MachineBasicBlock *MBB,
-                                      MachineBasicBlock::iterator I,
-                                      unsigned DstReg, unsigned SrcReg) const = 0;
-
   /// \brief Given a MIMG \p Opcode that writes all 4 channels, return the
   /// equivalent opcode that writes \p Channels Channels.
   int getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const;
-
 };
 
 namespace AMDGPU {
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index 575dfe413658..2b13bb9079ea 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -44,6 +44,11 @@ def AMDGPUFmasOp : SDTypeProfile<1, 4,
 // AMDGPU DAG Nodes
 //
 
+def AMDGPUconstdata_ptr : SDNode<
+  "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 1, [SDTCisVT<0, iPTR>,
+                                                     SDTCisVT<0, iPTR>]>
+>;
+
 // This argument to this node is a dword address.
 def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>;
 
@@ -63,7 +68,7 @@ def AMDGPUrsq : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>;
 def AMDGPUrsq_legacy : SDNode<"AMDGPUISD::RSQ_LEGACY", SDTFPUnaryOp>;
 
 // out = 1.0 / sqrt(a) result clamped to +/- max_float.
-def AMDGPUrsq_clamped : SDNode<"AMDGPUISD::RSQ_CLAMPED", SDTFPUnaryOp>;
+def AMDGPUrsq_clamp : SDNode<"AMDGPUISD::RSQ_CLAMP", SDTFPUnaryOp>;
 
 def AMDGPUldexp : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>;
 
@@ -183,6 +188,11 @@ def AMDGPUstore_mskor : SDNode<"AMDGPUISD::STORE_MSKOR",
                         SDTypeProfile<0, 2, []>,
                         [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
 
+def AMDGPUatomic_cmp_swap : SDNode<"AMDGPUISD::ATOMIC_CMP_SWAP",
+                            SDTypeProfile<1, 2, [SDTCisPtrTy<1>, SDTCisVec<2>]>,
+                            [SDNPHasChain, SDNPMayStore, SDNPMayLoad,
+                             SDNPMemOperand]>;
+
 def AMDGPUround : SDNode<"ISD::FROUND",
                          SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>>;
 
@@ -209,6 +219,16 @@ def AMDGPUmad_i24 : SDNode<"AMDGPUISD::MAD_I24", AMDGPUDTIntTernaryOp,
   []
 >;
 
+def AMDGPUsmed3 : SDNode<"AMDGPUISD::SMED3", AMDGPUDTIntTernaryOp,
+  []
+>;
+
+def AMDGPUumed3 : SDNode<"AMDGPUISD::UMED3", AMDGPUDTIntTernaryOp,
+  []
+>;
+
+def AMDGPUfmed3 : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>;
+
 def AMDGPUsendmsg : SDNode<"AMDGPUISD::SENDMSG",
                     SDTypeProfile<0, 1, [SDTCisInt<0>]>,
                     [SDNPHasChain, SDNPInGlue]>;
@@ -241,5 +261,8 @@ def IL_brcond      : SDNode<"AMDGPUISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChai
 //===----------------------------------------------------------------------===//
 // Call/Return DAG Nodes
 //===----------------------------------------------------------------------===//
-def IL_retflag       : SDNode<"AMDGPUISD::RET_FLAG", SDTNone,
+def AMDGPUendpgm : SDNode<"AMDGPUISD::ENDPGM", SDTNone,
+    [SDNPHasChain, SDNPOptInGlue]>;
+
+def AMDGPUreturn : SDNode<"AMDGPUISD::RETURN", SDTNone,
     [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td
index 2a7ce6a47176..6761b4b5df95 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -12,7 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-class AMDGPUInst <dag outs, dag ins, string asm, list<dag> pattern> : Instruction {
+class AMDGPUInst <dag outs, dag ins, string asm = "",
+  list<dag> pattern = []> : Instruction {
   field bit isRegisterLoad = 0;
   field bit isRegisterStore = 0;
 
@@ -23,15 +24,22 @@ class AMDGPUInst <dag outs, dag ins, string asm, list<dag> pattern> : Instructio
   let Pattern = pattern;
   let Itinerary = NullALU;
 
+  // SoftFail is a field the disassembler can use to provide a way for
+  // instructions to not match without killing the whole decode process. It is
+  // mainly used for ARM, but Tablegen expects this field to exist or it fails
+  // to build the decode table.
+  field bits<64> SoftFail = 0;
+
+  let DecoderNamespace = Namespace;
+
   let TSFlags{63} = isRegisterLoad;
   let TSFlags{62} = isRegisterStore;
 }
 
-class AMDGPUShaderInst <dag outs, dag ins, string asm, list<dag> pattern>
-    : AMDGPUInst<outs, ins, asm, pattern> {
+class AMDGPUShaderInst <dag outs, dag ins, string asm = "",
+  list<dag> pattern = []> : AMDGPUInst<outs, ins, asm, pattern> {
 
   field bits<32> Inst = 0xffffffff;
-
 }
 
 def FP32Denormals : Predicate<"Subtarget.hasFP32Denormals()">;
@@ -41,6 +49,13 @@ def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">;
 def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>;
 def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>;
 
+// 32-bit VALU immediate operand that uses the constant bus.
+def u32kimm : Operand<i32> {
+  let OperandNamespace = "AMDGPU";
+  let OperandType = "OPERAND_KIMM32";
+  let PrintMethod = "printU32ImmOperand";
+}
+
 let OperandType = "OPERAND_IMMEDIATE" in {
 
 def u32imm : Operand<i32> {
@@ -146,6 +161,17 @@ def COND_NULL : PatLeaf <
   [{(void)N; return false;}]
 >;
 
+
+//===----------------------------------------------------------------------===//
+// Misc. PatFrags
+//===----------------------------------------------------------------------===//
+
+class HasOneUseBinOp<SDPatternOperator op> : PatFrag<
+  (ops node:$src0, node:$src1),
+  (op $src0, $src1),
+  [{ return N->hasOneUse(); }]
+>;
+
 //===----------------------------------------------------------------------===//
 // Load/Store Pattern Fragments
 //===----------------------------------------------------------------------===//
@@ -168,21 +194,58 @@ def truncstorei8_private : PrivateStore <truncstorei8>;
 def truncstorei16_private : PrivateStore <truncstorei16>;
 def store_private : PrivateStore <store>;
 
-def global_store : PatFrag<(ops node:$val, node:$ptr),
-    (store node:$val, node:$ptr), [{
-        return isGlobalStore(dyn_cast<StoreSDNode>(N));
+class GlobalMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{
+  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
 }]>;
 
 // Global address space loads
-def global_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
-    return isGlobalLoad(dyn_cast<LoadSDNode>(N));
+class GlobalLoad <SDPatternOperator op> : GlobalMemOp <
+  (ops node:$ptr), (op node:$ptr)
+>;
+
+def global_load : GlobalLoad <load>;
+
+// Global address space stores
+class GlobalStore <SDPatternOperator op> : GlobalMemOp <
+  (ops node:$value, node:$ptr), (op node:$value, node:$ptr)
+>;
+
+def global_store : GlobalStore <store>;
+def global_store_atomic : GlobalStore<atomic_store>;
+
+
+class ConstantMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{
+  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS;
 }]>;
 
 // Constant address space loads
-def constant_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
-    return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
+class ConstantLoad <SDPatternOperator op> : ConstantMemOp <
+  (ops node:$ptr), (op node:$ptr)
+>;
+
+def constant_load : ConstantLoad<load>;
+
+class LocalMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{
+  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+}]>;
+
+// Local address space loads
+class LocalLoad <SDPatternOperator op> : LocalMemOp <
+  (ops node:$ptr), (op node:$ptr)
+>;
+
+class LocalStore <SDPatternOperator op> : LocalMemOp <
+  (ops node:$value, node:$ptr), (op node:$value, node:$ptr)
+>;
+
+class FlatMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{
+  return cast<MemSDNode>(N)->getAddressSPace() == AMDGPUAS::FLAT_ADDRESS;
 }]>;
 
+class FlatLoad <SDPatternOperator op> : FlatMemOp <
+  (ops node:$ptr), (op node:$ptr)
+>;
+
 class AZExtLoadBase <SDPatternOperator ld_node>: PatFrag<(ops node:$ptr),
                                               (ld_node node:$ptr), [{
   LoadSDNode *L = cast<LoadSDNode>(N);
@@ -196,29 +259,14 @@ def az_extloadi8 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
   return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;
 }]>;
 
-def az_extloadi8_global : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{
-    return isGlobalLoad(dyn_cast<LoadSDNode>(N));
-}]>;
-
-def sextloadi8_global : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{
-    return isGlobalLoad(dyn_cast<LoadSDNode>(N));
-}]>;
+def az_extloadi8_global : GlobalLoad <az_extloadi8>;
+def sextloadi8_global : GlobalLoad <sextloadi8>;
 
-def az_extloadi8_constant : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{
-    return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
-}]>;
+def az_extloadi8_constant : ConstantLoad <az_extloadi8>;
+def sextloadi8_constant : ConstantLoad <sextloadi8>;
 
-def sextloadi8_constant : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{
-    return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
-}]>;
-
-def az_extloadi8_local : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{
-    return isLocalLoad(dyn_cast<LoadSDNode>(N));
-}]>;
-
-def sextloadi8_local : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{
-    return isLocalLoad(dyn_cast<LoadSDNode>(N));
-}]>;
+def az_extloadi8_local : LocalLoad <az_extloadi8>;
+def sextloadi8_local : LocalLoad <sextloadi8>;
 
 def extloadi8_private : PrivateLoad <az_extloadi8>;
 def sextloadi8_private : PrivateLoad <sextloadi8>;
@@ -227,29 +275,14 @@ def az_extloadi16 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
   return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;
 }]>;
 
-def az_extloadi16_global : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{
-    return isGlobalLoad(dyn_cast<LoadSDNode>(N));
-}]>;
-
-def sextloadi16_global : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{
-    return isGlobalLoad(dyn_cast<LoadSDNode>(N));
-}]>;
-
-def az_extloadi16_constant : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{
-    return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
-}]>;
-
-def sextloadi16_constant : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{
-    return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
-}]>;
+def az_extloadi16_global : GlobalLoad <az_extloadi16>;
+def sextloadi16_global : GlobalLoad <sextloadi16>;
 
-def az_extloadi16_local : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{
-    return isLocalLoad(dyn_cast<LoadSDNode>(N));
-}]>;
+def az_extloadi16_constant : ConstantLoad <az_extloadi16>;
+def sextloadi16_constant : ConstantLoad <sextloadi16>;
 
-def sextloadi16_local : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{
-    return isLocalLoad(dyn_cast<LoadSDNode>(N));
-}]>;
+def az_extloadi16_local : LocalLoad <az_extloadi16>;
+def sextloadi16_local : LocalLoad <sextloadi16>;
 
 def extloadi16_private : PrivateLoad <az_extloadi16>;
 def sextloadi16_private : PrivateLoad <sextloadi16>;
@@ -258,49 +291,20 @@ def az_extloadi32 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
   return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32;
 }]>;
 
-def az_extloadi32_global : PatFrag<(ops node:$ptr),
-                                   (az_extloadi32 node:$ptr), [{
-  return isGlobalLoad(dyn_cast<LoadSDNode>(N));
-}]>;
+def az_extloadi32_global : GlobalLoad <az_extloadi32>;
 
-def az_extloadi32_flat : PatFrag<(ops node:$ptr),
-                                   (az_extloadi32 node:$ptr), [{
-  return isFlatLoad(dyn_cast<LoadSDNode>(N));
-}]>;
+def az_extloadi32_flat : FlatLoad <az_extloadi32>;
 
-def az_extloadi32_constant : PatFrag<(ops node:$ptr),
-                                     (az_extloadi32 node:$ptr), [{
-  return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
-}]>;
+def az_extloadi32_constant : ConstantLoad <az_extloadi32>;
 
-def truncstorei8_global : PatFrag<(ops node:$val, node:$ptr),
-                                  (truncstorei8 node:$val, node:$ptr), [{
-  return isGlobalStore(dyn_cast<StoreSDNode>(N));
-}]>;
-
-def truncstorei16_global : PatFrag<(ops node:$val, node:$ptr),
-                                  (truncstorei16 node:$val, node:$ptr), [{
-  return isGlobalStore(dyn_cast<StoreSDNode>(N));
-}]>;
+def truncstorei8_global : GlobalStore <truncstorei8>;
+def truncstorei16_global : GlobalStore <truncstorei16>;
 
-def local_store : PatFrag<(ops node:$val, node:$ptr),
-                             (store node:$val, node:$ptr), [{
-  return isLocalStore(dyn_cast<StoreSDNode>(N));
-}]>;
+def local_store : LocalStore <store>;
+def truncstorei8_local : LocalStore <truncstorei8>;
+def truncstorei16_local : LocalStore <truncstorei16>;
 
-def truncstorei8_local : PatFrag<(ops node:$val, node:$ptr),
-                                  (truncstorei8 node:$val, node:$ptr), [{
-  return isLocalStore(dyn_cast<StoreSDNode>(N));
-}]>;
-
-def truncstorei16_local : PatFrag<(ops node:$val, node:$ptr),
-                                  (truncstorei16 node:$val, node:$ptr), [{
-  return isLocalStore(dyn_cast<StoreSDNode>(N));
-}]>;
-
-def local_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
-    return isLocalLoad(dyn_cast<LoadSDNode>(N));
-}]>;
+def local_load : LocalLoad <load>;
 
 class Aligned8Bytes <dag ops, dag frag> : PatFrag <ops, frag, [{
     return cast<MemSDNode>(N)->getAlignment() % 8 == 0;
@@ -370,6 +374,12 @@ class global_binary_atomic_op<SDNode atomic_op> : PatFrag<
   [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}]
 >;
 
+class flat_binary_atomic_op<SDNode atomic_op> : PatFrag<
+  (ops node:$ptr, node:$value),
+  (atomic_op node:$ptr, node:$value),
+  [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS;}]
+>;
+
 def atomic_swap_global : global_binary_atomic_op<atomic_swap>;
 def atomic_add_global : global_binary_atomic_op<atomic_load_add>;
 def atomic_and_global : global_binary_atomic_op<atomic_load_and>;
@@ -381,6 +391,26 @@ def atomic_umax_global : global_binary_atomic_op<atomic_load_umax>;
 def atomic_umin_global : global_binary_atomic_op<atomic_load_umin>;
 def atomic_xor_global : global_binary_atomic_op<atomic_load_xor>;
 
+def atomic_cmp_swap_global : global_binary_atomic_op<AMDGPUatomic_cmp_swap>;
+def atomic_cmp_swap_global_nortn : PatFrag<
+  (ops node:$ptr, node:$value),
+  (atomic_cmp_swap_global node:$ptr, node:$value),
+  [{ return SDValue(N, 0).use_empty(); }]
+>;
+
+def atomic_swap_flat : flat_binary_atomic_op<atomic_swap>;
+def atomic_add_flat : flat_binary_atomic_op<atomic_load_add>;
+def atomic_and_flat : flat_binary_atomic_op<atomic_load_and>;
+def atomic_max_flat : flat_binary_atomic_op<atomic_load_max>;
+def atomic_min_flat : flat_binary_atomic_op<atomic_load_min>;
+def atomic_or_flat : flat_binary_atomic_op<atomic_load_or>;
+def atomic_sub_flat : flat_binary_atomic_op<atomic_load_sub>;
+def atomic_umax_flat : flat_binary_atomic_op<atomic_load_umax>;
+def atomic_umin_flat : flat_binary_atomic_op<atomic_load_umin>;
+def atomic_xor_flat : flat_binary_atomic_op<atomic_load_xor>;
+
+def atomic_cmp_swap_flat : flat_binary_atomic_op<AMDGPUatomic_cmp_swap>;
+
 //===----------------------------------------------------------------------===//
 // Misc Pattern Fragments
 //===----------------------------------------------------------------------===//
@@ -392,6 +422,7 @@ int TWO_PI_INV = 0x3e22f983;
 int FP_UINT_MAX_PLUS_1 = 0x4f800000;    // 1 << 32 in floating point encoding
 int FP32_NEG_ONE = 0xbf800000;
 int FP32_ONE = 0x3f800000;
+int FP64_ONE = 0x3ff0000000000000;
 }
 def CONST : Constants;
 
@@ -570,6 +601,25 @@ class ROTRPattern <Instruction BIT_ALIGN> : Pat <
   (BIT_ALIGN $src0, $src0, $src1)
 >;
 
+// This matches 16 permutations of
+// max(min(x, y), min(max(x, y), z))
+class IntMed3Pat<Instruction med3Inst,
+                 SDPatternOperator max,
+                 SDPatternOperator max_oneuse,
+                 SDPatternOperator min_oneuse> : Pat<
+  (max (min_oneuse i32:$src0, i32:$src1),
+       (min_oneuse (max_oneuse i32:$src0, i32:$src1), i32:$src2)),
+  (med3Inst $src0, $src1, $src2)
+>;
+
+let Properties = [SDNPCommutative, SDNPAssociative] in {
+def smax_oneuse : HasOneUseBinOp<smax>;
+def smin_oneuse : HasOneUseBinOp<smin>;
+def umax_oneuse : HasOneUseBinOp<umax>;
+def umin_oneuse : HasOneUseBinOp<umin>;
+} // Properties = [SDNPCommutative, SDNPAssociative]
+
+
 // 24-bit arithmetic patterns
 def umul24 : PatFrag <(ops node:$x, node:$y), (mul node:$x, node:$y)>;
 
@@ -587,13 +637,6 @@ def cvt_flr_i32_f32 : PatFrag <
   [{ (void)N; return TM.Options.NoNaNsFPMath; }]
 >;
 
-/*
-class UMUL24Pattern <Instruction UMUL24> : Pat <
-  (mul U24:$x, U24:$y),
-  (UMUL24 $x, $y)
->;
-*/
-
 class IMad24Pat<Instruction Inst> : Pat <
   (add (AMDGPUmul_i24 i32:$src0, i32:$src1), i32:$src2),
   (Inst $src0, $src1, $src2)
@@ -604,30 +647,6 @@ class UMad24Pat<Instruction Inst> : Pat <
   (Inst $src0, $src1, $src2)
 >;
 
-multiclass Expand24IBitOps<Instruction MulInst, Instruction AddInst> {
-  def _expand_imad24 : Pat <
-    (AMDGPUmad_i24 i32:$src0, i32:$src1, i32:$src2),
-    (AddInst (MulInst $src0, $src1), $src2)
-  >;
-
-  def _expand_imul24 : Pat <
-    (AMDGPUmul_i24 i32:$src0, i32:$src1),
-    (MulInst $src0, $src1)
-  >;
-}
-
-multiclass Expand24UBitOps<Instruction MulInst, Instruction AddInst> {
-  def _expand_umad24 : Pat <
-    (AMDGPUmad_u24 i32:$src0, i32:$src1, i32:$src2),
-    (AddInst (MulInst $src0, $src1), $src2)
-  >;
-
-  def _expand_umul24 : Pat <
-    (AMDGPUmul_u24 i32:$src0, i32:$src1),
-    (MulInst $src0, $src1)
-  >;
-}
-
 class RcpPat<Instruction RcpInst, ValueType vt> : Pat <
   (fdiv FP_ONE, vt:$src),
   (RcpInst $src)
diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
index e94bb6013d83..791872a9db40 100644
--- a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
@@ -20,46 +20,44 @@
 
 using namespace llvm;
 
-#define GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN
-#include "AMDGPUGenIntrinsics.inc"
-#undef GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN
-
 AMDGPUIntrinsicInfo::AMDGPUIntrinsicInfo()
     : TargetIntrinsicInfo() {}
 
-std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys,
-                                         unsigned numTys) const {
-  static const char *const names[] = {
+static const char *const IntrinsicNameTable[] = {
 #define GET_INTRINSIC_NAME_TABLE
 #include "AMDGPUGenIntrinsics.inc"
 #undef GET_INTRINSIC_NAME_TABLE
-  };
+};
 
+std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys,
+                                         unsigned numTys) const {
   if (IntrID < Intrinsic::num_intrinsics) {
     return nullptr;
   }
   assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics &&
          "Invalid intrinsic ID");
 
-  std::string Result(names[IntrID - Intrinsic::num_intrinsics]);
+  std::string Result(IntrinsicNameTable[IntrID - Intrinsic::num_intrinsics]);
   return Result;
 }
 
-unsigned AMDGPUIntrinsicInfo::lookupName(const char *Name,
+unsigned AMDGPUIntrinsicInfo::lookupName(const char *NameData,
                                          unsigned Len) const {
-  if (!StringRef(Name, Len).startswith("llvm."))
+  StringRef Name(NameData, Len);
+  if (!Name.startswith("llvm."))
     return 0; // All intrinsics start with 'llvm.'
 
-#define GET_FUNCTION_RECOGNIZER
-#include "AMDGPUGenIntrinsics.inc"
-#undef GET_FUNCTION_RECOGNIZER
-  AMDGPUIntrinsic::ID IntrinsicID =
-      (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic;
-  IntrinsicID = getIntrinsicForGCCBuiltin("AMDGPU", Name);
-
-  if (IntrinsicID != (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic) {
-    return IntrinsicID;
+  // Look for a name match in our table.  If the intrinsic is not overloaded,
+  // require an exact match. If it is overloaded, require a prefix match. The
+  // AMDGPU enum enum starts at Intrinsic::num_intrinsics.
+  int Idx = Intrinsic::lookupLLVMIntrinsicByName(IntrinsicNameTable, Name);
+  if (Idx >= 0) {
+    bool IsPrefixMatch = Name.size() > strlen(IntrinsicNameTable[Idx]);
+    return IsPrefixMatch == isOverloaded(Idx + 1)
+               ? Intrinsic::num_intrinsics + Idx
+               : 0;
   }
+
   return 0;
 }
 
diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
index 4c95b5ec0974..f4173929259c 100644
--- a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
@@ -11,8 +11,8 @@
 /// \brief Interface for the AMDGPU Implementation of the Intrinsic Info class.
 //
 //===-----------------------------------------------------------------------===//
-#ifndef LLVM_LIB_TARGET_R600_AMDGPUINTRINSICINFO_H
-#define LLVM_LIB_TARGET_R600_AMDGPUINTRINSICINFO_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUINTRINSICINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUINTRINSICINFO_H
 
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/Target/TargetIntrinsicInfo.h"
@@ -31,7 +31,7 @@ enum ID {
 
 } // end namespace AMDGPUIntrinsic
 
-class AMDGPUIntrinsicInfo : public TargetIntrinsicInfo {
+class AMDGPUIntrinsicInfo final : public TargetIntrinsicInfo {
 public:
   AMDGPUIntrinsicInfo();
   std::string getName(unsigned IntrId, Type **Tys = nullptr,
diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsics.td b/lib/Target/AMDGPU/AMDGPUIntrinsics.td
index 1de3546485b1..2127391f18e7 100644
--- a/lib/Target/AMDGPU/AMDGPUIntrinsics.td
+++ b/lib/Target/AMDGPU/AMDGPUIntrinsics.td
@@ -12,79 +12,26 @@
 //===----------------------------------------------------------------------===//
 
 let TargetPrefix = "AMDGPU", isTarget = 1 in {
-
-  def int_AMDGPU_store_output : Intrinsic<[], [llvm_float_ty, llvm_i32_ty], []>;
-  def int_AMDGPU_swizzle : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_abs : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_arl : Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
-  def int_AMDGPU_cndlt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
-  def int_AMDGPU_div : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
-  def int_AMDGPU_fract : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
   def int_AMDGPU_clamp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
 
-  // This is named backwards (instead of rsq_legacy) so we don't have
-  // to define it with the public builtins intrinsics. This is a
-  // workaround for how intrinsic names are parsed. If the name is
-  // llvm.AMDGPU.rsq.legacy, the parser assumes that you meant
-  // llvm.AMDGPU.rsq.{f32 | f64} and incorrectly mangled the name.
-  def int_AMDGPU_legacy_rsq : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
-
-  def int_AMDGPU_dp4 : Intrinsic<[llvm_float_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
   def int_AMDGPU_kill : Intrinsic<[], [llvm_float_ty], []>;
   def int_AMDGPU_kilp : Intrinsic<[], [], []>;
-  def int_AMDGPU_lrp : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
-  def int_AMDGPU_mul : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
-  def int_AMDGPU_pow : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
-  def int_AMDGPU_seq : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
-  def int_AMDGPU_sgt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
-  def int_AMDGPU_sge : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
-  def int_AMDGPU_sle : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
-  def int_AMDGPU_sne : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
-  def int_AMDGPU_mullit : Intrinsic<[llvm_v4f32_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
-  def int_AMDGPU_tex : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_txb : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_txf : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_txq : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_txd : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_txl : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_trunc : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
-  def int_AMDGPU_ddx : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_ddy : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_imax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_imin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_umax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_umin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_umul24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_imul24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_imad24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_umad24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_cvt_f32_ubyte0 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_cvt_f32_ubyte1 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_cvt_f32_ubyte2 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_cvt_f32_ubyte3 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_flbit_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
+
+  // Deprecated in favor of separate int_amdgcn_cube* intrinsics.
   def int_AMDGPU_cube : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-  def int_AMDGPU_bfi : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+
+  // Deprecated in favor of expanded bit operations
   def int_AMDGPU_bfe_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_AMDGPU_bfe_u32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_bfm : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_brev : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_flbit_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_barrier_local  : Intrinsic<[], [], [IntrConvergent]>;
-  def int_AMDGPU_barrier_global  : Intrinsic<[], [], [IntrConvergent]>;
-}
-
-// Legacy names for compatibility.
-let TargetPrefix = "AMDIL", isTarget = 1 in {
-  def int_AMDIL_abs : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>;
-  def int_AMDIL_fraction : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
-  def int_AMDIL_clamp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
-  def int_AMDIL_exp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
-  def int_AMDIL_round_nearest : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
-}
 
-let TargetPrefix = "TGSI", isTarget = 1 in {
+  // Deprecated in favor of llvm.amdgcn.rsq
+  def int_AMDGPU_rsq : Intrinsic<
+    [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]
+  >;
 
-  def int_TGSI_lit_z : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],[IntrNoMem]>;
+  // Deprecated in favor of llvm.amdgcn.read.workdim
+  def int_AMDGPU_read_workdim : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>;
 }
 
 include "SIIntrinsics.td"
diff --git a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index dfc652f31da5..ad8d3e4d3545 100644
--- a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -15,9 +15,9 @@
 
 #include "AMDGPUMCInstLower.h"
 #include "AMDGPUAsmPrinter.h"
+#include "AMDGPUSubtarget.h"
 #include "AMDGPUTargetMachine.h"
 #include "InstPrinter/AMDGPUInstPrinter.h"
-#include "R600InstrInfo.h"
 #include "SIInstrInfo.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -37,8 +37,14 @@
 using namespace llvm;
 
 AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &st):
-  Ctx(ctx), ST(st)
-{ }
+  Ctx(ctx), ST(st) { }
+
+static MCSymbolRefExpr::VariantKind getVariantKind(unsigned MOFlags) {
+  switch (MOFlags) {
+  default: return MCSymbolRefExpr::VK_None;
+  case SIInstrInfo::MO_GOTPCREL: return MCSymbolRefExpr::VK_GOTPCREL;
+  }
+}
 
 void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
 
@@ -70,11 +76,16 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
     case MachineOperand::MO_GlobalAddress: {
       const GlobalValue *GV = MO.getGlobal();
       MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(GV->getName()));
-      MCOp = MCOperand::createExpr(MCSymbolRefExpr::create(Sym, Ctx));
+      const MCExpr *SymExpr =
+          MCSymbolRefExpr::create(Sym, getVariantKind(MO.getTargetFlags()),Ctx);
+      const MCExpr *Expr = MCBinaryExpr::createAdd(SymExpr,
+          MCConstantExpr::create(MO.getOffset(), Ctx), Ctx);
+      MCOp = MCOperand::createExpr(Expr);
       break;
     }
     case MachineOperand::MO_ExternalSymbol: {
       MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(MO.getSymbolName()));
+      Sym->setExternal(true);
       const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx);
       MCOp = MCOperand::createExpr(Expr);
       break;
@@ -88,13 +99,13 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   const AMDGPUSubtarget &STI = MF->getSubtarget<AMDGPUSubtarget>();
   AMDGPUMCInstLower MCInstLowering(OutContext, STI);
 
-#ifdef _DEBUG
   StringRef Err;
-  if (!STI.getInstrInfo()->verifyInstruction(MI, Err)) {
-    errs() << "Warning: Illegal instruction detected: " << Err << "\n";
+  if (!STI.getInstrInfo()->verifyInstruction(*MI, Err)) {
+    LLVMContext &C = MI->getParent()->getParent()->getFunction()->getContext();
+    C.emitError("Illegal instruction detected: " + Err);
     MI->dump();
   }
-#endif
+
   if (MI->isBundle()) {
     const MachineBasicBlock *MBB = MI->getParent();
     MachineBasicBlock::const_instr_iterator I = ++MI->getIterator();
@@ -103,6 +114,29 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       ++I;
     }
   } else {
+    // We don't want SI_MASK_BRANCH/SI_RETURN encoded. They are placeholder
+    // terminator instructions and should only be printed as comments.
+    if (MI->getOpcode() == AMDGPU::SI_MASK_BRANCH) {
+      if (isVerbose()) {
+        SmallVector<char, 16> BBStr;
+        raw_svector_ostream Str(BBStr);
+
+        const MachineBasicBlock *MBB = MI->getOperand(0).getMBB();
+        const MCSymbolRefExpr *Expr
+          = MCSymbolRefExpr::create(MBB->getSymbol(), OutContext);
+        Expr->print(Str, MAI);
+        OutStreamer->emitRawComment(" mask branch " + BBStr);
+      }
+
+      return;
+    }
+
+    if (MI->getOpcode() == AMDGPU::SI_RETURN) {
+      if (isVerbose())
+        OutStreamer->emitRawComment(" return");
+      return;
+    }
+
     MCInst TmpInst;
     MCInstLowering.lower(MI, TmpInst);
     EmitToStreamer(*OutStreamer, TmpInst);
@@ -114,10 +148,9 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       raw_string_ostream DisasmStream(DisasmLine);
 
       AMDGPUInstPrinter InstPrinter(*TM.getMCAsmInfo(),
-                                    *MF->getSubtarget().getInstrInfo(),
-                                    *MF->getSubtarget().getRegisterInfo());
-      InstPrinter.printInst(&TmpInst, DisasmStream, StringRef(),
-                            MF->getSubtarget());
+                                    *STI.getInstrInfo(),
+                                    *STI.getRegisterInfo());
+      InstPrinter.printInst(&TmpInst, DisasmStream, StringRef(), STI);
 
       // Disassemble instruction/operands to hex representation.
       SmallVector<MCFixup, 4> Fixups;
diff --git a/lib/Target/AMDGPU/AMDGPUMCInstLower.h b/lib/Target/AMDGPU/AMDGPUMCInstLower.h
index d322fe072b2b..957dcd0de8ef 100644
--- a/lib/Target/AMDGPU/AMDGPUMCInstLower.h
+++ b/lib/Target/AMDGPU/AMDGPUMCInstLower.h
@@ -8,8 +8,8 @@
 /// \file
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_R600_AMDGPUMCINSTLOWER_H
-#define LLVM_LIB_TARGET_R600_AMDGPUMCINSTLOWER_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMCINSTLOWER_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMCINSTLOWER_H
 
 namespace llvm {
 
diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index 54137177e4c0..44516dab04f1 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -1,8 +1,5 @@
 #include "AMDGPUMachineFunction.h"
-#include "AMDGPU.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/Function.h"
+
 using namespace llvm;
 
 // Pin the vtable to this file.
@@ -10,11 +7,17 @@ void AMDGPUMachineFunction::anchor() {}
 
 AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
   MachineFunctionInfo(),
-  ShaderType(ShaderType::COMPUTE),
+  KernArgSize(0),
+  MaxKernArgAlign(0),
   LDSSize(0),
   ABIArgOffset(0),
   ScratchSize(0),
-  IsKernel(true) {
+  IsKernel(MF.getFunction()->getCallingConv() == llvm::CallingConv::AMDGPU_KERNEL ||
+           MF.getFunction()->getCallingConv() == llvm::CallingConv::SPIR_KERNEL)
+{
+}
 
-  ShaderType = AMDGPU::getShaderType(*MF.getFunction());
+bool AMDGPUMachineFunction::isKernel() const
+{
+  return IsKernel;
 }
diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index 46fcee874887..6b31f63e1a9d 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -1,4 +1,4 @@
-//===-- R600MachineFunctionInfo.h - R600 Machine Function Info ----*- C++ -*-=//
+//===-- AMDGPUMachineFunctionInfo.h -------------------------------*- C++ -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,12 +6,9 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-/// \file
-//===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_R600_AMDGPUMACHINEFUNCTION_H
-#define LLVM_LIB_TARGET_R600_AMDGPUMACHINEFUNCTION_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINEFUNCTION_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINEFUNCTION_H
 
 #include "llvm/CodeGen/MachineFunction.h"
 #include <map>
@@ -19,11 +16,25 @@
 namespace llvm {
 
 class AMDGPUMachineFunction : public MachineFunctionInfo {
+  uint64_t KernArgSize;
+  unsigned MaxKernArgAlign;
+
   virtual void anchor();
-  unsigned ShaderType;
 
 public:
   AMDGPUMachineFunction(const MachineFunction &MF);
+
+  uint64_t allocateKernArg(uint64_t Size, unsigned Align) {
+    assert(isPowerOf2_32(Align));
+    KernArgSize = alignTo(KernArgSize, Align);
+
+    uint64_t Result = KernArgSize;
+    KernArgSize += Size;
+
+    MaxKernArgAlign = std::max(Align, MaxKernArgAlign);
+    return Result;
+  }
+
   /// A map to keep track of local memory objects and their offsets within
   /// the local memory space.
   std::map<const GlobalValue *, unsigned> LocalMemoryObjects;
@@ -33,14 +44,7 @@ public:
   /// Start of implicit kernel args
   unsigned ABIArgOffset;
 
-  unsigned getShaderType() const {
-    return ShaderType;
-  }
-
-  bool isKernel() const {
-    // FIXME: Assume everything is a kernel until function calls are supported.
-    return true;
-  }
+  bool isKernel() const;
 
   unsigned ScratchSize;
   bool IsKernel;
diff --git a/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp b/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp
index 554bf1da81f5..8bc7b53435be 100644
--- a/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp
+++ b/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp
@@ -25,7 +25,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/Passes.h"
diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 87d50d587059..775463809634 100644
--- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -16,7 +16,8 @@
 #include "AMDGPUSubtarget.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -26,79 +27,317 @@ using namespace llvm;
 
 namespace {
 
-class AMDGPUPromoteAlloca : public FunctionPass,
-                       public InstVisitor<AMDGPUPromoteAlloca> {
-
-  static char ID;
+// FIXME: This can create globals so should be a module pass.
+class AMDGPUPromoteAlloca : public FunctionPass {
+private:
+  const TargetMachine *TM;
   Module *Mod;
-  const AMDGPUSubtarget &ST;
-  int LocalMemAvailable;
+  const DataLayout *DL;
+  MDNode *MaxWorkGroupSizeRange;
+
+  // FIXME: This should be per-kernel.
+  uint32_t LocalMemLimit;
+  uint32_t CurrentLocalMemUsage;
+
+  bool IsAMDGCN;
+  bool IsAMDHSA;
+
+  std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
+  Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
+
+  /// BaseAlloca is the alloca root the search started from.
+  /// Val may be that alloca or a recursive user of it.
+  bool collectUsesWithPtrTypes(Value *BaseAlloca,
+                               Value *Val,
+                               std::vector<Value*> &WorkList) const;
+
+  /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
+  /// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
+  /// Returns true if both operands are derived from the same alloca. Val should
+  /// be the same value as one of the input operands of UseInst.
+  bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
+                                       Instruction *UseInst,
+                                       int OpIdx0, int OpIdx1) const;
 
 public:
-  AMDGPUPromoteAlloca(const AMDGPUSubtarget &st) : FunctionPass(ID), ST(st),
-                                                   LocalMemAvailable(0) { }
+  static char ID;
+
+  AMDGPUPromoteAlloca(const TargetMachine *TM_ = nullptr) :
+    FunctionPass(ID),
+    TM(TM_),
+    Mod(nullptr),
+    DL(nullptr),
+    MaxWorkGroupSizeRange(nullptr),
+    LocalMemLimit(0),
+    CurrentLocalMemUsage(0),
+    IsAMDGCN(false),
+    IsAMDHSA(false) { }
+
   bool doInitialization(Module &M) override;
   bool runOnFunction(Function &F) override;
-  const char *getPassName() const override { return "AMDGPU Promote Alloca"; }
-  void visitAlloca(AllocaInst &I);
+
+  const char *getPassName() const override {
+    return "AMDGPU Promote Alloca";
+  }
+
+  void handleAlloca(AllocaInst &I);
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    FunctionPass::getAnalysisUsage(AU);
+  }
 };
 
 } // End anonymous namespace
 
 char AMDGPUPromoteAlloca::ID = 0;
 
+INITIALIZE_TM_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE,
+                   "AMDGPU promote alloca to vector or LDS", false, false)
+
+char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
+
+
 bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
+  if (!TM)
+    return false;
+
   Mod = &M;
+  DL = &Mod->getDataLayout();
+
+  // The maximum workitem id.
+  //
+  // FIXME: Should get as subtarget property. Usually runtime enforced max is
+  // 256.
+  MDBuilder MDB(Mod->getContext());
+  MaxWorkGroupSizeRange = MDB.createRange(APInt(32, 0), APInt(32, 2048));
+
+  const Triple &TT = TM->getTargetTriple();
+
+  IsAMDGCN = TT.getArch() == Triple::amdgcn;
+  IsAMDHSA = TT.getOS() == Triple::AMDHSA;
+
   return false;
 }
 
 bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
+  if (!TM || skipFunction(F))
+    return false;
 
-  FunctionType *FTy = F.getFunctionType();
-
-  LocalMemAvailable = ST.getLocalMemorySize();
+  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
+  if (!ST.isPromoteAllocaEnabled())
+    return false;
 
+  FunctionType *FTy = F.getFunctionType();
 
   // If the function has any arguments in the local address space, then it's
   // possible these arguments require the entire local memory space, so
   // we cannot use local memory in the pass.
-  for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) {
-    Type *ParamTy = FTy->getParamType(i);
-    if (ParamTy->isPointerTy() &&
-        ParamTy->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
-      LocalMemAvailable = 0;
-      DEBUG(dbgs() << "Function has local memory argument.  Promoting to "
+  for (Type *ParamTy : FTy->params()) {
+    PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
+    if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+      LocalMemLimit = 0;
+      DEBUG(dbgs() << "Function has local memory argument. Promoting to "
                       "local memory disabled.\n");
-      break;
+      return false;
     }
   }
 
-  if (LocalMemAvailable > 0) {
-    // Check how much local memory is being used by global objects
-    for (Module::global_iterator I = Mod->global_begin(),
-                                 E = Mod->global_end(); I != E; ++I) {
-      GlobalVariable *GV = &*I;
-      PointerType *GVTy = GV->getType();
-      if (GVTy->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
+  LocalMemLimit = ST.getLocalMemorySize();
+  if (LocalMemLimit == 0)
+    return false;
+
+  const DataLayout &DL = Mod->getDataLayout();
+
+  // Check how much local memory is being used by global objects
+  CurrentLocalMemUsage = 0;
+  for (GlobalVariable &GV : Mod->globals()) {
+    if (GV.getType()->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
+      continue;
+
+    for (const User *U : GV.users()) {
+      const Instruction *Use = dyn_cast<Instruction>(U);
+      if (!Use)
         continue;
-      for (Value::use_iterator U = GV->use_begin(),
-                               UE = GV->use_end(); U != UE; ++U) {
-        Instruction *Use = dyn_cast<Instruction>(*U);
-        if (!Use)
-          continue;
-        if (Use->getParent()->getParent() == &F)
-          LocalMemAvailable -=
-              Mod->getDataLayout().getTypeAllocSize(GVTy->getElementType());
+
+      if (Use->getParent()->getParent() == &F) {
+        unsigned Align = GV.getAlignment();
+        if (Align == 0)
+          Align = DL.getABITypeAlignment(GV.getValueType());
+
+        // FIXME: Try to account for padding here. The padding is currently
+        // determined from the inverse order of uses in the function. I'm not
+        // sure if the use list order is in any way connected to this, so the
+        // total reported size is likely incorrect.
+        uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType());
+        CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align);
+        CurrentLocalMemUsage += AllocSize;
+        break;
       }
     }
   }
 
-  LocalMemAvailable = std::max(0, LocalMemAvailable);
-  DEBUG(dbgs() << LocalMemAvailable << "bytes free in local memory.\n");
+  unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage);
 
-  visit(F);
+  // Restrict local memory usage so that we don't drastically reduce occupancy,
+  // unless it is already significantly reduced.
 
-  return false;
+  // TODO: Have some sort of hint or other heuristics to guess occupancy based
+  // on other factors..
+  unsigned OccupancyHint
+    = AMDGPU::getIntegerAttribute(F, "amdgpu-max-waves-per-eu", 0);
+  if (OccupancyHint == 0)
+    OccupancyHint = 7;
+
+  // Clamp to max value.
+  OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerCU());
+
+  // Check the hint but ignore it if it's obviously wrong from the existing LDS
+  // usage.
+  MaxOccupancy = std::min(OccupancyHint, MaxOccupancy);
+
+
+  // Round up to the next tier of usage.
+  unsigned MaxSizeWithWaveCount
+    = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy);
+
+  // Program is possibly broken by using more local mem than available.
+  if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
+    return false;
+
+  LocalMemLimit = MaxSizeWithWaveCount;
+
+  DEBUG(
+    dbgs() << F.getName() << " uses " << CurrentLocalMemUsage << " bytes of LDS\n"
+    << "  Rounding size to " << MaxSizeWithWaveCount
+    << " with a maximum occupancy of " << MaxOccupancy << '\n'
+    << " and " << (LocalMemLimit - CurrentLocalMemUsage)
+    << " available for promotion\n"
+  );
+
+  BasicBlock &EntryBB = *F.begin();
+  for (auto I = EntryBB.begin(), E = EntryBB.end(); I != E; ) {
+    AllocaInst *AI = dyn_cast<AllocaInst>(I);
+
+    ++I;
+    if (AI)
+      handleAlloca(*AI);
+  }
+
+  return true;
+}
+
+std::pair<Value *, Value *>
+AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
+  if (!IsAMDHSA) {
+    Function *LocalSizeYFn
+      = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y);
+    Function *LocalSizeZFn
+      = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_z);
+
+    CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {});
+    CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {});
+
+    LocalSizeY->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
+    LocalSizeZ->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
+
+    return std::make_pair(LocalSizeY, LocalSizeZ);
+  }
+
+  // We must read the size out of the dispatch pointer.
+  assert(IsAMDGCN);
+
+  // We are indexing into this struct, and want to extract the workgroup_size_*
+  // fields.
+  //
+  //   typedef struct hsa_kernel_dispatch_packet_s {
+  //     uint16_t header;
+  //     uint16_t setup;
+  //     uint16_t workgroup_size_x ;
+  //     uint16_t workgroup_size_y;
+  //     uint16_t workgroup_size_z;
+  //     uint16_t reserved0;
+  //     uint32_t grid_size_x ;
+  //     uint32_t grid_size_y ;
+  //     uint32_t grid_size_z;
+  //
+  //     uint32_t private_segment_size;
+  //     uint32_t group_segment_size;
+  //     uint64_t kernel_object;
+  //
+  // #ifdef HSA_LARGE_MODEL
+  //     void *kernarg_address;
+  // #elif defined HSA_LITTLE_ENDIAN
+  //     void *kernarg_address;
+  //     uint32_t reserved1;
+  // #else
+  //     uint32_t reserved1;
+  //     void *kernarg_address;
+  // #endif
+  //     uint64_t reserved2;
+  //     hsa_signal_t completion_signal; // uint64_t wrapper
+  //   } hsa_kernel_dispatch_packet_t
+  //
+  Function *DispatchPtrFn
+    = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr);
+
+  CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {});
+  DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NoAlias);
+  DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull);
+
+  // Size of the dispatch packet struct.
+  DispatchPtr->addDereferenceableAttr(AttributeSet::ReturnIndex, 64);
+
+  Type *I32Ty = Type::getInt32Ty(Mod->getContext());
+  Value *CastDispatchPtr = Builder.CreateBitCast(
+    DispatchPtr, PointerType::get(I32Ty, AMDGPUAS::CONSTANT_ADDRESS));
+
+  // We could do a single 64-bit load here, but it's likely that the basic
+  // 32-bit and extract sequence is already present, and it is probably easier
+  // to CSE this. The loads should be mergable later anyway.
+  Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 1);
+  LoadInst *LoadXY = Builder.CreateAlignedLoad(GEPXY, 4);
+
+  Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 2);
+  LoadInst *LoadZU = Builder.CreateAlignedLoad(GEPZU, 4);
+
+  MDNode *MD = llvm::MDNode::get(Mod->getContext(), None);
+  LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
+  LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
+  LoadZU->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
+
+  // Extract y component. Upper half of LoadZU should be zero already.
+  Value *Y = Builder.CreateLShr(LoadXY, 16);
+
+  return std::make_pair(Y, LoadZU);
+}
+
+Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
+  Intrinsic::ID IntrID = Intrinsic::ID::not_intrinsic;
+
+  switch (N) {
+  case 0:
+    IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_x
+      : Intrinsic::r600_read_tidig_x;
+    break;
+  case 1:
+    IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_y
+      : Intrinsic::r600_read_tidig_y;
+    break;
+
+  case 2:
+    IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_z
+      : Intrinsic::r600_read_tidig_z;
+    break;
+  default:
+    llvm_unreachable("invalid dimension");
+  }
+
+  Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID);
+  CallInst *CI = Builder.CreateCall(WorkitemIdFn);
+  CI->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
+
+  return CI;
 }
 
 static VectorType *arrayTypeToVecType(Type *ArrayTy) {
@@ -151,17 +390,16 @@ static bool canVectorizeInst(Instruction *Inst, User *User) {
 }
 
 static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
-  Type *AllocaTy = Alloca->getAllocatedType();
+  ArrayType *AllocaTy = dyn_cast<ArrayType>(Alloca->getAllocatedType());
 
-  DEBUG(dbgs() << "Alloca Candidate for vectorization \n");
+  DEBUG(dbgs() << "Alloca candidate for vectorization\n");
 
   // FIXME: There is no reason why we can't support larger arrays, we
   // are just being conservative for now.
-  if (!AllocaTy->isArrayTy() ||
-      AllocaTy->getArrayElementType()->isVectorTy() ||
-      AllocaTy->getArrayNumElements() > 4) {
-
-    DEBUG(dbgs() << "  Cannot convert type to vector");
+  if (!AllocaTy ||
+      AllocaTy->getElementType()->isVectorTy() ||
+      AllocaTy->getNumElements() > 4) {
+    DEBUG(dbgs() << "  Cannot convert type to vector\n");
     return false;
   }
 
@@ -200,9 +438,8 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
   DEBUG(dbgs() << "  Converting alloca to vector "
         << *AllocaTy << " -> " << *VectorTy << '\n');
 
-  for (std::vector<Value*>::iterator I = WorkList.begin(),
-                                     E = WorkList.end(); I != E; ++I) {
-    Instruction *Inst = cast<Instruction>(*I);
+  for (Value *V : WorkList) {
+    Instruction *Inst = cast<Instruction>(V);
     IRBuilder<> Builder(Inst);
     switch (Inst->getOpcode()) {
     case Instruction::Load: {
@@ -239,44 +476,163 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
   return true;
 }
 
-static bool collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) {
-  bool Success = true;
+static bool isCallPromotable(CallInst *CI) {
+  // TODO: We might be able to handle some cases where the callee is a
+  // constantexpr bitcast of a function.
+  if (!CI->getCalledFunction())
+    return false;
+
+  IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
+  if (!II)
+    return false;
+
+  switch (II->getIntrinsicID()) {
+  case Intrinsic::memcpy:
+  case Intrinsic::memmove:
+  case Intrinsic::memset:
+  case Intrinsic::lifetime_start:
+  case Intrinsic::lifetime_end:
+  case Intrinsic::invariant_start:
+  case Intrinsic::invariant_end:
+  case Intrinsic::invariant_group_barrier:
+  case Intrinsic::objectsize:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool AMDGPUPromoteAlloca::binaryOpIsDerivedFromSameAlloca(Value *BaseAlloca,
+                                                          Value *Val,
+                                                          Instruction *Inst,
+                                                          int OpIdx0,
+                                                          int OpIdx1) const {
+  // Figure out which operand is the one we might not be promoting.
+  Value *OtherOp = Inst->getOperand(OpIdx0);
+  if (Val == OtherOp)
+    OtherOp = Inst->getOperand(OpIdx1);
+
+  if (isa<ConstantPointerNull>(OtherOp))
+    return true;
+
+  Value *OtherObj = GetUnderlyingObject(OtherOp, *DL);
+  if (!isa<AllocaInst>(OtherObj))
+    return false;
+
+  // TODO: We should be able to replace undefs with the right pointer type.
+
+  // TODO: If we know the other base object is another promotable
+  // alloca, not necessarily this alloca, we can do this. The
+  // important part is both must have the same address space at
+  // the end.
+  if (OtherObj != BaseAlloca) {
+    DEBUG(dbgs() << "Found a binary instruction with another alloca object\n");
+    return false;
+  }
+
+  return true;
+}
+
+bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes(
+  Value *BaseAlloca,
+  Value *Val,
+  std::vector<Value*> &WorkList) const {
+
   for (User *User : Val->users()) {
-    if(std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end())
+    if (std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end())
       continue;
+
     if (CallInst *CI = dyn_cast<CallInst>(User)) {
-      // TODO: We might be able to handle some cases where the callee is a
-      // constantexpr bitcast of a function.
-      if (!CI->getCalledFunction())
+      if (!isCallPromotable(CI))
         return false;
 
       WorkList.push_back(User);
       continue;
     }
 
-    // FIXME: Correctly handle ptrtoint instructions.
-    Instruction *UseInst = dyn_cast<Instruction>(User);
-    if (UseInst && UseInst->getOpcode() == Instruction::PtrToInt)
+    Instruction *UseInst = cast<Instruction>(User);
+    if (UseInst->getOpcode() == Instruction::PtrToInt)
       return false;
 
-    if (StoreInst *SI = dyn_cast_or_null<StoreInst>(UseInst)) {
+    if (LoadInst *LI = dyn_cast_or_null<LoadInst>(UseInst)) {
+      if (LI->isVolatile())
+        return false;
+
+      continue;
+    }
+
+    if (StoreInst *SI = dyn_cast<StoreInst>(UseInst)) {
+      if (SI->isVolatile())
+        return false;
+
       // Reject if the stored value is not the pointer operand.
       if (SI->getPointerOperand() != Val)
         return false;
+    } else if (AtomicRMWInst *RMW = dyn_cast_or_null<AtomicRMWInst>(UseInst)) {
+      if (RMW->isVolatile())
+        return false;
+    } else if (AtomicCmpXchgInst *CAS
+               = dyn_cast_or_null<AtomicCmpXchgInst>(UseInst)) {
+      if (CAS->isVolatile())
+        return false;
+    }
+
+    // Only promote a select if we know that the other select operand
+    // is from another pointer that will also be promoted.
+    if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
+      if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, ICmp, 0, 1))
+        return false;
+
+      // May need to rewrite constant operands.
+      WorkList.push_back(ICmp);
     }
 
     if (!User->getType()->isPointerTy())
       continue;
 
-    WorkList.push_back(User);
+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UseInst)) {
+      // Be conservative if an address could be computed outside the bounds of
+      // the alloca.
+      if (!GEP->isInBounds())
+        return false;
+    }
 
-    Success &= collectUsesWithPtrTypes(User, WorkList);
+    // Only promote a select if we know that the other select operand is from
+    // another pointer that will also be promoted.
+    if (SelectInst *SI = dyn_cast<SelectInst>(UseInst)) {
+      if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, SI, 1, 2))
+        return false;
+    }
+
+    // Repeat for phis.
+    if (PHINode *Phi = dyn_cast<PHINode>(UseInst)) {
+      // TODO: Handle more complex cases. We should be able to replace loops
+      // over arrays.
+      switch (Phi->getNumIncomingValues()) {
+      case 1:
+        break;
+      case 2:
+        if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Phi, 0, 1))
+          return false;
+        break;
+      default:
+        return false;
+      }
+    }
+
+    WorkList.push_back(User);
+    if (!collectUsesWithPtrTypes(BaseAlloca, User, WorkList))
+      return false;
   }
-  return Success;
+
+  return true;
 }
 
-void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) {
-  if (!I.isStaticAlloca())
+// FIXME: Should try to pick the most likely to be profitable allocas first.
+void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
+  // Array allocations are probably not worth handling, since an allocation of
+  // the array type is the canonical form.
+  if (!I.isStaticAlloca() || I.isArrayAllocation())
     return;
 
   IRBuilder<> Builder(&I);
@@ -286,95 +642,144 @@ void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) {
 
   DEBUG(dbgs() << "Trying to promote " << I << '\n');
 
-  if (tryPromoteAllocaToVector(&I))
+  if (tryPromoteAllocaToVector(&I)) {
+    DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
+    return;
+  }
+
+  const Function &ContainingFunction = *I.getParent()->getParent();
+
+  // Don't promote the alloca to LDS for shader calling conventions as the work
+  // item ID intrinsics are not supported for these calling conventions.
+  // Furthermore not all LDS is available for some of the stages.
+  if (AMDGPU::isShader(ContainingFunction.getCallingConv()))
     return;
 
-  DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
+  // FIXME: We should also try to get this value from the reqd_work_group_size
+  // function attribute if it is available.
+  unsigned WorkGroupSize = AMDGPU::getMaximumWorkGroupSize(ContainingFunction);
 
-  // FIXME: This is the maximum work group size.  We should try to get
-  // value from the reqd_work_group_size function attribute if it is
-  // available.
-  unsigned WorkGroupSize = 256;
-  int AllocaSize =
-      WorkGroupSize * Mod->getDataLayout().getTypeAllocSize(AllocaTy);
+  const DataLayout &DL = Mod->getDataLayout();
 
-  if (AllocaSize > LocalMemAvailable) {
-    DEBUG(dbgs() << " Not enough local memory to promote alloca.\n");
+  unsigned Align = I.getAlignment();
+  if (Align == 0)
+    Align = DL.getABITypeAlignment(I.getAllocatedType());
+
+  // FIXME: This computed padding is likely wrong since it depends on inverse
+  // usage order.
+  //
+  // FIXME: It is also possible that if we're allowed to use all of the memory
+  // could could end up using more than the maximum due to alignment padding.
+
+  uint32_t NewSize = alignTo(CurrentLocalMemUsage, Align);
+  uint32_t AllocSize = WorkGroupSize * DL.getTypeAllocSize(AllocaTy);
+  NewSize += AllocSize;
+
+  if (NewSize > LocalMemLimit) {
+    DEBUG(dbgs() << "  " << AllocSize
+          << " bytes of local memory not available to promote\n");
     return;
   }
 
+  CurrentLocalMemUsage = NewSize;
+
   std::vector<Value*> WorkList;
 
-  if (!collectUsesWithPtrTypes(&I, WorkList)) {
+  if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
     DEBUG(dbgs() << " Do not know how to convert all uses\n");
     return;
   }
 
   DEBUG(dbgs() << "Promoting alloca to local memory\n");
-  LocalMemAvailable -= AllocaSize;
 
-  Type *GVTy = ArrayType::get(I.getAllocatedType(), 256);
+  Function *F = I.getParent()->getParent();
+
+  Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize);
   GlobalVariable *GV = new GlobalVariable(
-      *Mod, GVTy, false, GlobalValue::ExternalLinkage, 0, I.getName(), 0,
-      GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);
-
-  FunctionType *FTy = FunctionType::get(
-      Type::getInt32Ty(Mod->getContext()), false);
-  AttributeSet AttrSet;
-  AttrSet.addAttribute(Mod->getContext(), 0, Attribute::ReadNone);
-
-  Value *ReadLocalSizeY = Mod->getOrInsertFunction(
-      "llvm.r600.read.local.size.y", FTy, AttrSet);
-  Value *ReadLocalSizeZ = Mod->getOrInsertFunction(
-      "llvm.r600.read.local.size.z", FTy, AttrSet);
-  Value *ReadTIDIGX = Mod->getOrInsertFunction(
-      "llvm.r600.read.tidig.x", FTy, AttrSet);
-  Value *ReadTIDIGY = Mod->getOrInsertFunction(
-      "llvm.r600.read.tidig.y", FTy, AttrSet);
-  Value *ReadTIDIGZ = Mod->getOrInsertFunction(
-      "llvm.r600.read.tidig.z", FTy, AttrSet);
-
-  Value *TCntY = Builder.CreateCall(ReadLocalSizeY, {});
-  Value *TCntZ = Builder.CreateCall(ReadLocalSizeZ, {});
-  Value *TIdX = Builder.CreateCall(ReadTIDIGX, {});
-  Value *TIdY = Builder.CreateCall(ReadTIDIGY, {});
-  Value *TIdZ = Builder.CreateCall(ReadTIDIGZ, {});
-
-  Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ);
+      *Mod, GVTy, false, GlobalValue::InternalLinkage,
+      UndefValue::get(GVTy),
+      Twine(F->getName()) + Twine('.') + I.getName(),
+      nullptr,
+      GlobalVariable::NotThreadLocal,
+      AMDGPUAS::LOCAL_ADDRESS);
+  GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+  GV->setAlignment(I.getAlignment());
+
+  Value *TCntY, *TCntZ;
+
+  std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
+  Value *TIdX = getWorkitemID(Builder, 0);
+  Value *TIdY = getWorkitemID(Builder, 1);
+  Value *TIdZ = getWorkitemID(Builder, 2);
+
+  Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true);
   Tmp0 = Builder.CreateMul(Tmp0, TIdX);
-  Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ);
+  Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true);
   Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
   TID = Builder.CreateAdd(TID, TIdZ);
 
-  std::vector<Value*> Indices;
-  Indices.push_back(Constant::getNullValue(Type::getInt32Ty(Mod->getContext())));
-  Indices.push_back(TID);
+  Value *Indices[] = {
+    Constant::getNullValue(Type::getInt32Ty(Mod->getContext())),
+    TID
+  };
 
-  Value *Offset = Builder.CreateGEP(GVTy, GV, Indices);
+  Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
   I.mutateType(Offset->getType());
   I.replaceAllUsesWith(Offset);
   I.eraseFromParent();
 
-  for (std::vector<Value*>::iterator i = WorkList.begin(),
-                                     e = WorkList.end(); i != e; ++i) {
-    Value *V = *i;
+  for (Value *V : WorkList) {
     CallInst *Call = dyn_cast<CallInst>(V);
     if (!Call) {
-      Type *EltTy = V->getType()->getPointerElementType();
-      PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
+      if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
+        Value *Src0 = CI->getOperand(0);
+        Type *EltTy = Src0->getType()->getPointerElementType();
+        PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
+
+        if (isa<ConstantPointerNull>(CI->getOperand(0)))
+          CI->setOperand(0, ConstantPointerNull::get(NewTy));
+
+        if (isa<ConstantPointerNull>(CI->getOperand(1)))
+          CI->setOperand(1, ConstantPointerNull::get(NewTy));
+
+        continue;
+      }
 
       // The operand's value should be corrected on its own.
       if (isa<AddrSpaceCastInst>(V))
         continue;
 
+      Type *EltTy = V->getType()->getPointerElementType();
+      PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
+
       // FIXME: It doesn't really make sense to try to do this for all
       // instructions.
       V->mutateType(NewTy);
+
+      // Adjust the types of any constant operands.
+      if (SelectInst *SI = dyn_cast<SelectInst>(V)) {
+        if (isa<ConstantPointerNull>(SI->getOperand(1)))
+          SI->setOperand(1, ConstantPointerNull::get(NewTy));
+
+        if (isa<ConstantPointerNull>(SI->getOperand(2)))
+          SI->setOperand(2, ConstantPointerNull::get(NewTy));
+      } else if (PHINode *Phi = dyn_cast<PHINode>(V)) {
+        for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
+          if (isa<ConstantPointerNull>(Phi->getIncomingValue(I)))
+            Phi->setIncomingValue(I, ConstantPointerNull::get(NewTy));
+        }
+      }
+
       continue;
     }
 
     IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Call);
     if (!Intr) {
+      // FIXME: What is this for? It doesn't make sense to promote arbitrary
+      // function calls. If the call is to a defined function that can also be
+      // promoted, we should be able to do this once that function is also
+      // rewritten.
+
       std::vector<Type*> ArgTypes;
       for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands();
                                 ArgIdx != ArgEnd; ++ArgIdx) {
@@ -405,6 +810,14 @@ void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) {
       Intr->eraseFromParent();
       continue;
     }
+    case Intrinsic::memmove: {
+      MemMoveInst *MemMove = cast<MemMoveInst>(Intr);
+      Builder.CreateMemMove(MemMove->getRawDest(), MemMove->getRawSource(),
+                            MemMove->getLength(), MemMove->getAlignment(),
+                            MemMove->isVolatile());
+      Intr->eraseFromParent();
+      continue;
+    }
     case Intrinsic::memset: {
       MemSetInst *MemSet = cast<MemSetInst>(Intr);
       Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
@@ -413,6 +826,28 @@ void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) {
       Intr->eraseFromParent();
       continue;
     }
+    case Intrinsic::invariant_start:
+    case Intrinsic::invariant_end:
+    case Intrinsic::invariant_group_barrier:
+      Intr->eraseFromParent();
+      // FIXME: I think the invariant marker should still theoretically apply,
+      // but the intrinsics need to be changed to accept pointers with any
+      // address space.
+      continue;
+    case Intrinsic::objectsize: {
+      Value *Src = Intr->getOperand(0);
+      Type *SrcTy = Src->getType()->getPointerElementType();
+      Function *ObjectSize = Intrinsic::getDeclaration(Mod,
+        Intrinsic::objectsize,
+        { Intr->getType(), PointerType::get(SrcTy, AMDGPUAS::LOCAL_ADDRESS) }
+      );
+
+      CallInst *NewCall
+        = Builder.CreateCall(ObjectSize, { Src, Intr->getOperand(1) });
+      Intr->replaceAllUsesWith(NewCall);
+      Intr->eraseFromParent();
+      continue;
+    }
     default:
       Intr->dump();
       llvm_unreachable("Don't know how to promote alloca intrinsic use.");
@@ -420,6 +855,6 @@ void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) {
   }
 }
 
-FunctionPass *llvm::createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST) {
-  return new AMDGPUPromoteAlloca(ST);
+FunctionPass *llvm::createAMDGPUPromoteAlloca(const TargetMachine *TM) {
+  return new AMDGPUPromoteAlloca(TM);
 }
diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
index 3ca0eca3417f..941f2d8a468a 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
@@ -24,20 +24,14 @@ AMDGPURegisterInfo::AMDGPURegisterInfo() : AMDGPUGenRegisterInfo(0) {}
 // they are not supported at this time.
 //===----------------------------------------------------------------------===//
 
-const MCPhysReg AMDGPURegisterInfo::CalleeSavedReg = AMDGPU::NoRegister;
+// Dummy to not crash RegisterClassInfo.
+static const MCPhysReg CalleeSavedReg = AMDGPU::NoRegister;
 
-const MCPhysReg*
-AMDGPURegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+const MCPhysReg *AMDGPURegisterInfo::getCalleeSavedRegs(
+  const MachineFunction *) const {
   return &CalleeSavedReg;
 }
 
-void AMDGPURegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
-                                             int SPAdj,
-                                             unsigned FIOperandNum,
-                                             RegScavenger *RS) const {
-  llvm_unreachable("Subroutines not supported yet");
-}
-
 unsigned AMDGPURegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   return AMDGPU::NoRegister;
 }
@@ -54,10 +48,5 @@ unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const {
   return SubRegs[Channel];
 }
 
-unsigned AMDGPURegisterInfo::getIndirectSubReg(unsigned IndirectIndex) const {
-
-  return getSubRegFromChannel(IndirectIndex);
-}
-
 #define GET_REGINFO_TARGET_DESC
 #include "AMDGPUGenRegisterInfo.inc"
diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.h b/lib/Target/AMDGPU/AMDGPURegisterInfo.h
index 0344834328f6..ef51aad95dce 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterInfo.h
+++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.h
@@ -13,10 +13,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_R600_AMDGPUREGISTERINFO_H
-#define LLVM_LIB_TARGET_R600_AMDGPUREGISTERINFO_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERINFO_H
 
-#include "llvm/ADT/BitVector.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 
 #define GET_REGINFO_HEADER
@@ -29,30 +28,14 @@ class AMDGPUSubtarget;
 class TargetInstrInfo;
 
 struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
-  static const MCPhysReg CalleeSavedReg;
-
   AMDGPURegisterInfo();
 
-  BitVector getReservedRegs(const MachineFunction &MF) const override {
-    assert(!"Unimplemented");  return BitVector();
-  }
-
-  virtual unsigned getHWRegIndex(unsigned Reg) const {
-    assert(!"Unimplemented"); return 0;
-  }
-
   /// \returns the sub reg enum value for the given \p Channel
   /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0)
   unsigned getSubRegFromChannel(unsigned Channel) const;
 
   const MCPhysReg* getCalleeSavedRegs(const MachineFunction *MF) const override;
-  void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
-                           unsigned FIOperandNum,
-                           RegScavenger *RS) const override;
   unsigned getFrameRegister(const MachineFunction &MF) const override;
-
-  unsigned getIndirectSubReg(unsigned IndirectIndex) const;
-
 };
 
 } // End namespace llvm
diff --git a/lib/Target/AMDGPU/AMDGPURuntimeMetadata.h b/lib/Target/AMDGPU/AMDGPURuntimeMetadata.h
new file mode 100644
index 000000000000..40f639434507
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPURuntimeMetadata.h
@@ -0,0 +1,138 @@
+//===-- AMDGPURuntimeMetadata.h - AMDGPU Runtime Metadata -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// Enums and structure types used by runtime metadata.
+///
+/// Runtime requests certain information (metadata) about kernels to be able
+/// to execute the kernels and answer the queries about the kernels.
+/// The metadata is represented as a byte stream in an ELF section of a
+/// binary (code object). The byte stream consists of key-value pairs.
+/// Each key is an 8 bit unsigned integer. Each value can be an integer,
+/// a string, or a stream of key-value pairs. There are 3 levels of key-value
+/// pair streams. At the beginning of the ELF section is the top level
+/// key-value pair stream. A kernel-level key-value pair stream starts after
+/// encountering KeyKernelBegin and ends immediately before encountering
+/// KeyKernelEnd. A kernel-argument-level key-value pair stream starts
+/// after encountering KeyArgBegin and ends immediately before encountering
+/// KeyArgEnd. A kernel-level key-value pair stream can only appear in a top
+/// level key-value pair stream. A kernel-argument-level key-value pair stream
+/// can only appear in a kernel-level key-value pair stream.
+///
+/// The format should be kept backward compatible. New enum values and bit
+/// fields should be appended at the end. It is suggested to bump up the
+/// revision number whenever the format changes and document the change
+/// in the revision in this header.
+///
+//
+//===----------------------------------------------------------------------===//
+//
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPURUNTIMEMETADATA_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPURUNTIMEMETADATA_H
+
+#include <stdint.h>
+
+namespace AMDGPU {
+
+namespace RuntimeMD {
+
+  // Version and revision of runtime metadata
+  const unsigned char MDVersion   = 1;
+  const unsigned char MDRevision  = 0;
+
+  // ELF section name containing runtime metadata
+  const char SectionName[] = ".AMDGPU.runtime_metadata";
+
+  // Enumeration values of keys in runtime metadata.
+  enum Key {
+    KeyNull                     = 0, // Place holder. Ignored when encountered
+    KeyMDVersion                = 1, // Runtime metadata version
+    KeyLanguage                 = 2, // Language
+    KeyLanguageVersion          = 3, // Language version
+    KeyKernelBegin              = 4, // Beginning of kernel-level stream
+    KeyKernelEnd                = 5, // End of kernel-level stream
+    KeyKernelName               = 6, // Kernel name
+    KeyArgBegin                 = 7, // Beginning of kernel-arg-level stream
+    KeyArgEnd                   = 8, // End of kernel-arg-level stream
+    KeyArgSize                  = 9, // Kernel arg size
+    KeyArgAlign                 = 10, // Kernel arg alignment
+    KeyArgTypeName              = 11, // Kernel type name
+    KeyArgName                  = 12, // Kernel name
+    KeyArgTypeKind              = 13, // Kernel argument type kind
+    KeyArgValueType             = 14, // Kernel argument value type
+    KeyArgAddrQual              = 15, // Kernel argument address qualifier
+    KeyArgAccQual               = 16, // Kernel argument access qualifier
+    KeyArgIsConst               = 17, // Kernel argument is const qualified
+    KeyArgIsRestrict            = 18, // Kernel argument is restrict qualified
+    KeyArgIsVolatile            = 19, // Kernel argument is volatile qualified
+    KeyArgIsPipe                = 20, // Kernel argument is pipe qualified
+    KeyReqdWorkGroupSize        = 21, // Required work group size
+    KeyWorkGroupSizeHint        = 22, // Work group size hint
+    KeyVecTypeHint              = 23, // Vector type hint
+    KeyKernelIndex              = 24, // Kernel index for device enqueue
+    KeySGPRs                    = 25, // Number of SGPRs
+    KeyVGPRs                    = 26, // Number of VGPRs
+    KeyMinWavesPerSIMD          = 27, // Minimum number of waves per SIMD
+    KeyMaxWavesPerSIMD          = 28, // Maximum number of waves per SIMD
+    KeyFlatWorkGroupSizeLimits  = 29, // Flat work group size limits
+    KeyMaxWorkGroupSize         = 30, // Maximum work group size
+    KeyNoPartialWorkGroups      = 31, // No partial work groups
+  };
+
+  enum Language : uint8_t {
+    OpenCL_C      = 0,
+    HCC           = 1,
+    OpenMP        = 2,
+    OpenCL_CPP    = 3,
+};
+
+  enum LanguageVersion : uint16_t {
+    V100          = 100,
+    V110          = 110,
+    V120          = 120,
+    V200          = 200,
+    V210          = 210,
+  };
+
+  namespace KernelArg {
+    enum TypeKind : uint8_t {
+      Value     = 0,
+      Pointer   = 1,
+      Image     = 2,
+      Sampler   = 3,
+      Queue     = 4,
+    };
+
+    enum ValueType : uint16_t {
+      Struct  = 0,
+      I8      = 1,
+      U8      = 2,
+      I16     = 3,
+      U16     = 4,
+      F16     = 5,
+      I32     = 6,
+      U32     = 7,
+      F32     = 8,
+      I64     = 9,
+      U64     = 10,
+      F64     = 11,
+    };
+
+    enum AccessQualifer : uint8_t {
+      None       = 0,
+      ReadOnly   = 1,
+      WriteOnly  = 2,
+      ReadWrite  = 3,
+    };
+  } // namespace KernelArg
+} // namespace RuntimeMD
+} // namespace AMDGPU
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPURUNTIMEMETADATA_H
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 7d70fa73da29..10fa9cf46737 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -15,7 +15,6 @@
 #include "AMDGPUSubtarget.h"
 #include "R600ISelLowering.h"
 #include "R600InstrInfo.h"
-#include "R600MachineScheduler.h"
 #include "SIFrameLowering.h"
 #include "SIISelLowering.h"
 #include "SIInstrInfo.h"
@@ -32,6 +31,8 @@ using namespace llvm;
 #define GET_SUBTARGETINFO_CTOR
 #include "AMDGPUGenSubtargetInfo.inc"
 
+AMDGPUSubtarget::~AMDGPUSubtarget() {}
+
 AMDGPUSubtarget &
 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
                                                  StringRef GPU, StringRef FS) {
@@ -44,14 +45,11 @@ AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
   // for SI has the unhelpful behavior that it unsets everything else if you
   // disable it.
 
-  SmallString<256> FullFS("+promote-alloca,+fp64-denormals,");
+  SmallString<256> FullFS("+promote-alloca,+fp64-denormals,+load-store-opt,");
   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
-    FullFS += "+flat-for-global,";
+    FullFS += "+flat-for-global,+unaligned-buffer-access,";
   FullFS += FS;
 
-  if (GPU == "" && TT.getArch() == Triple::amdgcn)
-    GPU = "SI";
-
   ParseSubtargetFeatures(GPU, FullFS);
 
   // FIXME: I don't think think Evergreen has any useful support for
@@ -61,52 +59,142 @@ AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
     FP32Denormals = false;
     FP64Denormals = false;
   }
+
+  // Set defaults if needed.
+  if (MaxPrivateElementSize == 0)
+    MaxPrivateElementSize = 4;
+
   return *this;
 }
 
 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
-                                 TargetMachine &TM)
-    : AMDGPUGenSubtargetInfo(TT, GPU, FS), DevName(GPU), Is64bit(false),
-      DumpCode(false), R600ALUInst(false), HasVertexCache(false),
-      TexVTXClauseSize(0), Gen(AMDGPUSubtarget::R600), FP64(false),
-      FP64Denormals(false), FP32Denormals(false), FastFMAF32(false),
-      CaymanISA(false), FlatAddressSpace(false), FlatForGlobal(false),
-      EnableIRStructurizer(true), EnablePromoteAlloca(false), EnableIfCvt(true),
-      EnableLoadStoreOpt(false), EnableUnsafeDSOffsetFolding(false),
-      EnableXNACK(false),
-      WavefrontSize(0), CFALUBug(false), LocalMemorySize(0),
-      EnableVGPRSpilling(false), SGPRInitBug(false), IsGCN(false),
-      GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), LDSBankCount(0),
-      IsaVersion(ISAVersion0_0_0), EnableHugeScratchBuffer(false),
-      EnableSIScheduler(false), FrameLowering(nullptr),
-      InstrItins(getInstrItineraryForCPU(GPU)), TargetTriple(TT) {
-
+                                 const TargetMachine &TM)
+  : AMDGPUGenSubtargetInfo(TT, GPU, FS),
+    TargetTriple(TT),
+    Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600),
+    IsaVersion(ISAVersion0_0_0),
+    WavefrontSize(64),
+    LocalMemorySize(0),
+    LDSBankCount(0),
+    MaxPrivateElementSize(0),
+
+    FastFMAF32(false),
+    HalfRate64Ops(false),
+
+    FP32Denormals(false),
+    FP64Denormals(false),
+    FPExceptions(false),
+    FlatForGlobal(false),
+    UnalignedBufferAccess(false),
+
+    EnableXNACK(false),
+    DebuggerInsertNops(false),
+    DebuggerReserveRegs(false),
+    DebuggerEmitPrologue(false),
+
+    EnableVGPRSpilling(false),
+    EnablePromoteAlloca(false),
+    EnableLoadStoreOpt(false),
+    EnableUnsafeDSOffsetFolding(false),
+    EnableSIScheduler(false),
+    DumpCode(false),
+
+    FP64(false),
+    IsGCN(false),
+    GCN1Encoding(false),
+    GCN3Encoding(false),
+    CIInsts(false),
+    SGPRInitBug(false),
+    HasSMemRealTime(false),
+    Has16BitInsts(false),
+    FlatAddressSpace(false),
+
+    R600ALUInst(false),
+    CaymanISA(false),
+    CFALUBug(false),
+    HasVertexCache(false),
+    TexVTXClauseSize(0),
+
+    FeatureDisable(false),
+    InstrItins(getInstrItineraryForCPU(GPU)) {
   initializeSubtargetDependencies(TT, GPU, FS);
+}
 
-  const unsigned MaxStackAlign = 64 * 16; // Maximum stack alignment (long16)
-
-  if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
-    InstrInfo.reset(new R600InstrInfo(*this));
-    TLInfo.reset(new R600TargetLowering(TM, *this));
-
-    // FIXME: Should have R600 specific FrameLowering
-    FrameLowering.reset(new AMDGPUFrameLowering(
-                          TargetFrameLowering::StackGrowsUp,
-                          MaxStackAlign,
-                          0));
-  } else {
-    InstrInfo.reset(new SIInstrInfo(*this));
-    TLInfo.reset(new SITargetLowering(TM, *this));
-    FrameLowering.reset(new SIFrameLowering(
-                          TargetFrameLowering::StackGrowsUp,
-                          MaxStackAlign,
-                          0));
+// FIXME: These limits are for SI. Did they change with the larger maximum LDS
+// size?
+unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves) const {
+  switch (NWaves) {
+  case 10:
+    return 1638;
+  case 9:
+    return 1820;
+  case 8:
+    return 2048;
+  case 7:
+    return 2340;
+  case 6:
+    return 2730;
+  case 5:
+    return 3276;
+  case 4:
+    return 4096;
+  case 3:
+    return 5461;
+  case 2:
+    return 8192;
+  default:
+    return getLocalMemorySize();
   }
 }
 
-unsigned AMDGPUSubtarget::getStackEntrySize() const {
-  assert(getGeneration() <= NORTHERN_ISLANDS);
-  switch(getWavefrontSize()) {
+unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes) const {
+  if (Bytes <= 1638)
+    return 10;
+
+  if (Bytes <= 1820)
+    return 9;
+
+  if (Bytes <= 2048)
+    return 8;
+
+  if (Bytes <= 2340)
+    return 7;
+
+  if (Bytes <= 2730)
+    return 6;
+
+  if (Bytes <= 3276)
+    return 5;
+
+  if (Bytes <= 4096)
+    return 4;
+
+  if (Bytes <= 5461)
+    return 3;
+
+  if (Bytes <= 8192)
+    return 2;
+
+  return 1;
+}
+
+R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
+                             const TargetMachine &TM) :
+  AMDGPUSubtarget(TT, GPU, FS, TM),
+  InstrInfo(*this),
+  FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
+  TLInfo(TM, *this) {}
+
+SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
+                         const TargetMachine &TM) :
+  AMDGPUSubtarget(TT, GPU, FS, TM),
+  InstrInfo(*this),
+  FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
+  TLInfo(TM, *this),
+  GISel() {}
+
+unsigned R600Subtarget::getStackEntrySize() const {
+  switch (getWavefrontSize()) {
   case 16:
     return 8;
   case 32:
@@ -118,37 +206,36 @@ unsigned AMDGPUSubtarget::getStackEntrySize() const {
   }
 }
 
-unsigned AMDGPUSubtarget::getAmdKernelCodeChipID() const {
-  switch(getGeneration()) {
-  default: llvm_unreachable("ChipID unknown");
-  case SEA_ISLANDS: return 12;
-  }
-}
-
-AMDGPU::IsaVersion AMDGPUSubtarget::getIsaVersion() const {
-  return AMDGPU::getIsaVersion(getFeatureBits());
+void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
+                                      unsigned NumRegionInstrs) const {
+  // Track register pressure so the scheduler can try to decrease
+  // pressure once register usage is above the threshold defined by
+  // SIRegisterInfo::getRegPressureSetLimit()
+  Policy.ShouldTrackPressure = true;
+
+  // Enabling both top down and bottom up scheduling seems to give us less
+  // register spills than just using one of these approaches on its own.
+  Policy.OnlyTopDown = false;
+  Policy.OnlyBottomUp = false;
+
+  // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
+  if (!enableSIScheduler())
+    Policy.ShouldTrackLaneMasks = true;
 }
 
-bool AMDGPUSubtarget::isVGPRSpillingEnabled(
-                                       const SIMachineFunctionInfo *MFI) const {
-  return MFI->getShaderType() == ShaderType::COMPUTE || EnableVGPRSpilling;
+bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
+  return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
 }
 
-void AMDGPUSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
-                                          MachineInstr *begin,
-                                          MachineInstr *end,
-                                          unsigned NumRegionInstrs) const {
-  if (getGeneration() >= SOUTHERN_ISLANDS) {
-
-    // Track register pressure so the scheduler can try to decrease
-    // pressure once register usage is above the threshold defined by
-    // SIRegisterInfo::getRegPressureSetLimit()
-    Policy.ShouldTrackPressure = true;
-
-    // Enabling both top down and bottom up scheduling seems to give us less
-    // register spills than just using one of these approaches on its own.
-    Policy.OnlyTopDown = false;
-    Policy.OnlyBottomUp = false;
+unsigned SISubtarget::getAmdKernelCodeChipID() const {
+  switch (getGeneration()) {
+  case SEA_ISLANDS:
+    return 12;
+  default:
+    llvm_unreachable("ChipID unknown");
   }
 }
 
+AMDGPU::IsaVersion SISubtarget::getIsaVersion() const {
+  return AMDGPU::getIsaVersion(getFeatureBits());
+}
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 49c94f1eceb8..3fe61aa449e0 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -16,12 +16,14 @@
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
 
 #include "AMDGPU.h"
-#include "AMDGPUFrameLowering.h"
-#include "AMDGPUInstrInfo.h"
-#include "AMDGPUISelLowering.h"
-#include "AMDGPUSubtarget.h"
+#include "R600InstrInfo.h"
+#include "R600ISelLowering.h"
+#include "R600FrameLowering.h"
+#include "SIInstrInfo.h"
+#include "SIISelLowering.h"
+#include "SIFrameLowering.h"
 #include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 
 #define GET_SUBTARGETINFO_HEADER
@@ -30,9 +32,9 @@
 namespace llvm {
 
 class SIMachineFunctionInfo;
+class StringRef;
 
 class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
-
 public:
   enum Generation {
     R600 = 0,
@@ -45,10 +47,6 @@ public:
   };
 
   enum {
-    FIXED_SGPR_COUNT_FOR_INIT_BUG = 80
-  };
-
-  enum {
     ISAVersion0_0_0,
     ISAVersion7_0_0,
     ISAVersion7_0_1,
@@ -57,114 +55,116 @@ public:
     ISAVersion8_0_3
   };
 
-private:
-  std::string DevName;
-  bool Is64bit;
-  bool DumpCode;
-  bool R600ALUInst;
-  bool HasVertexCache;
-  short TexVTXClauseSize;
+protected:
+  // Basic subtarget description.
+  Triple TargetTriple;
   Generation Gen;
-  bool FP64;
-  bool FP64Denormals;
-  bool FP32Denormals;
+  unsigned IsaVersion;
+  unsigned WavefrontSize;
+  int LocalMemorySize;
+  int LDSBankCount;
+  unsigned MaxPrivateElementSize;
+
+  // Possibly statically set by tablegen, but may want to be overridden.
   bool FastFMAF32;
-  bool CaymanISA;
-  bool FlatAddressSpace;
+  bool HalfRate64Ops;
+
+  // Dynamially set bits that enable features.
+  bool FP32Denormals;
+  bool FP64Denormals;
+  bool FPExceptions;
   bool FlatForGlobal;
-  bool EnableIRStructurizer;
+  bool UnalignedBufferAccess;
+  bool EnableXNACK;
+  bool DebuggerInsertNops;
+  bool DebuggerReserveRegs;
+  bool DebuggerEmitPrologue;
+
+  // Used as options.
+  bool EnableVGPRSpilling;
   bool EnablePromoteAlloca;
-  bool EnableIfCvt;
   bool EnableLoadStoreOpt;
   bool EnableUnsafeDSOffsetFolding;
-  bool EnableXNACK;
-  unsigned WavefrontSize;
-  bool CFALUBug;
-  int LocalMemorySize;
-  bool EnableVGPRSpilling;
-  bool SGPRInitBug;
+  bool EnableSIScheduler;
+  bool DumpCode;
+
+  // Subtarget statically properties set by tablegen
+  bool FP64;
   bool IsGCN;
   bool GCN1Encoding;
   bool GCN3Encoding;
   bool CIInsts;
+  bool SGPRInitBug;
+  bool HasSMemRealTime;
+  bool Has16BitInsts;
+  bool FlatAddressSpace;
+  bool R600ALUInst;
+  bool CaymanISA;
+  bool CFALUBug;
+  bool HasVertexCache;
+  short TexVTXClauseSize;
+
+  // Dummy feature to use for assembler in tablegen.
   bool FeatureDisable;
-  int LDSBankCount;
-  unsigned IsaVersion;
-  bool EnableHugeScratchBuffer;
-  bool EnableSIScheduler;
 
-  std::unique_ptr<AMDGPUFrameLowering> FrameLowering;
-  std::unique_ptr<AMDGPUTargetLowering> TLInfo;
-  std::unique_ptr<AMDGPUInstrInfo> InstrInfo;
   InstrItineraryData InstrItins;
-  Triple TargetTriple;
 
 public:
-  AMDGPUSubtarget(const Triple &TT, StringRef CPU, StringRef FS,
-                  TargetMachine &TM);
+  AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
+                  const TargetMachine &TM);
+  virtual ~AMDGPUSubtarget();
   AMDGPUSubtarget &initializeSubtargetDependencies(const Triple &TT,
                                                    StringRef GPU, StringRef FS);
 
-  const AMDGPUFrameLowering *getFrameLowering() const override {
-    return FrameLowering.get();
-  }
-  const AMDGPUInstrInfo *getInstrInfo() const override {
-    return InstrInfo.get();
-  }
-  const AMDGPURegisterInfo *getRegisterInfo() const override {
-    return &InstrInfo->getRegisterInfo();
-  }
-  AMDGPUTargetLowering *getTargetLowering() const override {
-    return TLInfo.get();
-  }
+  const AMDGPUInstrInfo *getInstrInfo() const override;
+  const AMDGPUFrameLowering *getFrameLowering() const override;
+  const AMDGPUTargetLowering *getTargetLowering() const override;
+  const AMDGPURegisterInfo *getRegisterInfo() const override;
+
   const InstrItineraryData *getInstrItineraryData() const override {
     return &InstrItins;
   }
 
   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
 
-  bool is64bit() const {
-    return Is64bit;
-  }
-
-  bool hasVertexCache() const {
-    return HasVertexCache;
-  }
-
-  short getTexVTXClauseSize() const {
-    return TexVTXClauseSize;
+  bool isAmdHsaOS() const {
+    return TargetTriple.getOS() == Triple::AMDHSA;
   }
 
   Generation getGeneration() const {
     return Gen;
   }
 
-  bool hasHWFP64() const {
-    return FP64;
+  unsigned getWavefrontSize() const {
+    return WavefrontSize;
   }
 
-  bool hasCaymanISA() const {
-    return CaymanISA;
+  int getLocalMemorySize() const {
+    return LocalMemorySize;
   }
 
-  bool hasFP32Denormals() const {
-    return FP32Denormals;
+  int getLDSBankCount() const {
+    return LDSBankCount;
   }
 
-  bool hasFP64Denormals() const {
-    return FP64Denormals;
+  unsigned getMaxPrivateElementSize() const {
+    return MaxPrivateElementSize;
+  }
+
+  bool hasHWFP64() const {
+    return FP64;
   }
 
   bool hasFastFMAF32() const {
     return FastFMAF32;
   }
 
-  bool hasFlatAddressSpace() const {
-    return FlatAddressSpace;
+  bool hasHalfRate64Ops() const {
+    return HalfRate64Ops;
   }
 
-  bool useFlatForGlobal() const {
-    return FlatForGlobal;
+  bool hasAddr64() const {
+    return (getGeneration() < VOLCANIC_ISLANDS);
   }
 
   bool hasBFE() const {
@@ -214,116 +214,249 @@ public:
     return (getGeneration() >= EVERGREEN);
   }
 
-  bool IsIRStructurizerEnabled() const {
-    return EnableIRStructurizer;
+  bool hasCaymanISA() const {
+    return CaymanISA;
   }
 
   bool isPromoteAllocaEnabled() const {
     return EnablePromoteAlloca;
   }
 
-  bool isIfCvtEnabled() const {
-    return EnableIfCvt;
+  bool unsafeDSOffsetFoldingEnabled() const {
+    return EnableUnsafeDSOffsetFolding;
   }
 
-  bool loadStoreOptEnabled() const {
-    return EnableLoadStoreOpt;
+  bool dumpCode() const {
+    return DumpCode;
   }
 
-  bool unsafeDSOffsetFoldingEnabled() const {
-    return EnableUnsafeDSOffsetFolding;
+  /// Return the amount of LDS that can be used that will not restrict the
+  /// occupancy lower than WaveCount.
+  unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount) const;
+
+  /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if
+  /// the given LDS memory size is the only constraint.
+  unsigned getOccupancyWithLocalMemSize(uint32_t Bytes) const;
+
+
+  bool hasFP32Denormals() const {
+    return FP32Denormals;
   }
 
-  unsigned getWavefrontSize() const {
-    return WavefrontSize;
+  bool hasFP64Denormals() const {
+    return FP64Denormals;
   }
 
-  unsigned getStackEntrySize() const;
+  bool hasFPExceptions() const {
+    return FPExceptions;
+  }
 
-  bool hasCFAluBug() const {
-    assert(getGeneration() <= NORTHERN_ISLANDS);
-    return CFALUBug;
+  bool useFlatForGlobal() const {
+    return FlatForGlobal;
   }
 
-  int getLocalMemorySize() const {
-    return LocalMemorySize;
+  bool hasUnalignedBufferAccess() const {
+    return UnalignedBufferAccess;
   }
 
-  bool hasSGPRInitBug() const {
-    return SGPRInitBug;
+  bool isXNACKEnabled() const {
+    return EnableXNACK;
   }
 
-  int getLDSBankCount() const {
-    return LDSBankCount;
+  unsigned getMaxWavesPerCU() const {
+    if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)
+      return 10;
+
+    // FIXME: Not sure what this is for other subtagets.
+    return 8;
   }
 
-  unsigned getAmdKernelCodeChipID() const;
+  /// \brief Returns the offset in bytes from the start of the input buffer
+  ///        of the first explicit kernel argument.
+  unsigned getExplicitKernelArgOffset() const {
+    return isAmdHsaOS() ? 0 : 36;
+  }
 
-  AMDGPU::IsaVersion getIsaVersion() const;
+  unsigned getStackAlignment() const {
+    // Scratch is allocated in 256 dword per wave blocks.
+    return 4 * 256 / getWavefrontSize();
+  }
 
   bool enableMachineScheduler() const override {
     return true;
   }
 
-  void overrideSchedPolicy(MachineSchedPolicy &Policy,
-                           MachineInstr *begin, MachineInstr *end,
-                           unsigned NumRegionInstrs) const override;
+  bool enableSubRegLiveness() const override {
+    return true;
+  }
+};
 
-  // Helper functions to simplify if statements
-  bool isTargetELF() const {
-    return false;
+class R600Subtarget final : public AMDGPUSubtarget {
+private:
+  R600InstrInfo InstrInfo;
+  R600FrameLowering FrameLowering;
+  R600TargetLowering TLInfo;
+
+public:
+  R600Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
+                const TargetMachine &TM);
+
+  const R600InstrInfo *getInstrInfo() const override {
+    return &InstrInfo;
   }
 
-  StringRef getDeviceName() const {
-    return DevName;
+  const R600FrameLowering *getFrameLowering() const override {
+    return &FrameLowering;
   }
 
-  bool enableHugeScratchBuffer() const {
-    return EnableHugeScratchBuffer;
+  const R600TargetLowering *getTargetLowering() const override {
+    return &TLInfo;
   }
 
-  bool enableSIScheduler() const {
-    return EnableSIScheduler;
+  const R600RegisterInfo *getRegisterInfo() const override {
+    return &InstrInfo.getRegisterInfo();
   }
 
-  bool dumpCode() const {
-    return DumpCode;
+  bool hasCFAluBug() const {
+    return CFALUBug;
   }
-  bool r600ALUEncoding() const {
-    return R600ALUInst;
+
+  bool hasVertexCache() const {
+    return HasVertexCache;
   }
-  bool isAmdHsaOS() const {
-    return TargetTriple.getOS() == Triple::AMDHSA;
+
+  short getTexVTXClauseSize() const {
+    return TexVTXClauseSize;
   }
-  bool isVGPRSpillingEnabled(const SIMachineFunctionInfo *MFI) const;
 
-  bool isXNACKEnabled() const {
-    return EnableXNACK;
+  unsigned getStackEntrySize() const;
+};
+
+class SISubtarget final : public AMDGPUSubtarget {
+public:
+  enum {
+    FIXED_SGPR_COUNT_FOR_INIT_BUG = 80
+  };
+
+private:
+  SIInstrInfo InstrInfo;
+  SIFrameLowering FrameLowering;
+  SITargetLowering TLInfo;
+  std::unique_ptr<GISelAccessor> GISel;
+
+public:
+  SISubtarget(const Triple &TT, StringRef CPU, StringRef FS,
+              const TargetMachine &TM);
+
+  const SIInstrInfo *getInstrInfo() const override {
+    return &InstrInfo;
   }
 
-  unsigned getMaxWavesPerCU() const {
-    if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)
-      return 10;
+  const SIFrameLowering *getFrameLowering() const override {
+    return &FrameLowering;
+  }
 
-    // FIXME: Not sure what this is for other subtagets.
-    llvm_unreachable("do not know max waves per CU for this subtarget.");
+  const SITargetLowering *getTargetLowering() const override {
+    return &TLInfo;
   }
 
-  bool enableSubRegLiveness() const override {
-    return true;
+  const CallLowering *getCallLowering() const override {
+    assert(GISel && "Access to GlobalISel APIs not set");
+    return GISel->getCallLowering();
   }
 
-  /// \brief Returns the offset in bytes from the start of the input buffer
-  ///        of the first explicit kernel argument.
-  unsigned getExplicitKernelArgOffset() const {
-    return isAmdHsaOS() ? 0 : 36;
+  const SIRegisterInfo *getRegisterInfo() const override {
+    return &InstrInfo.getRegisterInfo();
+  }
+
+  void setGISelAccessor(GISelAccessor &GISel) {
+    this->GISel.reset(&GISel);
   }
 
+  void overrideSchedPolicy(MachineSchedPolicy &Policy,
+                           unsigned NumRegionInstrs) const override;
+
+  bool isVGPRSpillingEnabled(const Function& F) const;
+
+  unsigned getAmdKernelCodeChipID() const;
+
+  AMDGPU::IsaVersion getIsaVersion() const;
+
   unsigned getMaxNumUserSGPRs() const {
     return 16;
   }
+
+  bool hasFlatAddressSpace() const {
+    return FlatAddressSpace;
+  }
+
+  bool hasSMemRealTime() const {
+    return HasSMemRealTime;
+  }
+
+  bool has16BitInsts() const {
+    return Has16BitInsts;
+  }
+
+  bool enableSIScheduler() const {
+    return EnableSIScheduler;
+  }
+
+  bool debuggerSupported() const {
+    return debuggerInsertNops() && debuggerReserveRegs() &&
+      debuggerEmitPrologue();
+  }
+
+  bool debuggerInsertNops() const {
+    return DebuggerInsertNops;
+  }
+
+  bool debuggerReserveRegs() const {
+    return DebuggerReserveRegs;
+  }
+
+  bool debuggerEmitPrologue() const {
+    return DebuggerEmitPrologue;
+  }
+
+  bool loadStoreOptEnabled() const {
+    return EnableLoadStoreOpt;
+  }
+
+  bool hasSGPRInitBug() const {
+    return SGPRInitBug;
+  }
 };
 
+
+inline const AMDGPUInstrInfo *AMDGPUSubtarget::getInstrInfo() const {
+  if (getGeneration() >= SOUTHERN_ISLANDS)
+    return static_cast<const SISubtarget *>(this)->getInstrInfo();
+
+  return static_cast<const R600Subtarget *>(this)->getInstrInfo();
+}
+
+inline const AMDGPUFrameLowering *AMDGPUSubtarget::getFrameLowering() const  {
+  if (getGeneration() >= SOUTHERN_ISLANDS)
+    return static_cast<const SISubtarget *>(this)->getFrameLowering();
+
+  return static_cast<const R600Subtarget *>(this)->getFrameLowering();
+}
+
+inline const AMDGPUTargetLowering *AMDGPUSubtarget::getTargetLowering() const  {
+  if (getGeneration() >= SOUTHERN_ISLANDS)
+    return static_cast<const SISubtarget *>(this)->getTargetLowering();
+
+  return static_cast<const R600Subtarget *>(this)->getTargetLowering();
+}
+
+inline const AMDGPURegisterInfo *AMDGPUSubtarget::getRegisterInfo() const  {
+  if (getGeneration() >= SOUTHERN_ISLANDS)
+    return static_cast<const SISubtarget *>(this)->getRegisterInfo();
+
+  return static_cast<const R600Subtarget *>(this)->getRegisterInfo();
+}
+
 } // End namespace llvm
 
 #endif
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 519ae5cc748d..3e53f52c689f 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -14,19 +14,23 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUTargetMachine.h"
-#include "AMDGPUTargetObjectFile.h"
 #include "AMDGPU.h"
+#include "AMDGPUCallLowering.h"
+#include "AMDGPUTargetObjectFile.h"
 #include "AMDGPUTargetTransformInfo.h"
 #include "R600ISelLowering.h"
 #include "R600InstrInfo.h"
 #include "R600MachineScheduler.h"
 #include "SIISelLowering.h"
 #include "SIInstrInfo.h"
+
 #include "llvm/Analysis/Passes.h"
+#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
-#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/IR/LegacyPassManager.h"
@@ -34,10 +38,35 @@
 #include "llvm/Support/raw_os_ostream.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Scalar.h"
-#include <llvm/CodeGen/Passes.h>
+#include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/Vectorize.h"
 
 using namespace llvm;
 
+static cl::opt<bool> EnableR600StructurizeCFG(
+  "r600-ir-structurize",
+  cl::desc("Use StructurizeCFG IR pass"),
+  cl::init(true));
+
+static cl::opt<bool> EnableSROA(
+  "amdgpu-sroa",
+  cl::desc("Run SROA after promote alloca pass"),
+  cl::ReallyHidden,
+  cl::init(true));
+
+static cl::opt<bool> EnableR600IfConvert(
+  "r600-if-convert",
+  cl::desc("Use if conversion pass"),
+  cl::ReallyHidden,
+  cl::init(true));
+
+// Option to disable vectorizer for tests.
+static cl::opt<bool> EnableLoadStoreVectorizer(
+  "amdgpu-load-store-vectorizer",
+  cl::desc("Enable load store vectorizer"),
+  cl::init(false),
+  cl::Hidden);
+
 extern "C" void LLVMInitializeAMDGPUTarget() {
   // Register the target
   RegisterTargetMachine<R600TargetMachine> X(TheAMDGPUTarget);
@@ -47,17 +76,22 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
   initializeSILowerI1CopiesPass(*PR);
   initializeSIFixSGPRCopiesPass(*PR);
   initializeSIFoldOperandsPass(*PR);
-  initializeSIFixSGPRLiveRangesPass(*PR);
+  initializeSIShrinkInstructionsPass(*PR);
   initializeSIFixControlFlowLiveIntervalsPass(*PR);
   initializeSILoadStoreOptimizerPass(*PR);
   initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
   initializeAMDGPUAnnotateUniformValuesPass(*PR);
+  initializeAMDGPUPromoteAllocaPass(*PR);
+  initializeAMDGPUCodeGenPreparePass(*PR);
+  initializeSIAnnotateControlFlowPass(*PR);
+  initializeSIDebuggerInsertNopsPass(*PR);
+  initializeSIInsertWaitsPass(*PR);
+  initializeSIWholeQuadModePass(*PR);
+  initializeSILowerControlFlowPass(*PR);
+  initializeSIDebuggerInsertNopsPass(*PR);
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
-  if (TT.getOS() == Triple::AMDHSA)
-    return make_unique<AMDGPUHSATargetObjectFile>();
-
   return make_unique<AMDGPUTargetObjectFile>();
 }
 
@@ -73,60 +107,156 @@ static MachineSchedRegistry
 SISchedRegistry("si", "Run SI's custom scheduler",
                 createSIMachineScheduler);
 
-static std::string computeDataLayout(const Triple &TT) {
-  std::string Ret = "e-p:32:32";
-
-  if (TT.getArch() == Triple::amdgcn) {
-    // 32-bit private, local, and region pointers. 64-bit global and constant.
-    Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64";
+static StringRef computeDataLayout(const Triple &TT) {
+  if (TT.getArch() == Triple::r600) {
+    // 32-bit pointers.
+    return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
+            "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
   }
 
-  Ret += "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256"
-         "-v512:512-v1024:1024-v2048:2048-n32:64";
+  // 32-bit private, local, and region pointers. 64-bit global, constant and
+  // flat.
+  return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32"
+         "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
+         "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
+}
+
+LLVM_READNONE
+static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
+  if (!GPU.empty())
+    return GPU;
 
-  return Ret;
+  // HSA only supports CI+, so change the default GPU to a CI for HSA.
+  if (TT.getArch() == Triple::amdgcn)
+    return (TT.getOS() == Triple::AMDHSA) ? "kaveri" : "tahiti";
+
+  return "r600";
+}
+
+static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
+  // The AMDGPU toolchain only supports generating shared objects, so we
+  // must always use PIC.
+  return Reloc::PIC_;
 }
 
 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
                                          StringRef CPU, StringRef FS,
-                                         TargetOptions Options, Reloc::Model RM,
+                                         TargetOptions Options,
+                                         Optional<Reloc::Model> RM,
                                          CodeModel::Model CM,
                                          CodeGenOpt::Level OptLevel)
-    : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options, RM, CM,
-                        OptLevel),
-      TLOF(createTLOF(getTargetTriple())), Subtarget(TT, CPU, FS, *this),
-      IntrinsicInfo() {
+  : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
+                      FS, Options, getEffectiveRelocModel(RM), CM, OptLevel),
+    TLOF(createTLOF(getTargetTriple())),
+    IntrinsicInfo() {
   setRequiresStructuredCFG(true);
   initAsmInfo();
 }
 
 AMDGPUTargetMachine::~AMDGPUTargetMachine() { }
 
+StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
+  Attribute GPUAttr = F.getFnAttribute("target-cpu");
+  return GPUAttr.hasAttribute(Attribute::None) ?
+    getTargetCPU() : GPUAttr.getValueAsString();
+}
+
+StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
+  Attribute FSAttr = F.getFnAttribute("target-features");
+
+  return FSAttr.hasAttribute(Attribute::None) ?
+    getTargetFeatureString() :
+    FSAttr.getValueAsString();
+}
+
 //===----------------------------------------------------------------------===//
 // R600 Target Machine (R600 -> Cayman)
 //===----------------------------------------------------------------------===//
 
 R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT,
-                                     StringRef FS, StringRef CPU,
-                                     TargetOptions Options, Reloc::Model RM,
+                                     StringRef CPU, StringRef FS,
+                                     TargetOptions Options,
+                                     Optional<Reloc::Model> RM,
                                      CodeModel::Model CM, CodeGenOpt::Level OL)
-    : AMDGPUTargetMachine(T, TT, FS, CPU, Options, RM, CM, OL) {}
+  : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
+
+const R600Subtarget *R600TargetMachine::getSubtargetImpl(
+  const Function &F) const {
+  StringRef GPU = getGPUName(F);
+  StringRef FS = getFeatureString(F);
+
+  SmallString<128> SubtargetKey(GPU);
+  SubtargetKey.append(FS);
+
+  auto &I = SubtargetMap[SubtargetKey];
+  if (!I) {
+    // This needs to be done before we create a new subtarget since any
+    // creation will depend on the TM and the code generation flags on the
+    // function that reside in TargetOptions.
+    resetTargetOptions(F);
+    I = llvm::make_unique<R600Subtarget>(TargetTriple, GPU, FS, *this);
+  }
+
+  return I.get();
+}
 
 //===----------------------------------------------------------------------===//
 // GCN Target Machine (SI+)
 //===----------------------------------------------------------------------===//
 
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+namespace {
+struct SIGISelActualAccessor : public GISelAccessor {
+  std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
+  const AMDGPUCallLowering *getCallLowering() const override {
+    return CallLoweringInfo.get();
+  }
+};
+} // End anonymous namespace.
+#endif
+
 GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
-                                   StringRef FS, StringRef CPU,
-                                   TargetOptions Options, Reloc::Model RM,
+                                   StringRef CPU, StringRef FS,
+                                   TargetOptions Options,
+                                   Optional<Reloc::Model> RM,
                                    CodeModel::Model CM, CodeGenOpt::Level OL)
-    : AMDGPUTargetMachine(T, TT, FS, CPU, Options, RM, CM, OL) {}
+  : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
+
+const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
+  StringRef GPU = getGPUName(F);
+  StringRef FS = getFeatureString(F);
+
+  SmallString<128> SubtargetKey(GPU);
+  SubtargetKey.append(FS);
+
+  auto &I = SubtargetMap[SubtargetKey];
+  if (!I) {
+    // This needs to be done before we create a new subtarget since any
+    // creation will depend on the TM and the code generation flags on the
+    // function that reside in TargetOptions.
+    resetTargetOptions(F);
+    I = llvm::make_unique<SISubtarget>(TargetTriple, GPU, FS, *this);
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+    GISelAccessor *GISel = new GISelAccessor();
+#else
+    SIGISelActualAccessor *GISel = new SIGISelActualAccessor();
+    GISel->CallLoweringInfo.reset(
+      new AMDGPUCallLowering(*I->getTargetLowering()));
+#endif
+
+    I->setGISelAccessor(*GISel);
+  }
+
+  return I.get();
+}
 
 //===----------------------------------------------------------------------===//
 // AMDGPU Pass Setup
 //===----------------------------------------------------------------------===//
 
 namespace {
+
 class AMDGPUPassConfig : public TargetPassConfig {
 public:
   AMDGPUPassConfig(TargetMachine *TM, PassManagerBase &PM)
@@ -142,16 +272,8 @@ public:
     return getTM<AMDGPUTargetMachine>();
   }
 
-  ScheduleDAGInstrs *
-  createMachineScheduler(MachineSchedContext *C) const override {
-    const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl();
-    if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
-      return createR600MachineScheduler(C);
-    else if (ST.enableSIScheduler())
-      return createSIMachineScheduler(C);
-    return nullptr;
-  }
-
+  void addEarlyCSEOrGVNPass();
+  void addStraightLineScalarOptimizationPasses();
   void addIRPasses() override;
   void addCodeGenPrepare() override;
   bool addPreISel() override;
@@ -159,27 +281,44 @@ public:
   bool addGCPasses() override;
 };
 
-class R600PassConfig : public AMDGPUPassConfig {
+class R600PassConfig final : public AMDGPUPassConfig {
 public:
   R600PassConfig(TargetMachine *TM, PassManagerBase &PM)
     : AMDGPUPassConfig(TM, PM) { }
 
+  ScheduleDAGInstrs *createMachineScheduler(
+    MachineSchedContext *C) const override {
+    return createR600MachineScheduler(C);
+  }
+
   bool addPreISel() override;
   void addPreRegAlloc() override;
   void addPreSched2() override;
   void addPreEmitPass() override;
 };
 
-class GCNPassConfig : public AMDGPUPassConfig {
+class GCNPassConfig final : public AMDGPUPassConfig {
 public:
   GCNPassConfig(TargetMachine *TM, PassManagerBase &PM)
     : AMDGPUPassConfig(TM, PM) { }
+
+  GCNTargetMachine &getGCNTargetMachine() const {
+    return getTM<GCNTargetMachine>();
+  }
+
+  ScheduleDAGInstrs *
+  createMachineScheduler(MachineSchedContext *C) const override;
+
   bool addPreISel() override;
+  void addMachineSSAOptimization() override;
   bool addInstSelector() override;
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+  bool addIRTranslator() override;
+  bool addRegBankSelect() override;
+#endif
   void addFastRegAlloc(FunctionPass *RegAllocPass) override;
   void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override;
   void addPreRegAlloc() override;
-  void addPostRegAlloc() override;
   void addPreSched2() override;
   void addPreEmitPass() override;
 };
@@ -188,12 +327,39 @@ public:
 
 TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() {
   return TargetIRAnalysis([this](const Function &F) {
-    return TargetTransformInfo(
-        AMDGPUTTIImpl(this, F.getParent()->getDataLayout()));
+    return TargetTransformInfo(AMDGPUTTIImpl(this, F));
   });
 }
 
+void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
+  if (getOptLevel() == CodeGenOpt::Aggressive)
+    addPass(createGVNPass());
+  else
+    addPass(createEarlyCSEPass());
+}
+
+void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
+  addPass(createSeparateConstOffsetFromGEPPass());
+  addPass(createSpeculativeExecutionPass());
+  // ReassociateGEPs exposes more opportunites for SLSR. See
+  // the example in reassociate-geps-and-slsr.ll.
+  addPass(createStraightLineStrengthReducePass());
+  // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
+  // EarlyCSE can reuse.
+  addEarlyCSEOrGVNPass();
+  // Run NaryReassociate after EarlyCSE/GVN to be more effective.
+  addPass(createNaryReassociatePass());
+  // NaryReassociate on GEPs creates redundant common expressions, so run
+  // EarlyCSE after it.
+  addPass(createEarlyCSEPass());
+}
+
 void AMDGPUPassConfig::addIRPasses() {
+  // There is no reason to run these.
+  disablePass(&StackMapLivenessID);
+  disablePass(&FuncletLayoutID);
+  disablePass(&PatchableFunctionID);
+
   // Function calls are not supported, so make sure we inline everything.
   addPass(createAMDGPUAlwaysInlinePass());
   addPass(createAlwaysInlinerPass());
@@ -207,24 +373,43 @@ void AMDGPUPassConfig::addIRPasses() {
   // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
   addPass(createAMDGPUOpenCLImageTypeLoweringPass());
 
+  const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
+  if (TM.getOptLevel() > CodeGenOpt::None) {
+    addPass(createAMDGPUPromoteAlloca(&TM));
+
+    if (EnableSROA)
+      addPass(createSROAPass());
+  }
+
+  addStraightLineScalarOptimizationPasses();
+
   TargetPassConfig::addIRPasses();
+
+  // EarlyCSE is not always strong enough to clean up what LSR produces. For
+  // example, GVN can combine
+  //
+  //   %0 = add %a, %b
+  //   %1 = add %b, %a
+  //
+  // and
+  //
+  //   %0 = shl nsw %a, 2
+  //   %1 = shl %a, 2
+  //
+  // but EarlyCSE can do neither of them.
+  if (getOptLevel() != CodeGenOpt::None)
+    addEarlyCSEOrGVNPass();
 }
 
 void AMDGPUPassConfig::addCodeGenPrepare() {
-  const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl();
-  if (ST.isPromoteAllocaEnabled()) {
-    addPass(createAMDGPUPromoteAlloca(ST));
-    addPass(createSROAPass());
-  }
   TargetPassConfig::addCodeGenPrepare();
+
+  if (EnableLoadStoreVectorizer)
+    addPass(createLoadStoreVectorizerPass());
 }
 
-bool
-AMDGPUPassConfig::addPreISel() {
-  const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl();
+bool AMDGPUPassConfig::addPreISel() {
   addPass(createFlattenCFGPass());
-  if (ST.IsIRStructurizerEnabled())
-    addPass(createStructurizeCFGPass());
   return false;
 }
 
@@ -244,7 +429,9 @@ bool AMDGPUPassConfig::addGCPasses() {
 
 bool R600PassConfig::addPreISel() {
   AMDGPUPassConfig::addPreISel();
-  addPass(createR600TextureIntrinsicsReplacer());
+
+  if (EnableR600StructurizeCFG)
+    addPass(createStructurizeCFGPass());
   return false;
 }
 
@@ -253,9 +440,8 @@ void R600PassConfig::addPreRegAlloc() {
 }
 
 void R600PassConfig::addPreSched2() {
-  const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl();
   addPass(createR600EmitClauseMarkers(), false);
-  if (ST.isIfCvtEnabled())
+  if (EnableR600IfConvert)
     addPass(&IfConverterID, false);
   addPass(createR600ClauseMergePass(*TM), false);
 }
@@ -276,32 +462,62 @@ TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) {
 // GCN Pass Setup
 //===----------------------------------------------------------------------===//
 
+ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
+  MachineSchedContext *C) const {
+  const SISubtarget &ST = C->MF->getSubtarget<SISubtarget>();
+  if (ST.enableSIScheduler())
+    return createSIMachineScheduler(C);
+  return nullptr;
+}
+
 bool GCNPassConfig::addPreISel() {
   AMDGPUPassConfig::addPreISel();
 
   // FIXME: We need to run a pass to propagate the attributes when calls are
   // supported.
   addPass(&AMDGPUAnnotateKernelFeaturesID);
-
+  addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
   addPass(createSinkingPass());
   addPass(createSITypeRewriter());
-  addPass(createSIAnnotateControlFlowPass());
   addPass(createAMDGPUAnnotateUniformValues());
+  addPass(createSIAnnotateControlFlowPass());
 
   return false;
 }
 
+void GCNPassConfig::addMachineSSAOptimization() {
+  TargetPassConfig::addMachineSSAOptimization();
+
+  // We want to fold operands after PeepholeOptimizer has run (or as part of
+  // it), because it will eliminate extra copies making it easier to fold the
+  // real source operand. We want to eliminate dead instructions after, so that
+  // we see fewer uses of the copies. We then need to clean up the dead
+  // instructions leftover after the operands are folded as well.
+  //
+  // XXX - Can we get away without running DeadMachineInstructionElim again?
+  addPass(&SIFoldOperandsID);
+  addPass(&DeadMachineInstructionElimID);
+}
+
 bool GCNPassConfig::addInstSelector() {
   AMDGPUPassConfig::addInstSelector();
   addPass(createSILowerI1CopiesPass());
   addPass(&SIFixSGPRCopiesID);
-  addPass(createSIFoldOperandsPass());
   return false;
 }
 
-void GCNPassConfig::addPreRegAlloc() {
-  const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl();
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+bool GCNPassConfig::addIRTranslator() {
+  addPass(new IRTranslator());
+  return false;
+}
 
+bool GCNPassConfig::addRegBankSelect() {
+  return false;
+}
+#endif
+
+void GCNPassConfig::addPreRegAlloc() {
   // This needs to be run directly before register allocation because
   // earlier passes might recompute live intervals.
   // TODO: handle CodeGenOpt::None; fast RA ignores spill weights set by the pass
@@ -309,42 +525,48 @@ void GCNPassConfig::addPreRegAlloc() {
     insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID);
   }
 
-  if (getOptLevel() > CodeGenOpt::None && ST.loadStoreOptEnabled()) {
+  if (getOptLevel() > CodeGenOpt::None) {
     // Don't do this with no optimizations since it throws away debug info by
     // merging nonadjacent loads.
 
     // This should be run after scheduling, but before register allocation. It
     // also need extra copies to the address operand to be eliminated.
+
+    // FIXME: Move pre-RA and remove extra reg coalescer run.
     insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID);
     insertPass(&MachineSchedulerID, &RegisterCoalescerID);
   }
-  addPass(createSIShrinkInstructionsPass(), false);
+
+  addPass(createSIShrinkInstructionsPass());
+  addPass(createSIWholeQuadModePass());
 }
 
 void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
-  addPass(&SIFixSGPRLiveRangesID);
   TargetPassConfig::addFastRegAlloc(RegAllocPass);
 }
 
 void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
-  // We want to run this after LiveVariables is computed to avoid computing them
-  // twice.
-  // FIXME: We shouldn't disable the verifier here. r249087 introduced a failure
-  // that needs to be fixed.
-  insertPass(&LiveVariablesID, &SIFixSGPRLiveRangesID, /*VerifyAfter=*/false);
   TargetPassConfig::addOptimizedRegAlloc(RegAllocPass);
 }
 
-void GCNPassConfig::addPostRegAlloc() {
-  addPass(createSIShrinkInstructionsPass(), false);
-}
-
 void GCNPassConfig::addPreSched2() {
 }
 
 void GCNPassConfig::addPreEmitPass() {
-  addPass(createSIInsertWaits(*TM), false);
-  addPass(createSILowerControlFlowPass(*TM), false);
+  // The hazard recognizer that runs as part of the post-ra scheduler does not
+  // guarantee to be able handle all hazards correctly. This is because if there
+  // are multiple scheduling regions in a basic block, the regions are scheduled
+  // bottom up, so when we begin to schedule a region we don't know what
+  // instructions were emitted directly before it.
+  //
+  // Here we add a stand-alone hazard recognizer pass which can handle all
+  // cases.
+  addPass(&PostRAHazardRecognizerID);
+
+  addPass(createSIInsertWaitsPass());
+  addPass(createSIShrinkInstructionsPass());
+  addPass(createSILowerControlFlowPass());
+  addPass(createSIDebuggerInsertNopsPass());
 }
 
 TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index 236e3f824030..b0eb3a9a15f7 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -12,15 +12,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_R600_AMDGPUTARGETMACHINE_H
-#define LLVM_LIB_TARGET_R600_AMDGPUTARGETMACHINE_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETMACHINE_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETMACHINE_H
 
-#include "AMDGPUFrameLowering.h"
-#include "AMDGPUInstrInfo.h"
 #include "AMDGPUIntrinsicInfo.h"
 #include "AMDGPUSubtarget.h"
-#include "R600ISelLowering.h"
-#include "llvm/IR/DataLayout.h"
 
 namespace llvm {
 
@@ -29,23 +25,23 @@ namespace llvm {
 //===----------------------------------------------------------------------===//
 
 class AMDGPUTargetMachine : public LLVMTargetMachine {
-private:
-
 protected:
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
-  AMDGPUSubtarget Subtarget;
   AMDGPUIntrinsicInfo IntrinsicInfo;
 
+  StringRef getGPUName(const Function &F) const;
+  StringRef getFeatureString(const Function &F) const;
+
 public:
-  AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef FS,
-                      StringRef CPU, TargetOptions Options, Reloc::Model RM,
-                      CodeModel::Model CM, CodeGenOpt::Level OL);
+  AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                      StringRef FS, TargetOptions Options,
+                      Optional<Reloc::Model> RM, CodeModel::Model CM,
+                      CodeGenOpt::Level OL);
   ~AMDGPUTargetMachine();
 
-  const AMDGPUSubtarget *getSubtargetImpl() const { return &Subtarget; }
-  const AMDGPUSubtarget *getSubtargetImpl(const Function &) const override {
-    return &Subtarget;
-  }
+  const AMDGPUSubtarget *getSubtargetImpl() const;
+  const AMDGPUSubtarget *getSubtargetImpl(const Function &) const override;
+
   const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override {
     return &IntrinsicInfo;
   }
@@ -60,30 +56,47 @@ public:
 // R600 Target Machine (R600 -> Cayman)
 //===----------------------------------------------------------------------===//
 
-class R600TargetMachine : public AMDGPUTargetMachine {
+class R600TargetMachine final : public AMDGPUTargetMachine {
+private:
+  mutable StringMap<std::unique_ptr<R600Subtarget>> SubtargetMap;
 
 public:
-  R600TargetMachine(const Target &T, const Triple &TT, StringRef FS,
-                    StringRef CPU, TargetOptions Options, Reloc::Model RM,
-                    CodeModel::Model CM, CodeGenOpt::Level OL);
+  R600TargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                    StringRef FS, TargetOptions Options,
+                    Optional<Reloc::Model> RM, CodeModel::Model CM,
+                    CodeGenOpt::Level OL);
 
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
+  const R600Subtarget *getSubtargetImpl(const Function &) const override;
 };
 
 //===----------------------------------------------------------------------===//
 // GCN Target Machine (SI+)
 //===----------------------------------------------------------------------===//
 
-class GCNTargetMachine : public AMDGPUTargetMachine {
+class GCNTargetMachine final : public AMDGPUTargetMachine {
+private:
+  mutable StringMap<std::unique_ptr<SISubtarget>> SubtargetMap;
 
 public:
-  GCNTargetMachine(const Target &T, const Triple &TT, StringRef FS,
-                   StringRef CPU, TargetOptions Options, Reloc::Model RM,
-                   CodeModel::Model CM, CodeGenOpt::Level OL);
+  GCNTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                   StringRef FS, TargetOptions Options,
+                   Optional<Reloc::Model> RM, CodeModel::Model CM,
+                   CodeGenOpt::Level OL);
 
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
+  const SISubtarget *getSubtargetImpl(const Function &) const override;
 };
 
+inline const AMDGPUSubtarget *AMDGPUTargetMachine::getSubtargetImpl(
+  const Function &F) const {
+  if (getTargetTriple().getArch() == Triple::amdgcn)
+    return static_cast<const GCNTargetMachine *>(this)->getSubtargetImpl(F);
+  return static_cast<const R600TargetMachine *>(this)->getSubtargetImpl(F);
+}
+
 } // End namespace llvm
 
 #endif
diff --git a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
index e050f21091ba..03d1e2c764de 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
@@ -29,59 +29,3 @@ MCSection *AMDGPUTargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV,
 
   return TargetLoweringObjectFileELF::SelectSectionForGlobal(GV, Kind, Mang, TM);
 }
-
-//===----------------------------------------------------------------------===//
-// HSA Object File
-//===----------------------------------------------------------------------===//
-
-
-void AMDGPUHSATargetObjectFile::Initialize(MCContext &Ctx,
-                                           const TargetMachine &TM){
-  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
-  InitializeELF(TM.Options.UseInitArray);
-
-  TextSection = AMDGPU::getHSATextSection(Ctx);
-
-  DataGlobalAgentSection = AMDGPU::getHSADataGlobalAgentSection(Ctx);
-  DataGlobalProgramSection = AMDGPU::getHSADataGlobalProgramSection(Ctx);
-
-  RodataReadonlyAgentSection = AMDGPU::getHSARodataReadonlyAgentSection(Ctx);
-}
-
-bool AMDGPUHSATargetObjectFile::isAgentAllocationSection(
-    const char *SectionName) const {
-  return cast<MCSectionELF>(DataGlobalAgentSection)
-      ->getSectionName()
-      .equals(SectionName);
-}
-
-bool AMDGPUHSATargetObjectFile::isAgentAllocation(const GlobalValue *GV) const {
-  // Read-only segments can only have agent allocation.
-  return AMDGPU::isReadOnlySegment(GV) ||
-         (AMDGPU::isGlobalSegment(GV) && GV->hasSection() &&
-          isAgentAllocationSection(GV->getSection()));
-}
-
-bool AMDGPUHSATargetObjectFile::isProgramAllocation(
-    const GlobalValue *GV) const {
-  // The default for global segments is program allocation.
-  return AMDGPU::isGlobalSegment(GV) && !isAgentAllocation(GV);
-}
-
-MCSection *AMDGPUHSATargetObjectFile::SelectSectionForGlobal(
-                                        const GlobalValue *GV, SectionKind Kind,
-                                        Mangler &Mang,
-                                        const TargetMachine &TM) const {
-  if (Kind.isText() && !GV->hasComdat())
-    return getTextSection();
-
-  if (AMDGPU::isGlobalSegment(GV)) {
-    if (isAgentAllocation(GV))
-      return DataGlobalAgentSection;
-
-    if (isProgramAllocation(GV))
-      return DataGlobalProgramSection;
-  }
-
-  return AMDGPUTargetObjectFile::SelectSectionForGlobal(GV, Kind, Mang, TM);
-}
diff --git a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
index 921341ebb897..f530e0952a74 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
@@ -28,24 +28,6 @@ class AMDGPUTargetObjectFile : public TargetLoweringObjectFileELF {
                                       const TargetMachine &TM) const override;
 };
 
-class AMDGPUHSATargetObjectFile final : public AMDGPUTargetObjectFile {
-private:
-  MCSection *DataGlobalAgentSection;
-  MCSection *DataGlobalProgramSection;
-  MCSection *RodataReadonlyAgentSection;
-
-  bool isAgentAllocationSection(const char *SectionName) const;
-  bool isAgentAllocation(const GlobalValue *GV) const;
-  bool isProgramAllocation(const GlobalValue *GV) const;
-
-public:
-  void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
-
-  MCSection *SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
-                                    Mangler &Mang,
-                                    const TargetMachine &TM) const override;
-};
-
 } // end namespace llvm
 
 #endif
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 54a003d6a9cf..3d630fe3ea9d 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/BasicTTIImpl.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/CostTable.h"
 #include "llvm/Target/TargetLowering.h"
@@ -28,6 +29,7 @@ using namespace llvm;
 
 #define DEBUG_TYPE "AMDGPUtti"
 
+
 void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L,
                                             TTI::UnrollingPreferences &UP) {
   UP.Threshold = 300; // Twice the default.
@@ -78,11 +80,127 @@ unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) {
   return Vector ? 0 : 32;
 }
 
+unsigned AMDGPUTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) {
+  switch (AddrSpace) {
+  case AMDGPUAS::GLOBAL_ADDRESS:
+  case AMDGPUAS::CONSTANT_ADDRESS:
+  case AMDGPUAS::FLAT_ADDRESS:
+    return 128;
+  case AMDGPUAS::LOCAL_ADDRESS:
+  case AMDGPUAS::REGION_ADDRESS:
+    return 64;
+  case AMDGPUAS::PRIVATE_ADDRESS:
+    return 8 * ST->getMaxPrivateElementSize();
+  default:
+    if (ST->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS &&
+        (AddrSpace == AMDGPUAS::PARAM_D_ADDRESS ||
+         AddrSpace == AMDGPUAS::PARAM_I_ADDRESS ||
+         (AddrSpace >= AMDGPUAS::CONSTANT_BUFFER_0 &&
+          AddrSpace <= AMDGPUAS::CONSTANT_BUFFER_15)))
+      return 128;
+    llvm_unreachable("unhandled address space");
+  }
+}
+
 unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) {
   // Semi-arbitrary large amount.
   return 64;
 }
 
+int AMDGPUTTIImpl::getArithmeticInstrCost(
+    unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
+    TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
+    TTI::OperandValueProperties Opd2PropInfo) {
+
+  EVT OrigTy = TLI->getValueType(DL, Ty);
+  if (!OrigTy.isSimple()) {
+    return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+                                         Opd1PropInfo, Opd2PropInfo);
+  }
+
+  // Legalize the type.
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+
+  // Because we don't have any legal vector operations, but the legal types, we
+  // need to account for split vectors.
+  unsigned NElts = LT.second.isVector() ?
+    LT.second.getVectorNumElements() : 1;
+
+  MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
+
+  switch (ISD) {
+  case ISD::SHL:
+  case ISD::SRL:
+  case ISD::SRA: {
+    if (SLT == MVT::i64)
+      return get64BitInstrCost() * LT.first * NElts;
+
+    // i32
+    return getFullRateInstrCost() * LT.first * NElts;
+  }
+  case ISD::ADD:
+  case ISD::SUB:
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR: {
+    if (SLT == MVT::i64){
+      // and, or and xor are typically split into 2 VALU instructions.
+      return 2 * getFullRateInstrCost() * LT.first * NElts;
+    }
+
+    return LT.first * NElts * getFullRateInstrCost();
+  }
+  case ISD::MUL: {
+    const int QuarterRateCost = getQuarterRateInstrCost();
+    if (SLT == MVT::i64) {
+      const int FullRateCost = getFullRateInstrCost();
+      return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
+    }
+
+    // i32
+    return QuarterRateCost * NElts * LT.first;
+  }
+  case ISD::FADD:
+  case ISD::FSUB:
+  case ISD::FMUL:
+    if (SLT == MVT::f64)
+      return LT.first * NElts * get64BitInstrCost();
+
+    if (SLT == MVT::f32 || SLT == MVT::f16)
+      return LT.first * NElts * getFullRateInstrCost();
+    break;
+
+  case ISD::FDIV:
+  case ISD::FREM:
+    // FIXME: frem should be handled separately. The fdiv in it is most of it,
+    // but the current lowering is also not entirely correct.
+    if (SLT == MVT::f64) {
+      int Cost = 4 * get64BitInstrCost() + 7 * getQuarterRateInstrCost();
+
+      // Add cost of workaround.
+      if (ST->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS)
+        Cost += 3 * getFullRateInstrCost();
+
+      return LT.first * Cost * NElts;
+    }
+
+    // Assuming no fp32 denormals lowering.
+    if (SLT == MVT::f32 || SLT == MVT::f16) {
+      assert(!ST->hasFP32Denormals() && "will change when supported");
+      int Cost = 7 * getFullRateInstrCost() + 1 * getQuarterRateInstrCost();
+      return LT.first * NElts * Cost;
+    }
+
+    break;
+  default:
+    break;
+  }
+
+  return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+                                       Opd1PropInfo, Opd2PropInfo);
+}
+
 unsigned AMDGPUTTIImpl::getCFInstrCost(unsigned Opcode) {
   // XXX - For some reason this isn't called for switch.
   switch (Opcode) {
@@ -98,6 +216,11 @@ int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
                                       unsigned Index) {
   switch (Opcode) {
   case Instruction::ExtractElement:
+  case Instruction::InsertElement:
+    // Extracts are just reads of a subregister, so are free. Inserts are
+    // considered free because we don't want to have any cost for scalarizing
+    // operations, and we don't have to copy into a different register class.
+
     // Dynamic indexing isn't free and is best avoided.
     return Index == ~0u ? 2 : 0;
   default:
@@ -115,6 +238,9 @@ static bool isIntrinsicSourceOfDivergence(const TargetIntrinsicInfo *TII,
     // IntrinsicsAMDGPU.td
     break;
 
+  case Intrinsic::amdgcn_workitem_id_x:
+  case Intrinsic::amdgcn_workitem_id_y:
+  case Intrinsic::amdgcn_workitem_id_z:
   case Intrinsic::amdgcn_interp_p1:
   case Intrinsic::amdgcn_interp_p2:
   case Intrinsic::amdgcn_mbcnt_hi:
@@ -122,6 +248,31 @@ static bool isIntrinsicSourceOfDivergence(const TargetIntrinsicInfo *TII,
   case Intrinsic::r600_read_tidig_x:
   case Intrinsic::r600_read_tidig_y:
   case Intrinsic::r600_read_tidig_z:
+  case Intrinsic::amdgcn_image_atomic_swap:
+  case Intrinsic::amdgcn_image_atomic_add:
+  case Intrinsic::amdgcn_image_atomic_sub:
+  case Intrinsic::amdgcn_image_atomic_smin:
+  case Intrinsic::amdgcn_image_atomic_umin:
+  case Intrinsic::amdgcn_image_atomic_smax:
+  case Intrinsic::amdgcn_image_atomic_umax:
+  case Intrinsic::amdgcn_image_atomic_and:
+  case Intrinsic::amdgcn_image_atomic_or:
+  case Intrinsic::amdgcn_image_atomic_xor:
+  case Intrinsic::amdgcn_image_atomic_inc:
+  case Intrinsic::amdgcn_image_atomic_dec:
+  case Intrinsic::amdgcn_image_atomic_cmpswap:
+  case Intrinsic::amdgcn_buffer_atomic_swap:
+  case Intrinsic::amdgcn_buffer_atomic_add:
+  case Intrinsic::amdgcn_buffer_atomic_sub:
+  case Intrinsic::amdgcn_buffer_atomic_smin:
+  case Intrinsic::amdgcn_buffer_atomic_umin:
+  case Intrinsic::amdgcn_buffer_atomic_smax:
+  case Intrinsic::amdgcn_buffer_atomic_umax:
+  case Intrinsic::amdgcn_buffer_atomic_and:
+  case Intrinsic::amdgcn_buffer_atomic_or:
+  case Intrinsic::amdgcn_buffer_atomic_xor:
+  case Intrinsic::amdgcn_buffer_atomic_cmpswap:
+  case Intrinsic::amdgcn_ps_live:
     return true;
   }
 
@@ -129,18 +280,17 @@ static bool isIntrinsicSourceOfDivergence(const TargetIntrinsicInfo *TII,
   switch (TII->lookupName((const char *)Name.bytes_begin(), Name.size())) {
   default:
     return false;
-  case AMDGPUIntrinsic::SI_tid:
   case AMDGPUIntrinsic::SI_fs_interp:
+  case AMDGPUIntrinsic::SI_fs_constant:
     return true;
   }
 }
 
 static bool isArgPassedInSGPR(const Argument *A) {
   const Function *F = A->getParent();
-  unsigned ShaderType = AMDGPU::getShaderType(*F);
 
   // Arguments to compute shaders are never a source of divergence.
-  if (ShaderType == ShaderType::COMPUTE)
+  if (!AMDGPU::isShader(F->getCallingConv()))
     return true;
 
   // For non-compute shaders, SGPR inputs are marked with either inreg or byval.
@@ -169,6 +319,13 @@ bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const {
   if (const LoadInst *Load = dyn_cast<LoadInst>(V))
     return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS;
 
+  // Atomics are divergent because they are executed sequentially: when an
+  // atomic operation refers to the same address in each thread, then each
+  // thread after the first sees the value written by the previous thread as
+  // original value.
+  if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V))
+    return true;
+
   if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
     const TargetMachine &TM = getTLI()->getTargetMachine();
     return isIntrinsicSourceOfDivergence(TM.getIntrinsicInfo(), Intrinsic);
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 976afb03443b..a82a07458086 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -14,18 +14,18 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_R600_AMDGPUTARGETTRANSFORMINFO_H
-#define LLVM_LIB_TARGET_R600_AMDGPUTARGETTRANSFORMINFO_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
 
 #include "AMDGPU.h"
 #include "AMDGPUTargetMachine.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/BasicTTIImpl.h"
-#include "llvm/Target/TargetLowering.h"
 
 namespace llvm {
+class AMDGPUTargetLowering;
 
-class AMDGPUTTIImpl : public BasicTTIImplBase<AMDGPUTTIImpl> {
+class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
   typedef BasicTTIImplBase<AMDGPUTTIImpl> BaseT;
   typedef TargetTransformInfo TTI;
   friend BaseT;
@@ -36,10 +36,33 @@ class AMDGPUTTIImpl : public BasicTTIImplBase<AMDGPUTTIImpl> {
   const AMDGPUSubtarget *getST() const { return ST; }
   const AMDGPUTargetLowering *getTLI() const { return TLI; }
 
+
+  static inline int getFullRateInstrCost() {
+    return TargetTransformInfo::TCC_Basic;
+  }
+
+  static inline int getHalfRateInstrCost() {
+    return 2 * TargetTransformInfo::TCC_Basic;
+  }
+
+  // TODO: The size is usually 8 bytes, but takes 4x as many cycles. Maybe
+  // should be 2 or 4.
+  static inline int getQuarterRateInstrCost() {
+    return 3 * TargetTransformInfo::TCC_Basic;
+  }
+
+   // On some parts, normal fp64 operations are half rate, and others
+   // quarter. This also applies to some integer operations.
+  inline int get64BitInstrCost() const {
+    return ST->hasHalfRate64Ops() ?
+      getHalfRateInstrCost() : getQuarterRateInstrCost();
+  }
+
 public:
-  explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const DataLayout &DL)
-      : BaseT(TM, DL), ST(TM->getSubtargetImpl()),
-        TLI(ST->getTargetLowering()) {}
+  explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
+    : BaseT(TM, F.getParent()->getDataLayout()),
+      ST(TM->getSubtargetImpl(F)),
+      TLI(ST->getTargetLowering()) {}
 
   // Provide value semantics. MSVC requires that we spell all of these out.
   AMDGPUTTIImpl(const AMDGPUTTIImpl &Arg)
@@ -54,17 +77,27 @@ public:
 
   TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) {
     assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
-    return ST->hasBCNT(TyWidth) ? TTI::PSK_FastHardware : TTI::PSK_Software;
+    return TTI::PSK_FastHardware;
   }
 
   unsigned getNumberOfRegisters(bool Vector);
   unsigned getRegisterBitWidth(bool Vector);
+  unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace);
   unsigned getMaxInterleaveFactor(unsigned VF);
 
+  int getArithmeticInstrCost(
+    unsigned Opcode, Type *Ty,
+    TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
+    TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
+    TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
+    TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+
   unsigned getCFInstrCost(unsigned Opcode);
 
   int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index);
   bool isSourceOfDivergence(const Value *V) const;
+
+  unsigned getVectorSplitCost() { return 0; }
 };
 
 } // end namespace llvm
diff --git a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
index 917efd149e00..21de76396b16 100644
--- a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
+++ b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
@@ -50,8 +50,6 @@ STATISTIC(numSerialPatternMatch,    "CFGStructurizer number of serial pattern "
     "matched");
 STATISTIC(numIfPatternMatch,        "CFGStructurizer number of if pattern "
     "matched");
-STATISTIC(numLoopcontPatternMatch,  "CFGStructurizer number of loop-continue "
-    "pattern matched");
 STATISTIC(numClonedBlock,           "CFGStructurizer cloned blocks");
 STATISTIC(numClonedInstr,           "CFGStructurizer cloned instructions");
 
@@ -162,7 +160,7 @@ public:
   bool prepare();
 
   bool runOnMachineFunction(MachineFunction &MF) override {
-    TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo());
+    TII = MF.getSubtarget<R600Subtarget>().getInstrInfo();
     TRI = &TII->getRegisterInfo();
     DEBUG(MF.dump(););
     OrderedBlks.clear();
@@ -213,7 +211,6 @@ protected:
   int getSCCNum(MachineBasicBlock *MBB) const;
   MachineBasicBlock *getLoopLandInfo(MachineLoop *LoopRep) const;
   bool hasBackEdge(MachineBasicBlock *MBB) const;
-  static unsigned getLoopDepth(MachineLoop *LoopRep);
   bool isRetiredBlock(MachineBasicBlock *MBB) const;
   bool isActiveLoophead(MachineBasicBlock *MBB) const;
   PathToKind singlePathTo(MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB,
@@ -229,16 +226,15 @@ protected:
 
   // Function originally from CFGStructTraits
   void insertInstrEnd(MachineBasicBlock *MBB, int NewOpcode,
-      DebugLoc DL = DebugLoc());
+                      const DebugLoc &DL = DebugLoc());
   MachineInstr *insertInstrBefore(MachineBasicBlock *MBB, int NewOpcode,
-    DebugLoc DL = DebugLoc());
+                                  const DebugLoc &DL = DebugLoc());
   MachineInstr *insertInstrBefore(MachineBasicBlock::iterator I, int NewOpcode);
   void insertCondBranchBefore(MachineBasicBlock::iterator I, int NewOpcode,
-      DebugLoc DL);
+                              const DebugLoc &DL);
   void insertCondBranchBefore(MachineBasicBlock *MBB,
-      MachineBasicBlock::iterator I, int NewOpcode, int RegNum,
-      DebugLoc DL);
-  void insertCondBranchEnd(MachineBasicBlock *MBB, int NewOpcode, int RegNum);
+                              MachineBasicBlock::iterator I, int NewOpcode,
+                              int RegNum, const DebugLoc &DL);
   static int getBranchNzeroOpcode(int OldOpcode);
   static int getBranchZeroOpcode(int OldOpcode);
   static int getContinueNzeroOpcode(int OldOpcode);
@@ -257,7 +253,6 @@ protected:
   /// instruction.  Such move instruction "belong to" the loop backward-edge.
   MachineInstr *getLoopendBlockBranchInstr(MachineBasicBlock *MBB);
   static MachineInstr *getReturnInstr(MachineBasicBlock *MBB);
-  static MachineInstr *getContinueInstr(MachineBasicBlock *MBB);
   static bool isReturnBlock(MachineBasicBlock *MBB);
   static void cloneSuccessorList(MachineBasicBlock *DstMBB,
       MachineBasicBlock *SrcMBB) ;
@@ -276,11 +271,7 @@ protected:
   int ifPatternMatch(MachineBasicBlock *MBB);
   int loopendPatternMatch();
   int mergeLoop(MachineLoop *LoopRep);
-  int loopcontPatternMatch(MachineLoop *LoopRep, MachineBasicBlock *LoopHeader);
 
-  void handleLoopcontBlock(MachineBasicBlock *ContingMBB,
-      MachineLoop *ContingLoop, MachineBasicBlock *ContMBB,
-      MachineLoop *ContLoop);
   /// return true iff src1Blk->succ_size() == 0 && src1Blk and src2Blk are in
   /// the same loop with LoopLandInfo without explicitly keeping track of
   /// loopContBlks and loopBreakBlks, this is a method to get the information.
@@ -337,13 +328,7 @@ protected:
       MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I);
   void recordSccnum(MachineBasicBlock *MBB, int SCCNum);
   void retireBlock(MachineBasicBlock *MBB);
-  void setLoopLandBlock(MachineLoop *LoopRep, MachineBasicBlock *MBB = nullptr);
 
-  MachineBasicBlock *findNearestCommonPostDom(std::set<MachineBasicBlock *>&);
-  /// This is work around solution for findNearestCommonDominator not available
-  /// to post dom a proper fix should go to Dominators.h.
-  MachineBasicBlock *findNearestCommonPostDom(MachineBasicBlock *MBB1,
-      MachineBasicBlock *MBB2);
 
 private:
   MBBInfoMap BlockInfoMap;
@@ -376,10 +361,6 @@ bool AMDGPUCFGStructurizer::hasBackEdge(MachineBasicBlock *MBB) const {
   return MBB->isSuccessor(LoopHeader);
 }
 
-unsigned AMDGPUCFGStructurizer::getLoopDepth(MachineLoop *LoopRep) {
-  return LoopRep ? LoopRep->getLoopDepth() : 0;
-}
-
 bool AMDGPUCFGStructurizer::isRetiredBlock(MachineBasicBlock *MBB) const {
   MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB);
   if (It == BlockInfoMap.end())
@@ -442,7 +423,8 @@ bool AMDGPUCFGStructurizer::needMigrateBlock(MachineBasicBlock *MBB) const {
 
 void AMDGPUCFGStructurizer::reversePredicateSetter(
     MachineBasicBlock::iterator I) {
-  while (I--) {
+  assert(static_cast<MachineInstr *>(I) && "Expected valid iterator");
+  for (;; --I) {
     if (I->getOpcode() == AMDGPU::PRED_X) {
       switch (static_cast<MachineInstr *>(I)->getOperand(2).getImm()) {
       case OPCODE_IS_ZERO_INT:
@@ -469,16 +451,17 @@ void AMDGPUCFGStructurizer::reversePredicateSetter(
 }
 
 void AMDGPUCFGStructurizer::insertInstrEnd(MachineBasicBlock *MBB,
-    int NewOpcode, DebugLoc DL) {
- MachineInstr *MI = MBB->getParent()
-    ->CreateMachineInstr(TII->get(NewOpcode), DL);
+                                           int NewOpcode, const DebugLoc &DL) {
+  MachineInstr *MI =
+      MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DL);
   MBB->push_back(MI);
   //assume the instruction doesn't take any reg operand ...
   SHOWNEWINSTR(MI);
 }
 
 MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore(MachineBasicBlock *MBB,
-    int NewOpcode, DebugLoc DL) {
+                                                       int NewOpcode,
+                                                       const DebugLoc &DL) {
   MachineInstr *MI =
       MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DL);
   if (MBB->begin() != MBB->end())
@@ -502,7 +485,7 @@ MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore(
 }
 
 void AMDGPUCFGStructurizer::insertCondBranchBefore(
-    MachineBasicBlock::iterator I, int NewOpcode, DebugLoc DL) {
+    MachineBasicBlock::iterator I, int NewOpcode, const DebugLoc &DL) {
   MachineInstr *OldMI = &(*I);
   MachineBasicBlock *MBB = OldMI->getParent();
   MachineFunction *MF = MBB->getParent();
@@ -514,9 +497,9 @@ void AMDGPUCFGStructurizer::insertCondBranchBefore(
   //erase later oldInstr->eraseFromParent();
 }
 
-void AMDGPUCFGStructurizer::insertCondBranchBefore(MachineBasicBlock *blk,
-    MachineBasicBlock::iterator I, int NewOpcode, int RegNum,
-    DebugLoc DL) {
+void AMDGPUCFGStructurizer::insertCondBranchBefore(
+    MachineBasicBlock *blk, MachineBasicBlock::iterator I, int NewOpcode,
+    int RegNum, const DebugLoc &DL) {
   MachineFunction *MF = blk->getParent();
   MachineInstr *NewInstr = MF->CreateMachineInstr(TII->get(NewOpcode), DL);
   //insert before
@@ -525,16 +508,6 @@ void AMDGPUCFGStructurizer::insertCondBranchBefore(MachineBasicBlock *blk,
   SHOWNEWINSTR(NewInstr);
 }
 
-void AMDGPUCFGStructurizer::insertCondBranchEnd(MachineBasicBlock *MBB,
-    int NewOpcode, int RegNum) {
-  MachineFunction *MF = MBB->getParent();
-  MachineInstr *NewInstr =
-    MF->CreateMachineInstr(TII->get(NewOpcode), DebugLoc());
-  MBB->push_back(NewInstr);
-  MachineInstrBuilder(*MF, NewInstr).addReg(RegNum, false);
-  SHOWNEWINSTR(NewInstr);
-}
-
 int AMDGPUCFGStructurizer::getBranchNzeroOpcode(int OldOpcode) {
   switch(OldOpcode) {
   case AMDGPU::JUMP_COND:
@@ -664,16 +637,6 @@ MachineInstr *AMDGPUCFGStructurizer::getReturnInstr(MachineBasicBlock *MBB) {
   return nullptr;
 }
 
-MachineInstr *AMDGPUCFGStructurizer::getContinueInstr(MachineBasicBlock *MBB) {
-  MachineBasicBlock::reverse_iterator It = MBB->rbegin();
-  if (It != MBB->rend()) {
-    MachineInstr *MI = &(*It);
-    if (MI->getOpcode() == AMDGPU::CONTINUE)
-      return MI;
-  }
-  return nullptr;
-}
-
 bool AMDGPUCFGStructurizer::isReturnBlock(MachineBasicBlock *MBB) {
   MachineInstr *MI = getReturnInstr(MBB);
   bool IsReturn = (MBB->succ_size() == 0);
@@ -697,11 +660,8 @@ MachineBasicBlock *AMDGPUCFGStructurizer::clone(MachineBasicBlock *MBB) {
   MachineFunction *Func = MBB->getParent();
   MachineBasicBlock *NewMBB = Func->CreateMachineBasicBlock();
   Func->push_back(NewMBB);  //insert to function
-  for (MachineBasicBlock::iterator It = MBB->begin(), E = MBB->end();
-      It != E; ++It) {
-    MachineInstr *MI = Func->CloneMachineInstr(It);
-    NewMBB->push_back(MI);
-  }
+  for (const MachineInstr &It : *MBB)
+    NewMBB->push_back(Func->CloneMachineInstr(&It));
   return NewMBB;
 }
 
@@ -727,7 +687,7 @@ void AMDGPUCFGStructurizer::wrapup(MachineBasicBlock *MBB) {
    while (It != E) {
      if (Pre->getOpcode() == AMDGPU::CONTINUE
          && It->getOpcode() == AMDGPU::ENDLOOP)
-       ContInstr.push_back(Pre);
+       ContInstr.push_back(&*Pre);
      Pre = It;
      ++It;
    }
@@ -923,7 +883,7 @@ bool AMDGPUCFGStructurizer::run() {
 
   if (!Finish) {
     DEBUG(FuncRep->viewCFG());
-    llvm_unreachable("IRREDUCIBLE_CFG");
+    report_fatal_error("IRREDUCIBLE_CFG");
   }
 
   return true;
@@ -1145,34 +1105,6 @@ int AMDGPUCFGStructurizer::mergeLoop(MachineLoop *LoopRep) {
   return 1;
 }
 
-int AMDGPUCFGStructurizer::loopcontPatternMatch(MachineLoop *LoopRep,
-    MachineBasicBlock *LoopHeader) {
-  int NumCont = 0;
-  SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> ContMBB;
-  typedef GraphTraits<Inverse<MachineBasicBlock *> > GTIM;
-  GTIM::ChildIteratorType It = GTIM::child_begin(LoopHeader),
-      E = GTIM::child_end(LoopHeader);
-  for (; It != E; ++It) {
-    MachineBasicBlock *MBB = *It;
-    if (LoopRep->contains(MBB)) {
-      handleLoopcontBlock(MBB, MLI->getLoopFor(MBB),
-                          LoopHeader, LoopRep);
-      ContMBB.push_back(MBB);
-      ++NumCont;
-    }
-  }
-
-  for (SmallVectorImpl<MachineBasicBlock *>::iterator It = ContMBB.begin(),
-      E = ContMBB.end(); It != E; ++It) {
-    (*It)->removeSuccessor(LoopHeader, true);
-  }
-
-  numLoopcontPatternMatch += NumCont;
-
-  return NumCont;
-}
-
-
 bool AMDGPUCFGStructurizer::isSameloopDetachedContbreak(
     MachineBasicBlock *Src1MBB, MachineBasicBlock *Src2MBB) {
   if (Src1MBB->succ_size() == 0) {
@@ -1413,10 +1345,10 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
   MachineBasicBlock::iterator I = insertInstrBefore(LandBlk, AMDGPU::ENDIF);
 
   if (LandBlkHasOtherPred) {
-    llvm_unreachable("Extra register needed to handle CFG");
+    report_fatal_error("Extra register needed to handle CFG");
     unsigned CmpResReg =
       HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
-    llvm_unreachable("Extra compare instruction needed to handle CFG");
+    report_fatal_error("Extra compare instruction needed to handle CFG");
     insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET,
         CmpResReg, DebugLoc());
   }
@@ -1433,7 +1365,7 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
     // need to uncondionally insert the assignment to ensure a path from its
     // predecessor rather than headBlk has valid value in initReg if
     // (initVal != 1).
-    llvm_unreachable("Extra register needed to handle CFG");
+    report_fatal_error("Extra register needed to handle CFG");
   }
   insertInstrBefore(I, AMDGPU::ELSE);
 
@@ -1442,7 +1374,7 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
     // need to uncondionally insert the assignment to ensure a path from its
     // predecessor rather than headBlk has valid value in initReg if
     // (initVal != 0)
-    llvm_unreachable("Extra register needed to handle CFG");
+    report_fatal_error("Extra register needed to handle CFG");
   }
 
   if (LandBlkHasOtherPred) {
@@ -1454,7 +1386,7 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
          PE = LandBlk->pred_end(); PI != PE; ++PI) {
       MachineBasicBlock *MBB = *PI;
       if (MBB != TrueMBB && MBB != FalseMBB)
-        llvm_unreachable("Extra register needed to handle CFG");
+        report_fatal_error("Extra register needed to handle CFG");
     }
   }
   DEBUG(
@@ -1468,17 +1400,6 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
   return NumNewBlk;
 }
 
-void AMDGPUCFGStructurizer::handleLoopcontBlock(MachineBasicBlock *ContingMBB,
-    MachineLoop *ContingLoop, MachineBasicBlock *ContMBB,
-    MachineLoop *ContLoop) {
-  DEBUG(dbgs() << "loopcontPattern cont = BB" << ContingMBB->getNumber()
-               << " header = BB" << ContMBB->getNumber() << "\n";
-        dbgs() << "Trying to continue loop-depth = "
-               << getLoopDepth(ContLoop)
-               << " from loop-depth = " << getLoopDepth(ContingLoop) << "\n";);
-  settleLoopcontBlock(ContingMBB, ContMBB);
-}
-
 void AMDGPUCFGStructurizer::mergeSerialBlock(MachineBasicBlock *DstMBB,
     MachineBasicBlock *SrcMBB) {
   DEBUG(
@@ -1809,76 +1730,6 @@ void AMDGPUCFGStructurizer::retireBlock(MachineBasicBlock *MBB) {
          && "can't retire block yet");
 }
 
-void AMDGPUCFGStructurizer::setLoopLandBlock(MachineLoop *loopRep,
-    MachineBasicBlock *MBB) {
-  MachineBasicBlock *&TheEntry = LLInfoMap[loopRep];
-  if (!MBB) {
-    MBB = FuncRep->CreateMachineBasicBlock();
-    FuncRep->push_back(MBB);  //insert to function
-    SHOWNEWBLK(MBB, "DummyLandingBlock for loop without break: ");
-  }
-  TheEntry = MBB;
-  DEBUG(
-    dbgs() << "setLoopLandBlock loop-header = BB"
-           << loopRep->getHeader()->getNumber()
-           << "  landing-block = BB" << MBB->getNumber() << "\n";
-  );
-}
-
-MachineBasicBlock *
-AMDGPUCFGStructurizer::findNearestCommonPostDom(MachineBasicBlock *MBB1,
-    MachineBasicBlock *MBB2) {
-
-  if (PDT->dominates(MBB1, MBB2))
-    return MBB1;
-  if (PDT->dominates(MBB2, MBB1))
-    return MBB2;
-
-  MachineDomTreeNode *Node1 = PDT->getNode(MBB1);
-  MachineDomTreeNode *Node2 = PDT->getNode(MBB2);
-
-  // Handle newly cloned node.
-  if (!Node1 && MBB1->succ_size() == 1)
-    return findNearestCommonPostDom(*MBB1->succ_begin(), MBB2);
-  if (!Node2 && MBB2->succ_size() == 1)
-    return findNearestCommonPostDom(MBB1, *MBB2->succ_begin());
-
-  if (!Node1 || !Node2)
-    return nullptr;
-
-  Node1 = Node1->getIDom();
-  while (Node1) {
-    if (PDT->dominates(Node1, Node2))
-      return Node1->getBlock();
-    Node1 = Node1->getIDom();
-  }
-
-  return nullptr;
-}
-
-MachineBasicBlock *
-AMDGPUCFGStructurizer::findNearestCommonPostDom(
-    std::set<MachineBasicBlock *> &MBBs) {
-  MachineBasicBlock *CommonDom;
-  std::set<MachineBasicBlock *>::const_iterator It = MBBs.begin();
-  std::set<MachineBasicBlock *>::const_iterator E = MBBs.end();
-  for (CommonDom = *It; It != E && CommonDom; ++It) {
-    MachineBasicBlock *MBB = *It;
-    if (MBB != CommonDom)
-      CommonDom = findNearestCommonPostDom(MBB, CommonDom);
-  }
-
-  DEBUG(
-    dbgs() << "Common post dominator for exit blocks is ";
-    if (CommonDom)
-          dbgs() << "BB" << CommonDom->getNumber() << "\n";
-    else
-      dbgs() << "NULL\n";
-  );
-
-  return CommonDom;
-}
-
 char AMDGPUCFGStructurizer::ID = 0;
 
 } // end anonymous namespace
diff --git a/lib/Target/AMDGPU/AMDKernelCodeT.h b/lib/Target/AMDGPU/AMDKernelCodeT.h
index a9ba60c8cbad..5d243e949fd3 100644
--- a/lib/Target/AMDGPU/AMDKernelCodeT.h
+++ b/lib/Target/AMDGPU/AMDKernelCodeT.h
@@ -44,6 +44,15 @@ enum amd_code_version_t {
   AMD_CODE_VERSION_MINOR = 1
 };
 
+// Sets val bits for specified mask in specified dst packed instance.
+#define AMD_HSA_BITS_SET(dst, mask, val)                                       \
+  dst &= (~(1 << mask ## _SHIFT) & ~mask);                                     \
+  dst |= (((val) << mask ## _SHIFT) & mask)
+
+// Gets bits for specified mask from specified src packed instance.
+#define AMD_HSA_BITS_GET(src, mask)                                            \
+  ((src & mask) >> mask ## _SHIFT)                                             \
+
 /// The values used to define the number of bytes to use for the
 /// swizzle element size.
 enum amd_element_byte_size_t {
@@ -118,10 +127,14 @@ enum amd_code_property_mask_t {
   AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH = 1,
   AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT,
 
+  AMD_CODE_PROPERTY_RESERVED1_SHIFT = 10,
+  AMD_CODE_PROPERTY_RESERVED1_WIDTH = 6,
+  AMD_CODE_PROPERTY_RESERVED1 = ((1 << AMD_CODE_PROPERTY_RESERVED1_WIDTH) - 1) << AMD_CODE_PROPERTY_RESERVED1_SHIFT,
+
   /// Control wave ID base counter for GDS ordered-append. Used to set
   /// COMPUTE_DISPATCH_INITIATOR.ORDERED_APPEND_ENBL. (Not sure if
   /// ORDERED_APPEND_MODE also needs to be settable)
-  AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT = 10,
+  AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT = 16,
   AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH = 1,
   AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS = ((1 << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT,
 
@@ -146,7 +159,7 @@ enum amd_code_property_mask_t {
   /// is generally DWORD.
   ///
   /// uSE VALUES FROM THE AMD_ELEMENT_BYTE_SIZE_T ENUM.
-  AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT = 11,
+  AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT = 17,
   AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH = 2,
   AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE = ((1 << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH) - 1) << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT,
 
@@ -155,7 +168,7 @@ enum amd_code_property_mask_t {
   /// HSA_MACHINE_LARGE. Must also match
   /// SH_MEM_CONFIG.PTR32 (GFX6 (SI)/GFX7 (CI)),
   /// SH_MEM_CONFIG.ADDRESS_MODE (GFX8 (VI)+).
-  AMD_CODE_PROPERTY_IS_PTR64_SHIFT = 13,
+  AMD_CODE_PROPERTY_IS_PTR64_SHIFT = 19,
   AMD_CODE_PROPERTY_IS_PTR64_WIDTH = 1,
   AMD_CODE_PROPERTY_IS_PTR64 = ((1 << AMD_CODE_PROPERTY_IS_PTR64_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_PTR64_SHIFT,
 
@@ -167,18 +180,22 @@ enum amd_code_property_mask_t {
   /// workitem_private_segment_byte_size only specifies the statically
   /// know private segment size, and additional space must be added
   /// for the call stack.
-  AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT = 14,
+  AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT = 20,
   AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH = 1,
   AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK = ((1 << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT,
 
   /// Indicate if code generated has support for debugging.
-  AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT = 15,
+  AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT = 21,
   AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH = 1,
   AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT,
 
-  AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT = 15,
+  AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT = 22,
   AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_WIDTH = 1,
-  AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT
+  AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT,
+
+  AMD_CODE_PROPERTY_RESERVED2_SHIFT = 23,
+  AMD_CODE_PROPERTY_RESERVED2_WIDTH = 9,
+  AMD_CODE_PROPERTY_RESERVED2 = ((1 << AMD_CODE_PROPERTY_RESERVED2_WIDTH) - 1) << AMD_CODE_PROPERTY_RESERVED2_SHIFT
 };
 
 /// @brief The hsa_ext_control_directives_t specifies the values for the HSAIL
diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index d9f753f40133..efcf1b23adaa 100644
--- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1,4 +1,4 @@
-//===-- AMDGPUAsmParser.cpp - Parse SI asm to MCInst instructions ----------===//
+//===-- AMDGPUAsmParser.cpp - Parse SI asm to MCInst instructions ---------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,15 +7,17 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "AMDKernelCodeT.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "MCTargetDesc/AMDGPUTargetStreamer.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "AMDKernelCodeT.h"
 #include "SIDefines.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "Utils/AMDKernelCodeTUtils.h"
+#include "Utils/AMDGPUAsmUtils.h"
 #include "llvm/ADT/APFloat.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCContext.h"
@@ -25,16 +27,17 @@
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbolELF.h"
-#include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
 
 using namespace llvm;
 
@@ -42,6 +45,8 @@ namespace {
 
 struct OptionalOperand;
 
+enum RegisterKind { IS_UNKNOWN, IS_VGPR, IS_SGPR, IS_TTMP, IS_SPECIAL };
+
 class AMDGPUOperand : public MCParsedAsmOperand {
   enum KindTy {
     Token,
@@ -55,19 +60,74 @@ class AMDGPUOperand : public MCParsedAsmOperand {
 public:
   AMDGPUOperand(enum KindTy K) : MCParsedAsmOperand(), Kind(K) {}
 
-  MCContext *Ctx;
+  typedef std::unique_ptr<AMDGPUOperand> Ptr;
+
+  struct Modifiers {
+    bool Abs;
+    bool Neg;
+    bool Sext;
+
+    bool hasFPModifiers() const { return Abs || Neg; }
+    bool hasIntModifiers() const { return Sext; }
+    bool hasModifiers() const { return hasFPModifiers() || hasIntModifiers(); }
+
+    int64_t getFPModifiersOperand() const {
+      int64_t Operand = 0;
+      Operand |= Abs ? SISrcMods::ABS : 0;
+      Operand |= Neg ? SISrcMods::NEG : 0;
+      return Operand;
+    }
+
+    int64_t getIntModifiersOperand() const {
+      int64_t Operand = 0;
+      Operand |= Sext ? SISrcMods::SEXT : 0;
+      return Operand;
+    }
+
+    int64_t getModifiersOperand() const {
+      assert(!(hasFPModifiers() && hasIntModifiers())
+           && "fp and int modifiers should not be used simultaneously");
+      if (hasFPModifiers()) {
+        return getFPModifiersOperand();
+      } else if (hasIntModifiers()) {
+        return getIntModifiersOperand();
+      } else {
+        return 0;
+      }
+    }
+
+    friend raw_ostream &operator <<(raw_ostream &OS, AMDGPUOperand::Modifiers Mods);
+  };
 
   enum ImmTy {
     ImmTyNone,
-    ImmTyDSOffset0,
-    ImmTyDSOffset1,
     ImmTyGDS,
+    ImmTyOffen,
+    ImmTyIdxen,
+    ImmTyAddr64,
     ImmTyOffset,
+    ImmTyOffset0,
+    ImmTyOffset1,
     ImmTyGLC,
     ImmTySLC,
     ImmTyTFE,
-    ImmTyClamp,
-    ImmTyOMod
+    ImmTyClampSI,
+    ImmTyOModSI,
+    ImmTyDppCtrl,
+    ImmTyDppRowMask,
+    ImmTyDppBankMask,
+    ImmTyDppBoundCtrl,
+    ImmTySdwaDstSel,
+    ImmTySdwaSrc0Sel,
+    ImmTySdwaSrc1Sel,
+    ImmTySdwaDstUnused,
+    ImmTyDMask,
+    ImmTyUNorm,
+    ImmTyDA,
+    ImmTyR128,
+    ImmTyLWE,
+    ImmTyHwreg,
+    ImmTySendMsg,
   };
 
   struct TokOp {
@@ -79,11 +139,12 @@ public:
     bool IsFPImm;
     ImmTy Type;
     int64_t Val;
+    Modifiers Mods;
   };
 
   struct RegOp {
     unsigned RegNo;
-    int Modifiers;
+    Modifiers Mods;
     const MCRegisterInfo *TRI;
     const MCSubtargetInfo *STI;
     bool IsForcedVOP3;
@@ -96,175 +157,323 @@ public:
     const MCExpr *Expr;
   };
 
-  void addImmOperands(MCInst &Inst, unsigned N) const {
-    Inst.addOperand(MCOperand::createImm(getImm()));
+  bool isToken() const override {
+    if (Kind == Token)
+      return true;
+
+    if (Kind != Expression || !Expr)
+      return false;
+
+    // When parsing operands, we can't always tell if something was meant to be
+    // a token, like 'gds', or an expression that references a global variable.
+    // In this case, we assume the string is an expression, and if we need to
+    // interpret is a token, then we treat the symbol name as the token.
+    return isa<MCSymbolRefExpr>(Expr);
   }
 
-  StringRef getToken() const {
-    return StringRef(Tok.Data, Tok.Length);
+  bool isImm() const override {
+    return Kind == Immediate;
   }
 
-  void addRegOperands(MCInst &Inst, unsigned N) const {
-    Inst.addOperand(MCOperand::createReg(AMDGPU::getMCReg(getReg(), *Reg.STI)));
+  bool isInlinableImm() const {
+    if (!isImmTy(ImmTyNone)) {
+      // Only plain immediates are inlinable (e.g. "clamp" attribute is not)
+      return false;
+    }
+    // TODO: We should avoid using host float here. It would be better to
+    // check the float bit values which is what a few other places do.
+    // We've had bot failures before due to weird NaN support on mips hosts.
+    const float F = BitsToFloat(Imm.Val);
+    // TODO: Add 1/(2*pi) for VI
+    return (Imm.Val <= 64 && Imm.Val >= -16) ||
+           (F == 0.0 || F == 0.5 || F == -0.5 || F == 1.0 || F == -1.0 ||
+           F == 2.0 || F == -2.0 || F == 4.0 || F == -4.0);
   }
 
-  void addRegOrImmOperands(MCInst &Inst, unsigned N) const {
-    if (isReg())
-      addRegOperands(Inst, N);
-    else
-      addImmOperands(Inst, N);
+  bool isRegKind() const {
+    return Kind == Register;
   }
 
-  void addRegWithInputModsOperands(MCInst &Inst, unsigned N) const {
-    Inst.addOperand(MCOperand::createImm(
-        Reg.Modifiers == -1 ? 0 : Reg.Modifiers));
-    addRegOperands(Inst, N);
+  bool isReg() const override {
+    return isRegKind() && !Reg.Mods.hasModifiers();
+  }
+
+  bool isRegOrImmWithInputMods() const {
+    return isRegKind() || isInlinableImm();
+  }
+
+  bool isImmTy(ImmTy ImmT) const {
+    return isImm() && Imm.Type == ImmT;
+  }
+  
+  bool isImmModifier() const {
+    return isImm() && Imm.Type != ImmTyNone;
+  }
+  
+  bool isClampSI() const { return isImmTy(ImmTyClampSI); }
+  bool isOModSI() const { return isImmTy(ImmTyOModSI); }
+  bool isDMask() const { return isImmTy(ImmTyDMask); }
+  bool isUNorm() const { return isImmTy(ImmTyUNorm); }
+  bool isDA() const { return isImmTy(ImmTyDA); }
+  bool isR128() const { return isImmTy(ImmTyUNorm); }
+  bool isLWE() const { return isImmTy(ImmTyLWE); }
+  bool isOffen() const { return isImmTy(ImmTyOffen); }
+  bool isIdxen() const { return isImmTy(ImmTyIdxen); }
+  bool isAddr64() const { return isImmTy(ImmTyAddr64); }
+  bool isOffset() const { return isImmTy(ImmTyOffset) && isUInt<16>(getImm()); }
+  bool isOffset0() const { return isImmTy(ImmTyOffset0) && isUInt<16>(getImm()); }
+  bool isOffset1() const { return isImmTy(ImmTyOffset1) && isUInt<8>(getImm()); }
+  bool isGDS() const { return isImmTy(ImmTyGDS); }
+  bool isGLC() const { return isImmTy(ImmTyGLC); }
+  bool isSLC() const { return isImmTy(ImmTySLC); }
+  bool isTFE() const { return isImmTy(ImmTyTFE); }
+  bool isBankMask() const { return isImmTy(ImmTyDppBankMask); }
+  bool isRowMask() const { return isImmTy(ImmTyDppRowMask); }
+  bool isBoundCtrl() const { return isImmTy(ImmTyDppBoundCtrl); }
+  bool isSDWADstSel() const { return isImmTy(ImmTySdwaDstSel); }
+  bool isSDWASrc0Sel() const { return isImmTy(ImmTySdwaSrc0Sel); }
+  bool isSDWASrc1Sel() const { return isImmTy(ImmTySdwaSrc1Sel); }
+  bool isSDWADstUnused() const { return isImmTy(ImmTySdwaDstUnused); }
+  
+  bool isMod() const {
+    return isClampSI() || isOModSI();
   }
 
-  void addSoppBrTargetOperands(MCInst &Inst, unsigned N) const {
-    if (isImm())
-      addImmOperands(Inst, N);
-    else {
-      assert(isExpr());
-      Inst.addOperand(MCOperand::createExpr(Expr));
-    }
+  bool isRegOrImm() const {
+    return isReg() || isImm();
   }
 
-  bool defaultTokenHasSuffix() const {
-    StringRef Token(Tok.Data, Tok.Length);
+  bool isRegClass(unsigned RCID) const {
+    return isReg() && Reg.TRI->getRegClass(RCID).contains(getReg());
+  }
 
-    return Token.endswith("_e32") || Token.endswith("_e64");
+  bool isSCSrc32() const {
+    return isInlinableImm() || isRegClass(AMDGPU::SReg_32RegClassID);
   }
 
-  bool isToken() const override {
-    return Kind == Token;
+  bool isSCSrc64() const {
+    return isInlinableImm() || isRegClass(AMDGPU::SReg_64RegClassID);
   }
 
-  bool isImm() const override {
-    return Kind == Immediate;
+  bool isSSrc32() const {
+    return isImm() || isSCSrc32() || isExpr();
   }
 
-  bool isInlineImm() const {
-    float F = BitsToFloat(Imm.Val);
-    // TODO: Add 0.5pi for VI
-    return isImm() && ((Imm.Val <= 64 && Imm.Val >= -16) ||
-           (F == 0.0 || F == 0.5 || F == -0.5 || F == 1.0 || F == -1.0 ||
-           F == 2.0 || F == -2.0 || F == 4.0 || F == -4.0));
+  bool isSSrc64() const {
+    // TODO: Find out how SALU supports extension of 32-bit literals to 64 bits.
+    // See isVSrc64().
+    return isImm() || isSCSrc64();
   }
 
-  bool isDSOffset0() const {
-    assert(isImm());
-    return Imm.Type == ImmTyDSOffset0;
+  bool isVCSrc32() const {
+    return isInlinableImm() || isRegClass(AMDGPU::VS_32RegClassID);
   }
 
-  bool isDSOffset1() const {
-    assert(isImm());
-    return Imm.Type == ImmTyDSOffset1;
+  bool isVCSrc64() const {
+    return isInlinableImm() || isRegClass(AMDGPU::VS_64RegClassID);
   }
 
-  int64_t getImm() const {
-    return Imm.Val;
+  bool isVSrc32() const {
+    return isImm() || isVCSrc32();
   }
 
-  enum ImmTy getImmTy() const {
-    assert(isImm());
-    return Imm.Type;
+  bool isVSrc64() const {
+    // TODO: Check if the 64-bit value (coming from assembly source) can be
+    // narrowed to 32 bits (in the instruction stream). That require knowledge
+    // of instruction type (unsigned/signed, floating or "untyped"/B64),
+    // see [AMD GCN3 ISA 6.3.1].
+    // TODO: How 64-bit values are formed from 32-bit literals in _B64 insns?
+    return isImm() || isVCSrc64();
   }
 
-  bool isRegKind() const {
-    return Kind == Register;
+  bool isMem() const override {
+    return false;
   }
 
-  bool isReg() const override {
-    return Kind == Register && Reg.Modifiers == -1;
+  bool isExpr() const {
+    return Kind == Expression;
   }
 
-  bool isRegWithInputMods() const {
-    return Kind == Register && (Reg.IsForcedVOP3 || Reg.Modifiers != -1);
+  bool isSoppBrTarget() const {
+    return isExpr() || isImm();
   }
 
-  void setModifiers(unsigned Mods) {
-    assert(isReg());
-    Reg.Modifiers = Mods;
+  bool isSWaitCnt() const;
+  bool isHwreg() const;
+  bool isSendMsg() const;
+  bool isSMRDOffset() const;
+  bool isSMRDLiteralOffset() const;
+  bool isDPPCtrl() const;
+
+  StringRef getExpressionAsToken() const {
+    assert(isExpr());
+    const MCSymbolRefExpr *S = cast<MCSymbolRefExpr>(Expr);
+    return S->getSymbol().getName();
   }
 
-  bool hasModifiers() const {
-    assert(isRegKind());
-    return Reg.Modifiers != -1;
+
+  StringRef getToken() const {
+    assert(isToken());
+
+    if (Kind == Expression)
+      return getExpressionAsToken();
+
+    return StringRef(Tok.Data, Tok.Length);
+  }
+
+  int64_t getImm() const {
+    assert(isImm());
+    return Imm.Val;
+  }
+
+  enum ImmTy getImmTy() const {
+    assert(isImm());
+    return Imm.Type;
   }
 
   unsigned getReg() const override {
     return Reg.RegNo;
   }
 
-  bool isRegOrImm() const {
-    return isReg() || isImm();
+  SMLoc getStartLoc() const override {
+    return StartLoc;
   }
 
-  bool isRegClass(unsigned RCID) const {
-    return Reg.TRI->getRegClass(RCID).contains(getReg());
+  SMLoc getEndLoc() const override {
+    return EndLoc;
   }
 
-  bool isSCSrc32() const {
-    return isInlineImm() || (isReg() && isRegClass(AMDGPU::SReg_32RegClassID));
+  Modifiers getModifiers() const {
+    assert(isRegKind() || isImmTy(ImmTyNone));
+    return isRegKind() ? Reg.Mods : Imm.Mods;
   }
 
-  bool isSSrc32() const {
-    return isImm() || (isReg() && isRegClass(AMDGPU::SReg_32RegClassID));
+  void setModifiers(Modifiers Mods) {
+    assert(isRegKind() || isImmTy(ImmTyNone));
+    if (isRegKind())
+      Reg.Mods = Mods;
+    else
+      Imm.Mods = Mods;
   }
 
-  bool isSSrc64() const {
-    return isImm() || isInlineImm() ||
-           (isReg() && isRegClass(AMDGPU::SReg_64RegClassID));
+  bool hasModifiers() const {
+    return getModifiers().hasModifiers();
   }
-
-  bool isSCSrc64() const {
-    return (isReg() && isRegClass(AMDGPU::SReg_64RegClassID)) || isInlineImm();
+  
+  bool hasFPModifiers() const {
+    return getModifiers().hasFPModifiers();
   }
 
-  bool isVCSrc32() const {
-    return isInlineImm() || (isReg() && isRegClass(AMDGPU::VS_32RegClassID));
+  bool hasIntModifiers() const {
+    return getModifiers().hasIntModifiers();
   }
 
-  bool isVCSrc64() const {
-    return isInlineImm() || (isReg() && isRegClass(AMDGPU::VS_64RegClassID));
+  void addImmOperands(MCInst &Inst, unsigned N, bool ApplyModifiers = true) const {
+    if (isImmTy(ImmTyNone) && ApplyModifiers && Imm.Mods.hasFPModifiers()) {
+      // Apply modifiers to immediate value 
+      int64_t Val = Imm.Val;
+      bool Negate = Imm.Mods.Neg; // Only negate can get here
+      if (Imm.IsFPImm) {
+        APFloat F(BitsToFloat(Val));
+        if (Negate) {
+          F.changeSign();
+        }
+        Val = F.bitcastToAPInt().getZExtValue();
+      } else {
+        Val = Negate ? -Val : Val;
+      }
+      Inst.addOperand(MCOperand::createImm(Val));
+    } else {
+      Inst.addOperand(MCOperand::createImm(getImm()));
+    }
   }
 
-  bool isVSrc32() const {
-    return isImm() || (isReg() && isRegClass(AMDGPU::VS_32RegClassID));
+  void addRegOperands(MCInst &Inst, unsigned N) const {
+    Inst.addOperand(MCOperand::createReg(AMDGPU::getMCReg(getReg(), *Reg.STI)));
   }
 
-  bool isVSrc64() const {
-    return isImm() || (isReg() && isRegClass(AMDGPU::VS_64RegClassID));
+  void addRegOrImmOperands(MCInst &Inst, unsigned N) const {
+    if (isRegKind())
+      addRegOperands(Inst, N);
+    else if (isExpr())
+      Inst.addOperand(MCOperand::createExpr(Expr));
+    else
+      addImmOperands(Inst, N);
   }
 
-  bool isMem() const override {
-    return false;
+  void addRegOrImmWithInputModsOperands(MCInst &Inst, unsigned N) const {
+    Modifiers Mods = getModifiers();
+    Inst.addOperand(MCOperand::createImm(Mods.getModifiersOperand()));
+    if (isRegKind()) {
+      addRegOperands(Inst, N);
+    } else {
+      addImmOperands(Inst, N, false);
+    }
   }
 
-  bool isExpr() const {
-    return Kind == Expression;
+  void addRegOrImmWithFPInputModsOperands(MCInst &Inst, unsigned N) const {
+    assert(!hasIntModifiers());
+    addRegOrImmWithInputModsOperands(Inst, N);
   }
 
-  bool isSoppBrTarget() const {
-    return isExpr() || isImm();
+  void addRegOrImmWithIntInputModsOperands(MCInst &Inst, unsigned N) const {
+    assert(!hasFPModifiers());
+    addRegOrImmWithInputModsOperands(Inst, N);
   }
 
-  SMLoc getStartLoc() const override {
-    return StartLoc;
+  void addSoppBrTargetOperands(MCInst &Inst, unsigned N) const {
+    if (isImm())
+      addImmOperands(Inst, N);
+    else {
+      assert(isExpr());
+      Inst.addOperand(MCOperand::createExpr(Expr));
+    }
   }
 
-  SMLoc getEndLoc() const override {
-    return EndLoc;
+  void printImmTy(raw_ostream& OS, ImmTy Type) const {
+    switch (Type) {
+    case ImmTyNone: OS << "None"; break;
+    case ImmTyGDS: OS << "GDS"; break;
+    case ImmTyOffen: OS << "Offen"; break;
+    case ImmTyIdxen: OS << "Idxen"; break;
+    case ImmTyAddr64: OS << "Addr64"; break;
+    case ImmTyOffset: OS << "Offset"; break;
+    case ImmTyOffset0: OS << "Offset0"; break;
+    case ImmTyOffset1: OS << "Offset1"; break;
+    case ImmTyGLC: OS << "GLC"; break;
+    case ImmTySLC: OS << "SLC"; break;
+    case ImmTyTFE: OS << "TFE"; break;
+    case ImmTyClampSI: OS << "ClampSI"; break;
+    case ImmTyOModSI: OS << "OModSI"; break;
+    case ImmTyDppCtrl: OS << "DppCtrl"; break;
+    case ImmTyDppRowMask: OS << "DppRowMask"; break;
+    case ImmTyDppBankMask: OS << "DppBankMask"; break;
+    case ImmTyDppBoundCtrl: OS << "DppBoundCtrl"; break;
+    case ImmTySdwaDstSel: OS << "SdwaDstSel"; break;
+    case ImmTySdwaSrc0Sel: OS << "SdwaSrc0Sel"; break;
+    case ImmTySdwaSrc1Sel: OS << "SdwaSrc1Sel"; break;
+    case ImmTySdwaDstUnused: OS << "SdwaDstUnused"; break;
+    case ImmTyDMask: OS << "DMask"; break;
+    case ImmTyUNorm: OS << "UNorm"; break;
+    case ImmTyDA: OS << "DA"; break;
+    case ImmTyR128: OS << "R128"; break;
+    case ImmTyLWE: OS << "LWE"; break;
+    case ImmTyHwreg: OS << "Hwreg"; break;
+    case ImmTySendMsg: OS << "SendMsg"; break;
+    }
   }
 
   void print(raw_ostream &OS) const override {
     switch (Kind) {
     case Register:
-      OS << "<register " << getReg() << " mods: " << Reg.Modifiers << '>';
+      OS << "<register " << getReg() << " mods: " << Reg.Mods << '>';
       break;
     case Immediate:
-      OS << getImm();
+      OS << '<' << getImm();
+      if (getImmTy() != ImmTyNone) {
+        OS << " type: "; printImmTy(OS, getImmTy());
+      }
+      OS << " mods: " << Imm.Mods << '>';
       break;
     case Token:
       OS << '\'' << getToken() << '\'';
@@ -275,20 +484,21 @@ public:
     }
   }
 
-  static std::unique_ptr<AMDGPUOperand> CreateImm(int64_t Val, SMLoc Loc,
-                                                  enum ImmTy Type = ImmTyNone,
-                                                  bool IsFPImm = false) {
+  static AMDGPUOperand::Ptr CreateImm(int64_t Val, SMLoc Loc,
+                                      enum ImmTy Type = ImmTyNone,
+                                      bool IsFPImm = false) {
     auto Op = llvm::make_unique<AMDGPUOperand>(Immediate);
     Op->Imm.Val = Val;
     Op->Imm.IsFPImm = IsFPImm;
     Op->Imm.Type = Type;
+    Op->Imm.Mods = {false, false, false};
     Op->StartLoc = Loc;
     Op->EndLoc = Loc;
     return Op;
   }
 
-  static std::unique_ptr<AMDGPUOperand> CreateToken(StringRef Str, SMLoc Loc,
-                                           bool HasExplicitEncodingSize = true) {
+  static AMDGPUOperand::Ptr CreateToken(StringRef Str, SMLoc Loc,
+                                        bool HasExplicitEncodingSize = true) {
     auto Res = llvm::make_unique<AMDGPUOperand>(Token);
     Res->Tok.Data = Str.data();
     Res->Tok.Length = Str.size();
@@ -297,43 +507,43 @@ public:
     return Res;
   }
 
-  static std::unique_ptr<AMDGPUOperand> CreateReg(unsigned RegNo, SMLoc S,
-                                                  SMLoc E,
-                                                  const MCRegisterInfo *TRI,
-                                                  const MCSubtargetInfo *STI,
-                                                  bool ForceVOP3) {
+  static AMDGPUOperand::Ptr CreateReg(unsigned RegNo, SMLoc S,
+                                      SMLoc E,
+                                      const MCRegisterInfo *TRI,
+                                      const MCSubtargetInfo *STI,
+                                      bool ForceVOP3) {
     auto Op = llvm::make_unique<AMDGPUOperand>(Register);
     Op->Reg.RegNo = RegNo;
     Op->Reg.TRI = TRI;
     Op->Reg.STI = STI;
-    Op->Reg.Modifiers = -1;
+    Op->Reg.Mods = {false, false, false};
     Op->Reg.IsForcedVOP3 = ForceVOP3;
     Op->StartLoc = S;
     Op->EndLoc = E;
     return Op;
   }
 
-  static std::unique_ptr<AMDGPUOperand> CreateExpr(const class MCExpr *Expr, SMLoc S) {
+  static AMDGPUOperand::Ptr CreateExpr(const class MCExpr *Expr, SMLoc S) {
     auto Op = llvm::make_unique<AMDGPUOperand>(Expression);
     Op->Expr = Expr;
     Op->StartLoc = S;
     Op->EndLoc = S;
     return Op;
   }
-
-  bool isDSOffset() const;
-  bool isDSOffset01() const;
-  bool isSWaitCnt() const;
-  bool isMubufOffset() const;
-  bool isSMRDOffset() const;
-  bool isSMRDLiteralOffset() const;
 };
 
+raw_ostream &operator <<(raw_ostream &OS, AMDGPUOperand::Modifiers Mods) {
+  OS << "abs:" << Mods.Abs << " neg: " << Mods.Neg << " sext:" << Mods.Sext;
+  return OS;
+}
+
 class AMDGPUAsmParser : public MCTargetAsmParser {
   const MCInstrInfo &MII;
   MCAsmParser &Parser;
 
   unsigned ForcedEncodingSize;
+  bool ForcedDPP;
+  bool ForcedSDWA;
 
   bool isSI() const {
     return AMDGPU::isSI(getSTI());
@@ -373,9 +583,11 @@ private:
   bool ParseSectionDirectiveHSADataGlobalAgent();
   bool ParseSectionDirectiveHSADataGlobalProgram();
   bool ParseSectionDirectiveHSARodataReadonlyAgent();
+  bool AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth, RegisterKind RegKind, unsigned Reg1, unsigned RegNum);
+  bool ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg, unsigned& RegNum, unsigned& RegWidth);
+  void cvtMubufImpl(MCInst &Inst, const OperandVector &Operands, bool IsAtomic, bool IsAtomicReturn);
 
 public:
-public:
   enum AMDGPUMatchResultTy {
     Match_PreferE32 = FIRST_TARGET_MATCH_RESULT_TY
   };
@@ -384,7 +596,9 @@ public:
                const MCInstrInfo &MII,
                const MCTargetOptions &Options)
       : MCTargetAsmParser(Options, STI), MII(MII), Parser(_Parser),
-        ForcedEncodingSize(0) {
+        ForcedEncodingSize(0),
+        ForcedDPP(false),
+        ForcedSDWA(false) {
     MCAsmParserExtension::Initialize(Parser);
 
     if (getSTI().getFeatureBits().none()) {
@@ -393,6 +607,21 @@ public:
     }
 
     setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
+
+    {
+      // TODO: make those pre-defined variables read-only.
+      // Currently there is none suitable machinery in the core llvm-mc for this.
+      // MCSymbol::isRedefinable is intended for another purpose, and
+      // AsmParser::parseDirectiveSet() cannot be specialized for specific target.
+      AMDGPU::IsaVersion Isa = AMDGPU::getIsaVersion(getSTI().getFeatureBits());
+      MCContext &Ctx = getContext();
+      MCSymbol *Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_major"));
+      Sym->setVariableValue(MCConstantExpr::create(Isa.Major, Ctx));
+      Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_minor"));
+      Sym->setVariableValue(MCConstantExpr::create(Isa.Minor, Ctx));
+      Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_stepping"));
+      Sym->setVariableValue(MCConstantExpr::create(Isa.Stepping, Ctx));
+    }
   }
 
   AMDGPUTargetStreamer &getTargetStreamer() {
@@ -400,84 +629,117 @@ public:
     return static_cast<AMDGPUTargetStreamer &>(TS);
   }
 
-  unsigned getForcedEncodingSize() const {
-    return ForcedEncodingSize;
-  }
-
-  void setForcedEncodingSize(unsigned Size) {
-    ForcedEncodingSize = Size;
-  }
+  void setForcedEncodingSize(unsigned Size) { ForcedEncodingSize = Size; }
+  void setForcedDPP(bool ForceDPP_) { ForcedDPP = ForceDPP_; }
+  void setForcedSDWA(bool ForceSDWA_) { ForcedSDWA = ForceSDWA_; }
 
-  bool isForcedVOP3() const {
-    return ForcedEncodingSize == 64;
-  }
+  unsigned getForcedEncodingSize() const { return ForcedEncodingSize; }
+  bool isForcedVOP3() const { return ForcedEncodingSize == 64; }
+  bool isForcedDPP() const { return ForcedDPP; }
+  bool isForcedSDWA() const { return ForcedSDWA; }
 
+  std::unique_ptr<AMDGPUOperand> parseRegister();
   bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
   unsigned checkTargetMatchPredicate(MCInst &Inst) override;
+  unsigned validateTargetOperandClass(MCParsedAsmOperand &Op,
+                                      unsigned Kind) override;
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                OperandVector &Operands, MCStreamer &Out,
                                uint64_t &ErrorInfo,
                                bool MatchingInlineAsm) override;
   bool ParseDirective(AsmToken DirectiveID) override;
   OperandMatchResultTy parseOperand(OperandVector &Operands, StringRef Mnemonic);
+  StringRef parseMnemonicSuffix(StringRef Name);
   bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
                         SMLoc NameLoc, OperandVector &Operands) override;
 
-  OperandMatchResultTy parseIntWithPrefix(const char *Prefix, int64_t &Int,
-                                          int64_t Default = 0);
+  OperandMatchResultTy parseIntWithPrefix(const char *Prefix, int64_t &Int);
   OperandMatchResultTy parseIntWithPrefix(const char *Prefix,
                                           OperandVector &Operands,
-                                          enum AMDGPUOperand::ImmTy ImmTy =
-                                                      AMDGPUOperand::ImmTyNone);
+                                          enum AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone,
+                                          bool (*ConvertResult)(int64_t&) = 0);
   OperandMatchResultTy parseNamedBit(const char *Name, OperandVector &Operands,
-                                     enum AMDGPUOperand::ImmTy ImmTy =
-                                                      AMDGPUOperand::ImmTyNone);
-  OperandMatchResultTy parseOptionalOps(
-                                   const ArrayRef<OptionalOperand> &OptionalOps,
-                                   OperandVector &Operands);
+                                     enum AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone);
+  OperandMatchResultTy parseStringWithPrefix(StringRef Prefix, StringRef &Value);
 
+  OperandMatchResultTy parseImm(OperandVector &Operands);
+  OperandMatchResultTy parseRegOrImm(OperandVector &Operands);
+  OperandMatchResultTy parseRegOrImmWithFPInputMods(OperandVector &Operands);
+  OperandMatchResultTy parseRegOrImmWithIntInputMods(OperandVector &Operands);
 
   void cvtDSOffset01(MCInst &Inst, const OperandVector &Operands);
   void cvtDS(MCInst &Inst, const OperandVector &Operands);
-  OperandMatchResultTy parseDSOptionalOps(OperandVector &Operands);
-  OperandMatchResultTy parseDSOff01OptionalOps(OperandVector &Operands);
-  OperandMatchResultTy parseDSOffsetOptional(OperandVector &Operands);
 
   bool parseCnt(int64_t &IntVal);
   OperandMatchResultTy parseSWaitCntOps(OperandVector &Operands);
-  OperandMatchResultTy parseSOppBrTarget(OperandVector &Operands);
+  OperandMatchResultTy parseHwreg(OperandVector &Operands);
 
-  OperandMatchResultTy parseFlatOptionalOps(OperandVector &Operands);
-  OperandMatchResultTy parseFlatAtomicOptionalOps(OperandVector &Operands);
-  void cvtFlat(MCInst &Inst, const OperandVector &Operands);
+private:
+  struct OperandInfoTy {
+    int64_t Id;
+    bool IsSymbolic;
+    OperandInfoTy(int64_t Id_) : Id(Id_), IsSymbolic(false) { }
+  };
 
-  void cvtMubuf(MCInst &Inst, const OperandVector &Operands);
-  OperandMatchResultTy parseOffset(OperandVector &Operands);
-  OperandMatchResultTy parseMubufOptionalOps(OperandVector &Operands);
-  OperandMatchResultTy parseGLC(OperandVector &Operands);
-  OperandMatchResultTy parseSLC(OperandVector &Operands);
-  OperandMatchResultTy parseTFE(OperandVector &Operands);
+  bool parseSendMsgConstruct(OperandInfoTy &Msg, OperandInfoTy &Operation, int64_t &StreamId);
+  bool parseHwregConstruct(OperandInfoTy &HwReg, int64_t &Offset, int64_t &Width);
+public:
+  OperandMatchResultTy parseOptionalOperand(OperandVector &Operands);
 
-  OperandMatchResultTy parseDMask(OperandVector &Operands);
-  OperandMatchResultTy parseUNorm(OperandVector &Operands);
-  OperandMatchResultTy parseR128(OperandVector &Operands);
+  OperandMatchResultTy parseSendMsgOp(OperandVector &Operands);
+  OperandMatchResultTy parseSOppBrTarget(OperandVector &Operands);
 
+  void cvtMubuf(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false, false); }
+  void cvtMubufAtomic(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, false); }
+  void cvtMubufAtomicReturn(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, true); }
+  AMDGPUOperand::Ptr defaultGLC() const;
+  AMDGPUOperand::Ptr defaultSLC() const;
+  AMDGPUOperand::Ptr defaultTFE() const;
+
+  AMDGPUOperand::Ptr defaultDMask() const;
+  AMDGPUOperand::Ptr defaultUNorm() const;
+  AMDGPUOperand::Ptr defaultDA() const;
+  AMDGPUOperand::Ptr defaultR128() const;
+  AMDGPUOperand::Ptr defaultLWE() const;
+  AMDGPUOperand::Ptr defaultSMRDOffset() const;
+  AMDGPUOperand::Ptr defaultSMRDLiteralOffset() const;
+
+  OperandMatchResultTy parseOModOperand(OperandVector &Operands);
+
+  void cvtId(MCInst &Inst, const OperandVector &Operands);
+  void cvtVOP3_2_mod(MCInst &Inst, const OperandVector &Operands);
   void cvtVOP3(MCInst &Inst, const OperandVector &Operands);
-  OperandMatchResultTy parseVOP3OptionalOps(OperandVector &Operands);
+
+  void cvtMIMG(MCInst &Inst, const OperandVector &Operands);
+  void cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands);
+
+  OperandMatchResultTy parseDPPCtrl(OperandVector &Operands);
+  AMDGPUOperand::Ptr defaultRowMask() const;
+  AMDGPUOperand::Ptr defaultBankMask() const;
+  AMDGPUOperand::Ptr defaultBoundCtrl() const;
+  void cvtDPP(MCInst &Inst, const OperandVector &Operands);
+
+  OperandMatchResultTy parseSDWASel(OperandVector &Operands, StringRef Prefix,
+                                    AMDGPUOperand::ImmTy Type);
+  OperandMatchResultTy parseSDWADstUnused(OperandVector &Operands);
+  void cvtSdwaVOP1(MCInst &Inst, const OperandVector &Operands);
+  void cvtSdwaVOP2(MCInst &Inst, const OperandVector &Operands);
+  void cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands);
+  void cvtSDWA(MCInst &Inst, const OperandVector &Operands,
+               uint64_t BasicInstType);
 };
 
 struct OptionalOperand {
   const char *Name;
   AMDGPUOperand::ImmTy Type;
   bool IsBit;
-  int64_t Default;
   bool (*ConvertResult)(int64_t&);
 };
 
 }
 
-static int getRegClass(bool IsVgpr, unsigned RegWidth) {
-  if (IsVgpr) {
+static int getRegClass(RegisterKind Is, unsigned RegWidth) {
+  if (Is == IS_VGPR) {
     switch (RegWidth) {
       default: return -1;
       case 1: return AMDGPU::VGPR_32RegClassID;
@@ -487,109 +749,379 @@ static int getRegClass(bool IsVgpr, unsigned RegWidth) {
       case 8: return AMDGPU::VReg_256RegClassID;
       case 16: return AMDGPU::VReg_512RegClassID;
     }
+  } else if (Is == IS_TTMP) {
+    switch (RegWidth) {
+      default: return -1;
+      case 1: return AMDGPU::TTMP_32RegClassID;
+      case 2: return AMDGPU::TTMP_64RegClassID;
+      case 4: return AMDGPU::TTMP_128RegClassID;
+    }
+  } else if (Is == IS_SGPR) {
+    switch (RegWidth) {
+      default: return -1;
+      case 1: return AMDGPU::SGPR_32RegClassID;
+      case 2: return AMDGPU::SGPR_64RegClassID;
+      case 4: return AMDGPU::SGPR_128RegClassID;
+      case 8: return AMDGPU::SReg_256RegClassID;
+      case 16: return AMDGPU::SReg_512RegClassID;
+    }
   }
-
-  switch (RegWidth) {
-    default: return -1;
-    case 1: return AMDGPU::SGPR_32RegClassID;
-    case 2: return AMDGPU::SGPR_64RegClassID;
-    case 4: return AMDGPU::SReg_128RegClassID;
-    case 8: return AMDGPU::SReg_256RegClassID;
-    case 16: return AMDGPU::SReg_512RegClassID;
-  }
+  return -1;
 }
 
-static unsigned getRegForName(StringRef RegName) {
-
+static unsigned getSpecialRegForName(StringRef RegName) {
   return StringSwitch<unsigned>(RegName)
     .Case("exec", AMDGPU::EXEC)
     .Case("vcc", AMDGPU::VCC)
     .Case("flat_scratch", AMDGPU::FLAT_SCR)
     .Case("m0", AMDGPU::M0)
     .Case("scc", AMDGPU::SCC)
+    .Case("tba", AMDGPU::TBA)
+    .Case("tma", AMDGPU::TMA)
     .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
     .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
     .Case("vcc_lo", AMDGPU::VCC_LO)
     .Case("vcc_hi", AMDGPU::VCC_HI)
     .Case("exec_lo", AMDGPU::EXEC_LO)
     .Case("exec_hi", AMDGPU::EXEC_HI)
+    .Case("tma_lo", AMDGPU::TMA_LO)
+    .Case("tma_hi", AMDGPU::TMA_HI)
+    .Case("tba_lo", AMDGPU::TBA_LO)
+    .Case("tba_hi", AMDGPU::TBA_HI)
     .Default(0);
 }
 
 bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) {
-  const AsmToken Tok = Parser.getTok();
-  StartLoc = Tok.getLoc();
-  EndLoc = Tok.getEndLoc();
+  auto R = parseRegister();
+  if (!R) return true;
+  assert(R->isReg());
+  RegNo = R->getReg();
+  StartLoc = R->getStartLoc();
+  EndLoc = R->getEndLoc();
+  return false;
+}
+
+bool AMDGPUAsmParser::AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth, RegisterKind RegKind, unsigned Reg1, unsigned RegNum)
+{
+  switch (RegKind) {
+  case IS_SPECIAL:
+    if (Reg == AMDGPU::EXEC_LO && Reg1 == AMDGPU::EXEC_HI) { Reg = AMDGPU::EXEC; RegWidth = 2; return true; }
+    if (Reg == AMDGPU::FLAT_SCR_LO && Reg1 == AMDGPU::FLAT_SCR_HI) { Reg = AMDGPU::FLAT_SCR; RegWidth = 2; return true; }
+    if (Reg == AMDGPU::VCC_LO && Reg1 == AMDGPU::VCC_HI) { Reg = AMDGPU::VCC; RegWidth = 2; return true; }
+    if (Reg == AMDGPU::TBA_LO && Reg1 == AMDGPU::TBA_HI) { Reg = AMDGPU::TBA; RegWidth = 2; return true; }
+    if (Reg == AMDGPU::TMA_LO && Reg1 == AMDGPU::TMA_HI) { Reg = AMDGPU::TMA; RegWidth = 2; return true; }
+    return false;
+  case IS_VGPR:
+  case IS_SGPR:
+  case IS_TTMP:
+    if (Reg1 != Reg + RegWidth) { return false; }
+    RegWidth++;
+    return true;
+  default:
+    assert(false); return false;
+  }
+}
+
+bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg, unsigned& RegNum, unsigned& RegWidth)
+{
+  const MCRegisterInfo *TRI = getContext().getRegisterInfo();
+  if (getLexer().is(AsmToken::Identifier)) {
+    StringRef RegName = Parser.getTok().getString();
+    if ((Reg = getSpecialRegForName(RegName))) {
+      Parser.Lex();
+      RegKind = IS_SPECIAL;
+    } else {
+      unsigned RegNumIndex = 0;
+      if (RegName[0] == 'v') {
+        RegNumIndex = 1;
+        RegKind = IS_VGPR;
+      } else if (RegName[0] == 's') {
+        RegNumIndex = 1;
+        RegKind = IS_SGPR;
+      } else if (RegName.startswith("ttmp")) {
+        RegNumIndex = strlen("ttmp");
+        RegKind = IS_TTMP;
+      } else {
+        return false;
+      }
+      if (RegName.size() > RegNumIndex) {
+        // Single 32-bit register: vXX.
+        if (RegName.substr(RegNumIndex).getAsInteger(10, RegNum))
+          return false;
+        Parser.Lex();
+        RegWidth = 1;
+      } else {
+        // Range of registers: v[XX:YY]. ":YY" is optional.
+        Parser.Lex();
+        int64_t RegLo, RegHi;
+        if (getLexer().isNot(AsmToken::LBrac))
+          return false;
+        Parser.Lex();
+
+        if (getParser().parseAbsoluteExpression(RegLo))
+          return false;
+
+        const bool isRBrace = getLexer().is(AsmToken::RBrac);
+        if (!isRBrace && getLexer().isNot(AsmToken::Colon))
+          return false;
+        Parser.Lex();
+
+        if (isRBrace) {
+          RegHi = RegLo;
+        } else {
+          if (getParser().parseAbsoluteExpression(RegHi))
+            return false;
+
+          if (getLexer().isNot(AsmToken::RBrac))
+            return false;
+          Parser.Lex();
+        }
+        RegNum = (unsigned) RegLo;
+        RegWidth = (RegHi - RegLo) + 1;
+      }
+    }
+  } else if (getLexer().is(AsmToken::LBrac)) {
+    // List of consecutive registers: [s0,s1,s2,s3]
+    Parser.Lex();
+    if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth))
+      return false;
+    if (RegWidth != 1)
+      return false;
+    RegisterKind RegKind1;
+    unsigned Reg1, RegNum1, RegWidth1;
+    do {
+      if (getLexer().is(AsmToken::Comma)) {
+        Parser.Lex();
+      } else if (getLexer().is(AsmToken::RBrac)) {
+        Parser.Lex();
+        break;
+      } else if (ParseAMDGPURegister(RegKind1, Reg1, RegNum1, RegWidth1)) {
+        if (RegWidth1 != 1) {
+          return false;
+        }
+        if (RegKind1 != RegKind) {
+          return false;
+        }
+        if (!AddNextRegisterToList(Reg, RegWidth, RegKind1, Reg1, RegNum1)) {
+          return false;
+        }
+      } else {
+        return false;
+      }
+    } while (true);
+  } else {
+    return false;
+  }
+  switch (RegKind) {
+  case IS_SPECIAL:
+    RegNum = 0;
+    RegWidth = 1;
+    break;
+  case IS_VGPR:
+  case IS_SGPR:
+  case IS_TTMP:
+  {
+    unsigned Size = 1;
+    if (RegKind == IS_SGPR || RegKind == IS_TTMP) {
+      // SGPR and TTMP registers must be are aligned. Max required alignment is 4 dwords.
+      Size = std::min(RegWidth, 4u);
+    }
+    if (RegNum % Size != 0)
+      return false;
+    RegNum = RegNum / Size;
+    int RCID = getRegClass(RegKind, RegWidth);
+    if (RCID == -1)
+      return false;
+    const MCRegisterClass RC = TRI->getRegClass(RCID);
+    if (RegNum >= RC.getNumRegs())
+      return false;
+    Reg = RC.getRegister(RegNum);
+    break;
+  }
+
+  default:
+    assert(false); return false;
+  }
+
+  if (!subtargetHasRegister(*TRI, Reg))
+    return false;
+  return true;
+}
+
+std::unique_ptr<AMDGPUOperand> AMDGPUAsmParser::parseRegister() {
+  const auto &Tok = Parser.getTok();
+  SMLoc StartLoc = Tok.getLoc();
+  SMLoc EndLoc = Tok.getEndLoc();
   const MCRegisterInfo *TRI = getContext().getRegisterInfo();
 
-  StringRef RegName = Tok.getString();
-  RegNo = getRegForName(RegName);
+  RegisterKind RegKind;
+  unsigned Reg, RegNum, RegWidth;
 
-  if (RegNo) {
+  if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth)) {
+    return nullptr;
+  }
+  return AMDGPUOperand::CreateReg(Reg, StartLoc, EndLoc,
+                                  TRI, &getSTI(), false);
+}
+
+AMDGPUAsmParser::OperandMatchResultTy
+AMDGPUAsmParser::parseImm(OperandVector &Operands) {
+  bool Minus = false;
+  if (getLexer().getKind() == AsmToken::Minus) {
+    Minus = true;
     Parser.Lex();
-    return !subtargetHasRegister(*TRI, RegNo);
   }
 
-  // Match vgprs and sgprs
-  if (RegName[0] != 's' && RegName[0] != 'v')
-    return true;
+  SMLoc S = Parser.getTok().getLoc();
+  switch(getLexer().getKind()) {
+  case AsmToken::Integer: {
+    int64_t IntVal;
+    if (getParser().parseAbsoluteExpression(IntVal))
+      return MatchOperand_ParseFail;
+    if (!isInt<32>(IntVal) && !isUInt<32>(IntVal)) {
+      Error(S, "invalid immediate: only 32-bit values are legal");
+      return MatchOperand_ParseFail;
+    }
 
-  bool IsVgpr = RegName[0] == 'v';
-  unsigned RegWidth;
-  unsigned RegIndexInClass;
-  if (RegName.size() > 1) {
-    // We have a 32-bit register
-    RegWidth = 1;
-    if (RegName.substr(1).getAsInteger(10, RegIndexInClass))
-      return true;
+    if (Minus)
+      IntVal *= -1;
+    Operands.push_back(AMDGPUOperand::CreateImm(IntVal, S));
+    return MatchOperand_Success;
+  }
+  case AsmToken::Real: {
+    // FIXME: We should emit an error if a double precisions floating-point
+    // value is used.  I'm not sure the best way to detect this.
+    int64_t IntVal;
+    if (getParser().parseAbsoluteExpression(IntVal))
+      return MatchOperand_ParseFail;
+
+    APFloat F((float)BitsToDouble(IntVal));
+    if (Minus)
+      F.changeSign();
+    Operands.push_back(
+        AMDGPUOperand::CreateImm(F.bitcastToAPInt().getZExtValue(), S,
+                                 AMDGPUOperand::ImmTyNone, true));
+    return MatchOperand_Success;
+  }
+  default:
+    return Minus ? MatchOperand_ParseFail : MatchOperand_NoMatch;
+  }
+}
+
+AMDGPUAsmParser::OperandMatchResultTy
+AMDGPUAsmParser::parseRegOrImm(OperandVector &Operands) {
+  auto res = parseImm(Operands);
+  if (res != MatchOperand_NoMatch) {
+    return res;
+  }
+
+  if (auto R = parseRegister()) {
+    assert(R->isReg());
+    R->Reg.IsForcedVOP3 = isForcedVOP3();
+    Operands.push_back(std::move(R));
+    return MatchOperand_Success;
+  }
+  return MatchOperand_ParseFail;
+}
+
+AMDGPUAsmParser::OperandMatchResultTy
+AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands) {
+  // XXX: During parsing we can't determine if minus sign means
+  // negate-modifier or negative immediate value.
+  // By default we suppose it is modifier.
+  bool Negate = false, Abs = false, Abs2 = false;
+
+  if (getLexer().getKind()== AsmToken::Minus) {
     Parser.Lex();
-  } else {
-    // We have a register greater than 32-bits.
+    Negate = true;
+  }
 
-    int64_t RegLo, RegHi;
+  if (getLexer().getKind() == AsmToken::Identifier && Parser.getTok().getString() == "abs") {
     Parser.Lex();
-    if (getLexer().isNot(AsmToken::LBrac))
-      return true;
+    Abs2 = true;
+    if (getLexer().isNot(AsmToken::LParen)) {
+      Error(Parser.getTok().getLoc(), "expected left paren after abs");
+      return MatchOperand_ParseFail;
+    }
+    Parser.Lex();
+  }
 
+  if (getLexer().getKind() == AsmToken::Pipe) {
+    if (Abs2) {
+      Error(Parser.getTok().getLoc(), "expected register or immediate");
+      return MatchOperand_ParseFail;
+    }
     Parser.Lex();
-    if (getParser().parseAbsoluteExpression(RegLo))
-      return true;
+    Abs = true;
+  }
 
-    if (getLexer().isNot(AsmToken::Colon))
-      return true;
+  auto Res = parseRegOrImm(Operands);
+  if (Res != MatchOperand_Success) {
+    return Res;
+  }
 
+  AMDGPUOperand::Modifiers Mods = {false, false, false};
+  if (Negate) {
+    Mods.Neg = true;
+  }
+  if (Abs) {
+    if (getLexer().getKind() != AsmToken::Pipe) {
+      Error(Parser.getTok().getLoc(), "expected vertical bar");
+      return MatchOperand_ParseFail;
+    }
     Parser.Lex();
-    if (getParser().parseAbsoluteExpression(RegHi))
-      return true;
+    Mods.Abs = true;
+  }
+  if (Abs2) {
+    if (getLexer().isNot(AsmToken::RParen)) {
+      Error(Parser.getTok().getLoc(), "expected closing parentheses");
+      return MatchOperand_ParseFail;
+    }
+    Parser.Lex();
+    Mods.Abs = true;
+  }
 
-    if (getLexer().isNot(AsmToken::RBrac))
-      return true;
+  if (Mods.hasFPModifiers()) {
+    AMDGPUOperand &Op = static_cast<AMDGPUOperand &>(*Operands.back());
+    Op.setModifiers(Mods);
+  }
+  return MatchOperand_Success;
+}
 
-    Parser.Lex();
-    RegWidth = (RegHi - RegLo) + 1;
-    if (IsVgpr) {
-      // VGPR registers aren't aligned.
-      RegIndexInClass = RegLo;
-    } else {
-      // SGPR registers are aligned.  Max alignment is 4 dwords.
-      unsigned Size = std::min(RegWidth, 4u);
-      if (RegLo % Size != 0)
-        return true;
+AMDGPUAsmParser::OperandMatchResultTy
+AMDGPUAsmParser::parseRegOrImmWithIntInputMods(OperandVector &Operands) {
+  bool Sext = false;
 
-      RegIndexInClass = RegLo / Size;
+  if (getLexer().getKind() == AsmToken::Identifier && Parser.getTok().getString() == "sext") {
+    Parser.Lex();
+    Sext = true;
+    if (getLexer().isNot(AsmToken::LParen)) {
+      Error(Parser.getTok().getLoc(), "expected left paren after sext");
+      return MatchOperand_ParseFail;
     }
+    Parser.Lex();
   }
 
-  int RCID = getRegClass(IsVgpr, RegWidth);
-  if (RCID == -1)
-    return true;
-
-  const MCRegisterClass RC = TRI->getRegClass(RCID);
-  if (RegIndexInClass >= RC.getNumRegs())
-    return true;
+  auto Res = parseRegOrImm(Operands);
+  if (Res != MatchOperand_Success) {
+    return Res;
+  }
 
-  RegNo = RC.getRegister(RegIndexInClass);
-  return !subtargetHasRegister(*TRI, RegNo);
+  AMDGPUOperand::Modifiers Mods = {false, false, false};
+  if (Sext) {
+    if (getLexer().isNot(AsmToken::RParen)) {
+      Error(Parser.getTok().getLoc(), "expected closing parentheses");
+      return MatchOperand_ParseFail;
+    }
+    Parser.Lex();
+    Mods.Sext = true;
+  }
+  
+  if (Mods.hasIntModifiers()) {
+    AMDGPUOperand &Op = static_cast<AMDGPUOperand &>(*Operands.back());
+    Op.setModifiers(Mods);
+  }
+  return MatchOperand_Success;
 }
 
 unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
@@ -597,7 +1129,9 @@ unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
   uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags;
 
   if ((getForcedEncodingSize() == 32 && (TSFlags & SIInstrFlags::VOP3)) ||
-      (getForcedEncodingSize() == 64 && !(TSFlags & SIInstrFlags::VOP3)))
+      (getForcedEncodingSize() == 64 && !(TSFlags & SIInstrFlags::VOP3)) ||
+      (isForcedDPP() && !(TSFlags & SIInstrFlags::DPP)) ||
+      (isForcedSDWA() && !(TSFlags & SIInstrFlags::SDWA)) )
     return Match_InvalidOperand;
 
   if ((TSFlags & SIInstrFlags::VOP3) &&
@@ -608,7 +1142,6 @@ unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
   return Match_Success;
 }
 
-
 bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                               OperandVector &Operands,
                                               MCStreamer &Out,
@@ -632,31 +1165,8 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
       SMLoc ErrorLoc = IDLoc;
       if (ErrorInfo != ~0ULL) {
         if (ErrorInfo >= Operands.size()) {
-          if (isForcedVOP3()) {
-            // If 64-bit encoding has been forced we can end up with no
-            // clamp or omod operands if none of the registers have modifiers,
-            // so we need to add these to the operand list.
-            AMDGPUOperand &LastOp =
-                ((AMDGPUOperand &)*Operands[Operands.size() - 1]);
-            if (LastOp.isRegKind() ||
-               (LastOp.isImm() &&
-                LastOp.getImmTy() != AMDGPUOperand::ImmTyNone)) {
-              SMLoc S = Parser.getTok().getLoc();
-              Operands.push_back(AMDGPUOperand::CreateImm(0, S,
-                                 AMDGPUOperand::ImmTyClamp));
-              Operands.push_back(AMDGPUOperand::CreateImm(0, S,
-                                 AMDGPUOperand::ImmTyOMod));
-              bool Res = MatchAndEmitInstruction(IDLoc, Opcode, Operands,
-                                                 Out, ErrorInfo,
-                                                 MatchingInlineAsm);
-              if (!Res)
-                return Res;
-            }
-
-          }
           return Error(IDLoc, "too few operands for instruction");
         }
-
         ErrorLoc = ((AMDGPUOperand &)*Operands[ErrorInfo]).getStartLoc();
         if (ErrorLoc == SMLoc())
           ErrorLoc = IDLoc;
@@ -762,164 +1272,12 @@ bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() {
 
 bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID,
                                                amd_kernel_code_t &Header) {
-
-  if (getLexer().isNot(AsmToken::Equal))
-    return TokError("expected '='");
-  Lex();
-
-  if (getLexer().isNot(AsmToken::Integer))
-    return TokError("amd_kernel_code_t values must be integers");
-
-  uint64_t Value = getLexer().getTok().getIntVal();
+  SmallString<40> ErrStr;
+  raw_svector_ostream Err(ErrStr);
+  if (!parseAmdKernelCodeField(ID, getParser(), Header, Err)) {
+    return TokError(Err.str());
+  }
   Lex();
-
-  if (ID == "kernel_code_version_major")
-    Header.amd_kernel_code_version_major = Value;
-  else if (ID == "kernel_code_version_minor")
-    Header.amd_kernel_code_version_minor = Value;
-  else if (ID == "machine_kind")
-    Header.amd_machine_kind = Value;
-  else if (ID == "machine_version_major")
-    Header.amd_machine_version_major = Value;
-  else if (ID == "machine_version_minor")
-    Header.amd_machine_version_minor = Value;
-  else if (ID == "machine_version_stepping")
-    Header.amd_machine_version_stepping = Value;
-  else if (ID == "kernel_code_entry_byte_offset")
-    Header.kernel_code_entry_byte_offset = Value;
-  else if (ID == "kernel_code_prefetch_byte_size")
-    Header.kernel_code_prefetch_byte_size = Value;
-  else if (ID == "max_scratch_backing_memory_byte_size")
-    Header.max_scratch_backing_memory_byte_size = Value;
-  else if (ID == "compute_pgm_rsrc1_vgprs")
-    Header.compute_pgm_resource_registers |= S_00B848_VGPRS(Value);
-  else if (ID == "compute_pgm_rsrc1_sgprs")
-    Header.compute_pgm_resource_registers |= S_00B848_SGPRS(Value);
-  else if (ID == "compute_pgm_rsrc1_priority")
-    Header.compute_pgm_resource_registers |= S_00B848_PRIORITY(Value);
-  else if (ID == "compute_pgm_rsrc1_float_mode")
-    Header.compute_pgm_resource_registers |= S_00B848_FLOAT_MODE(Value);
-  else if (ID == "compute_pgm_rsrc1_priv")
-    Header.compute_pgm_resource_registers |= S_00B848_PRIV(Value);
-  else if (ID == "compute_pgm_rsrc1_dx10_clamp")
-    Header.compute_pgm_resource_registers |= S_00B848_DX10_CLAMP(Value);
-  else if (ID == "compute_pgm_rsrc1_debug_mode")
-    Header.compute_pgm_resource_registers |= S_00B848_DEBUG_MODE(Value);
-  else if (ID == "compute_pgm_rsrc1_ieee_mode")
-    Header.compute_pgm_resource_registers |= S_00B848_IEEE_MODE(Value);
-  else if (ID == "compute_pgm_rsrc2_scratch_en")
-    Header.compute_pgm_resource_registers |= (S_00B84C_SCRATCH_EN(Value) << 32);
-  else if (ID == "compute_pgm_rsrc2_user_sgpr")
-    Header.compute_pgm_resource_registers |= (S_00B84C_USER_SGPR(Value) << 32);
-  else if (ID == "compute_pgm_rsrc2_tgid_x_en")
-    Header.compute_pgm_resource_registers |= (S_00B84C_TGID_X_EN(Value) << 32);
-  else if (ID == "compute_pgm_rsrc2_tgid_y_en")
-    Header.compute_pgm_resource_registers |= (S_00B84C_TGID_Y_EN(Value) << 32);
-  else if (ID == "compute_pgm_rsrc2_tgid_z_en")
-    Header.compute_pgm_resource_registers |= (S_00B84C_TGID_Z_EN(Value) << 32);
-  else if (ID == "compute_pgm_rsrc2_tg_size_en")
-    Header.compute_pgm_resource_registers |= (S_00B84C_TG_SIZE_EN(Value) << 32);
-  else if (ID == "compute_pgm_rsrc2_tidig_comp_cnt")
-    Header.compute_pgm_resource_registers |=
-        (S_00B84C_TIDIG_COMP_CNT(Value) << 32);
-  else if (ID == "compute_pgm_rsrc2_excp_en_msb")
-    Header.compute_pgm_resource_registers |=
-        (S_00B84C_EXCP_EN_MSB(Value) << 32);
-  else if (ID == "compute_pgm_rsrc2_lds_size")
-    Header.compute_pgm_resource_registers |= (S_00B84C_LDS_SIZE(Value) << 32);
-  else if (ID == "compute_pgm_rsrc2_excp_en")
-    Header.compute_pgm_resource_registers |= (S_00B84C_EXCP_EN(Value) << 32);
-  else if (ID == "compute_pgm_resource_registers")
-    Header.compute_pgm_resource_registers = Value;
-  else if (ID == "enable_sgpr_private_segment_buffer")
-    Header.code_properties |=
-        (Value << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT);
-  else if (ID == "enable_sgpr_dispatch_ptr")
-    Header.code_properties |=
-        (Value << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT);
-  else if (ID == "enable_sgpr_queue_ptr")
-    Header.code_properties |=
-        (Value << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT);
-  else if (ID == "enable_sgpr_kernarg_segment_ptr")
-    Header.code_properties |=
-        (Value << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT);
-  else if (ID == "enable_sgpr_dispatch_id")
-    Header.code_properties |=
-        (Value << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT);
-  else if (ID == "enable_sgpr_flat_scratch_init")
-    Header.code_properties |=
-        (Value << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT);
-  else if (ID == "enable_sgpr_private_segment_size")
-    Header.code_properties |=
-        (Value << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT);
-  else if (ID == "enable_sgpr_grid_workgroup_count_x")
-    Header.code_properties |=
-        (Value << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT);
-  else if (ID == "enable_sgpr_grid_workgroup_count_y")
-    Header.code_properties |=
-        (Value << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT);
-  else if (ID == "enable_sgpr_grid_workgroup_count_z")
-    Header.code_properties |=
-        (Value << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT);
-  else if (ID == "enable_ordered_append_gds")
-    Header.code_properties |=
-        (Value << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT);
-  else if (ID == "private_element_size")
-    Header.code_properties |=
-        (Value << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT);
-  else if (ID == "is_ptr64")
-    Header.code_properties |=
-        (Value << AMD_CODE_PROPERTY_IS_PTR64_SHIFT);
-  else if (ID == "is_dynamic_callstack")
-    Header.code_properties |=
-        (Value << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT);
-  else if (ID == "is_debug_enabled")
-    Header.code_properties |=
-        (Value << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT);
-  else if (ID == "is_xnack_enabled")
-    Header.code_properties |=
-        (Value << AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT);
-  else if (ID == "workitem_private_segment_byte_size")
-    Header.workitem_private_segment_byte_size = Value;
-  else if (ID == "workgroup_group_segment_byte_size")
-    Header.workgroup_group_segment_byte_size = Value;
-  else if (ID == "gds_segment_byte_size")
-    Header.gds_segment_byte_size = Value;
-  else if (ID == "kernarg_segment_byte_size")
-    Header.kernarg_segment_byte_size = Value;
-  else if (ID == "workgroup_fbarrier_count")
-    Header.workgroup_fbarrier_count = Value;
-  else if (ID == "wavefront_sgpr_count")
-    Header.wavefront_sgpr_count = Value;
-  else if (ID == "workitem_vgpr_count")
-    Header.workitem_vgpr_count = Value;
-  else if (ID == "reserved_vgpr_first")
-    Header.reserved_vgpr_first = Value;
-  else if (ID == "reserved_vgpr_count")
-    Header.reserved_vgpr_count = Value;
-  else if (ID == "reserved_sgpr_first")
-    Header.reserved_sgpr_first = Value;
-  else if (ID == "reserved_sgpr_count")
-    Header.reserved_sgpr_count = Value;
-  else if (ID == "debug_wavefront_private_segment_offset_sgpr")
-    Header.debug_wavefront_private_segment_offset_sgpr = Value;
-  else if (ID == "debug_private_segment_buffer_sgpr")
-    Header.debug_private_segment_buffer_sgpr = Value;
-  else if (ID == "kernarg_segment_alignment")
-    Header.kernarg_segment_alignment = Value;
-  else if (ID == "group_segment_alignment")
-    Header.group_segment_alignment = Value;
-  else if (ID == "private_segment_alignment")
-    Header.private_segment_alignment = Value;
-  else if (ID == "wavefront_size")
-    Header.wavefront_size = Value;
-  else if (ID == "call_convention")
-    Header.call_convention = Value;
-  else if (ID == "runtime_loader_kernel_symbol")
-    Header.runtime_loader_kernel_symbol = Value;
-  else
-    return TokError("amd_kernel_code_t value not recognized.");
-
   return false;
 }
 
@@ -930,9 +1288,6 @@ bool AMDGPUAsmParser::ParseDirectiveAMDKernelCodeT() {
 
   while (true) {
 
-    if (getLexer().isNot(AsmToken::EndOfStatement))
-      return TokError("amd_kernel_code_t values must begin on a new line");
-
     // Lex EndOfStatement.  This is in a while loop, because lexing a comment
     // will set the current token to EndOfStatement.
     while(getLexer().is(AsmToken::EndOfStatement))
@@ -1026,7 +1381,7 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
   if (IDVal == ".amd_kernel_code_t")
     return ParseDirectiveAMDKernelCodeT();
 
-  if (IDVal == ".hsatext" || IDVal == ".text")
+  if (IDVal == ".hsatext")
     return ParseSectionDirectiveHSAText();
 
   if (IDVal == ".amdgpu_hsa_kernel")
@@ -1078,19 +1433,6 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI,
   return true;
 }
 
-static bool operandsHaveModifiers(const OperandVector &Operands) {
-
-  for (unsigned i = 0, e = Operands.size(); i != e; ++i) {
-    const AMDGPUOperand &Op = ((AMDGPUOperand&)*Operands[i]);
-    if (Op.isRegKind() && Op.hasModifiers())
-      return true;
-    if (Op.isImm() && (Op.getImmTy() == AMDGPUOperand::ImmTyOMod ||
-                       Op.getImmTy() == AMDGPUOperand::ImmTyClamp))
-      return true;
-  }
-  return false;
-}
-
 AMDGPUAsmParser::OperandMatchResultTy
 AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
 
@@ -1107,113 +1449,59 @@ AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
       getLexer().is(AsmToken::EndOfStatement))
     return ResTy;
 
-  bool Negate = false, Abs = false;
-  if (getLexer().getKind()== AsmToken::Minus) {
-    Parser.Lex();
-    Negate = true;
-  }
+  ResTy = parseRegOrImm(Operands);
 
-  if (getLexer().getKind() == AsmToken::Pipe) {
-    Parser.Lex();
-    Abs = true;
-  }
-
-  switch(getLexer().getKind()) {
-    case AsmToken::Integer: {
-      SMLoc S = Parser.getTok().getLoc();
-      int64_t IntVal;
-      if (getParser().parseAbsoluteExpression(IntVal))
-        return MatchOperand_ParseFail;
-      if (!isInt<32>(IntVal) && !isUInt<32>(IntVal)) {
-        Error(S, "invalid immediate: only 32-bit values are legal");
-        return MatchOperand_ParseFail;
-      }
-
-      if (Negate)
-        IntVal *= -1;
-      Operands.push_back(AMDGPUOperand::CreateImm(IntVal, S));
-      return MatchOperand_Success;
-    }
-    case AsmToken::Real: {
-      // FIXME: We should emit an error if a double precisions floating-point
-      // value is used.  I'm not sure the best way to detect this.
-      SMLoc S = Parser.getTok().getLoc();
-      int64_t IntVal;
-      if (getParser().parseAbsoluteExpression(IntVal))
-        return MatchOperand_ParseFail;
+  if (ResTy == MatchOperand_Success)
+    return ResTy;
 
-      APFloat F((float)BitsToDouble(IntVal));
-      if (Negate)
-        F.changeSign();
-      Operands.push_back(
-          AMDGPUOperand::CreateImm(F.bitcastToAPInt().getZExtValue(), S));
+  if (getLexer().getKind() == AsmToken::Identifier) {
+    // If this identifier is a symbol, we want to create an expression for it.
+    // It is a little difficult to distinguish between a symbol name, and
+    // an instruction flag like 'gds'.  In order to do this, we parse
+    // all tokens as expressions and then treate the symbol name as the token
+    // string when we want to interpret the operand as a token.
+    const auto &Tok = Parser.getTok();
+    SMLoc S = Tok.getLoc();
+    const MCExpr *Expr = nullptr;
+    if (!Parser.parseExpression(Expr)) {
+      Operands.push_back(AMDGPUOperand::CreateExpr(Expr, S));
       return MatchOperand_Success;
     }
-    case AsmToken::Identifier: {
-      SMLoc S, E;
-      unsigned RegNo;
-      if (!ParseRegister(RegNo, S, E)) {
-
-        bool HasModifiers = operandsHaveModifiers(Operands);
-        unsigned Modifiers = 0;
 
-        if (Negate)
-          Modifiers |= 0x1;
-
-        if (Abs) {
-          if (getLexer().getKind() != AsmToken::Pipe)
-            return MatchOperand_ParseFail;
-          Parser.Lex();
-          Modifiers |= 0x2;
-        }
-
-        if (Modifiers && !HasModifiers) {
-          // We are adding a modifier to src1 or src2 and previous sources
-          // don't have modifiers, so we need to go back and empty modifers
-          // for each previous source.
-          for (unsigned PrevRegIdx = Operands.size() - 1; PrevRegIdx > 1;
-               --PrevRegIdx) {
-
-            AMDGPUOperand &RegOp = ((AMDGPUOperand&)*Operands[PrevRegIdx]);
-            RegOp.setModifiers(0);
-          }
-        }
-
-
-        Operands.push_back(AMDGPUOperand::CreateReg(
-            RegNo, S, E, getContext().getRegisterInfo(), &getSTI(),
-            isForcedVOP3()));
-
-        if (HasModifiers || Modifiers) {
-          AMDGPUOperand &RegOp = ((AMDGPUOperand&)*Operands[Operands.size() - 1]);
-          RegOp.setModifiers(Modifiers);
-
-        }
-     }  else {
-      Operands.push_back(AMDGPUOperand::CreateToken(Parser.getTok().getString(),
-                                                    S));
-      Parser.Lex();
-     }
-     return MatchOperand_Success;
-    }
-    default:
-      return MatchOperand_NoMatch;
+    Operands.push_back(AMDGPUOperand::CreateToken(Tok.getString(), Tok.getLoc()));
+    Parser.Lex();
+    return MatchOperand_Success;
   }
+  return MatchOperand_NoMatch;
 }
 
-bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info,
-                                       StringRef Name,
-                                       SMLoc NameLoc, OperandVector &Operands) {
-
+StringRef AMDGPUAsmParser::parseMnemonicSuffix(StringRef Name) {
   // Clear any forced encodings from the previous instruction.
   setForcedEncodingSize(0);
+  setForcedDPP(false);
+  setForcedSDWA(false);
 
-  if (Name.endswith("_e64"))
+  if (Name.endswith("_e64")) {
     setForcedEncodingSize(64);
-  else if (Name.endswith("_e32"))
+    return Name.substr(0, Name.size() - 4);
+  } else if (Name.endswith("_e32")) {
     setForcedEncodingSize(32);
+    return Name.substr(0, Name.size() - 4);
+  } else if (Name.endswith("_dpp")) {
+    setForcedDPP(true);
+    return Name.substr(0, Name.size() - 4);
+  } else if (Name.endswith("_sdwa")) {
+    setForcedSDWA(true);
+    return Name.substr(0, Name.size() - 5);
+  }
+  return Name;
+}
 
+bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info,
+                                       StringRef Name,
+                                       SMLoc NameLoc, OperandVector &Operands) {
   // Add the instruction mnemonic
+  Name = parseMnemonicSuffix(Name);
   Operands.push_back(AMDGPUOperand::CreateToken(Name, NameLoc));
 
   while (!getLexer().is(AsmToken::EndOfStatement)) {
@@ -1225,20 +1513,21 @@ bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info,
 
     switch (Res) {
       case MatchOperand_Success: break;
-      case MatchOperand_ParseFail: return Error(getLexer().getLoc(),
-                                                "failed parsing operand.");
-      case MatchOperand_NoMatch: return Error(getLexer().getLoc(),
-                                              "not a valid operand.");
+      case MatchOperand_ParseFail:
+        Error(getLexer().getLoc(), "failed parsing operand.");
+        while (!getLexer().is(AsmToken::EndOfStatement)) {
+          Parser.Lex();
+        }
+        return true;
+      case MatchOperand_NoMatch:
+        Error(getLexer().getLoc(), "not a valid operand.");
+        while (!getLexer().is(AsmToken::EndOfStatement)) {
+          Parser.Lex();
+        }
+        return true;
     }
   }
 
-  // Once we reach end of statement, continue parsing so we can add default
-  // values for optional arguments.
-  AMDGPUAsmParser::OperandMatchResultTy Res;
-  while ((Res = parseOperand(Operands, Name)) != MatchOperand_NoMatch) {
-    if (Res != MatchOperand_Success)
-      return Error(getLexer().getLoc(), "failed parsing operand.");
-  }
   return false;
 }
 
@@ -1247,22 +1536,14 @@ bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info,
 //===----------------------------------------------------------------------===//
 
 AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, int64_t &Int,
-                                    int64_t Default) {
-
-  // We are at the end of the statement, and this is a default argument, so
-  // use a default value.
-  if (getLexer().is(AsmToken::EndOfStatement)) {
-    Int = Default;
-    return MatchOperand_Success;
-  }
-
+AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, int64_t &Int) {
   switch(getLexer().getKind()) {
     default: return MatchOperand_NoMatch;
     case AsmToken::Identifier: {
-      StringRef OffsetName = Parser.getTok().getString();
-      if (!OffsetName.equals(Prefix))
+      StringRef Name = Parser.getTok().getString();
+      if (!Name.equals(Prefix)) {
         return MatchOperand_NoMatch;
+      }
 
       Parser.Lex();
       if (getLexer().isNot(AsmToken::Colon))
@@ -1282,16 +1563,21 @@ AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, int64_t &Int,
 
 AMDGPUAsmParser::OperandMatchResultTy
 AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, OperandVector &Operands,
-                                    enum AMDGPUOperand::ImmTy ImmTy) {
+                                    enum AMDGPUOperand::ImmTy ImmTy,
+                                    bool (*ConvertResult)(int64_t&)) {
 
   SMLoc S = Parser.getTok().getLoc();
-  int64_t Offset = 0;
+  int64_t Value = 0;
 
-  AMDGPUAsmParser::OperandMatchResultTy Res = parseIntWithPrefix(Prefix, Offset);
+  AMDGPUAsmParser::OperandMatchResultTy Res = parseIntWithPrefix(Prefix, Value);
   if (Res != MatchOperand_Success)
     return Res;
 
-  Operands.push_back(AMDGPUOperand::CreateImm(Offset, S, ImmTy));
+  if (ConvertResult && !ConvertResult(Value)) {
+    return MatchOperand_ParseFail;
+  }
+
+  Operands.push_back(AMDGPUOperand::CreateImm(Value, S, ImmTy));
   return MatchOperand_Success;
 }
 
@@ -1327,101 +1613,52 @@ AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands,
   return MatchOperand_Success;
 }
 
-static bool operandsHasOptionalOp(const OperandVector &Operands,
-                                  const OptionalOperand &OOp) {
-  for (unsigned i = 0; i < Operands.size(); i++) {
-    const AMDGPUOperand &ParsedOp = ((const AMDGPUOperand &)*Operands[i]);
-    if ((ParsedOp.isImm() && ParsedOp.getImmTy() == OOp.Type) ||
-        (ParsedOp.isToken() && ParsedOp.getToken() == OOp.Name))
-      return true;
+typedef std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalImmIndexMap;
 
+void addOptionalImmOperand(MCInst& Inst, const OperandVector& Operands,
+                           OptionalImmIndexMap& OptionalIdx,
+                           enum AMDGPUOperand::ImmTy ImmT, int64_t Default = 0) {
+  auto i = OptionalIdx.find(ImmT);
+  if (i != OptionalIdx.end()) {
+    unsigned Idx = i->second;
+    ((AMDGPUOperand &)*Operands[Idx]).addImmOperands(Inst, 1);
+  } else {
+    Inst.addOperand(MCOperand::createImm(Default));
   }
-  return false;
 }
 
 AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseOptionalOps(const ArrayRef<OptionalOperand> &OptionalOps,
-                                   OperandVector &Operands) {
-  SMLoc S = Parser.getTok().getLoc();
-  for (const OptionalOperand &Op : OptionalOps) {
-    if (operandsHasOptionalOp(Operands, Op))
-      continue;
-    AMDGPUAsmParser::OperandMatchResultTy Res;
-    int64_t Value;
-    if (Op.IsBit) {
-      Res = parseNamedBit(Op.Name, Operands, Op.Type);
-      if (Res == MatchOperand_NoMatch)
-        continue;
-      return Res;
-    }
-
-    Res = parseIntWithPrefix(Op.Name, Value, Op.Default);
-
-    if (Res == MatchOperand_NoMatch)
-      continue;
-
-    if (Res != MatchOperand_Success)
-      return Res;
+AMDGPUAsmParser::parseStringWithPrefix(StringRef Prefix, StringRef &Value) {
+  if (getLexer().isNot(AsmToken::Identifier)) {
+    return MatchOperand_NoMatch;
+  }
+  StringRef Tok = Parser.getTok().getString();
+  if (Tok != Prefix) {
+    return MatchOperand_NoMatch;
+  }
 
-    if (Op.ConvertResult && !Op.ConvertResult(Value)) {
-      return MatchOperand_ParseFail;
-    }
+  Parser.Lex();
+  if (getLexer().isNot(AsmToken::Colon)) {
+    return MatchOperand_ParseFail;
+  }
 
-    Operands.push_back(AMDGPUOperand::CreateImm(Value, S, Op.Type));
-    return MatchOperand_Success;
+  Parser.Lex();
+  if (getLexer().isNot(AsmToken::Identifier)) {
+    return MatchOperand_ParseFail;
   }
-  return MatchOperand_NoMatch;
+
+  Value = Parser.getTok().getString();
+  return MatchOperand_Success;
 }
 
 //===----------------------------------------------------------------------===//
 // ds
 //===----------------------------------------------------------------------===//
 
-static const OptionalOperand DSOptionalOps [] = {
-  {"offset",  AMDGPUOperand::ImmTyOffset, false, 0, nullptr},
-  {"gds",     AMDGPUOperand::ImmTyGDS, true, 0, nullptr}
-};
-
-static const OptionalOperand DSOptionalOpsOff01 [] = {
-  {"offset0", AMDGPUOperand::ImmTyDSOffset0, false, 0, nullptr},
-  {"offset1", AMDGPUOperand::ImmTyDSOffset1, false, 0, nullptr},
-  {"gds",     AMDGPUOperand::ImmTyGDS, true, 0, nullptr}
-};
-
-AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseDSOptionalOps(OperandVector &Operands) {
-  return parseOptionalOps(DSOptionalOps, Operands);
-}
-AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseDSOff01OptionalOps(OperandVector &Operands) {
-  return parseOptionalOps(DSOptionalOpsOff01, Operands);
-}
-
-AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseDSOffsetOptional(OperandVector &Operands) {
-  SMLoc S = Parser.getTok().getLoc();
-  AMDGPUAsmParser::OperandMatchResultTy Res =
-    parseIntWithPrefix("offset", Operands, AMDGPUOperand::ImmTyOffset);
-  if (Res == MatchOperand_NoMatch) {
-    Operands.push_back(AMDGPUOperand::CreateImm(0, S,
-                       AMDGPUOperand::ImmTyOffset));
-    Res = MatchOperand_Success;
-  }
-  return Res;
-}
-
-bool AMDGPUOperand::isDSOffset() const {
-  return isImm() && isUInt<16>(getImm());
-}
-
-bool AMDGPUOperand::isDSOffset01() const {
-  return isImm() && isUInt<8>(getImm());
-}
-
 void AMDGPUAsmParser::cvtDSOffset01(MCInst &Inst,
                                     const OperandVector &Operands) {
 
-  std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalIdx;
+  OptionalImmIndexMap OptionalIdx;
 
   for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
     AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
@@ -1436,13 +1673,10 @@ void AMDGPUAsmParser::cvtDSOffset01(MCInst &Inst,
     OptionalIdx[Op.getImmTy()] = i;
   }
 
-  unsigned Offset0Idx = OptionalIdx[AMDGPUOperand::ImmTyDSOffset0];
-  unsigned Offset1Idx = OptionalIdx[AMDGPUOperand::ImmTyDSOffset1];
-  unsigned GDSIdx = OptionalIdx[AMDGPUOperand::ImmTyGDS];
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset0);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset1);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGDS);
 
-  ((AMDGPUOperand &)*Operands[Offset0Idx]).addImmOperands(Inst, 1); // offset0
-  ((AMDGPUOperand &)*Operands[Offset1Idx]).addImmOperands(Inst, 1); // offset1
-  ((AMDGPUOperand &)*Operands[GDSIdx]).addImmOperands(Inst, 1); // gds
   Inst.addOperand(MCOperand::createReg(AMDGPU::M0)); // m0
 }
 
@@ -1469,12 +1703,11 @@ void AMDGPUAsmParser::cvtDS(MCInst &Inst, const OperandVector &Operands) {
     OptionalIdx[Op.getImmTy()] = i;
   }
 
-  unsigned OffsetIdx = OptionalIdx[AMDGPUOperand::ImmTyOffset];
-  ((AMDGPUOperand &)*Operands[OffsetIdx]).addImmOperands(Inst, 1); // offset
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGDS);
 
   if (!GDSOnly) {
-    unsigned GDSIdx = OptionalIdx[AMDGPUOperand::ImmTyGDS];
-    ((AMDGPUOperand &)*Operands[GDSIdx]).addImmOperands(Inst, 1); // gds
+    addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGDS);
   }
   Inst.addOperand(MCOperand::createReg(AMDGPU::M0)); // m0
 }
@@ -1516,7 +1749,7 @@ bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) {
     CntMask = 0x7;
     CntShift = 4;
   } else if (CntName == "lgkmcnt") {
-    CntMask = 0x7;
+    CntMask = 0xf;
     CntShift = 8;
   } else {
     return true;
@@ -1532,8 +1765,8 @@ AMDGPUAsmParser::parseSWaitCntOps(OperandVector &Operands) {
   // Disable all counters by default.
   // vmcnt   [3:0]
   // expcnt  [6:4]
-  // lgkmcnt [10:8]
-  int64_t CntVal = 0x77f;
+  // lgkmcnt [11:8]
+  int64_t CntVal = 0xf7f;
   SMLoc S = Parser.getTok().getLoc();
 
   switch(getLexer().getKind()) {
@@ -1555,10 +1788,298 @@ AMDGPUAsmParser::parseSWaitCntOps(OperandVector &Operands) {
   return MatchOperand_Success;
 }
 
+bool AMDGPUAsmParser::parseHwregConstruct(OperandInfoTy &HwReg, int64_t &Offset, int64_t &Width) {
+  using namespace llvm::AMDGPU::Hwreg;
+
+  if (Parser.getTok().getString() != "hwreg")
+    return true;
+  Parser.Lex();
+
+  if (getLexer().isNot(AsmToken::LParen))
+    return true;
+  Parser.Lex();
+
+  if (getLexer().is(AsmToken::Identifier)) {
+    HwReg.IsSymbolic = true;
+    HwReg.Id = ID_UNKNOWN_;
+    const StringRef tok = Parser.getTok().getString();
+    for (int i = ID_SYMBOLIC_FIRST_; i < ID_SYMBOLIC_LAST_; ++i) {
+      if (tok == IdSymbolic[i]) {
+        HwReg.Id = i;
+        break;
+      }
+    }
+    Parser.Lex();
+  } else {
+    HwReg.IsSymbolic = false;
+    if (getLexer().isNot(AsmToken::Integer))
+      return true;
+    if (getParser().parseAbsoluteExpression(HwReg.Id))
+      return true;
+  }
+
+  if (getLexer().is(AsmToken::RParen)) {
+    Parser.Lex();
+    return false;
+  }
+
+  // optional params
+  if (getLexer().isNot(AsmToken::Comma))
+    return true;
+  Parser.Lex();
+
+  if (getLexer().isNot(AsmToken::Integer))
+    return true;
+  if (getParser().parseAbsoluteExpression(Offset))
+    return true;
+
+  if (getLexer().isNot(AsmToken::Comma))
+    return true;
+  Parser.Lex();
+
+  if (getLexer().isNot(AsmToken::Integer))
+    return true;
+  if (getParser().parseAbsoluteExpression(Width))
+    return true;
+
+  if (getLexer().isNot(AsmToken::RParen))
+    return true;
+  Parser.Lex();
+
+  return false;
+}
+
+AMDGPUAsmParser::OperandMatchResultTy
+AMDGPUAsmParser::parseHwreg(OperandVector &Operands) {
+  using namespace llvm::AMDGPU::Hwreg;
+
+  int64_t Imm16Val = 0;
+  SMLoc S = Parser.getTok().getLoc();
+
+  switch(getLexer().getKind()) {
+    default: return MatchOperand_NoMatch;
+    case AsmToken::Integer:
+      // The operand can be an integer value.
+      if (getParser().parseAbsoluteExpression(Imm16Val))
+        return MatchOperand_NoMatch;
+      if (Imm16Val < 0 || !isUInt<16>(Imm16Val)) {
+        Error(S, "invalid immediate: only 16-bit values are legal");
+        // Do not return error code, but create an imm operand anyway and proceed
+        // to the next operand, if any. That avoids unneccessary error messages.
+      }
+      break;
+
+    case AsmToken::Identifier: {
+        OperandInfoTy HwReg(ID_UNKNOWN_);
+        int64_t Offset = OFFSET_DEFAULT_;
+        int64_t Width = WIDTH_M1_DEFAULT_ + 1;
+        if (parseHwregConstruct(HwReg, Offset, Width))
+          return MatchOperand_ParseFail;
+        if (HwReg.Id < 0 || !isUInt<ID_WIDTH_>(HwReg.Id)) {
+          if (HwReg.IsSymbolic)
+            Error(S, "invalid symbolic name of hardware register");
+          else
+            Error(S, "invalid code of hardware register: only 6-bit values are legal");
+        }
+        if (Offset < 0 || !isUInt<OFFSET_WIDTH_>(Offset))
+          Error(S, "invalid bit offset: only 5-bit values are legal");
+        if ((Width-1) < 0 || !isUInt<WIDTH_M1_WIDTH_>(Width-1))
+          Error(S, "invalid bitfield width: only values from 1 to 32 are legal");
+        Imm16Val = (HwReg.Id << ID_SHIFT_) | (Offset << OFFSET_SHIFT_) | ((Width-1) << WIDTH_M1_SHIFT_);
+      }
+      break;
+  }
+  Operands.push_back(AMDGPUOperand::CreateImm(Imm16Val, S, AMDGPUOperand::ImmTyHwreg));
+  return MatchOperand_Success;
+}
+
 bool AMDGPUOperand::isSWaitCnt() const {
   return isImm();
 }
 
+bool AMDGPUOperand::isHwreg() const {
+  return isImmTy(ImmTyHwreg);
+}
+
+bool AMDGPUAsmParser::parseSendMsgConstruct(OperandInfoTy &Msg, OperandInfoTy &Operation, int64_t &StreamId) {
+  using namespace llvm::AMDGPU::SendMsg;
+
+  if (Parser.getTok().getString() != "sendmsg")
+    return true;
+  Parser.Lex();
+
+  if (getLexer().isNot(AsmToken::LParen))
+    return true;
+  Parser.Lex();
+
+  if (getLexer().is(AsmToken::Identifier)) {
+    Msg.IsSymbolic = true;
+    Msg.Id = ID_UNKNOWN_;
+    const std::string tok = Parser.getTok().getString();
+    for (int i = ID_GAPS_FIRST_; i < ID_GAPS_LAST_; ++i) {
+      switch(i) {
+        default: continue; // Omit gaps.
+        case ID_INTERRUPT: case ID_GS: case ID_GS_DONE:  case ID_SYSMSG: break;
+      }
+      if (tok == IdSymbolic[i]) {
+        Msg.Id = i;
+        break;
+      }
+    }
+    Parser.Lex();
+  } else {
+    Msg.IsSymbolic = false;
+    if (getLexer().isNot(AsmToken::Integer))
+      return true;
+    if (getParser().parseAbsoluteExpression(Msg.Id))
+      return true;
+    if (getLexer().is(AsmToken::Integer))
+      if (getParser().parseAbsoluteExpression(Msg.Id))
+        Msg.Id = ID_UNKNOWN_;
+  }
+  if (Msg.Id == ID_UNKNOWN_) // Don't know how to parse the rest.
+    return false;
+
+  if (!(Msg.Id == ID_GS || Msg.Id == ID_GS_DONE || Msg.Id == ID_SYSMSG)) {
+    if (getLexer().isNot(AsmToken::RParen))
+      return true;
+    Parser.Lex();
+    return false;
+  }
+
+  if (getLexer().isNot(AsmToken::Comma))
+    return true;
+  Parser.Lex();
+
+  assert(Msg.Id == ID_GS || Msg.Id == ID_GS_DONE || Msg.Id == ID_SYSMSG);
+  Operation.Id = ID_UNKNOWN_;
+  if (getLexer().is(AsmToken::Identifier)) {
+    Operation.IsSymbolic = true;
+    const char* const *S = (Msg.Id == ID_SYSMSG) ? OpSysSymbolic : OpGsSymbolic;
+    const int F = (Msg.Id == ID_SYSMSG) ? OP_SYS_FIRST_ : OP_GS_FIRST_;
+    const int L = (Msg.Id == ID_SYSMSG) ? OP_SYS_LAST_ : OP_GS_LAST_;
+    const StringRef Tok = Parser.getTok().getString();
+    for (int i = F; i < L; ++i) {
+      if (Tok == S[i]) {
+        Operation.Id = i;
+        break;
+      }
+    }
+    Parser.Lex();
+  } else {
+    Operation.IsSymbolic = false;
+    if (getLexer().isNot(AsmToken::Integer))
+      return true;
+    if (getParser().parseAbsoluteExpression(Operation.Id))
+      return true;
+  }
+
+  if ((Msg.Id == ID_GS || Msg.Id == ID_GS_DONE) && Operation.Id != OP_GS_NOP) {
+    // Stream id is optional.
+    if (getLexer().is(AsmToken::RParen)) {
+      Parser.Lex();
+      return false;
+    }
+
+    if (getLexer().isNot(AsmToken::Comma))
+      return true;
+    Parser.Lex();
+
+    if (getLexer().isNot(AsmToken::Integer))
+      return true;
+    if (getParser().parseAbsoluteExpression(StreamId))
+      return true;
+  }
+
+  if (getLexer().isNot(AsmToken::RParen))
+    return true;
+  Parser.Lex();
+  return false;
+}
+
+AMDGPUAsmParser::OperandMatchResultTy
+AMDGPUAsmParser::parseSendMsgOp(OperandVector &Operands) {
+  using namespace llvm::AMDGPU::SendMsg;
+
+  int64_t Imm16Val = 0;
+  SMLoc S = Parser.getTok().getLoc();
+
+  switch(getLexer().getKind()) {
+  default:
+    return MatchOperand_NoMatch;
+  case AsmToken::Integer:
+    // The operand can be an integer value.
+    if (getParser().parseAbsoluteExpression(Imm16Val))
+      return MatchOperand_NoMatch;
+    if (Imm16Val < 0 || !isUInt<16>(Imm16Val)) {
+      Error(S, "invalid immediate: only 16-bit values are legal");
+      // Do not return error code, but create an imm operand anyway and proceed
+      // to the next operand, if any. That avoids unneccessary error messages.
+    }
+    break;
+  case AsmToken::Identifier: {
+      OperandInfoTy Msg(ID_UNKNOWN_);
+      OperandInfoTy Operation(OP_UNKNOWN_);
+      int64_t StreamId = STREAM_ID_DEFAULT_;
+      if (parseSendMsgConstruct(Msg, Operation, StreamId))
+        return MatchOperand_ParseFail;
+      do {
+        // Validate and encode message ID.
+        if (! ((ID_INTERRUPT <= Msg.Id && Msg.Id <= ID_GS_DONE)
+                || Msg.Id == ID_SYSMSG)) {
+          if (Msg.IsSymbolic)
+            Error(S, "invalid/unsupported symbolic name of message");
+          else
+            Error(S, "invalid/unsupported code of message");
+          break;
+        }
+        Imm16Val = (Msg.Id << ID_SHIFT_);
+        // Validate and encode operation ID.
+        if (Msg.Id == ID_GS || Msg.Id == ID_GS_DONE) {
+          if (! (OP_GS_FIRST_ <= Operation.Id && Operation.Id < OP_GS_LAST_)) {
+            if (Operation.IsSymbolic)
+              Error(S, "invalid symbolic name of GS_OP");
+            else
+              Error(S, "invalid code of GS_OP: only 2-bit values are legal");
+            break;
+          }
+          if (Operation.Id == OP_GS_NOP
+              && Msg.Id != ID_GS_DONE) {
+            Error(S, "invalid GS_OP: NOP is for GS_DONE only");
+            break;
+          }
+          Imm16Val |= (Operation.Id << OP_SHIFT_);
+        }
+        if (Msg.Id == ID_SYSMSG) {
+          if (! (OP_SYS_FIRST_ <= Operation.Id && Operation.Id < OP_SYS_LAST_)) {
+            if (Operation.IsSymbolic)
+              Error(S, "invalid/unsupported symbolic name of SYSMSG_OP");
+            else
+              Error(S, "invalid/unsupported code of SYSMSG_OP");
+            break;
+          }
+          Imm16Val |= (Operation.Id << OP_SHIFT_);
+        }
+        // Validate and encode stream ID.
+        if ((Msg.Id == ID_GS || Msg.Id == ID_GS_DONE) && Operation.Id != OP_GS_NOP) {
+          if (! (STREAM_ID_FIRST_ <= StreamId && StreamId < STREAM_ID_LAST_)) {
+            Error(S, "invalid stream id: only 2-bit values are legal");
+            break;
+          }
+          Imm16Val |= (StreamId << STREAM_ID_SHIFT_);
+        }
+      } while (0);
+    }
+    break;
+  }
+  Operands.push_back(AMDGPUOperand::CreateImm(Imm16Val, S, AMDGPUOperand::ImmTySendMsg));
+  return MatchOperand_Success;
+}
+
+bool AMDGPUOperand::isSendMsg() const {
+  return isImmTy(ImmTySendMsg);
+}
+
 //===----------------------------------------------------------------------===//
 // sopp branch targets
 //===----------------------------------------------------------------------===//
@@ -1587,33 +2108,26 @@ AMDGPUAsmParser::parseSOppBrTarget(OperandVector &Operands) {
 }
 
 //===----------------------------------------------------------------------===//
-// flat
+// mubuf
 //===----------------------------------------------------------------------===//
 
-static const OptionalOperand FlatOptionalOps [] = {
-  {"glc",    AMDGPUOperand::ImmTyGLC, true, 0, nullptr},
-  {"slc",    AMDGPUOperand::ImmTySLC, true, 0, nullptr},
-  {"tfe",    AMDGPUOperand::ImmTyTFE, true, 0, nullptr}
-};
-
-static const OptionalOperand FlatAtomicOptionalOps [] = {
-  {"slc",    AMDGPUOperand::ImmTySLC, true, 0, nullptr},
-  {"tfe",    AMDGPUOperand::ImmTyTFE, true, 0, nullptr}
-};
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultGLC() const {
+  return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyGLC);
+}
 
-AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseFlatOptionalOps(OperandVector &Operands) {
-  return parseOptionalOps(FlatOptionalOps, Operands);
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSLC() const {
+  return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTySLC);
 }
 
-AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseFlatAtomicOptionalOps(OperandVector &Operands) {
-  return parseOptionalOps(FlatAtomicOptionalOps, Operands);
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultTFE() const {
+  return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyTFE);
 }
 
-void AMDGPUAsmParser::cvtFlat(MCInst &Inst,
-                               const OperandVector &Operands) {
-  std::map<AMDGPUOperand::ImmTy, unsigned> OptionalIdx;
+void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
+                               const OperandVector &Operands,
+                               bool IsAtomic, bool IsAtomicReturn) {
+  OptionalImmIndexMap OptionalIdx;
+  assert(IsAtomicReturn ? IsAtomic : true);
 
   for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
     AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
@@ -1624,129 +2138,128 @@ void AMDGPUAsmParser::cvtFlat(MCInst &Inst,
       continue;
     }
 
-    // Handle 'glc' token which is sometimes hard-coded into the
+    // Handle the case where soffset is an immediate
+    if (Op.isImm() && Op.getImmTy() == AMDGPUOperand::ImmTyNone) {
+      Op.addImmOperands(Inst, 1);
+      continue;
+    }
+
+    // Handle tokens like 'offen' which are sometimes hard-coded into the
     // asm string.  There are no MCInst operands for these.
-    if (Op.isToken())
+    if (Op.isToken()) {
       continue;
+    }
+    assert(Op.isImm());
 
     // Handle optional arguments
     OptionalIdx[Op.getImmTy()] = i;
-
   }
 
-  // flat atomic instructions don't have a glc argument.
-  if (OptionalIdx.count(AMDGPUOperand::ImmTyGLC)) {
-    unsigned GLCIdx = OptionalIdx[AMDGPUOperand::ImmTyGLC];
-    ((AMDGPUOperand &)*Operands[GLCIdx]).addImmOperands(Inst, 1);
+  // Copy $vdata_in operand and insert as $vdata for MUBUF_Atomic RTN insns.
+  if (IsAtomicReturn) {
+    MCInst::iterator I = Inst.begin(); // $vdata_in is always at the beginning.
+    Inst.insert(I, *I);
   }
 
-  unsigned SLCIdx = OptionalIdx[AMDGPUOperand::ImmTySLC];
-  unsigned TFEIdx = OptionalIdx[AMDGPUOperand::ImmTyTFE];
-
-  ((AMDGPUOperand &)*Operands[SLCIdx]).addImmOperands(Inst, 1);
-  ((AMDGPUOperand &)*Operands[TFEIdx]).addImmOperands(Inst, 1);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset);
+  if (!IsAtomic) { // glc is hard-coded.
+    addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC);
+  }
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
 }
 
 //===----------------------------------------------------------------------===//
-// mubuf
+// mimg
 //===----------------------------------------------------------------------===//
 
-static const OptionalOperand MubufOptionalOps [] = {
-  {"offset", AMDGPUOperand::ImmTyOffset, false, 0, nullptr},
-  {"glc",    AMDGPUOperand::ImmTyGLC, true, 0, nullptr},
-  {"slc",    AMDGPUOperand::ImmTySLC, true, 0, nullptr},
-  {"tfe",    AMDGPUOperand::ImmTyTFE, true, 0, nullptr}
-};
+void AMDGPUAsmParser::cvtMIMG(MCInst &Inst, const OperandVector &Operands) {
+  unsigned I = 1;
+  const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
+  for (unsigned J = 0; J < Desc.getNumDefs(); ++J) {
+    ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1);
+  }
 
-AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseMubufOptionalOps(OperandVector &Operands) {
-  return parseOptionalOps(MubufOptionalOps, Operands);
-}
+  OptionalImmIndexMap OptionalIdx;
 
-AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseOffset(OperandVector &Operands) {
-  return parseIntWithPrefix("offset", Operands);
-}
+  for (unsigned E = Operands.size(); I != E; ++I) {
+    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
 
-AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseGLC(OperandVector &Operands) {
-  return parseNamedBit("glc", Operands);
-}
+    // Add the register arguments
+    if (Op.isRegOrImm()) {
+      Op.addRegOrImmOperands(Inst, 1);
+      continue;
+    } else if (Op.isImmModifier()) {
+      OptionalIdx[Op.getImmTy()] = I;
+    } else {
+      assert(false);
+    }
+  }
 
-AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseSLC(OperandVector &Operands) {
-  return parseNamedBit("slc", Operands);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDMask);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyUNorm);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDA);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyR128);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyLWE);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
 }
 
-AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseTFE(OperandVector &Operands) {
-  return parseNamedBit("tfe", Operands);
-}
+void AMDGPUAsmParser::cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands) {
+  unsigned I = 1;
+  const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
+  for (unsigned J = 0; J < Desc.getNumDefs(); ++J) {
+    ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1);
+  }
 
-bool AMDGPUOperand::isMubufOffset() const {
-  return isImm() && isUInt<12>(getImm());
-}
+  // Add src, same as dst
+  ((AMDGPUOperand &)*Operands[I]).addRegOperands(Inst, 1);
 
-void AMDGPUAsmParser::cvtMubuf(MCInst &Inst,
-                               const OperandVector &Operands) {
-  std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalIdx;
+  OptionalImmIndexMap OptionalIdx;
 
-  for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
-    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
+  for (unsigned E = Operands.size(); I != E; ++I) {
+    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
 
     // Add the register arguments
-    if (Op.isReg()) {
-      Op.addRegOperands(Inst, 1);
-      continue;
-    }
-
-    // Handle the case where soffset is an immediate
-    if (Op.isImm() && Op.getImmTy() == AMDGPUOperand::ImmTyNone) {
-      Op.addImmOperands(Inst, 1);
-      continue;
-    }
-
-    // Handle tokens like 'offen' which are sometimes hard-coded into the
-    // asm string.  There are no MCInst operands for these.
-    if (Op.isToken()) {
+    if (Op.isRegOrImm()) {
+      Op.addRegOrImmOperands(Inst, 1);
       continue;
+    } else if (Op.isImmModifier()) {
+      OptionalIdx[Op.getImmTy()] = I;
+    } else {
+      assert(false);
     }
-    assert(Op.isImm());
-
-    // Handle optional arguments
-    OptionalIdx[Op.getImmTy()] = i;
   }
 
-  assert(OptionalIdx.size() == 4);
-
-  unsigned OffsetIdx = OptionalIdx[AMDGPUOperand::ImmTyOffset];
-  unsigned GLCIdx = OptionalIdx[AMDGPUOperand::ImmTyGLC];
-  unsigned SLCIdx = OptionalIdx[AMDGPUOperand::ImmTySLC];
-  unsigned TFEIdx = OptionalIdx[AMDGPUOperand::ImmTyTFE];
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDMask);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyUNorm);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDA);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyR128);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyLWE);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
+}
 
-  ((AMDGPUOperand &)*Operands[OffsetIdx]).addImmOperands(Inst, 1);
-  ((AMDGPUOperand &)*Operands[GLCIdx]).addImmOperands(Inst, 1);
-  ((AMDGPUOperand &)*Operands[SLCIdx]).addImmOperands(Inst, 1);
-  ((AMDGPUOperand &)*Operands[TFEIdx]).addImmOperands(Inst, 1);
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultDMask() const {
+  return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyDMask);
 }
 
-//===----------------------------------------------------------------------===//
-// mimg
-//===----------------------------------------------------------------------===//
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultUNorm() const {
+  return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyUNorm);
+}
 
-AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseDMask(OperandVector &Operands) {
-  return parseIntWithPrefix("dmask", Operands);
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultDA() const {
+  return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyDA);
 }
 
-AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseUNorm(OperandVector &Operands) {
-  return parseNamedBit("unorm", Operands);
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultR128() const {
+  return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyR128);
 }
 
-AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseR128(OperandVector &Operands) {
-  return parseNamedBit("r128", Operands);
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultLWE() const {
+  return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyLWE);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1766,6 +2279,14 @@ bool AMDGPUOperand::isSMRDLiteralOffset() const {
   return isImm() && !isUInt<8>(getImm()) && isUInt<32>(getImm());
 }
 
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSMRDOffset() const {
+  return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyOffset);
+}
+
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSMRDLiteralOffset() const {
+  return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyOffset);
+}
+
 //===----------------------------------------------------------------------===//
 // vop3
 //===----------------------------------------------------------------------===//
@@ -1792,91 +2313,435 @@ static bool ConvertOmodDiv(int64_t &Div) {
   return false;
 }
 
-static const OptionalOperand VOP3OptionalOps [] = {
-  {"clamp", AMDGPUOperand::ImmTyClamp, true, 0, nullptr},
-  {"mul",   AMDGPUOperand::ImmTyOMod, false, 1, ConvertOmodMul},
-  {"div",   AMDGPUOperand::ImmTyOMod, false, 1, ConvertOmodDiv},
+static bool ConvertBoundCtrl(int64_t &BoundCtrl) {
+  if (BoundCtrl == 0) {
+    BoundCtrl = 1;
+    return true;
+  } else if (BoundCtrl == -1) {
+    BoundCtrl = 0;
+    return true;
+  }
+  return false;
+}
+
+// Note: the order in this table matches the order of operands in AsmString.
+static const OptionalOperand AMDGPUOptionalOperandTable[] = {
+  {"offen",   AMDGPUOperand::ImmTyOffen, true, nullptr},
+  {"idxen",   AMDGPUOperand::ImmTyIdxen, true, nullptr},
+  {"addr64",  AMDGPUOperand::ImmTyAddr64, true, nullptr},
+  {"offset0", AMDGPUOperand::ImmTyOffset0, false, nullptr},
+  {"offset1", AMDGPUOperand::ImmTyOffset1, false, nullptr},
+  {"gds",     AMDGPUOperand::ImmTyGDS, true, nullptr},
+  {"offset",  AMDGPUOperand::ImmTyOffset, false, nullptr},
+  {"glc",     AMDGPUOperand::ImmTyGLC, true, nullptr},
+  {"slc",     AMDGPUOperand::ImmTySLC, true, nullptr},
+  {"tfe",     AMDGPUOperand::ImmTyTFE, true, nullptr},
+  {"clamp",   AMDGPUOperand::ImmTyClampSI, true, nullptr},
+  {"omod",    AMDGPUOperand::ImmTyOModSI, false, ConvertOmodMul},
+  {"unorm",   AMDGPUOperand::ImmTyUNorm, true, nullptr},
+  {"da",      AMDGPUOperand::ImmTyDA,    true, nullptr},
+  {"r128",    AMDGPUOperand::ImmTyR128,  true, nullptr},
+  {"lwe",     AMDGPUOperand::ImmTyLWE,   true, nullptr},
+  {"dmask",   AMDGPUOperand::ImmTyDMask, false, nullptr},
+  {"row_mask",   AMDGPUOperand::ImmTyDppRowMask, false, nullptr},
+  {"bank_mask",  AMDGPUOperand::ImmTyDppBankMask, false, nullptr},
+  {"bound_ctrl", AMDGPUOperand::ImmTyDppBoundCtrl, false, ConvertBoundCtrl},
+  {"dst_sel",    AMDGPUOperand::ImmTySdwaDstSel, false, nullptr},
+  {"src0_sel",   AMDGPUOperand::ImmTySdwaSrc0Sel, false, nullptr},
+  {"src1_sel",   AMDGPUOperand::ImmTySdwaSrc1Sel, false, nullptr},
+  {"dst_unused", AMDGPUOperand::ImmTySdwaDstUnused, false, nullptr},
 };
 
-static bool isVOP3(OperandVector &Operands) {
-  if (operandsHaveModifiers(Operands))
-    return true;
+AMDGPUAsmParser::OperandMatchResultTy AMDGPUAsmParser::parseOptionalOperand(OperandVector &Operands) {
+  OperandMatchResultTy res;
+  for (const OptionalOperand &Op : AMDGPUOptionalOperandTable) {
+    // try to parse any optional operand here
+    if (Op.IsBit) {
+      res = parseNamedBit(Op.Name, Operands, Op.Type);
+    } else if (Op.Type == AMDGPUOperand::ImmTyOModSI) {
+      res = parseOModOperand(Operands);
+    } else if (Op.Type == AMDGPUOperand::ImmTySdwaDstSel ||
+               Op.Type == AMDGPUOperand::ImmTySdwaSrc0Sel ||
+               Op.Type == AMDGPUOperand::ImmTySdwaSrc1Sel) {
+      res = parseSDWASel(Operands, Op.Name, Op.Type);
+    } else if (Op.Type == AMDGPUOperand::ImmTySdwaDstUnused) {
+      res = parseSDWADstUnused(Operands);
+    } else {
+      res = parseIntWithPrefix(Op.Name, Operands, Op.Type, Op.ConvertResult);
+    }
+    if (res != MatchOperand_NoMatch) {
+      return res;
+    }
+  }
+  return MatchOperand_NoMatch;
+}
 
-  AMDGPUOperand &DstOp = ((AMDGPUOperand&)*Operands[1]);
+AMDGPUAsmParser::OperandMatchResultTy AMDGPUAsmParser::parseOModOperand(OperandVector &Operands)
+{
+  StringRef Name = Parser.getTok().getString();
+  if (Name == "mul") {
+    return parseIntWithPrefix("mul", Operands, AMDGPUOperand::ImmTyOModSI, ConvertOmodMul);
+  } else if (Name == "div") {
+    return parseIntWithPrefix("div", Operands, AMDGPUOperand::ImmTyOModSI, ConvertOmodDiv);
+  } else {
+    return MatchOperand_NoMatch;
+  }
+}
 
-  if (DstOp.isReg() && DstOp.isRegClass(AMDGPU::SGPR_64RegClassID))
-    return true;
+void AMDGPUAsmParser::cvtId(MCInst &Inst, const OperandVector &Operands) {
+  unsigned I = 1;
+  const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
+  for (unsigned J = 0; J < Desc.getNumDefs(); ++J) {
+    ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1);
+  }
+  for (unsigned E = Operands.size(); I != E; ++I)
+    ((AMDGPUOperand &)*Operands[I]).addRegOrImmOperands(Inst, 1);
+}
 
-  if (Operands.size() >= 5)
-    return true;
+void AMDGPUAsmParser::cvtVOP3_2_mod(MCInst &Inst, const OperandVector &Operands) {
+  uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags;
+  if (TSFlags & SIInstrFlags::VOP3) {
+    cvtVOP3(Inst, Operands);
+  } else {
+    cvtId(Inst, Operands);
+  }
+}
 
-  if (Operands.size() > 3) {
-    AMDGPUOperand &Src1Op = ((AMDGPUOperand&)*Operands[3]);
-    if (Src1Op.getReg() && (Src1Op.isRegClass(AMDGPU::SReg_32RegClassID) ||
-                            Src1Op.isRegClass(AMDGPU::SReg_64RegClassID)))
-      return true;
+void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) {
+  OptionalImmIndexMap OptionalIdx;
+  unsigned I = 1;
+  const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
+  for (unsigned J = 0; J < Desc.getNumDefs(); ++J) {
+    ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1);
+  }
+
+  for (unsigned E = Operands.size(); I != E; ++I) {
+    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
+    if (Op.isRegOrImmWithInputMods()) {
+      // only fp modifiers allowed in VOP3
+      Op.addRegOrImmWithFPInputModsOperands(Inst, 2);
+    } else if (Op.isImm()) {
+      OptionalIdx[Op.getImmTy()] = I;
+    } else {
+      assert(false);
+    }
+  }
+
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI);
+}
+
+//===----------------------------------------------------------------------===//
+// dpp
+//===----------------------------------------------------------------------===//
+
+bool AMDGPUOperand::isDPPCtrl() const {
+  bool result = isImm() && getImmTy() == ImmTyDppCtrl && isUInt<9>(getImm());
+  if (result) {
+    int64_t Imm = getImm();
+    return ((Imm >= 0x000) && (Imm <= 0x0ff)) ||
+           ((Imm >= 0x101) && (Imm <= 0x10f)) ||
+           ((Imm >= 0x111) && (Imm <= 0x11f)) ||
+           ((Imm >= 0x121) && (Imm <= 0x12f)) ||
+           (Imm == 0x130) ||
+           (Imm == 0x134) ||
+           (Imm == 0x138) ||
+           (Imm == 0x13c) ||
+           (Imm == 0x140) ||
+           (Imm == 0x141) ||
+           (Imm == 0x142) ||
+           (Imm == 0x143);
   }
   return false;
 }
 
 AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseVOP3OptionalOps(OperandVector &Operands) {
-
-  // The value returned by this function may change after parsing
-  // an operand so store the original value here.
-  bool HasModifiers = operandsHaveModifiers(Operands);
-
-  bool IsVOP3 = isVOP3(Operands);
-  if (HasModifiers || IsVOP3 ||
-      getLexer().isNot(AsmToken::EndOfStatement) ||
-      getForcedEncodingSize() == 64) {
-
-    AMDGPUAsmParser::OperandMatchResultTy Res =
-        parseOptionalOps(VOP3OptionalOps, Operands);
-
-    if (!HasModifiers && Res == MatchOperand_Success) {
-      // We have added a modifier operation, so we need to make sure all
-      // previous register operands have modifiers
-      for (unsigned i = 2, e = Operands.size(); i != e; ++i) {
-        AMDGPUOperand &Op = ((AMDGPUOperand&)*Operands[i]);
-        if (Op.isReg())
-          Op.setModifiers(0);
+AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+  StringRef Prefix;
+  int64_t Int;
+
+  if (getLexer().getKind() == AsmToken::Identifier) {
+    Prefix = Parser.getTok().getString();
+  } else {
+    return MatchOperand_NoMatch;
+  }
+
+  if (Prefix == "row_mirror") {
+    Int = 0x140;
+  } else if (Prefix == "row_half_mirror") {
+    Int = 0x141;
+  } else {
+    // Check to prevent parseDPPCtrlOps from eating invalid tokens
+    if (Prefix != "quad_perm"
+        && Prefix != "row_shl"
+        && Prefix != "row_shr"
+        && Prefix != "row_ror"
+        && Prefix != "wave_shl"
+        && Prefix != "wave_rol"
+        && Prefix != "wave_shr"
+        && Prefix != "wave_ror"
+        && Prefix != "row_bcast") {
+      return MatchOperand_NoMatch;
+    }
+
+    Parser.Lex();
+    if (getLexer().isNot(AsmToken::Colon))
+      return MatchOperand_ParseFail;
+
+    if (Prefix == "quad_perm") {
+      // quad_perm:[%d,%d,%d,%d]
+      Parser.Lex();
+      if (getLexer().isNot(AsmToken::LBrac))
+        return MatchOperand_ParseFail;
+
+      Parser.Lex();
+      if (getLexer().isNot(AsmToken::Integer))
+        return MatchOperand_ParseFail;
+      Int = getLexer().getTok().getIntVal();
+
+      Parser.Lex();
+      if (getLexer().isNot(AsmToken::Comma))
+        return MatchOperand_ParseFail;
+      Parser.Lex();
+      if (getLexer().isNot(AsmToken::Integer))
+        return MatchOperand_ParseFail;
+      Int += (getLexer().getTok().getIntVal() << 2);
+
+      Parser.Lex();
+      if (getLexer().isNot(AsmToken::Comma))
+        return MatchOperand_ParseFail;
+      Parser.Lex();
+      if (getLexer().isNot(AsmToken::Integer))
+        return MatchOperand_ParseFail;
+      Int += (getLexer().getTok().getIntVal() << 4);
+
+      Parser.Lex();
+      if (getLexer().isNot(AsmToken::Comma))
+        return MatchOperand_ParseFail;
+      Parser.Lex();
+      if (getLexer().isNot(AsmToken::Integer))
+        return MatchOperand_ParseFail;
+      Int += (getLexer().getTok().getIntVal() << 6);
+
+      Parser.Lex();
+      if (getLexer().isNot(AsmToken::RBrac))
+        return MatchOperand_ParseFail;
+
+    } else {
+      // sel:%d
+      Parser.Lex();
+      if (getLexer().isNot(AsmToken::Integer))
+        return MatchOperand_ParseFail;
+      Int = getLexer().getTok().getIntVal();
+
+      if (Prefix == "row_shl") {
+        Int |= 0x100;
+      } else if (Prefix == "row_shr") {
+        Int |= 0x110;
+      } else if (Prefix == "row_ror") {
+        Int |= 0x120;
+      } else if (Prefix == "wave_shl") {
+        Int = 0x130;
+      } else if (Prefix == "wave_rol") {
+        Int = 0x134;
+      } else if (Prefix == "wave_shr") {
+        Int = 0x138;
+      } else if (Prefix == "wave_ror") {
+        Int = 0x13C;
+      } else if (Prefix == "row_bcast") {
+        if (Int == 15) {
+          Int = 0x142;
+        } else if (Int == 31) {
+          Int = 0x143;
+        } else {
+          return MatchOperand_ParseFail;
+        }
+      } else {
+        return MatchOperand_ParseFail;
       }
     }
-    return Res;
   }
-  return MatchOperand_NoMatch;
+  Parser.Lex(); // eat last token
+
+  Operands.push_back(AMDGPUOperand::CreateImm(Int, S,
+                                              AMDGPUOperand::ImmTyDppCtrl));
+  return MatchOperand_Success;
 }
 
-void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) {
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultRowMask() const {
+  return AMDGPUOperand::CreateImm(0xf, SMLoc(), AMDGPUOperand::ImmTyDppRowMask);
+}
+
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultBankMask() const {
+  return AMDGPUOperand::CreateImm(0xf, SMLoc(), AMDGPUOperand::ImmTyDppBankMask);
+}
+
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultBoundCtrl() const {
+  return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyDppBoundCtrl);
+}
+
+void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands) {
+  OptionalImmIndexMap OptionalIdx;
 
-  unsigned i = 1;
+  unsigned I = 1;
   const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
-  if (Desc.getNumDefs() > 0) {
-    ((AMDGPUOperand &)*Operands[i++]).addRegOperands(Inst, 1);
+  for (unsigned J = 0; J < Desc.getNumDefs(); ++J) {
+    ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1);
   }
 
-  std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalIdx;
+  for (unsigned E = Operands.size(); I != E; ++I) {
+    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
+    // Add the register arguments
+    if (Op.isRegOrImmWithInputMods()) {
+      // Only float modifiers supported in DPP
+      Op.addRegOrImmWithFPInputModsOperands(Inst, 2);
+    } else if (Op.isDPPCtrl()) {
+      Op.addImmOperands(Inst, 1);
+    } else if (Op.isImm()) {
+      // Handle optional arguments
+      OptionalIdx[Op.getImmTy()] = I;
+    } else {
+      llvm_unreachable("Invalid operand type");
+    }
+  }
 
-  if (operandsHaveModifiers(Operands)) {
-    for (unsigned e = Operands.size(); i != e; ++i) {
-      AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppRowMask, 0xf);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBankMask, 0xf);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBoundCtrl);
+}
 
-      if (Op.isRegWithInputMods()) {
-        ((AMDGPUOperand &)*Operands[i]).addRegWithInputModsOperands(Inst, 2);
-        continue;
-      }
-      OptionalIdx[Op.getImmTy()] = i;
-    }
+//===----------------------------------------------------------------------===//
+// sdwa
+//===----------------------------------------------------------------------===//
 
-    unsigned ClampIdx = OptionalIdx[AMDGPUOperand::ImmTyClamp];
-    unsigned OModIdx = OptionalIdx[AMDGPUOperand::ImmTyOMod];
+AMDGPUAsmParser::OperandMatchResultTy
+AMDGPUAsmParser::parseSDWASel(OperandVector &Operands, StringRef Prefix,
+                              AMDGPUOperand::ImmTy Type) {
+  SMLoc S = Parser.getTok().getLoc();
+  StringRef Value;
+  AMDGPUAsmParser::OperandMatchResultTy res;
 
-    ((AMDGPUOperand &)*Operands[ClampIdx]).addImmOperands(Inst, 1);
-    ((AMDGPUOperand &)*Operands[OModIdx]).addImmOperands(Inst, 1);
-  } else {
-    for (unsigned e = Operands.size(); i != e; ++i)
-      ((AMDGPUOperand &)*Operands[i]).addRegOrImmOperands(Inst, 1);
+  res = parseStringWithPrefix(Prefix, Value);
+  if (res != MatchOperand_Success) {
+    return res;
+  }
+
+  int64_t Int;
+  Int = StringSwitch<int64_t>(Value)
+        .Case("BYTE_0", 0)
+        .Case("BYTE_1", 1)
+        .Case("BYTE_2", 2)
+        .Case("BYTE_3", 3)
+        .Case("WORD_0", 4)
+        .Case("WORD_1", 5)
+        .Case("DWORD", 6)
+        .Default(0xffffffff);
+  Parser.Lex(); // eat last token
+
+  if (Int == 0xffffffff) {
+    return MatchOperand_ParseFail;
+  }
+
+  Operands.push_back(AMDGPUOperand::CreateImm(Int, S, Type));
+  return MatchOperand_Success;
+}
+
+AMDGPUAsmParser::OperandMatchResultTy
+AMDGPUAsmParser::parseSDWADstUnused(OperandVector &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+  StringRef Value;
+  AMDGPUAsmParser::OperandMatchResultTy res;
+
+  res = parseStringWithPrefix("dst_unused", Value);
+  if (res != MatchOperand_Success) {
+    return res;
+  }
+
+  int64_t Int;
+  Int = StringSwitch<int64_t>(Value)
+        .Case("UNUSED_PAD", 0)
+        .Case("UNUSED_SEXT", 1)
+        .Case("UNUSED_PRESERVE", 2)
+        .Default(0xffffffff);
+  Parser.Lex(); // eat last token
+
+  if (Int == 0xffffffff) {
+    return MatchOperand_ParseFail;
+  }
+
+  Operands.push_back(AMDGPUOperand::CreateImm(Int, S,
+                                              AMDGPUOperand::ImmTySdwaDstUnused));
+  return MatchOperand_Success;
+}
+
+void AMDGPUAsmParser::cvtSdwaVOP1(MCInst &Inst, const OperandVector &Operands) {
+  cvtSDWA(Inst, Operands, SIInstrFlags::VOP1);
+}
+
+void AMDGPUAsmParser::cvtSdwaVOP2(MCInst &Inst, const OperandVector &Operands) {
+  cvtSDWA(Inst, Operands, SIInstrFlags::VOP2);
+}
+
+void AMDGPUAsmParser::cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands) {
+  cvtSDWA(Inst, Operands, SIInstrFlags::VOPC);
+}
+
+void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
+                              uint64_t BasicInstType) {
+  OptionalImmIndexMap OptionalIdx;
+
+  unsigned I = 1;
+  const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
+  for (unsigned J = 0; J < Desc.getNumDefs(); ++J) {
+    ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1);
+  }
+
+  for (unsigned E = Operands.size(); I != E; ++I) {
+    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
+    // Add the register arguments
+    if (BasicInstType == SIInstrFlags::VOPC &&
+        Op.isReg() &&
+        Op.Reg.RegNo == AMDGPU::VCC) {
+      // VOPC sdwa use "vcc" token as dst. Skip it.
+      continue;
+    } else if (Op.isRegOrImmWithInputMods()) {
+       Op.addRegOrImmWithInputModsOperands(Inst, 2);
+    } else if (Op.isImm()) {
+      // Handle optional arguments
+      OptionalIdx[Op.getImmTy()] = I;
+    } else {
+      llvm_unreachable("Invalid operand type");
+    }
+  }
+
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0);
+  
+  if (Inst.getOpcode() == AMDGPU::V_NOP_sdwa) {
+    // V_NOP_sdwa has no optional sdwa arguments
+    return;
+  }
+  switch (BasicInstType) {
+  case SIInstrFlags::VOP1: {
+    addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, 6);
+    addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, 2);
+    addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, 6);
+    break;
+  }
+  case SIInstrFlags::VOP2: {
+    addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, 6);
+    addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, 2);
+    addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, 6);
+    addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, 6);
+    break;
+  }
+  case SIInstrFlags::VOPC: {
+    addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, 6);
+    addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, 6);
+    break;
+  }
+  default:
+    llvm_unreachable("Invalid instruction type. Only VOP1, VOP2 and VOPC allowed");
   }
 }
 
@@ -1890,3 +2755,37 @@ extern "C" void LLVMInitializeAMDGPUAsmParser() {
 #define GET_MATCHER_IMPLEMENTATION
 #include "AMDGPUGenAsmMatcher.inc"
 
+
+// This fuction should be defined after auto-generated include so that we have
+// MatchClassKind enum defined
+unsigned AMDGPUAsmParser::validateTargetOperandClass(MCParsedAsmOperand &Op,
+                                                     unsigned Kind) {
+  // Tokens like "glc" would be parsed as immediate operands in ParseOperand().
+  // But MatchInstructionImpl() expects to meet token and fails to validate
+  // operand. This method checks if we are given immediate operand but expect to
+  // get corresponding token.
+  AMDGPUOperand &Operand = (AMDGPUOperand&)Op;
+  switch (Kind) {
+  case MCK_addr64:
+    return Operand.isAddr64() ? Match_Success : Match_InvalidOperand;
+  case MCK_gds:
+    return Operand.isGDS() ? Match_Success : Match_InvalidOperand;
+  case MCK_glc:
+    return Operand.isGLC() ? Match_Success : Match_InvalidOperand;
+  case MCK_idxen:
+    return Operand.isIdxen() ? Match_Success : Match_InvalidOperand;
+  case MCK_offen:
+    return Operand.isOffen() ? Match_Success : Match_InvalidOperand;
+  case MCK_SSrc32:
+    // When operands have expression values, they will return true for isToken,
+    // because it is not possible to distinguish between a token and an
+    // expression at parse time. MatchInstructionImpl() will always try to
+    // match an operand as a token, when isToken returns true, and when the
+    // name of the expression is not a valid token, the match will fail,
+    // so we need to handle it here.
+    return Operand.isSSrc32() ? Match_Success : Match_InvalidOperand;
+  case MCK_SoppBrTarget:
+    return Operand.isSoppBrTarget() ? Match_Success : Match_InvalidOperand;
+  default: return Match_InvalidOperand;
+  }
+}
diff --git a/lib/Target/AMDGPU/AsmParser/CMakeLists.txt b/lib/Target/AMDGPU/AsmParser/CMakeLists.txt
index 21ddc4eb83d2..70be7bb6eb36 100644
--- a/lib/Target/AMDGPU/AsmParser/CMakeLists.txt
+++ b/lib/Target/AMDGPU/AsmParser/CMakeLists.txt
@@ -1,3 +1,5 @@
 add_llvm_library(LLVMAMDGPUAsmParser
   AMDGPUAsmParser.cpp
   )
+
+add_dependencies(LLVMAMDGPUAsmParser LLVMAMDGPUUtils)
diff --git a/lib/Target/AMDGPU/AsmParser/Makefile b/lib/Target/AMDGPU/AsmParser/Makefile
deleted file mode 100644
index 5ad219028036..000000000000
--- a/lib/Target/AMDGPU/AsmParser/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/AMDGPU/AsmParser/Makefile ----------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMAMDGPUAsmParser
-
-# Hack: we need to include 'main' AMDGPU target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AMDGPU/CIInstructions.td b/lib/Target/AMDGPU/CIInstructions.td
index c543814cae0d..f9a9f79126bd 100644
--- a/lib/Target/AMDGPU/CIInstructions.td
+++ b/lib/Target/AMDGPU/CIInstructions.td
@@ -25,14 +25,6 @@
 // BUFFER_LOAD_DWORDX3
 // BUFFER_STORE_DWORDX3
 
-
-def isCIVI : Predicate <
-  "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS || "
-  "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS"
->, AssemblerPredicate<"FeatureCIInsts">;
-
-def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">;
-
 //===----------------------------------------------------------------------===//
 // VOP1 Instructions
 //===----------------------------------------------------------------------===//
@@ -108,9 +100,11 @@ defm S_DCACHE_INV_VOL : SMRD_Inval <smrd<0x1d, 0x22>,
 // MUBUF Instructions
 //===----------------------------------------------------------------------===//
 
+let DisableSIDecoder = 1 in {
 defm BUFFER_WBINVL1_VOL : MUBUF_Invalidate <mubuf<0x70, 0x3f>,
   "buffer_wbinvl1_vol", int_amdgcn_buffer_wbinvl1_vol
 >;
+}
 
 //===----------------------------------------------------------------------===//
 // Flat Instructions
@@ -159,129 +153,114 @@ defm FLAT_STORE_DWORDX3 : FLAT_Store_Helper <
   flat<0x1f, 0x1e>, "flat_store_dwordx3", VReg_96
 >;
 defm FLAT_ATOMIC_SWAP : FLAT_ATOMIC <
-  flat<0x30, 0x40>, "flat_atomic_swap", VGPR_32
+  flat<0x30, 0x40>, "flat_atomic_swap", VGPR_32, i32, atomic_swap_flat
 >;
 defm FLAT_ATOMIC_CMPSWAP : FLAT_ATOMIC <
-  flat<0x31, 0x41>, "flat_atomic_cmpswap", VGPR_32, VReg_64
+  flat<0x31, 0x41>, "flat_atomic_cmpswap", VGPR_32, i32,
+    atomic_cmp_swap_flat, v2i32, VReg_64
 >;
 defm FLAT_ATOMIC_ADD : FLAT_ATOMIC <
-  flat<0x32, 0x42>, "flat_atomic_add", VGPR_32
+  flat<0x32, 0x42>, "flat_atomic_add", VGPR_32, i32, atomic_add_flat
 >;
 defm FLAT_ATOMIC_SUB : FLAT_ATOMIC <
-  flat<0x33, 0x43>, "flat_atomic_sub", VGPR_32
+  flat<0x33, 0x43>, "flat_atomic_sub", VGPR_32, i32, atomic_sub_flat
 >;
 defm FLAT_ATOMIC_SMIN : FLAT_ATOMIC <
-  flat<0x35, 0x44>, "flat_atomic_smin", VGPR_32
+  flat<0x35, 0x44>, "flat_atomic_smin", VGPR_32, i32, atomic_min_flat
 >;
 defm FLAT_ATOMIC_UMIN : FLAT_ATOMIC <
-  flat<0x36, 0x45>, "flat_atomic_umin", VGPR_32
+  flat<0x36, 0x45>, "flat_atomic_umin", VGPR_32, i32, atomic_umin_flat
 >;
 defm FLAT_ATOMIC_SMAX : FLAT_ATOMIC <
-  flat<0x37, 0x46>, "flat_atomic_smax", VGPR_32
+  flat<0x37, 0x46>, "flat_atomic_smax", VGPR_32, i32, atomic_max_flat
 >;
 defm FLAT_ATOMIC_UMAX : FLAT_ATOMIC <
-  flat<0x38, 0x47>, "flat_atomic_umax", VGPR_32
+  flat<0x38, 0x47>, "flat_atomic_umax", VGPR_32, i32, atomic_umax_flat
 >;
 defm FLAT_ATOMIC_AND : FLAT_ATOMIC <
-  flat<0x39, 0x48>, "flat_atomic_and", VGPR_32
+  flat<0x39, 0x48>, "flat_atomic_and", VGPR_32, i32, atomic_and_flat
 >;
 defm FLAT_ATOMIC_OR : FLAT_ATOMIC <
-  flat<0x3a, 0x49>, "flat_atomic_or", VGPR_32
+  flat<0x3a, 0x49>, "flat_atomic_or", VGPR_32, i32, atomic_or_flat
 >;
 defm FLAT_ATOMIC_XOR : FLAT_ATOMIC <
-  flat<0x3b, 0x4a>, "flat_atomic_xor", VGPR_32
+  flat<0x3b, 0x4a>, "flat_atomic_xor", VGPR_32, i32, atomic_xor_flat
 >;
 defm FLAT_ATOMIC_INC : FLAT_ATOMIC <
-  flat<0x3c, 0x4b>, "flat_atomic_inc", VGPR_32
+  flat<0x3c, 0x4b>, "flat_atomic_inc", VGPR_32, i32, atomic_inc_flat
 >;
 defm FLAT_ATOMIC_DEC : FLAT_ATOMIC <
-  flat<0x3d, 0x4c>, "flat_atomic_dec", VGPR_32
+  flat<0x3d, 0x4c>, "flat_atomic_dec", VGPR_32, i32, atomic_dec_flat
 >;
 defm FLAT_ATOMIC_SWAP_X2 : FLAT_ATOMIC <
-  flat<0x50, 0x60>, "flat_atomic_swap_x2", VReg_64
+  flat<0x50, 0x60>, "flat_atomic_swap_x2", VReg_64, i64, atomic_swap_flat
 >;
 defm FLAT_ATOMIC_CMPSWAP_X2 : FLAT_ATOMIC <
-  flat<0x51, 0x61>, "flat_atomic_cmpswap_x2", VReg_64, VReg_128
+  flat<0x51, 0x61>, "flat_atomic_cmpswap_x2", VReg_64, i64,
+    atomic_cmp_swap_flat, v2i64, VReg_128
 >;
 defm FLAT_ATOMIC_ADD_X2 : FLAT_ATOMIC <
-  flat<0x52, 0x62>, "flat_atomic_add_x2", VReg_64
+  flat<0x52, 0x62>, "flat_atomic_add_x2", VReg_64, i64, atomic_add_flat
 >;
 defm FLAT_ATOMIC_SUB_X2 : FLAT_ATOMIC <
-  flat<0x53, 0x63>, "flat_atomic_sub_x2", VReg_64
+  flat<0x53, 0x63>, "flat_atomic_sub_x2", VReg_64, i64, atomic_sub_flat
 >;
 defm FLAT_ATOMIC_SMIN_X2 : FLAT_ATOMIC <
-  flat<0x55, 0x64>, "flat_atomic_smin_x2", VReg_64
+  flat<0x55, 0x64>, "flat_atomic_smin_x2", VReg_64, i64, atomic_min_flat
 >;
 defm FLAT_ATOMIC_UMIN_X2 : FLAT_ATOMIC <
-  flat<0x56, 0x65>, "flat_atomic_umin_x2", VReg_64
+  flat<0x56, 0x65>, "flat_atomic_umin_x2", VReg_64, i64, atomic_umin_flat
 >;
 defm FLAT_ATOMIC_SMAX_X2 : FLAT_ATOMIC <
-  flat<0x57, 0x66>, "flat_atomic_smax_x2", VReg_64
+  flat<0x57, 0x66>, "flat_atomic_smax_x2", VReg_64, i64, atomic_max_flat
 >;
 defm FLAT_ATOMIC_UMAX_X2 : FLAT_ATOMIC <
-  flat<0x58, 0x67>, "flat_atomic_umax_x2", VReg_64
+  flat<0x58, 0x67>, "flat_atomic_umax_x2", VReg_64, i64, atomic_umax_flat
 >;
 defm FLAT_ATOMIC_AND_X2 : FLAT_ATOMIC <
-  flat<0x59, 0x68>, "flat_atomic_and_x2", VReg_64
+  flat<0x59, 0x68>, "flat_atomic_and_x2", VReg_64, i64, atomic_and_flat
 >;
 defm FLAT_ATOMIC_OR_X2 : FLAT_ATOMIC <
-  flat<0x5a, 0x69>, "flat_atomic_or_x2", VReg_64
+  flat<0x5a, 0x69>, "flat_atomic_or_x2", VReg_64, i64, atomic_or_flat
 >;
 defm FLAT_ATOMIC_XOR_X2 : FLAT_ATOMIC <
-  flat<0x5b, 0x6a>, "flat_atomic_xor_x2", VReg_64
+  flat<0x5b, 0x6a>, "flat_atomic_xor_x2", VReg_64, i64, atomic_xor_flat
 >;
 defm FLAT_ATOMIC_INC_X2 : FLAT_ATOMIC <
-  flat<0x5c, 0x6b>, "flat_atomic_inc_x2", VReg_64
+  flat<0x5c, 0x6b>, "flat_atomic_inc_x2", VReg_64, i64, atomic_inc_flat
 >;
 defm FLAT_ATOMIC_DEC_X2 : FLAT_ATOMIC <
-  flat<0x5d, 0x6c>, "flat_atomic_dec_x2", VReg_64
+  flat<0x5d, 0x6c>, "flat_atomic_dec_x2", VReg_64, i64, atomic_dec_flat
 >;
 
 } // End SubtargetPredicate = isCIVI
 
 // CI Only flat instructions
 
-let SubtargetPredicate = isCI, VIAssemblerPredicate = DisableInst in {
+let SubtargetPredicate = isCI, VIAssemblerPredicate = DisableInst, DisableVIDecoder = 1 in {
 
 defm FLAT_ATOMIC_FCMPSWAP : FLAT_ATOMIC <
-  flat<0x3e>, "flat_atomic_fcmpswap", VGPR_32, VReg_64
+  flat<0x3e>, "flat_atomic_fcmpswap", VGPR_32, f32,
+    null_frag, v2f32, VReg_64
 >;
 defm FLAT_ATOMIC_FMIN : FLAT_ATOMIC <
-  flat<0x3f>, "flat_atomic_fmin", VGPR_32
+  flat<0x3f>, "flat_atomic_fmin", VGPR_32, f32
 >;
 defm FLAT_ATOMIC_FMAX : FLAT_ATOMIC <
-  flat<0x40>, "flat_atomic_fmax", VGPR_32
+  flat<0x40>, "flat_atomic_fmax", VGPR_32, f32
 >;
 defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_ATOMIC <
-  flat<0x5e>, "flat_atomic_fcmpswap_x2", VReg_64, VReg_128
+  flat<0x5e>, "flat_atomic_fcmpswap_x2", VReg_64, f64,
+  null_frag, v2f64, VReg_128
 >;
 defm FLAT_ATOMIC_FMIN_X2 : FLAT_ATOMIC <
-  flat<0x5f>, "flat_atomic_fmin_x2", VReg_64
+  flat<0x5f>, "flat_atomic_fmin_x2", VReg_64, f64
 >;
 defm FLAT_ATOMIC_FMAX_X2 : FLAT_ATOMIC <
-  flat<0x60>, "flat_atomic_fmax_x2", VReg_64
+  flat<0x60>, "flat_atomic_fmax_x2", VReg_64, f64
 >;
 
-} // End let SubtargetPredicate = isCI, VIAssemblerPredicate = DisableInst
-
-let Predicates = [isCI] in {
-
-// Convert (x - floor(x)) to fract(x)
-def : Pat <
-  (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)),
-             (f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))),
-  (V_FRACT_F32_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
->;
-
-// Convert (x + (-floor(x))) to fract(x)
-def : Pat <
-  (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)),
-             (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))),
-  (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
->;
-
-} // End Predicates = [isCI]
-
+} // End SubtargetPredicate = isCI, VIAssemblerPredicate = DisableInst, DisableVIDecoder = 1
 
 //===----------------------------------------------------------------------===//
 // Flat Patterns
@@ -289,12 +268,17 @@ def : Pat <
 
 let Predicates = [isCIVI] in {
 
-// Patterns for global loads with no offset
+// Patterns for global loads with no offset.
 class FlatLoadPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
   (vt (node i64:$addr)),
   (inst $addr, 0, 0, 0)
 >;
 
+class FlatLoadAtomicPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
+  (vt (node i64:$addr)),
+  (inst $addr, 1, 0, 0)
+>;
+
 def : FlatLoadPat <FLAT_LOAD_UBYTE, flat_az_extloadi8, i32>;
 def : FlatLoadPat <FLAT_LOAD_SBYTE, flat_sextloadi8, i32>;
 def : FlatLoadPat <FLAT_LOAD_USHORT, flat_az_extloadi16, i32>;
@@ -303,9 +287,20 @@ def : FlatLoadPat <FLAT_LOAD_DWORD, flat_load, i32>;
 def : FlatLoadPat <FLAT_LOAD_DWORDX2, flat_load, v2i32>;
 def : FlatLoadPat <FLAT_LOAD_DWORDX4, flat_load, v4i32>;
 
+def : FlatLoadAtomicPat <FLAT_LOAD_DWORD, atomic_flat_load, i32>;
+def : FlatLoadAtomicPat <FLAT_LOAD_DWORDX2, atomic_flat_load, i64>;
+
+
 class FlatStorePat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
   (node vt:$data, i64:$addr),
-  (inst $data, $addr, 0, 0, 0)
+  (inst $addr, $data, 0, 0, 0)
+>;
+
+class FlatStoreAtomicPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
+  // atomic store follows atomic binop convention so the address comes
+  // first.
+  (node i64:$addr, vt:$data),
+  (inst $addr, $data, 1, 0, 0)
 >;
 
 def : FlatStorePat <FLAT_STORE_BYTE, flat_truncstorei8, i32>;
@@ -314,20 +309,41 @@ def : FlatStorePat <FLAT_STORE_DWORD, flat_store, i32>;
 def : FlatStorePat <FLAT_STORE_DWORDX2, flat_store, v2i32>;
 def : FlatStorePat <FLAT_STORE_DWORDX4, flat_store, v4i32>;
 
-class FlatAtomicPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
-  (vt (node i64:$addr, vt:$data)),
+def : FlatStoreAtomicPat <FLAT_STORE_DWORD, atomic_flat_store, i32>;
+def : FlatStoreAtomicPat <FLAT_STORE_DWORDX2, atomic_flat_store, i64>;
+
+class FlatAtomicPat <FLAT inst, SDPatternOperator node, ValueType vt,
+                     ValueType data_vt = vt> : Pat <
+  (vt (node i64:$addr, data_vt:$data)),
   (inst $addr, $data, 0, 0)
 >;
 
 def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_add_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_AND_RTN, atomic_and_global, i32>;
 def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_sub_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_INC_RTN, atomic_inc_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_DEC_RTN, atomic_dec_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_AND_RTN, atomic_and_global, i32>;
 def : FlatAtomicPat <FLAT_ATOMIC_SMAX_RTN, atomic_max_global, i32>;
 def : FlatAtomicPat <FLAT_ATOMIC_UMAX_RTN, atomic_umax_global, i32>;
 def : FlatAtomicPat <FLAT_ATOMIC_SMIN_RTN, atomic_min_global, i32>;
 def : FlatAtomicPat <FLAT_ATOMIC_UMIN_RTN, atomic_umin_global, i32>;
 def : FlatAtomicPat <FLAT_ATOMIC_OR_RTN, atomic_or_global, i32>;
 def : FlatAtomicPat <FLAT_ATOMIC_SWAP_RTN, atomic_swap_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_CMPSWAP_RTN, atomic_cmp_swap_global, i32, v2i32>;
 def : FlatAtomicPat <FLAT_ATOMIC_XOR_RTN, atomic_xor_global, i32>;
 
+def : FlatAtomicPat <FLAT_ATOMIC_ADD_X2_RTN, atomic_add_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_SUB_X2_RTN, atomic_sub_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_INC_X2_RTN, atomic_inc_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_DEC_X2_RTN, atomic_dec_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_AND_X2_RTN, atomic_and_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_SMAX_X2_RTN, atomic_max_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_UMAX_X2_RTN, atomic_umax_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_SMIN_X2_RTN, atomic_min_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_UMIN_X2_RTN, atomic_umin_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_OR_X2_RTN, atomic_or_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_SWAP_X2_RTN, atomic_swap_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_CMPSWAP_X2_RTN, atomic_cmp_swap_global, i64, v2i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_XOR_X2_RTN, atomic_xor_global, i64>;
+
 } // End Predicates = [isCIVI]
diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt
index b9ef0e821763..45825c9cc76a 100644
--- a/lib/Target/AMDGPU/CMakeLists.txt
+++ b/lib/Target/AMDGPU/CMakeLists.txt
@@ -10,15 +10,30 @@ tablegen(LLVM AMDGPUGenMCCodeEmitter.inc -gen-emitter)
 tablegen(LLVM AMDGPUGenDFAPacketizer.inc -gen-dfa-packetizer)
 tablegen(LLVM AMDGPUGenAsmWriter.inc -gen-asm-writer)
 tablegen(LLVM AMDGPUGenAsmMatcher.inc -gen-asm-matcher)
+tablegen(LLVM AMDGPUGenDisassemblerTables.inc -gen-disassembler)
 add_public_tablegen_target(AMDGPUCommonTableGen)
 
+# List of all GlobalISel files.
+set(GLOBAL_ISEL_FILES
+  AMDGPUCallLowering.cpp
+  )
+
+# Add GlobalISel files to the dependencies if the user wants to build it.
+if(LLVM_BUILD_GLOBAL_ISEL)
+  set(GLOBAL_ISEL_BUILD_FILES ${GLOBAL_ISEL_FILES})
+else()
+  set(GLOBAL_ISEL_BUILD_FILES"")
+  set(LLVM_OPTIONAL_SOURCES LLVMGlobalISel ${GLOBAL_ISEL_FILES})
+endif()
+
+
 add_llvm_target(AMDGPUCodeGen
   AMDILCFGStructurizer.cpp
   AMDGPUAlwaysInlinePass.cpp
   AMDGPUAnnotateKernelFeatures.cpp
   AMDGPUAnnotateUniformValues.cpp
   AMDGPUAsmPrinter.cpp
-  AMDGPUDiagnosticInfoUnsupported.cpp
+  AMDGPUCodeGenPrepare.cpp
   AMDGPUFrameLowering.cpp
   AMDGPUTargetObjectFile.cpp
   AMDGPUIntrinsicInfo.cpp
@@ -33,10 +48,12 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUInstrInfo.cpp
   AMDGPUPromoteAlloca.cpp
   AMDGPURegisterInfo.cpp
+  GCNHazardRecognizer.cpp
   R600ClauseMergePass.cpp
   R600ControlFlowFinalizer.cpp
   R600EmitClauseMarkers.cpp
   R600ExpandSpecialInstrs.cpp
+  R600FrameLowering.cpp
   R600InstrInfo.cpp
   R600ISelLowering.cpp
   R600MachineFunctionInfo.cpp
@@ -44,11 +61,10 @@ add_llvm_target(AMDGPUCodeGen
   R600OptimizeVectorRegisters.cpp
   R600Packetizer.cpp
   R600RegisterInfo.cpp
-  R600TextureIntrinsicsReplacer.cpp
   SIAnnotateControlFlow.cpp
+  SIDebuggerInsertNops.cpp
   SIFixControlFlowLiveIntervals.cpp
   SIFixSGPRCopies.cpp
-  SIFixSGPRLiveRanges.cpp
   SIFoldOperands.cpp
   SIFrameLowering.cpp
   SIInsertWaits.cpp
@@ -62,10 +78,13 @@ add_llvm_target(AMDGPUCodeGen
   SIRegisterInfo.cpp
   SIShrinkInstructions.cpp
   SITypeRewriter.cpp
+  SIWholeQuadMode.cpp
+  ${GLOBAL_ISEL_BUILD_FILES}
   )
 
 add_subdirectory(AsmParser)
 add_subdirectory(InstPrinter)
+add_subdirectory(Disassembler)
 add_subdirectory(TargetInfo)
 add_subdirectory(MCTargetDesc)
 add_subdirectory(Utils)
diff --git a/lib/Target/AMDGPU/CaymanInstructions.td b/lib/Target/AMDGPU/CaymanInstructions.td
index a6c3785c815b..98bc6e856ea2 100644
--- a/lib/Target/AMDGPU/CaymanInstructions.td
+++ b/lib/Target/AMDGPU/CaymanInstructions.td
@@ -51,7 +51,6 @@ def : RsqPat<RECIPSQRT_IEEE_cm, f32>;
 def : POW_Common <LOG_IEEE_cm, EXP_IEEE_cm, MUL>;
 
 defm DIV_cm : DIV_Common<RECIP_IEEE_cm>;
-defm : Expand24UBitOps<MULLO_UINT_cm, ADD_INT>;
 
 // RECIP_UINT emulation for Cayman
 // The multiplication scales from [0,1] to the unsigned integer range
@@ -203,27 +202,53 @@ def VTX_READ_PARAM_128_cm : VTX_READ_128_cm <0,
 //===----------------------------------------------------------------------===//
 
 // 8-bit reads
-def VTX_READ_GLOBAL_8_cm : VTX_READ_8_cm <1,
-  [(set i32:$dst_gpr, (az_extloadi8_global ADDRVTX_READ:$src_gpr))]
+def VTX_READ_ID1_8_cm : VTX_READ_8_cm <1,
+  [(set i32:$dst_gpr, (vtx_id1_az_extloadi8 ADDRVTX_READ:$src_gpr))]
 >;
 
-def VTX_READ_GLOBAL_16_cm : VTX_READ_16_cm <1,
-  [(set i32:$dst_gpr, (az_extloadi16_global ADDRVTX_READ:$src_gpr))]
+// 16-bit reads
+def VTX_READ_ID1_16_cm : VTX_READ_16_cm <1,
+  [(set i32:$dst_gpr, (vtx_id1_az_extloadi16 ADDRVTX_READ:$src_gpr))]
 >;
 
 // 32-bit reads
-def VTX_READ_GLOBAL_32_cm : VTX_READ_32_cm <1,
-  [(set i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
+def VTX_READ_ID1_32_cm : VTX_READ_32_cm <1,
+  [(set i32:$dst_gpr, (vtx_id1_load ADDRVTX_READ:$src_gpr))]
 >;
 
 // 64-bit reads
-def VTX_READ_GLOBAL_64_cm : VTX_READ_64_cm <1,
-  [(set v2i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
+def VTX_READ_ID1_64_cm : VTX_READ_64_cm <1,
+  [(set v2i32:$dst_gpr, (vtx_id1_load ADDRVTX_READ:$src_gpr))]
 >;
 
 // 128-bit reads
-def VTX_READ_GLOBAL_128_cm : VTX_READ_128_cm <1,
-  [(set v4i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
+def VTX_READ_ID1_128_cm : VTX_READ_128_cm <1,
+  [(set v4i32:$dst_gpr, (vtx_id1_load ADDRVTX_READ:$src_gpr))]
+>;
+
+// 8-bit reads
+def VTX_READ_ID2_8_cm : VTX_READ_8_cm <2,
+  [(set i32:$dst_gpr, (vtx_id2_az_extloadi8 ADDRVTX_READ:$src_gpr))]
+>;
+
+// 16-bit reads
+def VTX_READ_ID2_16_cm : VTX_READ_16_cm <2,
+  [(set i32:$dst_gpr, (vtx_id2_az_extloadi16 ADDRVTX_READ:$src_gpr))]
+>;
+
+// 32-bit reads
+def VTX_READ_ID2_32_cm : VTX_READ_32_cm <2,
+  [(set i32:$dst_gpr, (vtx_id2_load ADDRVTX_READ:$src_gpr))]
+>;
+
+// 64-bit reads
+def VTX_READ_ID2_64_cm : VTX_READ_64_cm <2,
+  [(set v2i32:$dst_gpr, (vtx_id2_load ADDRVTX_READ:$src_gpr))]
+>;
+
+// 128-bit reads
+def VTX_READ_ID2_128_cm : VTX_READ_128_cm <2,
+  [(set v4i32:$dst_gpr, (vtx_id2_load ADDRVTX_READ:$src_gpr))]
 >;
 
 } // End isCayman
diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
new file mode 100644
index 000000000000..e11de855fe5f
--- /dev/null
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -0,0 +1,437 @@
+//===-- AMDGPUDisassembler.cpp - Disassembler for AMDGPU ISA --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// This file contains definition for AMDGPU ISA disassembler
+//
+//===----------------------------------------------------------------------===//
+
+// ToDo: What to do with instruction suffixes (v_mov_b32 vs v_mov_b32_e32)?
+
+#include "AMDGPUDisassembler.h"
+#include "AMDGPU.h"
+#include "AMDGPURegisterInfo.h"
+#include "SIDefines.h"
+#include "Utils/AMDGPUBaseInfo.h"
+
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/TargetRegistry.h"
+
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-disassembler"
+
+typedef llvm::MCDisassembler::DecodeStatus DecodeStatus;
+
+
+inline static MCDisassembler::DecodeStatus
+addOperand(MCInst &Inst, const MCOperand& Opnd) {
+  Inst.addOperand(Opnd);
+  return Opnd.isValid() ?
+    MCDisassembler::Success :
+    MCDisassembler::SoftFail;
+}
+
+#define DECODE_OPERAND2(RegClass, DecName) \
+static DecodeStatus Decode##RegClass##RegisterClass(MCInst &Inst, \
+                                                    unsigned Imm, \
+                                                    uint64_t /*Addr*/, \
+                                                    const void *Decoder) { \
+  auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); \
+  return addOperand(Inst, DAsm->decodeOperand_##DecName(Imm)); \
+}
+
+#define DECODE_OPERAND(RegClass) DECODE_OPERAND2(RegClass, RegClass)
+
+DECODE_OPERAND(VGPR_32)
+DECODE_OPERAND(VS_32)
+DECODE_OPERAND(VS_64)
+
+DECODE_OPERAND(VReg_64)
+DECODE_OPERAND(VReg_96)
+DECODE_OPERAND(VReg_128)
+
+DECODE_OPERAND(SReg_32)
+DECODE_OPERAND(SReg_32_XM0)
+DECODE_OPERAND(SReg_64)
+DECODE_OPERAND(SReg_128)
+DECODE_OPERAND(SReg_256)
+DECODE_OPERAND(SReg_512)
+
+#define GET_SUBTARGETINFO_ENUM
+#include "AMDGPUGenSubtargetInfo.inc"
+#undef GET_SUBTARGETINFO_ENUM
+
+#include "AMDGPUGenDisassemblerTables.inc"
+
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+template <typename T> static inline T eatBytes(ArrayRef<uint8_t>& Bytes) {
+  assert(Bytes.size() >= sizeof(T));
+  const auto Res = support::endian::read<T, support::endianness::little>(Bytes.data());
+  Bytes = Bytes.slice(sizeof(T));
+  return Res;
+}
+
+DecodeStatus AMDGPUDisassembler::tryDecodeInst(const uint8_t* Table,
+                                               MCInst &MI,
+                                               uint64_t Inst,
+                                               uint64_t Address) const {
+  assert(MI.getOpcode() == 0);
+  assert(MI.getNumOperands() == 0);
+  MCInst TmpInst;
+  const auto SavedBytes = Bytes;
+  if (decodeInstruction(Table, TmpInst, Inst, Address, this, STI)) {
+    MI = TmpInst;
+    return MCDisassembler::Success;
+  }
+  Bytes = SavedBytes;
+  return MCDisassembler::Fail;
+}
+
+DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
+                                                ArrayRef<uint8_t> Bytes_,
+                                                uint64_t Address,
+                                                raw_ostream &WS,
+                                                raw_ostream &CS) const {
+  CommentStream = &CS;
+
+  // ToDo: AMDGPUDisassembler supports only VI ISA.
+  assert(AMDGPU::isVI(STI) && "Can disassemble only VI ISA.");
+
+  const unsigned MaxInstBytesNum = (std::min)((size_t)8, Bytes_.size());
+  Bytes = Bytes_.slice(0, MaxInstBytesNum);
+
+  DecodeStatus Res = MCDisassembler::Fail;
+  do {
+    // ToDo: better to switch encoding length using some bit predicate
+    // but it is unknown yet, so try all we can
+
+    // Try to decode DPP and SDWA first to solve conflict with VOP1 and VOP2
+    // encodings
+    if (Bytes.size() >= 8) {
+      const uint64_t QW = eatBytes<uint64_t>(Bytes);
+      Res = tryDecodeInst(DecoderTableDPP64, MI, QW, Address);
+      if (Res) break;
+
+      Res = tryDecodeInst(DecoderTableSDWA64, MI, QW, Address);
+      if (Res) break;
+    }
+
+    // Reinitialize Bytes as DPP64 could have eaten too much
+    Bytes = Bytes_.slice(0, MaxInstBytesNum);
+
+    // Try decode 32-bit instruction
+    if (Bytes.size() < 4) break;
+    const uint32_t DW = eatBytes<uint32_t>(Bytes);
+    Res = tryDecodeInst(DecoderTableVI32, MI, DW, Address);
+    if (Res) break;
+
+    Res = tryDecodeInst(DecoderTableAMDGPU32, MI, DW, Address);
+    if (Res) break;
+
+    if (Bytes.size() < 4) break;
+    const uint64_t QW = ((uint64_t)eatBytes<uint32_t>(Bytes) << 32) | DW;
+    Res = tryDecodeInst(DecoderTableVI64, MI, QW, Address);
+    if (Res) break;
+
+    Res = tryDecodeInst(DecoderTableAMDGPU64, MI, QW, Address);
+  } while (false);
+
+  Size = Res ? (MaxInstBytesNum - Bytes.size()) : 0;
+  return Res;
+}
+
+const char* AMDGPUDisassembler::getRegClassName(unsigned RegClassID) const {
+  return getContext().getRegisterInfo()->
+    getRegClassName(&AMDGPUMCRegisterClasses[RegClassID]);
+}
+
+inline
+MCOperand AMDGPUDisassembler::errOperand(unsigned V,
+                                         const Twine& ErrMsg) const {
+  *CommentStream << "Error: " + ErrMsg;
+
+  // ToDo: add support for error operands to MCInst.h
+  // return MCOperand::createError(V);
+  return MCOperand();
+}
+
+inline
+MCOperand AMDGPUDisassembler::createRegOperand(unsigned int RegId) const {
+  return MCOperand::createReg(RegId);
+}
+
+inline
+MCOperand AMDGPUDisassembler::createRegOperand(unsigned RegClassID,
+                                               unsigned Val) const {
+  const auto& RegCl = AMDGPUMCRegisterClasses[RegClassID];
+  if (Val >= RegCl.getNumRegs())
+    return errOperand(Val, Twine(getRegClassName(RegClassID)) +
+                           ": unknown register " + Twine(Val));
+  return createRegOperand(RegCl.getRegister(Val));
+}
+
+inline
+MCOperand AMDGPUDisassembler::createSRegOperand(unsigned SRegClassID,
+                                                unsigned Val) const {
+  // ToDo: SI/CI have 104 SGPRs, VI - 102
+  // Valery: here we accepting as much as we can, let assembler sort it out
+  int shift = 0;
+  switch (SRegClassID) {
+  case AMDGPU::SGPR_32RegClassID:
+  case AMDGPU::TTMP_32RegClassID:
+    break;
+  case AMDGPU::SGPR_64RegClassID:
+  case AMDGPU::TTMP_64RegClassID:
+    shift = 1;
+    break;
+  case AMDGPU::SGPR_128RegClassID:
+  case AMDGPU::TTMP_128RegClassID:
+  // ToDo: unclear if s[100:104] is available on VI. Can we use VCC as SGPR in
+  // this bundle?
+  case AMDGPU::SReg_256RegClassID:
+  // ToDo: unclear if s[96:104] is available on VI. Can we use VCC as SGPR in
+  // this bundle?
+  case AMDGPU::SReg_512RegClassID:
+    shift = 2;
+    break;
+  // ToDo: unclear if s[88:104] is available on VI. Can we use VCC as SGPR in
+  // this bundle?
+  default:
+    assert(false);
+    break;
+  }
+  if (Val % (1 << shift))
+    *CommentStream << "Warning: " << getRegClassName(SRegClassID)
+                   << ": scalar reg isn't aligned " << Val;
+  return createRegOperand(SRegClassID, Val >> shift);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_VS_32(unsigned Val) const {
+  return decodeSrcOp(OPW32, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_VS_64(unsigned Val) const {
+  return decodeSrcOp(OPW64, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_VGPR_32(unsigned Val) const {
+  return createRegOperand(AMDGPU::VGPR_32RegClassID, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_VReg_64(unsigned Val) const {
+  return createRegOperand(AMDGPU::VReg_64RegClassID, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_VReg_96(unsigned Val) const {
+  return createRegOperand(AMDGPU::VReg_96RegClassID, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_VReg_128(unsigned Val) const {
+  return createRegOperand(AMDGPU::VReg_128RegClassID, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_SReg_32(unsigned Val) const {
+  // table-gen generated disassembler doesn't care about operand types
+  // leaving only registry class so SSrc_32 operand turns into SReg_32
+  // and therefore we accept immediates and literals here as well
+  return decodeSrcOp(OPW32, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_SReg_32_XM0(unsigned Val) const {
+  // SReg_32_XM0 is SReg_32 without M0
+  return decodeOperand_SReg_32(Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_SReg_64(unsigned Val) const {
+  // see decodeOperand_SReg_32 comment
+  return decodeSrcOp(OPW64, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_SReg_128(unsigned Val) const {
+  return decodeSrcOp(OPW128, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_SReg_256(unsigned Val) const {
+  return createSRegOperand(AMDGPU::SReg_256RegClassID, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_SReg_512(unsigned Val) const {
+  return createSRegOperand(AMDGPU::SReg_512RegClassID, Val);
+}
+
+
+MCOperand AMDGPUDisassembler::decodeLiteralConstant() const {
+  // For now all literal constants are supposed to be unsigned integer
+  // ToDo: deal with signed/unsigned 64-bit integer constants
+  // ToDo: deal with float/double constants
+  if (Bytes.size() < 4)
+    return errOperand(0, "cannot read literal, inst bytes left " +
+                         Twine(Bytes.size()));
+  return MCOperand::createImm(eatBytes<uint32_t>(Bytes));
+}
+
+MCOperand AMDGPUDisassembler::decodeIntImmed(unsigned Imm) {
+  using namespace AMDGPU::EncValues;
+  assert(Imm >= INLINE_INTEGER_C_MIN && Imm <= INLINE_INTEGER_C_MAX);
+  return MCOperand::createImm((Imm <= INLINE_INTEGER_C_POSITIVE_MAX) ?
+    (static_cast<int64_t>(Imm) - INLINE_INTEGER_C_MIN) :
+    (INLINE_INTEGER_C_POSITIVE_MAX - static_cast<int64_t>(Imm)));
+      // Cast prevents negative overflow.
+}
+
+MCOperand AMDGPUDisassembler::decodeFPImmed(bool Is32, unsigned Imm) {
+  assert(Imm >= AMDGPU::EncValues::INLINE_FLOATING_C_MIN
+      && Imm <= AMDGPU::EncValues::INLINE_FLOATING_C_MAX);
+  // ToDo: case 248: 1/(2*PI) - is allowed only on VI
+  // ToDo: AMDGPUInstPrinter does not support 1/(2*PI). It consider 1/(2*PI) as
+  // literal constant.
+  float V = 0.0f;
+  switch (Imm) {
+  case 240: V =  0.5f; break;
+  case 241: V = -0.5f; break;
+  case 242: V =  1.0f; break;
+  case 243: V = -1.0f; break;
+  case 244: V =  2.0f; break;
+  case 245: V = -2.0f; break;
+  case 246: V =  4.0f; break;
+  case 247: V = -4.0f; break;
+  case 248: return MCOperand::createImm(Is32 ?         // 1/(2*PI)
+                                          0x3e22f983 :
+                                          0x3fc45f306dc9c882);
+  default: break;
+  }
+  return MCOperand::createImm(Is32? FloatToBits(V) : DoubleToBits(V));
+}
+
+unsigned AMDGPUDisassembler::getVgprClassId(const OpWidthTy Width) const {
+  using namespace AMDGPU;
+  assert(OPW_FIRST_ <= Width && Width < OPW_LAST_);
+  switch (Width) {
+  default: // fall
+  case OPW32: return VGPR_32RegClassID;
+  case OPW64: return VReg_64RegClassID;
+  case OPW128: return VReg_128RegClassID;
+  }
+}
+
+unsigned AMDGPUDisassembler::getSgprClassId(const OpWidthTy Width) const {
+  using namespace AMDGPU;
+  assert(OPW_FIRST_ <= Width && Width < OPW_LAST_);
+  switch (Width) {
+  default: // fall
+  case OPW32: return SGPR_32RegClassID;
+  case OPW64: return SGPR_64RegClassID;
+  case OPW128: return SGPR_128RegClassID;
+  }
+}
+
+unsigned AMDGPUDisassembler::getTtmpClassId(const OpWidthTy Width) const {
+  using namespace AMDGPU;
+  assert(OPW_FIRST_ <= Width && Width < OPW_LAST_);
+  switch (Width) {
+  default: // fall
+  case OPW32: return TTMP_32RegClassID;
+  case OPW64: return TTMP_64RegClassID;
+  case OPW128: return TTMP_128RegClassID;
+  }
+}
+
+MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val) const {
+  using namespace AMDGPU::EncValues;
+  assert(Val < 512); // enum9
+
+  if (VGPR_MIN <= Val && Val <= VGPR_MAX) {
+    return createRegOperand(getVgprClassId(Width), Val - VGPR_MIN);
+  }
+  if (Val <= SGPR_MAX) {
+    assert(SGPR_MIN == 0); // "SGPR_MIN <= Val" is always true and causes compilation warning.
+    return createSRegOperand(getSgprClassId(Width), Val - SGPR_MIN);
+  }
+  if (TTMP_MIN <= Val && Val <= TTMP_MAX) {
+    return createSRegOperand(getTtmpClassId(Width), Val - TTMP_MIN);
+  }
+
+  assert(Width == OPW32 || Width == OPW64);
+  const bool Is32 = (Width == OPW32);
+
+  if (INLINE_INTEGER_C_MIN <= Val && Val <= INLINE_INTEGER_C_MAX)
+    return decodeIntImmed(Val);
+
+  if (INLINE_FLOATING_C_MIN <= Val && Val <= INLINE_FLOATING_C_MAX)
+    return decodeFPImmed(Is32, Val);
+
+  if (Val == LITERAL_CONST)
+    return decodeLiteralConstant();
+
+  return Is32 ? decodeSpecialReg32(Val) : decodeSpecialReg64(Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeSpecialReg32(unsigned Val) const {
+  using namespace AMDGPU;
+  switch (Val) {
+  case 102: return createRegOperand(getMCReg(FLAT_SCR_LO, STI));
+  case 103: return createRegOperand(getMCReg(FLAT_SCR_HI, STI));
+    // ToDo: no support for xnack_mask_lo/_hi register
+  case 104:
+  case 105: break;
+  case 106: return createRegOperand(VCC_LO);
+  case 107: return createRegOperand(VCC_HI);
+  case 108: return createRegOperand(TBA_LO);
+  case 109: return createRegOperand(TBA_HI);
+  case 110: return createRegOperand(TMA_LO);
+  case 111: return createRegOperand(TMA_HI);
+  case 124: return createRegOperand(M0);
+  case 126: return createRegOperand(EXEC_LO);
+  case 127: return createRegOperand(EXEC_HI);
+    // ToDo: no support for vccz register
+  case 251: break;
+    // ToDo: no support for execz register
+  case 252: break;
+  case 253: return createRegOperand(SCC);
+  default: break;
+  }
+  return errOperand(Val, "unknown operand encoding " + Twine(Val));
+}
+
+MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const {
+  using namespace AMDGPU;
+  switch (Val) {
+  case 102: return createRegOperand(getMCReg(FLAT_SCR, STI));
+  case 106: return createRegOperand(VCC);
+  case 108: return createRegOperand(TBA);
+  case 110: return createRegOperand(TMA);
+  case 126: return createRegOperand(EXEC);
+  default: break;
+  }
+  return errOperand(Val, "unknown operand encoding " + Twine(Val));
+}
+
+static MCDisassembler *createAMDGPUDisassembler(const Target &T,
+                                                const MCSubtargetInfo &STI,
+                                                MCContext &Ctx) {
+  return new AMDGPUDisassembler(STI, Ctx);
+}
+
+extern "C" void LLVMInitializeAMDGPUDisassembler() {
+  TargetRegistry::RegisterMCDisassembler(TheGCNTarget, createAMDGPUDisassembler);
+}
diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
new file mode 100644
index 000000000000..dff26a044bf5
--- /dev/null
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -0,0 +1,93 @@
+//===-- AMDGPUDisassembler.hpp - Disassembler for AMDGPU ISA ---*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// This file contains declaration for AMDGPU ISA disassembler
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_DISASSEMBLER_AMDGPUDISASSEMBLER_H
+#define LLVM_LIB_TARGET_AMDGPU_DISASSEMBLER_AMDGPUDISASSEMBLER_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+
+namespace llvm {
+
+  class MCContext;
+  class MCInst;
+  class MCOperand;
+  class MCSubtargetInfo;
+  class Twine;
+
+  class AMDGPUDisassembler : public MCDisassembler {
+  private:
+    mutable ArrayRef<uint8_t> Bytes;
+
+  public:
+    AMDGPUDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) :
+      MCDisassembler(STI, Ctx) {}
+
+    ~AMDGPUDisassembler() {}
+
+    DecodeStatus getInstruction(MCInst &MI, uint64_t &Size,
+                                ArrayRef<uint8_t> Bytes, uint64_t Address,
+                                raw_ostream &WS, raw_ostream &CS) const override;
+
+    const char* getRegClassName(unsigned RegClassID) const;
+
+    MCOperand createRegOperand(unsigned int RegId) const;
+    MCOperand createRegOperand(unsigned RegClassID, unsigned Val) const;
+    MCOperand createSRegOperand(unsigned SRegClassID, unsigned Val) const;
+
+    MCOperand errOperand(unsigned V, const llvm::Twine& ErrMsg) const;
+
+    DecodeStatus tryDecodeInst(const uint8_t* Table,
+                               MCInst &MI,
+                               uint64_t Inst,
+                               uint64_t Address) const;
+
+    MCOperand decodeOperand_VGPR_32(unsigned Val) const;
+    MCOperand decodeOperand_VS_32(unsigned Val) const;
+    MCOperand decodeOperand_VS_64(unsigned Val) const;
+
+    MCOperand decodeOperand_VReg_64(unsigned Val) const;
+    MCOperand decodeOperand_VReg_96(unsigned Val) const;
+    MCOperand decodeOperand_VReg_128(unsigned Val) const;
+
+    MCOperand decodeOperand_SReg_32(unsigned Val) const;
+    MCOperand decodeOperand_SReg_32_XM0(unsigned Val) const;
+    MCOperand decodeOperand_SReg_64(unsigned Val) const;
+    MCOperand decodeOperand_SReg_128(unsigned Val) const;
+    MCOperand decodeOperand_SReg_256(unsigned Val) const;
+    MCOperand decodeOperand_SReg_512(unsigned Val) const;
+
+    enum OpWidthTy {
+      OPW32,
+      OPW64,
+      OPW128,
+      OPW_LAST_,
+      OPW_FIRST_ = OPW32
+    };
+    unsigned getVgprClassId(const OpWidthTy Width) const;
+    unsigned getSgprClassId(const OpWidthTy Width) const;
+    unsigned getTtmpClassId(const OpWidthTy Width) const;
+
+    static MCOperand decodeIntImmed(unsigned Imm);
+    static MCOperand decodeFPImmed(bool Is32, unsigned Imm);
+    MCOperand decodeLiteralConstant() const;
+
+    MCOperand decodeSrcOp(const OpWidthTy Width, unsigned Val) const;
+    MCOperand decodeSpecialReg32(unsigned Val) const;
+    MCOperand decodeSpecialReg64(unsigned Val) const;
+  };
+} // namespace llvm
+
+#endif //LLVM_LIB_TARGET_AMDGPU_DISASSEMBLER_AMDGPUDISASSEMBLER_H
diff --git a/lib/Target/AMDGPU/Disassembler/CMakeLists.txt b/lib/Target/AMDGPU/Disassembler/CMakeLists.txt
new file mode 100644
index 000000000000..fb9231576919
--- /dev/null
+++ b/lib/Target/AMDGPU/Disassembler/CMakeLists.txt
@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMAMDGPUDisassembler
+  AMDGPUDisassembler.cpp
+  )
+
+add_dependencies(LLVMAMDGPUDisassembler AMDGPUCommonTableGen LLVMAMDGPUUtils)
diff --git a/lib/Target/AMDGPU/Disassembler/LLVMBuild.txt b/lib/Target/AMDGPU/Disassembler/LLVMBuild.txt
new file mode 100644
index 000000000000..c9005f8a7884
--- /dev/null
+++ b/lib/Target/AMDGPU/Disassembler/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===- ./lib/Target/AMDGPU/Disassembler/LLVMBuild.txt ------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = AMDGPUDisassembler
+parent = AMDGPU
+required_libraries = AMDGPUDesc AMDGPUInfo AMDGPUUtils MC MCDisassembler Support
+add_to_library_groups = AMDGPU
diff --git a/lib/Target/AMDGPU/EvergreenInstructions.td b/lib/Target/AMDGPU/EvergreenInstructions.td
index 2245f1417e53..94f05cc41aff 100644
--- a/lib/Target/AMDGPU/EvergreenInstructions.td
+++ b/lib/Target/AMDGPU/EvergreenInstructions.td
@@ -85,8 +85,6 @@ def COS_eg : COS_Common<0x8E>;
 def : POW_Common <LOG_IEEE_eg, EXP_IEEE_eg, MUL>;
 def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_eg $src))>;
 
-defm : Expand24IBitOps<MULLO_INT_eg, ADD_INT>;
-
 //===----------------------------------------------------------------------===//
 // Memory read/write instructions
 //===----------------------------------------------------------------------===//
@@ -212,23 +210,23 @@ class VTX_READ_128_eg <bits<8> buffer_id, list<dag> pattern>
 // VTX Read from parameter memory space
 //===----------------------------------------------------------------------===//
 
-def VTX_READ_PARAM_8_eg : VTX_READ_8_eg <0,
+def VTX_READ_PARAM_8_eg : VTX_READ_8_eg <3,
   [(set i32:$dst_gpr, (load_param_exti8 ADDRVTX_READ:$src_gpr))]
 >;
 
-def VTX_READ_PARAM_16_eg : VTX_READ_16_eg <0,
+def VTX_READ_PARAM_16_eg : VTX_READ_16_eg <3,
   [(set i32:$dst_gpr, (load_param_exti16 ADDRVTX_READ:$src_gpr))]
 >;
 
-def VTX_READ_PARAM_32_eg : VTX_READ_32_eg <0,
+def VTX_READ_PARAM_32_eg : VTX_READ_32_eg <3,
   [(set i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
 >;
 
-def VTX_READ_PARAM_64_eg : VTX_READ_64_eg <0,
+def VTX_READ_PARAM_64_eg : VTX_READ_64_eg <3,
   [(set v2i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
 >;
 
-def VTX_READ_PARAM_128_eg : VTX_READ_128_eg <0,
+def VTX_READ_PARAM_128_eg : VTX_READ_128_eg <3,
   [(set v4i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
 >;
 
@@ -237,27 +235,53 @@ def VTX_READ_PARAM_128_eg : VTX_READ_128_eg <0,
 //===----------------------------------------------------------------------===//
 
 // 8-bit reads
-def VTX_READ_GLOBAL_8_eg : VTX_READ_8_eg <1,
-  [(set i32:$dst_gpr, (az_extloadi8_global ADDRVTX_READ:$src_gpr))]
+def VTX_READ_ID1_8_eg : VTX_READ_8_eg <1,
+  [(set i32:$dst_gpr, (vtx_id1_az_extloadi8 ADDRVTX_READ:$src_gpr))]
+>;
+
+// 16-bit reads
+def VTX_READ_ID1_16_eg : VTX_READ_16_eg <1,
+  [(set i32:$dst_gpr, (vtx_id1_az_extloadi16 ADDRVTX_READ:$src_gpr))]
+>;
+
+// 32-bit reads
+def VTX_READ_ID1_32_eg : VTX_READ_32_eg <1,
+  [(set i32:$dst_gpr, (vtx_id1_load ADDRVTX_READ:$src_gpr))]
+>;
+
+// 64-bit reads
+def VTX_READ_ID1_64_eg : VTX_READ_64_eg <1,
+  [(set v2i32:$dst_gpr, (vtx_id1_load ADDRVTX_READ:$src_gpr))]
+>;
+
+// 128-bit reads
+def VTX_READ_ID1_128_eg : VTX_READ_128_eg <1,
+  [(set v4i32:$dst_gpr, (vtx_id1_load ADDRVTX_READ:$src_gpr))]
+>;
+
+// 8-bit reads
+def VTX_READ_ID2_8_eg : VTX_READ_8_eg <2,
+  [(set i32:$dst_gpr, (vtx_id2_az_extloadi8 ADDRVTX_READ:$src_gpr))]
 >;
 
-def VTX_READ_GLOBAL_16_eg : VTX_READ_16_eg <1,
-  [(set i32:$dst_gpr, (az_extloadi16_global ADDRVTX_READ:$src_gpr))]
+// 16-bit reads
+def VTX_READ_ID2_16_eg : VTX_READ_16_eg <2,
+  [(set i32:$dst_gpr, (vtx_id2_az_extloadi16 ADDRVTX_READ:$src_gpr))]
 >;
 
 // 32-bit reads
-def VTX_READ_GLOBAL_32_eg : VTX_READ_32_eg <1,
-  [(set i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
+def VTX_READ_ID2_32_eg : VTX_READ_32_eg <2,
+  [(set i32:$dst_gpr, (vtx_id2_load ADDRVTX_READ:$src_gpr))]
 >;
 
 // 64-bit reads
-def VTX_READ_GLOBAL_64_eg : VTX_READ_64_eg <1,
-  [(set v2i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
+def VTX_READ_ID2_64_eg : VTX_READ_64_eg <2,
+  [(set v2i32:$dst_gpr, (vtx_id2_load ADDRVTX_READ:$src_gpr))]
 >;
 
 // 128-bit reads
-def VTX_READ_GLOBAL_128_eg : VTX_READ_128_eg <1,
-  [(set v4i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
+def VTX_READ_ID2_128_eg : VTX_READ_128_eg <2,
+  [(set v4i32:$dst_gpr, (vtx_id2_load ADDRVTX_READ:$src_gpr))]
 >;
 
 } // End Predicates = [isEG]
@@ -356,8 +380,6 @@ let hasSideEffects = 1 in {
   def MOVA_INT_eg : R600_1OP <0xCC, "MOVA_INT", [], VecALU>;
 }
 
-def TGSI_LIT_Z_eg : TGSI_LIT_Z_Common<MUL_LIT_eg, LOG_CLAMPED_eg, EXP_IEEE_eg>;
-
 def FLT_TO_INT_eg : FLT_TO_INT_Common<0x50> {
   let Pattern = [];
   let Itinerary = AnyALU;
@@ -372,7 +394,7 @@ def FLT_TO_UINT_eg : FLT_TO_UINT_Common<0x9A> {
 def UINT_TO_FLT_eg : UINT_TO_FLT_Common<0x9C>;
 
 def GROUP_BARRIER : InstR600 <
-    (outs), (ins), "  GROUP_BARRIER", [(int_AMDGPU_barrier_local), (int_AMDGPU_barrier_global)], AnyALU>,
+    (outs), (ins), "  GROUP_BARRIER", [(int_r600_group_barrier)], AnyALU>,
     R600ALU_Word0,
     R600ALU_Word1_OP2 <0x54> {
 
@@ -401,11 +423,6 @@ def GROUP_BARRIER : InstR600 <
   let ALUInst = 1;
 }
 
-def : Pat <
-	(int_AMDGPU_barrier_global),
-	(GROUP_BARRIER)
->;
-
 //===----------------------------------------------------------------------===//
 // LDS Instructions
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
new file mode 100644
index 000000000000..29b1f79187d5
--- /dev/null
+++ b/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -0,0 +1,264 @@
+//===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements hazard recognizers for scheduling on GCN processors.
+//
+//===----------------------------------------------------------------------===//
+
+#include "GCNHazardRecognizer.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Hazard Recoginizer Implementation
+//===----------------------------------------------------------------------===//
+
+GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
+  CurrCycleInstr(nullptr),
+  MF(MF),
+  ST(MF.getSubtarget<SISubtarget>()) {
+  MaxLookAhead = 5;
+}
+
+void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
+  EmitInstruction(SU->getInstr());
+}
+
+void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
+  CurrCycleInstr = MI;
+}
+
+ScheduleHazardRecognizer::HazardType
+GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
+  MachineInstr *MI = SU->getInstr();
+
+  if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
+    return NoopHazard;
+
+  if (SIInstrInfo::isVMEM(*MI) && checkVMEMHazards(MI) > 0)
+    return NoopHazard;
+
+  if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
+    return NoopHazard;
+
+  return NoHazard;
+}
+
+unsigned GCNHazardRecognizer::PreEmitNoops(SUnit *SU) {
+  return PreEmitNoops(SU->getInstr());
+}
+
+unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
+  if (SIInstrInfo::isSMRD(*MI))
+    return std::max(0, checkSMRDHazards(MI));
+
+  if (SIInstrInfo::isVMEM(*MI))
+    return std::max(0, checkVMEMHazards(MI));
+
+  if (SIInstrInfo::isDPP(*MI))
+    return std::max(0, checkDPPHazards(MI));
+
+  return 0;
+}
+
+void GCNHazardRecognizer::EmitNoop() {
+  EmittedInstrs.push_front(nullptr);
+}
+
+void GCNHazardRecognizer::AdvanceCycle() {
+
+  // When the scheduler detects a stall, it will call AdvanceCycle() without
+  // emitting any instructions.
+  if (!CurrCycleInstr)
+    return;
+
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  unsigned NumWaitStates = TII->getNumWaitStates(*CurrCycleInstr);
+
+  // Keep track of emitted instructions
+  EmittedInstrs.push_front(CurrCycleInstr);
+
+  // Add a nullptr for each additional wait state after the first.  Make sure
+  // not to add more than getMaxLookAhead() items to the list, since we
+  // truncate the list to that size right after this loop.
+  for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
+       i < e; ++i) {
+    EmittedInstrs.push_front(nullptr);
+  }
+
+  // getMaxLookahead() is the largest number of wait states we will ever need
+  // to insert, so there is no point in keeping track of more than that many
+  // wait states.
+  EmittedInstrs.resize(getMaxLookAhead());
+
+  CurrCycleInstr = nullptr;
+}
+
+void GCNHazardRecognizer::RecedeCycle() {
+  llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
+}
+
+//===----------------------------------------------------------------------===//
+// Helper Functions
+//===----------------------------------------------------------------------===//
+
+int GCNHazardRecognizer::getWaitStatesSinceDef(
+    unsigned Reg, function_ref<bool(MachineInstr *)> IsHazardDef) {
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+  int WaitStates = -1;
+  for (MachineInstr *MI : EmittedInstrs) {
+    ++WaitStates;
+    if (!MI || !IsHazardDef(MI))
+      continue;
+    if (MI->modifiesRegister(Reg, TRI))
+      return WaitStates;
+  }
+  return std::numeric_limits<int>::max();
+}
+
+//===----------------------------------------------------------------------===//
+// No-op Hazard Detection
+//===----------------------------------------------------------------------===//
+
+static void addRegsToSet(iterator_range<MachineInstr::const_mop_iterator> Ops,
+                         std::set<unsigned> &Set) {
+  for (const MachineOperand &Op : Ops) {
+    if (Op.isReg())
+      Set.insert(Op.getReg());
+  }
+}
+
+int GCNHazardRecognizer::checkSMEMSoftClauseHazards(MachineInstr *SMEM) {
+  // SMEM soft clause are only present on VI+
+  if (ST.getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
+    return 0;
+
+  // A soft-clause is any group of consecutive SMEM instructions.  The
+  // instructions in this group may return out of order and/or may be
+  // replayed (i.e. the same instruction issued more than once).
+  //
+  // In order to handle these situations correctly we need to make sure
+  // that when a clause has more than one instruction, no instruction in the
+  // clause writes to a register that is read another instruction in the clause
+  // (including itself). If we encounter this situaion, we need to break the
+  // clause by inserting a non SMEM instruction.
+
+  std::set<unsigned> ClauseDefs;
+  std::set<unsigned> ClauseUses;
+
+  for (MachineInstr *MI : EmittedInstrs) {
+
+    // When we hit a non-SMEM instruction then we have passed the start of the
+    // clause and we can stop.
+    if (!MI || !SIInstrInfo::isSMRD(*MI))
+      break;
+
+    addRegsToSet(MI->defs(), ClauseDefs);
+    addRegsToSet(MI->uses(), ClauseUses);
+  }
+
+  if (ClauseDefs.empty())
+    return 0;
+
+  // FIXME: When we support stores, we need to make sure not to put loads and
+  // stores in the same clause if they use the same address.  For now, just
+  // start a new clause whenever we see a store.
+  if (SMEM->mayStore())
+    return 1;
+
+  addRegsToSet(SMEM->defs(), ClauseDefs);
+  addRegsToSet(SMEM->uses(), ClauseUses);
+
+  std::vector<unsigned> Result(std::max(ClauseDefs.size(), ClauseUses.size()));
+  std::vector<unsigned>::iterator End;
+
+  End = std::set_intersection(ClauseDefs.begin(), ClauseDefs.end(),
+                              ClauseUses.begin(), ClauseUses.end(), Result.begin());
+
+  // If the set of defs and uses intersect then we cannot add this instruction
+  // to the clause, so we have a hazard.
+  if (End != Result.begin())
+    return 1;
+
+  return 0;
+}
+
+int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  int WaitStatesNeeded = 0;
+
+  WaitStatesNeeded = checkSMEMSoftClauseHazards(SMRD);
+
+  // This SMRD hazard only affects SI.
+  if (ST.getGeneration() != SISubtarget::SOUTHERN_ISLANDS)
+    return WaitStatesNeeded;
+
+  // A read of an SGPR by SMRD instruction requires 4 wait states when the
+  // SGPR was written by a VALU instruction.
+  int SmrdSgprWaitStates = 4;
+  auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
+
+  for (const MachineOperand &Use : SMRD->uses()) {
+    if (!Use.isReg())
+      continue;
+    int WaitStatesNeededForUse =
+        SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn);
+    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+  }
+  return WaitStatesNeeded;
+}
+
+int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
+  const SIInstrInfo *TII = ST.getInstrInfo();
+
+  if (ST.getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
+    return 0;
+
+  const SIRegisterInfo &TRI = TII->getRegisterInfo();
+
+  // A read of an SGPR by a VMEM instruction requires 5 wait states when the
+  // SGPR was written by a VALU Instruction.
+  int VmemSgprWaitStates = 5;
+  int WaitStatesNeeded = 0;
+  auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
+
+  for (const MachineOperand &Use : VMEM->uses()) {
+    if (!Use.isReg() || TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
+      continue;
+
+    int WaitStatesNeededForUse =
+        VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn);
+    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+  }
+  return WaitStatesNeeded;
+}
+
+int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+  // Check for DPP VGPR read after VALU VGPR write.
+  int DppVgprWaitStates = 2;
+  int WaitStatesNeeded = 0;
+
+  for (const MachineOperand &Use : DPP->uses()) {
+    if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
+      continue;
+    int WaitStatesNeededForUse =
+        DppVgprWaitStates - getWaitStatesSinceDef(Use.getReg());
+    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+  }
+
+  return WaitStatesNeeded;
+}
diff --git a/lib/Target/AMDGPU/GCNHazardRecognizer.h b/lib/Target/AMDGPU/GCNHazardRecognizer.h
new file mode 100644
index 000000000000..d82041c5f174
--- /dev/null
+++ b/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -0,0 +1,62 @@
+//===-- GCNHazardRecognizers.h - GCN Hazard Recognizers ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines hazard recognizers for scheduling on GCN processors.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPUHAZARDRECOGNIZERS_H
+#define LLVM_LIB_TARGET_AMDGPUHAZARDRECOGNIZERS_H
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include <list>
+
+namespace llvm {
+
+class MachineFunction;
+class MachineInstr;
+class ScheduleDAG;
+class SIInstrInfo;
+class SISubtarget;
+
+class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
+  // This variable stores the instruction that has been emitted this cycle. It
+  // will be added to EmittedInstrs, when AdvanceCycle() or RecedeCycle() is
+  // called.
+  MachineInstr *CurrCycleInstr;
+  std::list<MachineInstr*> EmittedInstrs;
+  const MachineFunction &MF;
+  const SISubtarget &ST;
+
+  int getWaitStatesSinceDef(unsigned Reg,
+                            function_ref<bool(MachineInstr *)> IsHazardDef =
+                                [](MachineInstr *) { return true; });
+
+  int checkSMEMSoftClauseHazards(MachineInstr *SMEM);
+  int checkSMRDHazards(MachineInstr *SMRD);
+  int checkVMEMHazards(MachineInstr* VMEM);
+  int checkDPPHazards(MachineInstr *DPP);
+public:
+  GCNHazardRecognizer(const MachineFunction &MF);
+  // We can only issue one instruction per cycle.
+  bool atIssueLimit() const override { return true; }
+  void EmitInstruction(SUnit *SU) override;
+  void EmitInstruction(MachineInstr *MI) override;
+  HazardType getHazardType(SUnit *SU, int Stalls) override;
+  void EmitNoop() override;
+  unsigned PreEmitNoops(SUnit *SU) override;
+  unsigned PreEmitNoops(MachineInstr *) override;
+  void AdvanceCycle() override;
+  void RecedeCycle() override;
+};
+
+} // end namespace llvm
+
+#endif //LLVM_LIB_TARGET_AMDGPUHAZARDRECOGNIZERS_H
diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
index a187de88f639..2932d3bb1580 100644
--- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
+++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
@@ -11,6 +11,7 @@
 #include "AMDGPUInstPrinter.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIDefines.h"
+#include "Utils/AMDGPUAsmUtils.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
@@ -18,6 +19,8 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 
+#include <string>
+
 using namespace llvm;
 
 void AMDGPUInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
@@ -28,6 +31,11 @@ void AMDGPUInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
   printAnnotation(OS, Annot);
 }
 
+void AMDGPUInstPrinter::printU4ImmOperand(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &O) {
+  O << formatHex(MI->getOperand(OpNo).getImm() & 0xf);
+}
+
 void AMDGPUInstPrinter::printU8ImmOperand(const MCInst *MI, unsigned OpNo,
                                            raw_ostream &O) {
   O << formatHex(MI->getOperand(OpNo).getImm() & 0xff);
@@ -43,6 +51,11 @@ void AMDGPUInstPrinter::printU32ImmOperand(const MCInst *MI, unsigned OpNo,
   O << formatHex(MI->getOperand(OpNo).getImm() & 0xffffffff);
 }
 
+void AMDGPUInstPrinter::printU4ImmDecOperand(const MCInst *MI, unsigned OpNo,
+                                             raw_ostream &O) {
+  O << formatDec(MI->getOperand(OpNo).getImm() & 0xf);
+}
+
 void AMDGPUInstPrinter::printU8ImmDecOperand(const MCInst *MI, unsigned OpNo,
                                              raw_ostream &O) {
   O << formatDec(MI->getOperand(OpNo).getImm() & 0xff);
@@ -53,22 +66,26 @@ void AMDGPUInstPrinter::printU16ImmDecOperand(const MCInst *MI, unsigned OpNo,
   O << formatDec(MI->getOperand(OpNo).getImm() & 0xffff);
 }
 
+void AMDGPUInstPrinter::printNamedBit(const MCInst* MI, unsigned OpNo,
+                                      raw_ostream& O, StringRef BitName) {
+  if (MI->getOperand(OpNo).getImm()) {
+    O << ' ' << BitName;
+  }
+}
+
 void AMDGPUInstPrinter::printOffen(const MCInst *MI, unsigned OpNo,
                                    raw_ostream &O) {
-  if (MI->getOperand(OpNo).getImm())
-    O << " offen";
+  printNamedBit(MI, OpNo, O, "offen");
 }
 
 void AMDGPUInstPrinter::printIdxen(const MCInst *MI, unsigned OpNo,
                                    raw_ostream &O) {
-  if (MI->getOperand(OpNo).getImm())
-    O << " idxen";
+  printNamedBit(MI, OpNo, O, "idxen");
 }
 
 void AMDGPUInstPrinter::printAddr64(const MCInst *MI, unsigned OpNo,
                                     raw_ostream &O) {
-  if (MI->getOperand(OpNo).getImm())
-    O << " addr64";
+  printNamedBit(MI, OpNo, O, "addr64");
 }
 
 void AMDGPUInstPrinter::printMBUFOffset(const MCInst *MI, unsigned OpNo,
@@ -79,7 +96,7 @@ void AMDGPUInstPrinter::printMBUFOffset(const MCInst *MI, unsigned OpNo,
   }
 }
 
-void AMDGPUInstPrinter::printDSOffset(const MCInst *MI, unsigned OpNo,
+void AMDGPUInstPrinter::printOffset(const MCInst *MI, unsigned OpNo,
                                       raw_ostream &O) {
   uint16_t Imm = MI->getOperand(OpNo).getImm();
   if (Imm != 0) {
@@ -88,7 +105,7 @@ void AMDGPUInstPrinter::printDSOffset(const MCInst *MI, unsigned OpNo,
   }
 }
 
-void AMDGPUInstPrinter::printDSOffset0(const MCInst *MI, unsigned OpNo,
+void AMDGPUInstPrinter::printOffset0(const MCInst *MI, unsigned OpNo,
                                         raw_ostream &O) {
   if (MI->getOperand(OpNo).getImm()) {
     O << " offset0:";
@@ -96,7 +113,7 @@ void AMDGPUInstPrinter::printDSOffset0(const MCInst *MI, unsigned OpNo,
   }
 }
 
-void AMDGPUInstPrinter::printDSOffset1(const MCInst *MI, unsigned OpNo,
+void AMDGPUInstPrinter::printOffset1(const MCInst *MI, unsigned OpNo,
                                         raw_ostream &O) {
   if (MI->getOperand(OpNo).getImm()) {
     O << " offset1:";
@@ -104,28 +121,62 @@ void AMDGPUInstPrinter::printDSOffset1(const MCInst *MI, unsigned OpNo,
   }
 }
 
+void AMDGPUInstPrinter::printSMRDOffset(const MCInst *MI, unsigned OpNo,
+                                        raw_ostream &O) {
+  printU32ImmOperand(MI, OpNo, O);
+}
+
+void AMDGPUInstPrinter::printSMRDLiteralOffset(const MCInst *MI, unsigned OpNo,
+                                               raw_ostream &O) {
+  printU32ImmOperand(MI, OpNo, O);
+}
+
 void AMDGPUInstPrinter::printGDS(const MCInst *MI, unsigned OpNo,
                                  raw_ostream &O) {
-  if (MI->getOperand(OpNo).getImm())
-    O << " gds";
+  printNamedBit(MI, OpNo, O, "gds");
 }
 
 void AMDGPUInstPrinter::printGLC(const MCInst *MI, unsigned OpNo,
                                  raw_ostream &O) {
-  if (MI->getOperand(OpNo).getImm())
-    O << " glc";
+  printNamedBit(MI, OpNo, O, "glc");
 }
 
 void AMDGPUInstPrinter::printSLC(const MCInst *MI, unsigned OpNo,
                                  raw_ostream &O) {
-  if (MI->getOperand(OpNo).getImm())
-    O << " slc";
+  printNamedBit(MI, OpNo, O, "slc");
 }
 
 void AMDGPUInstPrinter::printTFE(const MCInst *MI, unsigned OpNo,
                                  raw_ostream &O) {
-  if (MI->getOperand(OpNo).getImm())
-    O << " tfe";
+  printNamedBit(MI, OpNo, O, "tfe");
+}
+
+void AMDGPUInstPrinter::printDMask(const MCInst *MI, unsigned OpNo,
+                                   raw_ostream &O) {
+  if (MI->getOperand(OpNo).getImm()) {
+    O << " dmask:";
+    printU16ImmOperand(MI, OpNo, O);
+  }
+}
+
+void AMDGPUInstPrinter::printUNorm(const MCInst *MI, unsigned OpNo,
+                                   raw_ostream &O) {
+  printNamedBit(MI, OpNo, O, "unorm");
+}
+
+void AMDGPUInstPrinter::printDA(const MCInst *MI, unsigned OpNo,
+                                raw_ostream &O) {
+  printNamedBit(MI, OpNo, O, "da");
+}
+
+void AMDGPUInstPrinter::printR128(const MCInst *MI, unsigned OpNo,
+                                  raw_ostream &O) {
+  printNamedBit(MI, OpNo, O, "r128");
+}
+
+void AMDGPUInstPrinter::printLWE(const MCInst *MI, unsigned OpNo,
+                                   raw_ostream &O) {
+  printNamedBit(MI, OpNo, O, "lwe");
 }
 
 void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O,
@@ -152,6 +203,18 @@ void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O,
   case AMDGPU::VCC_HI:
     O << "vcc_hi";
     return;
+  case AMDGPU::TBA_LO:
+    O << "tba_lo";
+    return;
+  case AMDGPU::TBA_HI:
+    O << "tba_hi";
+    return;
+  case AMDGPU::TMA_LO:
+    O << "tma_lo";
+    return;
+  case AMDGPU::TMA_HI:
+    O << "tma_hi";
+    return;
   case AMDGPU::EXEC_LO:
     O << "exec_lo";
     return;
@@ -168,62 +231,73 @@ void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O,
     break;
   }
 
-  char Type;
-  unsigned NumRegs;
+  // The low 8 bits of the encoding value is the register index, for both VGPRs
+  // and SGPRs.
+  unsigned RegIdx = MRI.getEncodingValue(reg) & ((1 << 8) - 1);
 
+  unsigned NumRegs;
   if (MRI.getRegClass(AMDGPU::VGPR_32RegClassID).contains(reg)) {
-    Type = 'v';
+    O << 'v';
     NumRegs = 1;
   } else  if (MRI.getRegClass(AMDGPU::SGPR_32RegClassID).contains(reg)) {
-    Type = 's';
+    O << 's';
     NumRegs = 1;
   } else if (MRI.getRegClass(AMDGPU::VReg_64RegClassID).contains(reg)) {
-    Type = 'v';
+    O <<'v';
     NumRegs = 2;
-  } else  if (MRI.getRegClass(AMDGPU::SReg_64RegClassID).contains(reg)) {
-    Type = 's';
+  } else  if (MRI.getRegClass(AMDGPU::SGPR_64RegClassID).contains(reg)) {
+    O << 's';
     NumRegs = 2;
   } else if (MRI.getRegClass(AMDGPU::VReg_128RegClassID).contains(reg)) {
-    Type = 'v';
+    O << 'v';
     NumRegs = 4;
-  } else  if (MRI.getRegClass(AMDGPU::SReg_128RegClassID).contains(reg)) {
-    Type = 's';
+  } else  if (MRI.getRegClass(AMDGPU::SGPR_128RegClassID).contains(reg)) {
+    O << 's';
     NumRegs = 4;
   } else if (MRI.getRegClass(AMDGPU::VReg_96RegClassID).contains(reg)) {
-    Type = 'v';
+    O << 'v';
     NumRegs = 3;
   } else if (MRI.getRegClass(AMDGPU::VReg_256RegClassID).contains(reg)) {
-    Type = 'v';
+    O << 'v';
     NumRegs = 8;
   } else if (MRI.getRegClass(AMDGPU::SReg_256RegClassID).contains(reg)) {
-    Type = 's';
+    O << 's';
     NumRegs = 8;
   } else if (MRI.getRegClass(AMDGPU::VReg_512RegClassID).contains(reg)) {
-    Type = 'v';
+    O << 'v';
     NumRegs = 16;
   } else if (MRI.getRegClass(AMDGPU::SReg_512RegClassID).contains(reg)) {
-    Type = 's';
+    O << 's';
     NumRegs = 16;
+  } else if (MRI.getRegClass(AMDGPU::TTMP_64RegClassID).contains(reg)) {
+    O << "ttmp";
+    NumRegs = 2;
+    RegIdx -= 112; // Trap temps start at offset 112. TODO: Get this from tablegen.
+  } else if (MRI.getRegClass(AMDGPU::TTMP_128RegClassID).contains(reg)) {
+    O << "ttmp";
+    NumRegs = 4;
+    RegIdx -= 112; // Trap temps start at offset 112. TODO: Get this from tablegen.
   } else {
     O << getRegisterName(reg);
     return;
   }
 
-  // The low 8 bits of the encoding value is the register index, for both VGPRs
-  // and SGPRs.
-  unsigned RegIdx = MRI.getEncodingValue(reg) & ((1 << 8) - 1);
   if (NumRegs == 1) {
-    O << Type << RegIdx;
+    O << RegIdx;
     return;
   }
 
-  O << Type << '[' << RegIdx << ':' << (RegIdx + NumRegs - 1) << ']';
+  O << '[' << RegIdx << ':' << (RegIdx + NumRegs - 1) << ']';
 }
 
 void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo,
                                     raw_ostream &O) {
   if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::VOP3)
     O << "_e64 ";
+  else if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::DPP)
+    O << "_dpp ";
+  else if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::SDWA)
+    O << "_sdwa ";
   else
     O << "_e32 ";
 
@@ -345,12 +419,13 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     const MCExpr *Exp = Op.getExpr();
     Exp->print(O, &MAI);
   } else {
-    llvm_unreachable("unknown operand type in printOperand");
+    O << "/*INV_OP*/";
   }
 }
 
-void AMDGPUInstPrinter::printOperandAndMods(const MCInst *MI, unsigned OpNo,
-                                            raw_ostream &O) {
+void AMDGPUInstPrinter::printOperandAndFPInputMods(const MCInst *MI,
+                                                      unsigned OpNo,
+                                                      raw_ostream &O) {
   unsigned InputModifiers = MI->getOperand(OpNo).getImm();
   if (InputModifiers & SISrcMods::NEG)
     O << '-';
@@ -361,6 +436,122 @@ void AMDGPUInstPrinter::printOperandAndMods(const MCInst *MI, unsigned OpNo,
     O << '|';
 }
 
+void AMDGPUInstPrinter::printOperandAndIntInputMods(const MCInst *MI,
+                                                     unsigned OpNo,
+                                                     raw_ostream &O) {
+  unsigned InputModifiers = MI->getOperand(OpNo).getImm();
+  if (InputModifiers & SISrcMods::SEXT)
+    O << "sext(";
+  printOperand(MI, OpNo + 1, O);
+  if (InputModifiers & SISrcMods::SEXT)
+    O << ')';
+}
+
+
+void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo,
+                                             raw_ostream &O) {
+  unsigned Imm = MI->getOperand(OpNo).getImm();
+  if (Imm <= 0x0ff) {
+    O << " quad_perm:[";
+    O << formatDec(Imm & 0x3)         << ',';
+    O << formatDec((Imm & 0xc)  >> 2) << ',';
+    O << formatDec((Imm & 0x30) >> 4) << ',';
+    O << formatDec((Imm & 0xc0) >> 6) << ']';
+  } else if ((Imm >= 0x101) && (Imm <= 0x10f)) {
+    O << " row_shl:";
+    printU4ImmDecOperand(MI, OpNo, O);
+  } else if ((Imm >= 0x111) && (Imm <= 0x11f)) {
+    O << " row_shr:";
+    printU4ImmDecOperand(MI, OpNo, O);
+  } else if ((Imm >= 0x121) && (Imm <= 0x12f)) {
+    O << " row_ror:";
+    printU4ImmDecOperand(MI, OpNo, O);
+  } else if (Imm == 0x130) {
+    O << " wave_shl:1";
+  } else if (Imm == 0x134) {
+    O << " wave_rol:1";
+  } else if (Imm == 0x138) {
+    O << " wave_shr:1";
+  } else if (Imm == 0x13c) {
+    O << " wave_ror:1";
+  } else if (Imm == 0x140) {
+    O << " row_mirror";
+  } else if (Imm == 0x141) {
+    O << " row_half_mirror";
+  } else if (Imm == 0x142) {
+    O << " row_bcast:15";
+  } else if (Imm == 0x143) {
+    O << " row_bcast:31";
+  } else {
+    llvm_unreachable("Invalid dpp_ctrl value");
+  }
+}
+
+void AMDGPUInstPrinter::printRowMask(const MCInst *MI, unsigned OpNo,
+                                            raw_ostream &O) {
+  O << " row_mask:";
+  printU4ImmOperand(MI, OpNo, O);
+}
+
+void AMDGPUInstPrinter::printBankMask(const MCInst *MI, unsigned OpNo,
+                                             raw_ostream &O) {
+  O << " bank_mask:";
+  printU4ImmOperand(MI, OpNo, O);
+}
+
+void AMDGPUInstPrinter::printBoundCtrl(const MCInst *MI, unsigned OpNo,
+                                              raw_ostream &O) {
+  unsigned Imm = MI->getOperand(OpNo).getImm();
+  if (Imm) {
+    O << " bound_ctrl:0"; // XXX - this syntax is used in sp3
+  }
+}
+
+void AMDGPUInstPrinter::printSDWASel(const MCInst *MI, unsigned OpNo,
+                                     raw_ostream &O) {
+  unsigned Imm = MI->getOperand(OpNo).getImm();
+  switch (Imm) {
+  case 0: O << "BYTE_0"; break;
+  case 1: O << "BYTE_1"; break;
+  case 2: O << "BYTE_2"; break;
+  case 3: O << "BYTE_3"; break;
+  case 4: O << "WORD_0"; break;
+  case 5: O << "WORD_1"; break;
+  case 6: O << "DWORD"; break;
+  default: llvm_unreachable("Invalid SDWA data select operand");
+  }
+}
+
+void AMDGPUInstPrinter::printSDWADstSel(const MCInst *MI, unsigned OpNo,
+                                        raw_ostream &O) {
+  O << "dst_sel:";
+  printSDWASel(MI, OpNo, O);
+}
+
+void AMDGPUInstPrinter::printSDWASrc0Sel(const MCInst *MI, unsigned OpNo,
+                                         raw_ostream &O) {
+  O << "src0_sel:";
+  printSDWASel(MI, OpNo, O);
+}
+
+void AMDGPUInstPrinter::printSDWASrc1Sel(const MCInst *MI, unsigned OpNo,
+                                         raw_ostream &O) {
+  O << "src1_sel:";
+  printSDWASel(MI, OpNo, O);
+}
+
+void AMDGPUInstPrinter::printSDWADstUnused(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &O) {
+  O << "dst_unused:";
+  unsigned Imm = MI->getOperand(OpNo).getImm();
+  switch (Imm) {
+  case 0: O << "UNUSED_PAD"; break;
+  case 1: O << "UNUSED_SEXT"; break;
+  case 2: O << "UNUSED_PRESERVE"; break;
+  default: llvm_unreachable("Invalid SDWA dest_unused operand");
+  }
+}
+
 void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum,
                                         raw_ostream &O) {
   unsigned Imm = MI->getOperand(OpNum).getImm();
@@ -395,9 +586,17 @@ void AMDGPUInstPrinter::printIfSet(const MCInst *MI, unsigned OpNo,
   }
 }
 
+void AMDGPUInstPrinter::printIfSet(const MCInst *MI, unsigned OpNo,
+                                   raw_ostream &O, char Asm) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  assert(Op.isImm());
+  if (Op.getImm() == 1)
+    O << Asm;
+}
+
 void AMDGPUInstPrinter::printAbs(const MCInst *MI, unsigned OpNo,
                                  raw_ostream &O) {
-  printIfSet(MI, OpNo, O, "|");
+  printIfSet(MI, OpNo, O, '|');
 }
 
 void AMDGPUInstPrinter::printClamp(const MCInst *MI, unsigned OpNo,
@@ -424,8 +623,15 @@ void AMDGPUInstPrinter::printOModSI(const MCInst *MI, unsigned OpNo,
 
 void AMDGPUInstPrinter::printLiteral(const MCInst *MI, unsigned OpNo,
                                      raw_ostream &O) {
-  int32_t Imm = MI->getOperand(OpNo).getImm();
-  O << Imm << '(' << BitsToFloat(Imm) << ')';
+  const MCOperand &Op = MI->getOperand(OpNo);
+  assert(Op.isImm() || Op.isExpr());
+  if (Op.isImm()) {
+    int64_t Imm = Op.getImm();
+    O << Imm << '(' << BitsToFloat(Imm) << ')';
+  }
+  if (Op.isExpr()) {
+    Op.getExpr()->print(O << '@', &MAI);
+  }
 }
 
 void AMDGPUInstPrinter::printLast(const MCInst *MI, unsigned OpNo,
@@ -435,7 +641,7 @@ void AMDGPUInstPrinter::printLast(const MCInst *MI, unsigned OpNo,
 
 void AMDGPUInstPrinter::printNeg(const MCInst *MI, unsigned OpNo,
                                  raw_ostream &O) {
-  printIfSet(MI, OpNo, O, "-");
+  printIfSet(MI, OpNo, O, '-');
 }
 
 void AMDGPUInstPrinter::printOMOD(const MCInst *MI, unsigned OpNo,
@@ -456,7 +662,7 @@ void AMDGPUInstPrinter::printOMOD(const MCInst *MI, unsigned OpNo,
 
 void AMDGPUInstPrinter::printRel(const MCInst *MI, unsigned OpNo,
                                  raw_ostream &O) {
-  printIfSet(MI, OpNo, O, "+");
+  printIfSet(MI, OpNo, O, '+');
 }
 
 void AMDGPUInstPrinter::printUpdateExecMask(const MCInst *MI, unsigned OpNo,
@@ -585,43 +791,49 @@ void AMDGPUInstPrinter::printKCache(const MCInst *MI, unsigned OpNo,
 
 void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo,
                                      raw_ostream &O) {
-  unsigned SImm16 = MI->getOperand(OpNo).getImm();
-  unsigned Msg = SImm16 & 0xF;
-  if (Msg == 2 || Msg == 3) {
-    unsigned Op = (SImm16 >> 4) & 0xF;
-    if (Msg == 3)
-      O << "Gs_done(";
-    else
-      O << "Gs(";
-    if (Op == 0) {
-      O << "nop";
-    } else {
-      unsigned Stream = (SImm16 >> 8) & 0x3;
-      if (Op == 1)
-        O << "cut";
-      else if (Op == 2)
-        O << "emit";
-      else if (Op == 3)
-        O << "emit-cut";
-      O << " stream " << Stream;
+  using namespace llvm::AMDGPU::SendMsg;
+
+  const unsigned SImm16 = MI->getOperand(OpNo).getImm();
+  const unsigned Id = SImm16 & ID_MASK_;
+  do {
+    if (Id == ID_INTERRUPT) {
+      if ((SImm16 & ~ID_MASK_) != 0) // Unused/unknown bits must be 0.
+        break;
+      O << "sendmsg(" << IdSymbolic[Id] << ')';
+      return;
     }
-    O << "), [m0] ";
-  } else if (Msg == 1)
-    O << "interrupt ";
-  else if (Msg == 15)
-    O << "system ";
-  else
-    O << "unknown(" << Msg << ") ";
+    if (Id == ID_GS || Id == ID_GS_DONE) {
+      if ((SImm16 & ~(ID_MASK_|OP_GS_MASK_|STREAM_ID_MASK_)) != 0) // Unused/unknown bits must be 0.
+        break;
+      const unsigned OpGs = (SImm16 & OP_GS_MASK_) >> OP_SHIFT_;
+      const unsigned StreamId = (SImm16 & STREAM_ID_MASK_) >> STREAM_ID_SHIFT_;
+      if (OpGs == OP_GS_NOP && Id != ID_GS_DONE) // NOP to be used for GS_DONE only.
+        break;
+      if (OpGs == OP_GS_NOP && StreamId != 0) // NOP does not use/define stream id bits.
+        break;
+      O << "sendmsg(" << IdSymbolic[Id] << ", " << OpGsSymbolic[OpGs];
+      if (OpGs != OP_GS_NOP) {  O << ", " << StreamId; }
+      O << ')';
+      return;
+    }
+    if (Id == ID_SYSMSG) {
+      if ((SImm16 & ~(ID_MASK_|OP_SYS_MASK_)) != 0) // Unused/unknown bits must be 0.
+        break;
+      const unsigned OpSys = (SImm16 & OP_SYS_MASK_) >> OP_SHIFT_;
+      if (! (OP_SYS_FIRST_ <= OpSys && OpSys < OP_SYS_LAST_)) // Unused/unknown.
+        break;
+      O << "sendmsg(" << IdSymbolic[Id] << ", " << OpSysSymbolic[OpSys] << ')';
+      return;
+    }
+  } while (0);
+  O << SImm16; // Unknown simm16 code.
 }
 
 void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo,
                                       raw_ostream &O) {
-  // Note: Mask values are taken from SIInsertWaits.cpp and not from ISA docs
-  // SIInsertWaits.cpp bits usage does not match ISA docs description but it
-  // works so it might be a misprint in docs.
   unsigned SImm16 = MI->getOperand(OpNo).getImm();
   unsigned Vmcnt = SImm16 & 0xF;
-  unsigned Expcnt = (SImm16 >> 4) & 0xF;
+  unsigned Expcnt = (SImm16 >> 4) & 0x7;
   unsigned Lgkmcnt = (SImm16 >> 8) & 0xF;
 
   bool NeedSpace = false;
@@ -638,11 +850,32 @@ void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo,
     NeedSpace = true;
   }
 
-  if (Lgkmcnt != 0x7) {
+  if (Lgkmcnt != 0xF) {
     if (NeedSpace)
       O << ' ';
     O << "lgkmcnt(" << Lgkmcnt << ')';
   }
 }
 
+void AMDGPUInstPrinter::printHwreg(const MCInst *MI, unsigned OpNo,
+                                   raw_ostream &O) {
+  using namespace llvm::AMDGPU::Hwreg;
+
+  unsigned SImm16 = MI->getOperand(OpNo).getImm();
+  const unsigned Id = (SImm16 & ID_MASK_) >> ID_SHIFT_;
+  const unsigned Offset = (SImm16 & OFFSET_MASK_) >> OFFSET_SHIFT_;
+  const unsigned Width = ((SImm16 & WIDTH_M1_MASK_) >> WIDTH_M1_SHIFT_) + 1;
+
+  O << "hwreg(";
+  if (ID_SYMBOLIC_FIRST_ <= Id && Id < ID_SYMBOLIC_LAST_) {
+    O << IdSymbolic[Id];
+  } else {
+    O << Id;
+  }
+  if (Width != WIDTH_M1_DEFAULT_ + 1 || Offset != OFFSET_DEFAULT_) {
+    O << ", " << Offset << ", " << Width;
+  }
+  O << ')';
+}
+
 #include "AMDGPUGenAsmWriter.inc"
diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
index 90541d86132d..f5a290f16045 100644
--- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
+++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
@@ -10,8 +10,8 @@
 /// \file
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_R600_INSTPRINTER_AMDGPUINSTPRINTER_H
-#define LLVM_LIB_TARGET_R600_INSTPRINTER_AMDGPUINSTPRINTER_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_INSTPRINTER_AMDGPUINSTPRINTER_H
+#define LLVM_LIB_TARGET_AMDGPU_INSTPRINTER_AMDGPUINSTPRINTER_H
 
 #include "llvm/MC/MCInstPrinter.h"
 
@@ -33,37 +33,60 @@ public:
                               const MCRegisterInfo &MRI);
 
 private:
+  void printU4ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printU8ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printU16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printU4ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printU8ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printU32ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printNamedBit(const MCInst* MI, unsigned OpNo, raw_ostream& O,
+                     StringRef BitName);
   void printOffen(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printIdxen(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printAddr64(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printMBUFOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printDSOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printDSOffset0(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printDSOffset1(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printOffset0(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printOffset1(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printSMRDOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printSMRDLiteralOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printGDS(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printGLC(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printSLC(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printTFE(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printDMask(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printUNorm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printDA(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printR128(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printLWE(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printRegOperand(unsigned RegNo, raw_ostream &O);
   void printVOPDst(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printImmediate32(uint32_t I, raw_ostream &O);
   void printImmediate64(uint64_t I, raw_ostream &O);
   void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printOperandAndMods(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printOperandAndFPInputMods(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printOperandAndIntInputMods(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printDPPCtrl(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printRowMask(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printBankMask(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printBoundCtrl(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printSDWASel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printSDWADstSel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printSDWASrc0Sel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printSDWASrc1Sel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printSDWADstUnused(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   static void printInterpSlot(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   static void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O,
                          StringRef Asm, StringRef Default = "");
+  static void printIfSet(const MCInst *MI, unsigned OpNo,
+                         raw_ostream &O, char Asm);
   static void printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   static void printClamp(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   static void printClampSI(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   static void printOModSI(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  static void printLiteral(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printLiteral(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   static void printLast(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   static void printNeg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   static void printOMOD(const MCInst *MI, unsigned OpNo, raw_ostream &O);
@@ -79,6 +102,7 @@ private:
   static void printKCache(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   static void printSendMsg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   static void printWaitFlag(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printHwreg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
 };
 
 } // End namespace llvm
diff --git a/lib/Target/AMDGPU/InstPrinter/CMakeLists.txt b/lib/Target/AMDGPU/InstPrinter/CMakeLists.txt
index ce63bd553b9c..7191ff2c4577 100644
--- a/lib/Target/AMDGPU/InstPrinter/CMakeLists.txt
+++ b/lib/Target/AMDGPU/InstPrinter/CMakeLists.txt
@@ -1,3 +1,5 @@
 add_llvm_library(LLVMAMDGPUAsmPrinter
   AMDGPUInstPrinter.cpp
   )
+
+add_dependencies(LLVMAMDGPUAsmPrinter LLVMAMDGPUUtils)
diff --git a/lib/Target/AMDGPU/InstPrinter/LLVMBuild.txt b/lib/Target/AMDGPU/InstPrinter/LLVMBuild.txt
index fdb43844dc63..30c2670316c8 100644
--- a/lib/Target/AMDGPU/InstPrinter/LLVMBuild.txt
+++ b/lib/Target/AMDGPU/InstPrinter/LLVMBuild.txt
@@ -19,6 +19,6 @@
 type = Library
 name = AMDGPUAsmPrinter
 parent = AMDGPU
-required_libraries = MC Support
+required_libraries = MC Support AMDGPUUtils
 add_to_library_groups = AMDGPU
 
diff --git a/lib/Target/AMDGPU/InstPrinter/Makefile b/lib/Target/AMDGPU/InstPrinter/Makefile
deleted file mode 100644
index 4e48ac7e28a9..000000000000
--- a/lib/Target/AMDGPU/InstPrinter/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-#===- lib/Target/R600/AsmPrinter/Makefile ------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMAMDGPUAsmPrinter
-
-# Hack: we need to include 'main' x86 target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AMDGPU/LLVMBuild.txt b/lib/Target/AMDGPU/LLVMBuild.txt
index 38c5489586f1..bbdd17737cf0 100644
--- a/lib/Target/AMDGPU/LLVMBuild.txt
+++ b/lib/Target/AMDGPU/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Target/AMDIL/LLVMBuild.txt -------------------------*- Conf -*--===;
+;===- ./lib/Target/AMDGPU/LLVMBuild.txt ------------------------*- Conf -*--===;
 ;
 ;                     The LLVM Compiler Infrastructure
 ;
@@ -16,7 +16,7 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = AsmParser InstPrinter MCTargetDesc TargetInfo Utils
+subdirectories = AsmParser Disassembler InstPrinter MCTargetDesc TargetInfo Utils
 
 [component_0]
 type = TargetGroup
@@ -24,10 +24,11 @@ name = AMDGPU
 parent = Target
 has_asmparser = 1
 has_asmprinter = 1
+has_disassembler = 1
 
 [component_1]
 type = Library
 name = AMDGPUCodeGen
 parent = AMDGPU
-required_libraries = Analysis AsmPrinter CodeGen Core IPO MC AMDGPUAsmParser AMDGPUAsmPrinter AMDGPUDesc AMDGPUInfo AMDGPUUtils Scalar SelectionDAG Support Target TransformUtils
+required_libraries = Analysis AsmPrinter CodeGen Core IPO MC AMDGPUAsmPrinter AMDGPUDesc AMDGPUInfo AMDGPUUtils Scalar SelectionDAG Support Target TransformUtils Vectorize
 add_to_library_groups = AMDGPU
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
index 60e8c8f3d303..1cb9d21408c6 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -53,7 +53,8 @@ public:
                             const MCAsmLayout &Layout) const override {
     return false;
   }
-  void relaxInstruction(const MCInst &Inst, MCInst &Res) const override {
+  void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+                        MCInst &Res) const override {
     assert(!"Not implemented");
   }
   bool mayNeedRelaxation(const MCInst &Inst) const override { return false; }
@@ -73,12 +74,17 @@ void AMDGPUMCObjectWriter::writeObject(MCAssembler &Asm,
 
 static unsigned getFixupKindNumBytes(unsigned Kind) {
   switch (Kind) {
+  case FK_SecRel_1:
   case FK_Data_1:
     return 1;
+  case FK_SecRel_2:
   case FK_Data_2:
     return 2;
+  case FK_SecRel_4:
   case FK_Data_4:
+  case FK_PCRel_4:
     return 4;
+  case FK_SecRel_8:
   case FK_Data_8:
     return 8;
   default:
@@ -92,32 +98,15 @@ void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
 
   switch ((unsigned)Fixup.getKind()) {
     case AMDGPU::fixup_si_sopp_br: {
+      int64_t BrImm = ((int64_t)Value - 4) / 4;
+      if (!isInt<16>(BrImm))
+        report_fatal_error("branch size exceeds simm16");
+
       uint16_t *Dst = (uint16_t*)(Data + Fixup.getOffset());
-      *Dst = (Value - 4) / 4;
+      *Dst = BrImm;
       break;
     }
 
-    case AMDGPU::fixup_si_rodata: {
-      uint32_t *Dst = (uint32_t*)(Data + Fixup.getOffset());
-      // We emit constant data at the end of the text section and generate its
-      // address using the following code sequence:
-      // s_getpc_b64 s[0:1]
-      // s_add_u32 s0, s0, $symbol
-      // s_addc_u32 s1, s1, 0
-      //
-      // s_getpc_b64 returns the address of the s_add_u32 instruction and then
-      // the fixup replaces $symbol with a literal constant, which is a
-      // pc-relative  offset from the encoding of the $symbol operand to the
-      // constant data.
-      //
-      // What we want here is an offset from the start of the s_add_u32
-      // instruction to the constant data, but since the encoding of $symbol
-      // starts 4 bytes after the start of the add instruction, we end up
-      // with an offset that is 4 bytes too small.  This requires us to
-      // add 4 to the fixup value before applying it.
-      *Dst = Value + 4;
-      break;
-    }
     default: {
       // FIXME: Copied from AArch64
       unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
@@ -144,7 +133,6 @@ const MCFixupKindInfo &AMDGPUAsmBackend::getFixupKindInfo(
   const static MCFixupKindInfo Infos[AMDGPU::NumTargetFixupKinds] = {
     // name                   offset bits  flags
     { "fixup_si_sopp_br",     0,     16,   MCFixupKindInfo::FKF_IsPCRel },
-    { "fixup_si_rodata",      0,     32,   MCFixupKindInfo::FKF_IsPCRel }
   };
 
   if (Kind < FirstTargetFixupKind)
@@ -167,13 +155,15 @@ namespace {
 
 class ELFAMDGPUAsmBackend : public AMDGPUAsmBackend {
   bool Is64Bit;
+  bool HasRelocationAddend;
 
 public:
-  ELFAMDGPUAsmBackend(const Target &T, bool Is64Bit) :
-      AMDGPUAsmBackend(T), Is64Bit(Is64Bit) { }
+  ELFAMDGPUAsmBackend(const Target &T, const Triple &TT) :
+      AMDGPUAsmBackend(T), Is64Bit(TT.getArch() == Triple::amdgcn),
+      HasRelocationAddend(TT.getOS() == Triple::AMDHSA) { }
 
   MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
-    return createAMDGPUELFObjectWriter(Is64Bit, OS);
+    return createAMDGPUELFObjectWriter(Is64Bit, HasRelocationAddend, OS);
   }
 };
 
@@ -182,8 +172,6 @@ public:
 MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T,
                                            const MCRegisterInfo &MRI,
                                            const Triple &TT, StringRef CPU) {
-  Triple TargetTriple(TT);
-
   // Use 64-bit ELF for amdgcn
-  return new ELFAMDGPUAsmBackend(T, TargetTriple.getArch() == Triple::amdgcn);
+  return new ELFAMDGPUAsmBackend(T, TT);
 }
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
index 820f17df8960..b4e3b8e896bd 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
@@ -18,23 +18,56 @@ namespace {
 
 class AMDGPUELFObjectWriter : public MCELFObjectTargetWriter {
 public:
-  AMDGPUELFObjectWriter(bool Is64Bit);
+  AMDGPUELFObjectWriter(bool Is64Bit, bool HasRelocationAddend);
 protected:
-  unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
-                        bool IsPCRel) const override {
-    return Fixup.getKind();
-  }
-
+  unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                        const MCFixup &Fixup, bool IsPCRel) const override;
 };
 
 
 } // End anonymous namespace
 
-AMDGPUELFObjectWriter::AMDGPUELFObjectWriter(bool Is64Bit)
-  : MCELFObjectTargetWriter(Is64Bit, ELF::ELFOSABI_AMDGPU_HSA,
-                            ELF::EM_AMDGPU, false) { }
+AMDGPUELFObjectWriter::AMDGPUELFObjectWriter(bool Is64Bit,
+                                             bool HasRelocationAddend)
+  : MCELFObjectTargetWriter(Is64Bit,
+                            ELF::ELFOSABI_AMDGPU_HSA,
+                            ELF::EM_AMDGPU,
+                            HasRelocationAddend) { }
+
+unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx,
+                                             const MCValue &Target,
+                                             const MCFixup &Fixup,
+                                             bool IsPCRel) const {
+  // SCRATCH_RSRC_DWORD[01] is a special global variable that represents
+  // the scratch buffer.
+  if (Target.getSymA()->getSymbol().getName() == "SCRATCH_RSRC_DWORD0")
+    return ELF::R_AMDGPU_ABS32_LO;
+  if (Target.getSymA()->getSymbol().getName() == "SCRATCH_RSRC_DWORD1")
+    return ELF::R_AMDGPU_ABS32_HI;
+
+  switch (Target.getAccessVariant()) {
+  default:
+    break;
+  case MCSymbolRefExpr::VK_GOTPCREL:
+    return ELF::R_AMDGPU_GOTPCREL;
+  }
+
+  switch (Fixup.getKind()) {
+  default: break;
+  case FK_PCRel_4:
+    return ELF::R_AMDGPU_REL32;
+  case FK_SecRel_4:
+    return ELF::R_AMDGPU_ABS32;
+  }
+
+  llvm_unreachable("unhandled relocation type");
+}
+
 
-MCObjectWriter *llvm::createAMDGPUELFObjectWriter(bool Is64Bit, raw_pwrite_stream &OS) {
-  MCELFObjectTargetWriter *MOTW = new AMDGPUELFObjectWriter(Is64Bit);
+MCObjectWriter *llvm::createAMDGPUELFObjectWriter(bool Is64Bit,
+                                                  bool HasRelocationAddend,
+                                                  raw_pwrite_stream &OS) {
+  MCELFObjectTargetWriter *MOTW =
+      new AMDGPUELFObjectWriter(Is64Bit, HasRelocationAddend);
   return createELFObjectWriter(MOTW, OS, true);
 }
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp
index 9ff9fe794d2b..43338a5bebd2 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp
@@ -12,11 +12,6 @@
 
 using namespace llvm;
 
-void AMDGPUELFStreamer::InitSections(bool NoExecStack) {
-  // Start with the .hsatext section by default.
-  SwitchSection(AMDGPU::getHSATextSection(getContext()));
-}
-
 MCELFStreamer *llvm::createAMDGPUELFStreamer(MCContext &Context,
                                            MCAsmBackend &MAB,
                                            raw_pwrite_stream &OS,
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h
index 488d7e74d741..5319b65d65f9 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h
@@ -1,4 +1,4 @@
-//===-------- AMDGPUELFStreamer.h - ELF Object Output ---------------------===//
+//===-------- AMDGPUELFStreamer.h - ELF Object Output -----------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -29,7 +29,6 @@ public:
                   MCCodeEmitter *Emitter)
       : MCELFStreamer(Context, MAB, OS, Emitter) { }
 
-  virtual void InitSections(bool NoExecStac) override;
 };
 
 MCELFStreamer *createAMDGPUELFStreamer(MCContext &Context, MCAsmBackend &MAB,
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h
index 59a9178082f6..20c1adfbc6b9 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUFIXUPKINDS_H
-#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUFIXUPKINDS_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUFIXUPKINDS_H
+#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUFIXUPKINDS_H
 
 #include "llvm/MC/MCFixup.h"
 
@@ -18,9 +18,6 @@ enum Fixups {
   /// 16-bit PC relative fixup for SOPP branch instructions.
   fixup_si_sopp_br = FirstTargetFixupKind,
 
-  /// fixup for global addresses with constant initializers
-  fixup_si_rodata,
-
   // Marker
   LastTargetFixupKind,
   NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
index 4bc80a028936..1655591abf39 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -9,12 +9,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUMCAsmInfo.h"
+#include "llvm/ADT/Triple.h"
 
 using namespace llvm;
+
 AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT) : MCAsmInfoELF() {
   HasSingleParameterDotFile = false;
   //===------------------------------------------------------------------===//
-  MaxInstLength = 16;
+  MinInstAlignment = 4;
+  MaxInstLength = (TT.getArch() == Triple::amdgcn) ? 8 : 16;
   SeparatorString = "\n";
   CommentString = ";";
   PrivateLabelPrefix = "";
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h
index a546961705d7..8cb33a3179cd 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCASMINFO_H
-#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCASMINFO_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCASMINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCASMINFO_H
 
 #include "llvm/MC/MCAsmInfoELF.h"
 namespace llvm {
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
index c95742762233..c942ea904085 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCCODEEMITTER_H
-#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCCODEEMITTER_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCCODEEMITTER_H
+#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCCODEEMITTER_H
 
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/Support/raw_ostream.h"
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index f70409470276..a0d9aab114fc 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -18,7 +18,6 @@
 #include "AMDGPUTargetStreamer.h"
 #include "InstPrinter/AMDGPUInstPrinter.h"
 #include "SIDefines.h"
-#include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
@@ -56,15 +55,6 @@ createAMDGPUMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
   return createAMDGPUMCSubtargetInfoImpl(TT, CPU, FS);
 }
 
-static MCCodeGenInfo *createAMDGPUMCCodeGenInfo(const Triple &TT,
-                                                Reloc::Model RM,
-                                                CodeModel::Model CM,
-                                                CodeGenOpt::Level OL) {
-  MCCodeGenInfo *X = new MCCodeGenInfo();
-  X->initMCCodeGenInfo(RM, CM, OL);
-  return X;
-}
-
 static MCInstPrinter *createAMDGPUMCInstPrinter(const Triple &T,
                                                 unsigned SyntaxVariant,
                                                 const MCAsmInfo &MAI,
@@ -99,7 +89,6 @@ extern "C" void LLVMInitializeAMDGPUTargetMC() {
   for (Target *T : {&TheAMDGPUTarget, &TheGCNTarget}) {
     RegisterMCAsmInfo<AMDGPUMCAsmInfo> X(*T);
 
-    TargetRegistry::RegisterMCCodeGenInfo(*T, createAMDGPUMCCodeGenInfo);
     TargetRegistry::RegisterMCInstrInfo(*T, createAMDGPUMCInstrInfo);
     TargetRegistry::RegisterMCRegInfo(*T, createAMDGPUMCRegisterInfo);
     TargetRegistry::RegisterMCSubtargetInfo(*T, createAMDGPUMCSubtargetInfo);
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
index 5d1b86b8c0c2..9ab7940812ba 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
@@ -13,13 +13,13 @@
 //===----------------------------------------------------------------------===//
 //
 
-#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCTARGETDESC_H
-#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCTARGETDESC_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCTARGETDESC_H
+#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCTARGETDESC_H
 
 #include "llvm/Support/DataTypes.h"
-#include "llvm/ADT/StringRef.h"
 
 namespace llvm {
+class StringRef;
 class MCAsmBackend;
 class MCCodeEmitter;
 class MCContext;
@@ -47,6 +47,7 @@ MCAsmBackend *createAMDGPUAsmBackend(const Target &T, const MCRegisterInfo &MRI,
                                      const Triple &TT, StringRef CPU);
 
 MCObjectWriter *createAMDGPUELFObjectWriter(bool Is64Bit,
+                                            bool HasRelocationAddend,
                                             raw_pwrite_stream &OS);
 } // End llvm namespace
 
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index b91134d2ee9b..83dcaacb738f 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -312,10 +312,6 @@ AMDGPUTargetELFStreamer::EmitAMDKernelCodeT(const amd_kernel_code_t &Header) {
 
   MCStreamer &OS = getStreamer();
   OS.PushSection();
-  // The MCObjectFileInfo that is available to the assembler is a generic
-  // implementation and not AMDGPUHSATargetObjectFile, so we can't use
-  // MCObjectFileInfo::getTextSection() here for fetching the HSATextSection.
-  OS.SwitchSection(AMDGPU::getHSATextSection(OS.getContext()));
   OS.EmitBytes(StringRef((const char*)&Header, sizeof(Header)));
   OS.PopSection();
 }
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index 83bb728f541c..b3d59e8f396e 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -7,16 +7,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUTARGETSTREAMER_H
-#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUTARGETSTREAMER_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUTARGETSTREAMER_H
+#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUTARGETSTREAMER_H
 
 #include "AMDKernelCodeT.h"
 #include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/Debug.h"
+
 namespace llvm {
 
 class MCELFStreamer;
+class MCSymbol;
 
 class AMDGPUTargetStreamer : public MCTargetStreamer {
 public:
diff --git a/lib/Target/AMDGPU/MCTargetDesc/Makefile b/lib/Target/AMDGPU/MCTargetDesc/Makefile
deleted file mode 100644
index 5ad68662d98c..000000000000
--- a/lib/Target/AMDGPU/MCTargetDesc/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/AMDGPU/TargetDesc/Makefile ----------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMAMDGPUDesc
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
index 3c1142dd664b..5e8e6ceb7ca2 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -15,6 +15,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "R600Defines.h"
+#include "MCTargetDesc/AMDGPUFixupKinds.h"
 #include "MCTargetDesc/AMDGPUMCCodeEmitter.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/MC/MCCodeEmitter.h"
@@ -51,12 +52,9 @@ public:
                              const MCSubtargetInfo &STI) const override;
 
 private:
-  void EmitByte(unsigned int byte, raw_ostream &OS) const;
-
   void Emit(uint32_t value, raw_ostream &OS) const;
   void Emit(uint64_t value, raw_ostream &OS) const;
 
-  unsigned getHWRegChan(unsigned reg) const;
   unsigned getHWReg(unsigned regNo) const;
 };
 
@@ -142,10 +140,6 @@ void R600MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
   }
 }
 
-void R600MCCodeEmitter::EmitByte(unsigned int Byte, raw_ostream &OS) const {
-  OS.write((uint8_t) Byte & 0xff);
-}
-
 void R600MCCodeEmitter::Emit(uint32_t Value, raw_ostream &OS) const {
   support::endian::Writer<support::little>(OS).write(Value);
 }
@@ -154,17 +148,13 @@ void R600MCCodeEmitter::Emit(uint64_t Value, raw_ostream &OS) const {
   support::endian::Writer<support::little>(OS).write(Value);
 }
 
-unsigned R600MCCodeEmitter::getHWRegChan(unsigned reg) const {
-  return MRI.getEncodingValue(reg) >> HW_CHAN_SHIFT;
-}
-
 unsigned R600MCCodeEmitter::getHWReg(unsigned RegNo) const {
   return MRI.getEncodingValue(RegNo) & HW_REG_MASK;
 }
 
 uint64_t R600MCCodeEmitter::getMachineOpValue(const MCInst &MI,
                                               const MCOperand &MO,
-                                        SmallVectorImpl<MCFixup> &Fixup,
+                                        SmallVectorImpl<MCFixup> &Fixups,
                                         const MCSubtargetInfo &STI) const {
   if (MO.isReg()) {
     if (HAS_NATIVE_OPERANDS(MCII.get(MI.getOpcode()).TSFlags))
@@ -172,6 +162,18 @@ uint64_t R600MCCodeEmitter::getMachineOpValue(const MCInst &MI,
     return getHWReg(MO.getReg());
   }
 
+  if (MO.isExpr()) {
+    // We put rodata at the end of code section, then map the entire
+    // code secetion as vtx buf. Thus the section relative address is the
+    // correct one.
+    // Each R600 literal instruction has two operands
+    // We can't easily get the order of the current one, so compare against
+    // the first one and adjust offset.
+    const unsigned offset = (&MO == &MI.getOperand(0)) ? 0 : 4;
+    Fixups.push_back(MCFixup::create(offset, MO.getExpr(), FK_SecRel_4, MI.getLoc()));
+    return 0;
+  }
+
   assert(MO.isImm());
   return MO.getImm();
 }
diff --git a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
index 9eb3dadbc5e2..71b585c25ac5 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -162,20 +162,30 @@ static uint32_t getLit64Encoding(uint64_t Val) {
 
 uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO,
                                          unsigned OpSize) const {
-  if (MO.isExpr())
-    return 255;
 
-  assert(!MO.isFPImm());
+  int64_t Imm;
+  if (MO.isExpr()) {
+    const MCConstantExpr *C = dyn_cast<MCConstantExpr>(MO.getExpr());
+    if (!C)
+      return 255;
+
+    Imm = C->getValue();
+  } else {
 
-  if (!MO.isImm())
-    return ~0;
+    assert(!MO.isFPImm());
+
+    if (!MO.isImm())
+      return ~0;
+
+    Imm = MO.getImm();
+  }
 
   if (OpSize == 4)
-    return getLit32Encoding(static_cast<uint32_t>(MO.getImm()));
+    return getLit32Encoding(static_cast<uint32_t>(Imm));
 
   assert(OpSize == 8);
 
-  return getLit64Encoding(static_cast<uint64_t>(MO.getImm()));
+  return getLit64Encoding(static_cast<uint64_t>(Imm));
 }
 
 void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
@@ -213,7 +223,11 @@ void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
 
     if (Op.isImm())
       Imm = Op.getImm();
-    else if (!Op.isExpr()) // Exprs will be replaced with a fixup value.
+    else if (Op.isExpr()) {
+      if (const MCConstantExpr *C = dyn_cast<MCConstantExpr>(Op.getExpr()))
+        Imm = C->getValue();
+
+    } else if (!Op.isExpr()) // Exprs will be replaced with a fixup value.
       llvm_unreachable("Must be immediate or expr");
 
     for (unsigned j = 0; j < 4; j++) {
@@ -247,10 +261,14 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
   if (MO.isReg())
     return MRI.getEncodingValue(MO.getReg());
 
-  if (MO.isExpr()) {
-    const MCSymbolRefExpr *Expr = cast<MCSymbolRefExpr>(MO.getExpr());
-    MCFixupKind Kind = (MCFixupKind)AMDGPU::fixup_si_rodata;
-    Fixups.push_back(MCFixup::create(4, Expr, Kind, MI.getLoc()));
+  if (MO.isExpr() && MO.getExpr()->getKind() != MCExpr::Constant) {
+    const MCSymbolRefExpr *Expr = dyn_cast<MCSymbolRefExpr>(MO.getExpr());
+    MCFixupKind Kind;
+    if (Expr && Expr->getSymbol().isExternal())
+      Kind = FK_Data_4;
+    else
+      Kind = FK_PCRel_4;
+    Fixups.push_back(MCFixup::create(4, MO.getExpr(), Kind, MI.getLoc()));
   }
 
   // Figure out the operand number, needed for isSrcOperand check
diff --git a/lib/Target/AMDGPU/Makefile b/lib/Target/AMDGPU/Makefile
deleted file mode 100644
index 219f34daa24f..000000000000
--- a/lib/Target/AMDGPU/Makefile
+++ /dev/null
@@ -1,23 +0,0 @@
-##===- lib/Target/R600/Makefile ---------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMAMDGPUCodeGen
-TARGET = AMDGPU
-
-# Make sure that tblgen is run, first thing.
-BUILT_SOURCES = AMDGPUGenRegisterInfo.inc AMDGPUGenInstrInfo.inc \
-		AMDGPUGenDAGISel.inc  AMDGPUGenSubtargetInfo.inc \
-		AMDGPUGenMCCodeEmitter.inc AMDGPUGenCallingConv.inc \
-		AMDGPUGenIntrinsics.inc AMDGPUGenDFAPacketizer.inc \
-		AMDGPUGenAsmWriter.inc AMDGPUGenAsmMatcher.inc
-
-DIRS = AsmParser InstPrinter TargetInfo MCTargetDesc Utils
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AMDGPU/Processors.td b/lib/Target/AMDGPU/Processors.td
index 4300d972d46b..f5f1eb14e993 100644
--- a/lib/Target/AMDGPU/Processors.td
+++ b/lib/Target/AMDGPU/Processors.td
@@ -13,11 +13,8 @@ class Proc<string Name, ProcessorItineraries itin, list<SubtargetFeature> Featur
 //===----------------------------------------------------------------------===//
 // R600
 //===----------------------------------------------------------------------===//
-def : Proc<"",           R600_VLIW5_Itin,
-    [FeatureR600, FeatureVertexCache]>;
-
 def : Proc<"r600",       R600_VLIW5_Itin,
-    [FeatureR600 , FeatureVertexCache, FeatureWavefrontSize64]>;
+    [FeatureR600, FeatureVertexCache, FeatureWavefrontSize64]>;
 
 def : Proc<"r630",       R600_VLIW5_Itin,
     [FeatureR600, FeatureVertexCache, FeatureWavefrontSize32]>;
@@ -84,11 +81,11 @@ def : Proc<"cayman",     R600_VLIW4_Itin,
 //===----------------------------------------------------------------------===//
 
 def : ProcessorModel<"SI", SIFullSpeedModel,
-  [FeatureSouthernIslands, FeatureFastFMAF32]
+  [FeatureSouthernIslands, FeatureFastFMAF32, HalfRate64Ops]
 >;
 
-def : ProcessorModel<"tahiti",   SIFullSpeedModel,
-  [FeatureSouthernIslands, FeatureFastFMAF32]
+def : ProcessorModel<"tahiti", SIFullSpeedModel,
+  [FeatureSouthernIslands, FeatureFastFMAF32, HalfRate64Ops]
 >;
 
 def : ProcessorModel<"pitcairn", SIQuarterSpeedModel, [FeatureSouthernIslands]>;
@@ -116,8 +113,8 @@ def : ProcessorModel<"kaveri",     SIQuarterSpeedModel,
 >;
 
 def : ProcessorModel<"hawaii", SIFullSpeedModel,
-  [FeatureSeaIslands, FeatureFastFMAF32, FeatureLDSBankCount32,
-   FeatureISAVersion7_0_1]
+  [FeatureSeaIslands, FeatureFastFMAF32, HalfRate64Ops,
+   FeatureLDSBankCount32, FeatureISAVersion7_0_1]
 >;
 
 def : ProcessorModel<"mullins",    SIQuarterSpeedModel,
@@ -148,3 +145,11 @@ def : ProcessorModel<"fiji", SIQuarterSpeedModel,
 def : ProcessorModel<"stoney", SIQuarterSpeedModel,
   [FeatureVolcanicIslands, FeatureISAVersion8_0_1, FeatureLDSBankCount16]
 >;
+
+def : ProcessorModel<"polaris10", SIQuarterSpeedModel,
+  [FeatureVolcanicIslands, FeatureISAVersion8_0_1, FeatureLDSBankCount32]
+>;
+
+def : ProcessorModel<"polaris11", SIQuarterSpeedModel,
+  [FeatureVolcanicIslands, FeatureISAVersion8_0_1, FeatureLDSBankCount32]
+>;
diff --git a/lib/Target/AMDGPU/R600ClauseMergePass.cpp b/lib/Target/AMDGPU/R600ClauseMergePass.cpp
index 3cb90218a7d5..3ccde79e2df4 100644
--- a/lib/Target/AMDGPU/R600ClauseMergePass.cpp
+++ b/lib/Target/AMDGPU/R600ClauseMergePass.cpp
@@ -31,8 +31,8 @@ using namespace llvm;
 
 namespace {
 
-static bool isCFAlu(const MachineInstr *MI) {
-  switch (MI->getOpcode()) {
+static bool isCFAlu(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
   case AMDGPU::CF_ALU:
   case AMDGPU::CF_ALU_PUSH_BEFORE:
     return true;
@@ -47,19 +47,19 @@ private:
   static char ID;
   const R600InstrInfo *TII;
 
-  unsigned getCFAluSize(const MachineInstr *MI) const;
-  bool isCFAluEnabled(const MachineInstr *MI) const;
+  unsigned getCFAluSize(const MachineInstr &MI) const;
+  bool isCFAluEnabled(const MachineInstr &MI) const;
 
   /// IfCvt pass can generate "disabled" ALU clause marker that need to be
   /// removed and their content affected to the previous alu clause.
   /// This function parse instructions after CFAlu until it find a disabled
   /// CFAlu and merge the content, or an enabled CFAlu.
-  void cleanPotentialDisabledCFAlu(MachineInstr *CFAlu) const;
+  void cleanPotentialDisabledCFAlu(MachineInstr &CFAlu) const;
 
   /// Check whether LatrCFAlu can be merged into RootCFAlu and do it if
   /// it is the case.
-  bool mergeIfPossible(MachineInstr *RootCFAlu, const MachineInstr *LatrCFAlu)
-      const;
+  bool mergeIfPossible(MachineInstr &RootCFAlu,
+                       const MachineInstr &LatrCFAlu) const;
 
 public:
   R600ClauseMergePass(TargetMachine &tm) : MachineFunctionPass(ID) { }
@@ -71,38 +71,40 @@ public:
 
 char R600ClauseMergePass::ID = 0;
 
-unsigned R600ClauseMergePass::getCFAluSize(const MachineInstr *MI) const {
+unsigned R600ClauseMergePass::getCFAluSize(const MachineInstr &MI) const {
   assert(isCFAlu(MI));
-  return MI->getOperand(
-      TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::COUNT)).getImm();
+  return MI
+      .getOperand(TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::COUNT))
+      .getImm();
 }
 
-bool R600ClauseMergePass::isCFAluEnabled(const MachineInstr *MI) const {
+bool R600ClauseMergePass::isCFAluEnabled(const MachineInstr &MI) const {
   assert(isCFAlu(MI));
-  return MI->getOperand(
-      TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::Enabled)).getImm();
+  return MI
+      .getOperand(TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::Enabled))
+      .getImm();
 }
 
-void R600ClauseMergePass::cleanPotentialDisabledCFAlu(MachineInstr *CFAlu)
-    const {
+void R600ClauseMergePass::cleanPotentialDisabledCFAlu(
+    MachineInstr &CFAlu) const {
   int CntIdx = TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::COUNT);
-  MachineBasicBlock::iterator I = CFAlu, E = CFAlu->getParent()->end();
+  MachineBasicBlock::iterator I = CFAlu, E = CFAlu.getParent()->end();
   I++;
   do {
-    while (I!= E && !isCFAlu(I))
+    while (I != E && !isCFAlu(*I))
       I++;
     if (I == E)
       return;
-    MachineInstr *MI = I++;
+    MachineInstr &MI = *I++;
     if (isCFAluEnabled(MI))
       break;
-    CFAlu->getOperand(CntIdx).setImm(getCFAluSize(CFAlu) + getCFAluSize(MI));
-    MI->eraseFromParent();
+    CFAlu.getOperand(CntIdx).setImm(getCFAluSize(CFAlu) + getCFAluSize(MI));
+    MI.eraseFromParent();
   } while (I != E);
 }
 
-bool R600ClauseMergePass::mergeIfPossible(MachineInstr *RootCFAlu,
-                                          const MachineInstr *LatrCFAlu) const {
+bool R600ClauseMergePass::mergeIfPossible(MachineInstr &RootCFAlu,
+                                          const MachineInstr &LatrCFAlu) const {
   assert(isCFAlu(RootCFAlu) && isCFAlu(LatrCFAlu));
   int CntIdx = TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::COUNT);
   unsigned RootInstCount = getCFAluSize(RootCFAlu),
@@ -112,7 +114,7 @@ bool R600ClauseMergePass::mergeIfPossible(MachineInstr *RootCFAlu,
     DEBUG(dbgs() << "Excess inst counts\n");
     return false;
   }
-  if (RootCFAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE)
+  if (RootCFAlu.getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE)
     return false;
   // Is KCache Bank 0 compatible ?
   int Mode0Idx =
@@ -121,12 +123,12 @@ bool R600ClauseMergePass::mergeIfPossible(MachineInstr *RootCFAlu,
       TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_BANK0);
   int KBank0LineIdx =
       TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_ADDR0);
-  if (LatrCFAlu->getOperand(Mode0Idx).getImm() &&
-      RootCFAlu->getOperand(Mode0Idx).getImm() &&
-      (LatrCFAlu->getOperand(KBank0Idx).getImm() !=
-       RootCFAlu->getOperand(KBank0Idx).getImm() ||
-      LatrCFAlu->getOperand(KBank0LineIdx).getImm() !=
-      RootCFAlu->getOperand(KBank0LineIdx).getImm())) {
+  if (LatrCFAlu.getOperand(Mode0Idx).getImm() &&
+      RootCFAlu.getOperand(Mode0Idx).getImm() &&
+      (LatrCFAlu.getOperand(KBank0Idx).getImm() !=
+           RootCFAlu.getOperand(KBank0Idx).getImm() ||
+       LatrCFAlu.getOperand(KBank0LineIdx).getImm() !=
+           RootCFAlu.getOperand(KBank0LineIdx).getImm())) {
     DEBUG(dbgs() << "Wrong KC0\n");
     return false;
   }
@@ -137,56 +139,61 @@ bool R600ClauseMergePass::mergeIfPossible(MachineInstr *RootCFAlu,
       TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_BANK1);
   int KBank1LineIdx =
       TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_ADDR1);
-  if (LatrCFAlu->getOperand(Mode1Idx).getImm() &&
-      RootCFAlu->getOperand(Mode1Idx).getImm() &&
-      (LatrCFAlu->getOperand(KBank1Idx).getImm() !=
-      RootCFAlu->getOperand(KBank1Idx).getImm() ||
-      LatrCFAlu->getOperand(KBank1LineIdx).getImm() !=
-      RootCFAlu->getOperand(KBank1LineIdx).getImm())) {
+  if (LatrCFAlu.getOperand(Mode1Idx).getImm() &&
+      RootCFAlu.getOperand(Mode1Idx).getImm() &&
+      (LatrCFAlu.getOperand(KBank1Idx).getImm() !=
+           RootCFAlu.getOperand(KBank1Idx).getImm() ||
+       LatrCFAlu.getOperand(KBank1LineIdx).getImm() !=
+           RootCFAlu.getOperand(KBank1LineIdx).getImm())) {
     DEBUG(dbgs() << "Wrong KC0\n");
     return false;
   }
-  if (LatrCFAlu->getOperand(Mode0Idx).getImm()) {
-    RootCFAlu->getOperand(Mode0Idx).setImm(
-        LatrCFAlu->getOperand(Mode0Idx).getImm());
-    RootCFAlu->getOperand(KBank0Idx).setImm(
-        LatrCFAlu->getOperand(KBank0Idx).getImm());
-    RootCFAlu->getOperand(KBank0LineIdx).setImm(
-        LatrCFAlu->getOperand(KBank0LineIdx).getImm());
+  if (LatrCFAlu.getOperand(Mode0Idx).getImm()) {
+    RootCFAlu.getOperand(Mode0Idx).setImm(
+        LatrCFAlu.getOperand(Mode0Idx).getImm());
+    RootCFAlu.getOperand(KBank0Idx).setImm(
+        LatrCFAlu.getOperand(KBank0Idx).getImm());
+    RootCFAlu.getOperand(KBank0LineIdx)
+        .setImm(LatrCFAlu.getOperand(KBank0LineIdx).getImm());
   }
-  if (LatrCFAlu->getOperand(Mode1Idx).getImm()) {
-    RootCFAlu->getOperand(Mode1Idx).setImm(
-        LatrCFAlu->getOperand(Mode1Idx).getImm());
-    RootCFAlu->getOperand(KBank1Idx).setImm(
-        LatrCFAlu->getOperand(KBank1Idx).getImm());
-    RootCFAlu->getOperand(KBank1LineIdx).setImm(
-        LatrCFAlu->getOperand(KBank1LineIdx).getImm());
+  if (LatrCFAlu.getOperand(Mode1Idx).getImm()) {
+    RootCFAlu.getOperand(Mode1Idx).setImm(
+        LatrCFAlu.getOperand(Mode1Idx).getImm());
+    RootCFAlu.getOperand(KBank1Idx).setImm(
+        LatrCFAlu.getOperand(KBank1Idx).getImm());
+    RootCFAlu.getOperand(KBank1LineIdx)
+        .setImm(LatrCFAlu.getOperand(KBank1LineIdx).getImm());
   }
-  RootCFAlu->getOperand(CntIdx).setImm(CumuledInsts);
-  RootCFAlu->setDesc(TII->get(LatrCFAlu->getOpcode()));
+  RootCFAlu.getOperand(CntIdx).setImm(CumuledInsts);
+  RootCFAlu.setDesc(TII->get(LatrCFAlu.getOpcode()));
   return true;
 }
 
 bool R600ClauseMergePass::runOnMachineFunction(MachineFunction &MF) {
-  TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo());
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
+  const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>();
+  TII = ST.getInstrInfo();
+
   for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
                                                   BB != BB_E; ++BB) {
     MachineBasicBlock &MBB = *BB;
     MachineBasicBlock::iterator I = MBB.begin(),  E = MBB.end();
     MachineBasicBlock::iterator LatestCFAlu = E;
     while (I != E) {
-      MachineInstr *MI = I++;
+      MachineInstr &MI = *I++;
       if ((!TII->canBeConsideredALU(MI) && !isCFAlu(MI)) ||
-          TII->mustBeLastInClause(MI->getOpcode()))
+          TII->mustBeLastInClause(MI.getOpcode()))
         LatestCFAlu = E;
       if (!isCFAlu(MI))
         continue;
       cleanPotentialDisabledCFAlu(MI);
 
-      if (LatestCFAlu != E && mergeIfPossible(LatestCFAlu, MI)) {
-        MI->eraseFromParent();
+      if (LatestCFAlu != E && mergeIfPossible(*LatestCFAlu, MI)) {
+        MI.eraseFromParent();
       } else {
-        assert(MI->getOperand(8).getImm() && "CF ALU instruction disabled");
+        assert(MI.getOperand(8).getImm() && "CF ALU instruction disabled");
         LatestCFAlu = MI;
       }
     }
diff --git a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
index bd80bb211b4f..d5bda4a8303e 100644
--- a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
+++ b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
@@ -39,16 +39,16 @@ struct CFStack {
     FIRST_NON_WQM_PUSH_W_FULL_ENTRY = 3
   };
 
-  const AMDGPUSubtarget *ST;
+  const R600Subtarget *ST;
   std::vector<StackItem> BranchStack;
   std::vector<StackItem> LoopStack;
   unsigned MaxStackSize;
   unsigned CurrentEntries;
   unsigned CurrentSubEntries;
 
-  CFStack(const AMDGPUSubtarget *st, unsigned ShaderType) : ST(st),
+  CFStack(const R600Subtarget *st, CallingConv::ID cc) : ST(st),
       // We need to reserve a stack entry for CALL_FS in vertex shaders.
-      MaxStackSize(ShaderType == ShaderType::VERTEX ? 1 : 0),
+      MaxStackSize(cc == CallingConv::AMDGPU_VS ? 1 : 0),
       CurrentEntries(0), CurrentSubEntries(0) { }
 
   unsigned getLoopDepth();
@@ -119,7 +119,7 @@ unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) {
     return 0;
   case CFStack::FIRST_NON_WQM_PUSH:
   assert(!ST->hasCaymanISA());
-  if (ST->getGeneration() <= AMDGPUSubtarget::R700) {
+  if (ST->getGeneration() <= R600Subtarget::R700) {
     // +1 For the push operation.
     // +2 Extra space required.
     return 3;
@@ -132,7 +132,7 @@ unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) {
     return 2;
   }
   case CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY:
-    assert(ST->getGeneration() >= AMDGPUSubtarget::EVERGREEN);
+    assert(ST->getGeneration() >= R600Subtarget::EVERGREEN);
     // +1 For the push operation.
     // +1 Extra space required.
     return 2;
@@ -142,8 +142,8 @@ unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) {
 }
 
 void CFStack::updateMaxStackSize() {
-  unsigned CurrentStackSize = CurrentEntries +
-                              (RoundUpToAlignment(CurrentSubEntries, 4) / 4);
+  unsigned CurrentStackSize =
+      CurrentEntries + (alignTo(CurrentSubEntries, 4) / 4);
   MaxStackSize = std::max(CurrentStackSize, MaxStackSize);
 }
 
@@ -159,7 +159,7 @@ void CFStack::pushBranch(unsigned Opcode, bool isWQM) {
                                              // See comment in
                                              // CFStack::getSubEntrySize()
       else if (CurrentEntries > 0 &&
-               ST->getGeneration() > AMDGPUSubtarget::EVERGREEN &&
+               ST->getGeneration() > R600Subtarget::EVERGREEN &&
                !ST->hasCaymanISA() &&
                !branchStackContains(CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY))
         Item = CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY;
@@ -220,10 +220,10 @@ private:
   const R600InstrInfo *TII;
   const R600RegisterInfo *TRI;
   unsigned MaxFetchInst;
-  const AMDGPUSubtarget *ST;
+  const R600Subtarget *ST;
 
-  bool IsTrivialInst(MachineInstr *MI) const {
-    switch (MI->getOpcode()) {
+  bool IsTrivialInst(MachineInstr &MI) const {
+    switch (MI.getOpcode()) {
     case AMDGPU::KILL:
     case AMDGPU::RETURN:
       return true;
@@ -234,7 +234,7 @@ private:
 
   const MCInstrDesc &getHWInstrDesc(ControlFlowInstruction CFI) const {
     unsigned Opcode = 0;
-    bool isEg = (ST->getGeneration() >= AMDGPUSubtarget::EVERGREEN);
+    bool isEg = (ST->getGeneration() >= R600Subtarget::EVERGREEN);
     switch (CFI) {
     case CF_TC:
       Opcode = isEg ? AMDGPU::CF_TC_EG : AMDGPU::CF_TC_R600;
@@ -278,11 +278,12 @@ private:
     return TII->get(Opcode);
   }
 
-  bool isCompatibleWithClause(const MachineInstr *MI,
-      std::set<unsigned> &DstRegs) const {
+  bool isCompatibleWithClause(const MachineInstr &MI,
+                              std::set<unsigned> &DstRegs) const {
     unsigned DstMI, SrcMI;
-    for (MachineInstr::const_mop_iterator I = MI->operands_begin(),
-        E = MI->operands_end(); I != E; ++I) {
+    for (MachineInstr::const_mop_iterator I = MI.operands_begin(),
+                                          E = MI.operands_end();
+         I != E; ++I) {
       const MachineOperand &MO = *I;
       if (!MO.isReg())
         continue;
@@ -318,20 +319,20 @@ private:
     MachineBasicBlock::iterator ClauseHead = I;
     std::vector<MachineInstr *> ClauseContent;
     unsigned AluInstCount = 0;
-    bool IsTex = TII->usesTextureCache(ClauseHead);
+    bool IsTex = TII->usesTextureCache(*ClauseHead);
     std::set<unsigned> DstRegs;
     for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) {
-      if (IsTrivialInst(I))
+      if (IsTrivialInst(*I))
         continue;
       if (AluInstCount >= MaxFetchInst)
         break;
-      if ((IsTex && !TII->usesTextureCache(I)) ||
-          (!IsTex && !TII->usesVertexCache(I)))
+      if ((IsTex && !TII->usesTextureCache(*I)) ||
+          (!IsTex && !TII->usesVertexCache(*I)))
         break;
-      if (!isCompatibleWithClause(I, DstRegs))
+      if (!isCompatibleWithClause(*I, DstRegs))
         break;
       AluInstCount ++;
-      ClauseContent.push_back(I);
+      ClauseContent.push_back(&*I);
     }
     MachineInstr *MIb = BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead),
         getHWInstrDesc(IsTex?CF_TC:CF_VC))
@@ -340,28 +341,37 @@ private:
     return ClauseFile(MIb, std::move(ClauseContent));
   }
 
-  void getLiteral(MachineInstr *MI, std::vector<int64_t> &Lits) const {
+  void getLiteral(MachineInstr &MI, std::vector<MachineOperand *> &Lits) const {
     static const unsigned LiteralRegs[] = {
       AMDGPU::ALU_LITERAL_X,
       AMDGPU::ALU_LITERAL_Y,
       AMDGPU::ALU_LITERAL_Z,
       AMDGPU::ALU_LITERAL_W
     };
-    const SmallVector<std::pair<MachineOperand *, int64_t>, 3 > Srcs =
+    const SmallVector<std::pair<MachineOperand *, int64_t>, 3> Srcs =
         TII->getSrcs(MI);
-    for (unsigned i = 0, e = Srcs.size(); i < e; ++i) {
-      if (Srcs[i].first->getReg() != AMDGPU::ALU_LITERAL_X)
+    for (const auto &Src:Srcs) {
+      if (Src.first->getReg() != AMDGPU::ALU_LITERAL_X)
         continue;
-      int64_t Imm = Srcs[i].second;
-      std::vector<int64_t>::iterator It =
-          std::find(Lits.begin(), Lits.end(), Imm);
+      int64_t Imm = Src.second;
+      std::vector<MachineOperand*>::iterator It =
+          std::find_if(Lits.begin(), Lits.end(),
+                    [&](MachineOperand* val)
+                        { return val->isImm() && (val->getImm() == Imm);});
+
+      // Get corresponding Operand
+      MachineOperand &Operand = MI.getOperand(
+          TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::literal));
+
       if (It != Lits.end()) {
+        // Reuse existing literal reg
         unsigned Index = It - Lits.begin();
-        Srcs[i].first->setReg(LiteralRegs[Index]);
+        Src.first->setReg(LiteralRegs[Index]);
       } else {
+        // Allocate new literal reg
         assert(Lits.size() < 4 && "Too many literals in Instruction Group");
-        Srcs[i].first->setReg(LiteralRegs[Lits.size()]);
-        Lits.push_back(Imm);
+        Src.first->setReg(LiteralRegs[Lits.size()]);
+        Lits.push_back(&Operand);
       }
     }
   }
@@ -384,56 +394,66 @@ private:
   ClauseFile
   MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I)
       const {
-    MachineBasicBlock::iterator ClauseHead = I;
+    MachineInstr &ClauseHead = *I;
     std::vector<MachineInstr *> ClauseContent;
     I++;
     for (MachineBasicBlock::instr_iterator E = MBB.instr_end(); I != E;) {
-      if (IsTrivialInst(I)) {
+      if (IsTrivialInst(*I)) {
         ++I;
         continue;
       }
       if (!I->isBundle() && !TII->isALUInstr(I->getOpcode()))
         break;
-      std::vector<int64_t> Literals;
+      std::vector<MachineOperand *>Literals;
       if (I->isBundle()) {
-        MachineInstr *DeleteMI = I;
+        MachineInstr &DeleteMI = *I;
         MachineBasicBlock::instr_iterator BI = I.getInstrIterator();
         while (++BI != E && BI->isBundledWithPred()) {
           BI->unbundleFromPred();
-          for (unsigned i = 0, e = BI->getNumOperands(); i != e; ++i) {
-            MachineOperand &MO = BI->getOperand(i);
+          for (MachineOperand &MO : BI->operands()) {
             if (MO.isReg() && MO.isInternalRead())
               MO.setIsInternalRead(false);
           }
-          getLiteral(&*BI, Literals);
+          getLiteral(*BI, Literals);
           ClauseContent.push_back(&*BI);
         }
         I = BI;
-        DeleteMI->eraseFromParent();
+        DeleteMI.eraseFromParent();
       } else {
-        getLiteral(I, Literals);
-        ClauseContent.push_back(I);
+        getLiteral(*I, Literals);
+        ClauseContent.push_back(&*I);
         I++;
       }
-      for (unsigned i = 0, e = Literals.size(); i < e; i+=2) {
-        unsigned literal0 = Literals[i];
-        unsigned literal2 = (i + 1 < e)?Literals[i + 1]:0;
-        MachineInstr *MILit = BuildMI(MBB, I, I->getDebugLoc(),
-            TII->get(AMDGPU::LITERALS))
-            .addImm(literal0)
-            .addImm(literal2);
+      for (unsigned i = 0, e = Literals.size(); i < e; i += 2) {
+        MachineInstrBuilder MILit = BuildMI(MBB, I, I->getDebugLoc(),
+            TII->get(AMDGPU::LITERALS));
+        if (Literals[i]->isImm()) {
+            MILit.addImm(Literals[i]->getImm());
+        } else {
+            MILit.addGlobalAddress(Literals[i]->getGlobal(),
+                                   Literals[i]->getOffset());
+        }
+        if (i + 1 < e) {
+          if (Literals[i + 1]->isImm()) {
+            MILit.addImm(Literals[i + 1]->getImm());
+          } else {
+            MILit.addGlobalAddress(Literals[i + 1]->getGlobal(),
+                                   Literals[i + 1]->getOffset());
+          }
+        } else
+          MILit.addImm(0);
         ClauseContent.push_back(MILit);
       }
     }
     assert(ClauseContent.size() < 128 && "ALU clause is too big");
-    ClauseHead->getOperand(7).setImm(ClauseContent.size() - 1);
-    return ClauseFile(ClauseHead, std::move(ClauseContent));
+    ClauseHead.getOperand(7).setImm(ClauseContent.size() - 1);
+    return ClauseFile(&ClauseHead, std::move(ClauseContent));
   }
 
   void
   EmitFetchClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause,
       unsigned &CfCount) {
-    CounterPropagateAddr(Clause.first, CfCount);
+    CounterPropagateAddr(*Clause.first, CfCount);
     MachineBasicBlock *BB = Clause.first->getParent();
     BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::FETCH_CLAUSE))
         .addImm(CfCount);
@@ -447,7 +467,7 @@ private:
   EmitALUClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause,
       unsigned &CfCount) {
     Clause.first->getOperand(0).setImm(0);
-    CounterPropagateAddr(Clause.first, CfCount);
+    CounterPropagateAddr(*Clause.first, CfCount);
     MachineBasicBlock *BB = Clause.first->getParent();
     BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::ALU_CLAUSE))
         .addImm(CfCount);
@@ -457,13 +477,13 @@ private:
     CfCount += Clause.second.size();
   }
 
-  void CounterPropagateAddr(MachineInstr *MI, unsigned Addr) const {
-    MI->getOperand(0).setImm(Addr + MI->getOperand(0).getImm());
+  void CounterPropagateAddr(MachineInstr &MI, unsigned Addr) const {
+    MI.getOperand(0).setImm(Addr + MI.getOperand(0).getImm());
   }
   void CounterPropagateAddr(const std::set<MachineInstr *> &MIs,
                             unsigned Addr) const {
     for (MachineInstr *MI : MIs) {
-      CounterPropagateAddr(MI, Addr);
+      CounterPropagateAddr(*MI, Addr);
     }
   }
 
@@ -472,20 +492,21 @@ public:
       : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), ST(nullptr) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override {
-    ST = &MF.getSubtarget<AMDGPUSubtarget>();
+    ST = &MF.getSubtarget<R600Subtarget>();
     MaxFetchInst = ST->getTexVTXClauseSize();
-    TII = static_cast<const R600InstrInfo *>(ST->getInstrInfo());
-    TRI = static_cast<const R600RegisterInfo *>(ST->getRegisterInfo());
+    TII = ST->getInstrInfo();
+    TRI = ST->getRegisterInfo();
+
     R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 
-    CFStack CFStack(ST, MFI->getShaderType());
+    CFStack CFStack(ST, MF.getFunction()->getCallingConv());
     for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME;
         ++MB) {
       MachineBasicBlock &MBB = *MB;
       unsigned CfCount = 0;
       std::vector<std::pair<unsigned, std::set<MachineInstr *> > > LoopStack;
       std::vector<MachineInstr * > IfThenElseStack;
-      if (MFI->getShaderType() == ShaderType::VERTEX) {
+      if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_VS) {
         BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()),
             getHWInstrDesc(CF_CALL_FS));
         CfCount++;
@@ -493,10 +514,10 @@ public:
       std::vector<ClauseFile> FetchClauses, AluClauses;
       std::vector<MachineInstr *> LastAlu(1);
       std::vector<MachineInstr *> ToPopAfter;
-      
+
       for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
           I != E;) {
-        if (TII->usesTextureCache(I) || TII->usesVertexCache(I)) {
+        if (TII->usesTextureCache(*I) || TII->usesVertexCache(*I)) {
           DEBUG(dbgs() << CfCount << ":"; I->dump(););
           FetchClauses.push_back(MakeFetchClause(MBB, I));
           CfCount++;
@@ -508,7 +529,7 @@ public:
         if (MI->getOpcode() != AMDGPU::ENDIF)
           LastAlu.back() = nullptr;
         if (MI->getOpcode() == AMDGPU::CF_ALU)
-          LastAlu.back() = MI;
+          LastAlu.back() = &*MI;
         I++;
         bool RequiresWorkAround =
             CFStack.requiresWorkAroundForInst(MI->getOpcode());
@@ -571,7 +592,7 @@ public:
         case AMDGPU::ELSE: {
           MachineInstr * JumpInst = IfThenElseStack.back();
           IfThenElseStack.pop_back();
-          CounterPropagateAddr(JumpInst, CfCount);
+          CounterPropagateAddr(*JumpInst, CfCount);
           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
               getHWInstrDesc(CF_ELSE))
               .addImm(0)
@@ -595,10 +616,10 @@ public:
             DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
             CfCount++;
           }
-          
+
           MachineInstr *IfOrElseInst = IfThenElseStack.back();
           IfThenElseStack.pop_back();
-          CounterPropagateAddr(IfOrElseInst, CfCount);
+          CounterPropagateAddr(*IfOrElseInst, CfCount);
           IfOrElseInst->getOperand(1).setImm(1);
           LastAlu.pop_back();
           MI->eraseFromParent();
@@ -625,15 +646,16 @@ public:
         case AMDGPU::RETURN: {
           BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END));
           CfCount++;
-          MI->eraseFromParent();
           if (CfCount % 2) {
             BuildMI(MBB, I, MBB.findDebugLoc(MI), TII->get(AMDGPU::PAD));
             CfCount++;
           }
+          MI->eraseFromParent();
           for (unsigned i = 0, e = FetchClauses.size(); i < e; i++)
             EmitFetchClause(I, FetchClauses[i], CfCount);
           for (unsigned i = 0, e = AluClauses.size(); i < e; i++)
             EmitALUClause(I, AluClauses[i], CfCount);
+          break;
         }
         default:
           if (TII->isExport(MI->getOpcode())) {
diff --git a/lib/Target/AMDGPU/R600Defines.h b/lib/Target/AMDGPU/R600Defines.h
index 51d87eda31d1..534461adc59f 100644
--- a/lib/Target/AMDGPU/R600Defines.h
+++ b/lib/Target/AMDGPU/R600Defines.h
@@ -8,8 +8,8 @@
 /// \file
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_R600_R600DEFINES_H
-#define LLVM_LIB_TARGET_R600_R600DEFINES_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_R600DEFINES_H
+#define LLVM_LIB_TARGET_AMDGPU_R600DEFINES_H
 
 #include "llvm/MC/MCRegisterInfo.h"
 
diff --git a/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp b/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
index fdc20302f4a3..93ed5be94a54 100644
--- a/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
+++ b/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
@@ -38,8 +38,8 @@ private:
   const R600InstrInfo *TII;
   int Address;
 
-  unsigned OccupiedDwords(MachineInstr *MI) const {
-    switch (MI->getOpcode()) {
+  unsigned OccupiedDwords(MachineInstr &MI) const {
+    switch (MI.getOpcode()) {
     case AMDGPU::INTERP_PAIR_XY:
     case AMDGPU::INTERP_PAIR_ZW:
     case AMDGPU::INTERP_VEC_LOAD:
@@ -53,17 +53,17 @@ private:
 
     // These will be expanded to two ALU instructions in the
     // ExpandSpecialInstructions pass.
-    if (TII->isLDSRetInstr(MI->getOpcode()))
+    if (TII->isLDSRetInstr(MI.getOpcode()))
       return 2;
 
-    if(TII->isVector(*MI) ||
-        TII->isCubeOp(MI->getOpcode()) ||
-        TII->isReductionOp(MI->getOpcode()))
+    if (TII->isVector(MI) || TII->isCubeOp(MI.getOpcode()) ||
+        TII->isReductionOp(MI.getOpcode()))
       return 4;
 
     unsigned NumLiteral = 0;
-    for (MachineInstr::mop_iterator It = MI->operands_begin(),
-        E = MI->operands_end(); It != E; ++It) {
+    for (MachineInstr::mop_iterator It = MI.operands_begin(),
+                                    E = MI.operands_end();
+         It != E; ++It) {
       MachineOperand &MO = *It;
       if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X)
         ++NumLiteral;
@@ -71,12 +71,12 @@ private:
     return 1 + NumLiteral;
   }
 
-  bool isALU(const MachineInstr *MI) const {
-    if (TII->isALUInstr(MI->getOpcode()))
+  bool isALU(const MachineInstr &MI) const {
+    if (TII->isALUInstr(MI.getOpcode()))
       return true;
-    if (TII->isVector(*MI) || TII->isCubeOp(MI->getOpcode()))
+    if (TII->isVector(MI) || TII->isCubeOp(MI.getOpcode()))
       return true;
-    switch (MI->getOpcode()) {
+    switch (MI.getOpcode()) {
     case AMDGPU::PRED_X:
     case AMDGPU::INTERP_PAIR_XY:
     case AMDGPU::INTERP_PAIR_ZW:
@@ -89,8 +89,8 @@ private:
     }
   }
 
-  bool IsTrivialInst(MachineInstr *MI) const {
-    switch (MI->getOpcode()) {
+  bool IsTrivialInst(MachineInstr &MI) const {
+    switch (MI.getOpcode()) {
     case AMDGPU::KILL:
     case AMDGPU::RETURN:
     case AMDGPU::IMPLICIT_DEF:
@@ -114,18 +114,20 @@ private:
         ((((Sel >> 2) - 512) & 4095) >> 5) << 1);
   }
 
-  bool SubstituteKCacheBank(MachineInstr *MI,
-      std::vector<std::pair<unsigned, unsigned> > &CachedConsts,
-      bool UpdateInstr = true) const {
+  bool
+  SubstituteKCacheBank(MachineInstr &MI,
+                       std::vector<std::pair<unsigned, unsigned>> &CachedConsts,
+                       bool UpdateInstr = true) const {
     std::vector<std::pair<unsigned, unsigned> > UsedKCache;
 
-    if (!TII->isALUInstr(MI->getOpcode()) && MI->getOpcode() != AMDGPU::DOT_4)
+    if (!TII->isALUInstr(MI.getOpcode()) && MI.getOpcode() != AMDGPU::DOT_4)
       return true;
 
-    const SmallVectorImpl<std::pair<MachineOperand *, int64_t> > &Consts =
+    const SmallVectorImpl<std::pair<MachineOperand *, int64_t>> &Consts =
         TII->getSrcs(MI);
-    assert((TII->isALUInstr(MI->getOpcode()) ||
-        MI->getOpcode() == AMDGPU::DOT_4) && "Can't assign Const");
+    assert(
+        (TII->isALUInstr(MI.getOpcode()) || MI.getOpcode() == AMDGPU::DOT_4) &&
+        "Can't assign Const");
     for (unsigned i = 0, n = Consts.size(); i < n; ++i) {
       if (Consts[i].first->getReg() != AMDGPU::ALU_CONST)
         continue;
@@ -194,9 +196,9 @@ private:
       // in the clause.
       unsigned LastUseCount = 0;
       for (MachineBasicBlock::iterator UseI = Def; UseI != BBEnd; ++UseI) {
-        AluInstCount += OccupiedDwords(UseI);
+        AluInstCount += OccupiedDwords(*UseI);
         // Make sure we won't need to end the clause due to KCache limitations.
-        if (!SubstituteKCacheBank(UseI, KCacheBanks, false))
+        if (!SubstituteKCacheBank(*UseI, KCacheBanks, false))
           return false;
 
         // We have reached the maximum instruction limit before finding the
@@ -230,9 +232,9 @@ private:
     bool PushBeforeModifier = false;
     unsigned AluInstCount = 0;
     for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) {
-      if (IsTrivialInst(I))
+      if (IsTrivialInst(*I))
         continue;
-      if (!isALU(I))
+      if (!isALU(*I))
         break;
       if (AluInstCount > TII->getMaxAlusPerClause())
         break;
@@ -245,7 +247,7 @@ private:
         // clause as predicated alus).
         if (AluInstCount > 0)
           break;
-        if (TII->getFlagOp(I).getImm() & MO_FLAG_PUSH)
+        if (TII->getFlagOp(*I).getImm() & MO_FLAG_PUSH)
           PushBeforeModifier = true;
         AluInstCount ++;
         continue;
@@ -267,16 +269,16 @@ private:
       if (!canClauseLocalKillFitInClause(AluInstCount, KCacheBanks, I, E))
         break;
 
-      if (!SubstituteKCacheBank(I, KCacheBanks))
+      if (!SubstituteKCacheBank(*I, KCacheBanks))
         break;
-      AluInstCount += OccupiedDwords(I);
+      AluInstCount += OccupiedDwords(*I);
     }
     unsigned Opcode = PushBeforeModifier ?
         AMDGPU::CF_ALU_PUSH_BEFORE : AMDGPU::CF_ALU;
     BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), TII->get(Opcode))
     // We don't use the ADDR field until R600ControlFlowFinalizer pass, where
     // it is safe to assume it is 0. However if we always put 0 here, the ifcvt
-    // pass may assume that identical ALU clause starter at the beginning of a 
+    // pass may assume that identical ALU clause starter at the beginning of a
     // true and false branch can be factorized which is not the case.
         .addImm(Address++) // ADDR
         .addImm(KCacheBanks.empty()?0:KCacheBanks[0].first) // KB0
@@ -298,7 +300,8 @@ public:
   }
 
   bool runOnMachineFunction(MachineFunction &MF) override {
-    TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo());
+    const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>();
+    TII = ST.getInstrInfo();
 
     for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
                                                     BB != BB_E; ++BB) {
@@ -307,7 +310,7 @@ public:
       if (I->getOpcode() == AMDGPU::CF_ALU)
         continue; // BB was already parsed
       for (MachineBasicBlock::iterator E = MBB.end(); I != E;) {
-        if (isALU(I))
+        if (isALU(*I))
           I = MakeALUClause(MBB, I);
         else
           ++I;
diff --git a/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp b/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
index 211d392e8fcc..0385b6283f37 100644
--- a/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
+++ b/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
@@ -29,7 +29,6 @@ using namespace llvm;
 namespace {
 
 class R600ExpandSpecialInstrsPass : public MachineFunctionPass {
-
 private:
   static char ID;
   const R600InstrInfo *TII;
@@ -61,12 +60,13 @@ void R600ExpandSpecialInstrsPass::SetFlagInNewMI(MachineInstr *NewMI,
   int OpIdx = TII->getOperandIdx(*OldMI, Op);
   if (OpIdx > -1) {
     uint64_t Val = OldMI->getOperand(OpIdx).getImm();
-    TII->setImmOperand(NewMI, Op, Val);
+    TII->setImmOperand(*NewMI, Op, Val);
   }
 }
 
 bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
-  TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>();
+  TII = ST.getInstrInfo();
 
   const R600RegisterInfo &TRI = TII->getRegisterInfo();
 
@@ -107,11 +107,11 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
                                             MI.getOperand(0).getReg(), // dst
                                             MI.getOperand(1).getReg(), // src0
                                             AMDGPU::ZERO);             // src1
-        TII->addFlag(PredSet, 0, MO_FLAG_MASK);
+        TII->addFlag(*PredSet, 0, MO_FLAG_MASK);
         if (Flags & MO_FLAG_PUSH) {
-          TII->setImmOperand(PredSet, AMDGPU::OpName::update_exec_mask, 1);
+          TII->setImmOperand(*PredSet, AMDGPU::OpName::update_exec_mask, 1);
         } else {
-          TII->setImmOperand(PredSet, AMDGPU::OpName::update_pred, 1);
+          TII->setImmOperand(*PredSet, AMDGPU::OpName::update_pred, 1);
         }
         MI.eraseFromParent();
         continue;
@@ -137,9 +137,9 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
             BMI->bundleWithPred();
           }
           if (Chan >= 2)
-            TII->addFlag(BMI, 0, MO_FLAG_MASK);
+            TII->addFlag(*BMI, 0, MO_FLAG_MASK);
           if (Chan != 3)
-            TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST);
+            TII->addFlag(*BMI, 0, MO_FLAG_NOT_LAST);
         }
 
         MI.eraseFromParent();
@@ -166,9 +166,9 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
             BMI->bundleWithPred();
           }
           if (Chan < 2)
-            TII->addFlag(BMI, 0, MO_FLAG_MASK);
+            TII->addFlag(*BMI, 0, MO_FLAG_MASK);
           if (Chan != 3)
-            TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST);
+            TII->addFlag(*BMI, 0, MO_FLAG_NOT_LAST);
         }
 
         MI.eraseFromParent();
@@ -189,7 +189,7 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
             BMI->bundleWithPred();
           }
           if (Chan != 3)
-            TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST);
+            TII->addFlag(*BMI, 0, MO_FLAG_NOT_LAST);
         }
 
         MI.eraseFromParent();
@@ -212,10 +212,10 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
             BMI->bundleWithPred();
           }
           if (Mask) {
-            TII->addFlag(BMI, 0, MO_FLAG_MASK);
+            TII->addFlag(*BMI, 0, MO_FLAG_MASK);
           }
           if (Chan != 3)
-            TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST);
+            TII->addFlag(*BMI, 0, MO_FLAG_NOT_LAST);
           unsigned Opcode = BMI->getOpcode();
           // While not strictly necessary from hw point of view, we force
           // all src operands of a dot4 inst to belong to the same slot.
@@ -330,10 +330,10 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
         if (Chan != 0)
           NewMI->bundleWithPred();
         if (Mask) {
-          TII->addFlag(NewMI, 0, MO_FLAG_MASK);
+          TII->addFlag(*NewMI, 0, MO_FLAG_MASK);
         }
         if (NotLast) {
-          TII->addFlag(NewMI, 0, MO_FLAG_NOT_LAST);
+          TII->addFlag(*NewMI, 0, MO_FLAG_NOT_LAST);
         }
         SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::clamp);
         SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::literal);
diff --git a/lib/Target/AMDGPU/R600FrameLowering.cpp b/lib/Target/AMDGPU/R600FrameLowering.cpp
new file mode 100644
index 000000000000..dd5681ff5e8b
--- /dev/null
+++ b/lib/Target/AMDGPU/R600FrameLowering.cpp
@@ -0,0 +1,15 @@
+//===----------------------- R600FrameLowering.cpp ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+
+#include "R600FrameLowering.h"
+
+using namespace llvm;
+
+R600FrameLowering::~R600FrameLowering() {
+}
diff --git a/lib/Target/AMDGPU/R600FrameLowering.h b/lib/Target/AMDGPU/R600FrameLowering.h
new file mode 100644
index 000000000000..5fe4e0d201ac
--- /dev/null
+++ b/lib/Target/AMDGPU/R600FrameLowering.h
@@ -0,0 +1,30 @@
+//===--------------------- R600FrameLowering.h ------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_R600FRAMELOWERING_H
+#define LLVM_LIB_TARGET_AMDGPU_R600FRAMELOWERING_H
+
+#include "AMDGPUFrameLowering.h"
+
+namespace llvm {
+
+class R600FrameLowering : public AMDGPUFrameLowering {
+public:
+  R600FrameLowering(StackDirection D, unsigned StackAl, int LAO,
+                    unsigned TransAl = 1) :
+    AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {}
+  virtual ~R600FrameLowering();
+
+  void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const {}
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const {}
+};
+
+}
+
+#endif
diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp
index 124a9c6e0f56..8f78edd76a51 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -30,18 +30,61 @@
 
 using namespace llvm;
 
-R600TargetLowering::R600TargetLowering(TargetMachine &TM,
-                                       const AMDGPUSubtarget &STI)
+R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
+                                       const R600Subtarget &STI)
     : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) {
-  addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
-  addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
   addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
   addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
+  addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
+  addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
 
   computeRegisterProperties(STI.getRegisterInfo());
 
+  // Legalize loads and stores to the private address space.
+  setOperationAction(ISD::LOAD, MVT::i32, Custom);
+  setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
+  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
+
+  // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
+  // spaces, so it is custom lowered to handle those where it isn't.
+  for (MVT VT : MVT::integer_valuetypes()) {
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom);
+
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom);
+
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom);
+  }
+
+  // Workaround for LegalizeDAG asserting on expansion of i1 vector loads.
+  setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
+
+  setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
+
+
+  setOperationAction(ISD::STORE, MVT::i8, Custom);
+  setOperationAction(ISD::STORE, MVT::i32, Custom);
+  setOperationAction(ISD::STORE, MVT::v2i32, Custom);
+  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
+
+  setTruncStoreAction(MVT::i32, MVT::i8, Custom);
+  setTruncStoreAction(MVT::i32, MVT::i16, Custom);
+
+  // Workaround for LegalizeDAG asserting on expansion of i1 vector stores.
+  setTruncStoreAction(MVT::v2i32, MVT::v2i1, Expand);
+  setTruncStoreAction(MVT::v4i32, MVT::v4i1, Expand);
+
   // Set condition code actions
   setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
   setCondCodeAction(ISD::SETUO,  MVT::f32, Expand);
@@ -73,10 +116,6 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM,
 
   setOperationAction(ISD::FSUB, MVT::f32, Expand);
 
-  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
-  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
-  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
-
   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
 
@@ -122,37 +161,6 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM,
 
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
 
-
-  // Legalize loads and stores to the private address space.
-  setOperationAction(ISD::LOAD, MVT::i32, Custom);
-  setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
-  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
-
-  // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
-  // spaces, so it is custom lowered to handle those where it isn't.
-  for (MVT VT : MVT::integer_valuetypes()) {
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom);
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom);
-
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom);
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom);
-
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom);
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom);
-  }
-
-  setOperationAction(ISD::STORE, MVT::i8, Custom);
-  setOperationAction(ISD::STORE, MVT::i32, Custom);
-  setOperationAction(ISD::STORE, MVT::v2i32, Custom);
-  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
-  setTruncStoreAction(MVT::i32, MVT::i8, Custom);
-  setTruncStoreAction(MVT::i32, MVT::i16, Custom);
-
-  setOperationAction(ISD::LOAD, MVT::i32, Custom);
-  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
 
   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
@@ -165,12 +173,6 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM,
   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
 
-  setTargetDAGCombine(ISD::FP_ROUND);
-  setTargetDAGCombine(ISD::FP_TO_SINT);
-  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
-  setTargetDAGCombine(ISD::SELECT_CC);
-  setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
-
   // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
   //  to be Legal/Custom in order to avoid library calls.
   setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
@@ -188,119 +190,138 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM,
   }
 
   setSchedulingPreference(Sched::Source);
+
+
+  setTargetDAGCombine(ISD::FP_ROUND);
+  setTargetDAGCombine(ISD::FP_TO_SINT);
+  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+  setTargetDAGCombine(ISD::SELECT_CC);
+  setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
+}
+
+const R600Subtarget *R600TargetLowering::getSubtarget() const {
+  return static_cast<const R600Subtarget *>(Subtarget);
 }
 
 static inline bool isEOP(MachineBasicBlock::iterator I) {
   return std::next(I)->getOpcode() == AMDGPU::RETURN;
 }
 
-MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
-    MachineInstr * MI, MachineBasicBlock * BB) const {
+MachineBasicBlock *
+R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
+                                                MachineBasicBlock *BB) const {
   MachineFunction * MF = BB->getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
-  MachineBasicBlock::iterator I = *MI;
-  const R600InstrInfo *TII =
-      static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo());
+  MachineBasicBlock::iterator I = MI;
+  const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
 
-  switch (MI->getOpcode()) {
+  switch (MI.getOpcode()) {
   default:
     // Replace LDS_*_RET instruction that don't have any uses with the
     // equivalent LDS_*_NORET instruction.
-    if (TII->isLDSRetInstr(MI->getOpcode())) {
-      int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
+    if (TII->isLDSRetInstr(MI.getOpcode())) {
+      int DstIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
       assert(DstIdx != -1);
       MachineInstrBuilder NewMI;
       // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add
       //        LDS_1A2D support and remove this special case.
-      if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()) ||
-           MI->getOpcode() == AMDGPU::LDS_CMPST_RET)
+      if (!MRI.use_empty(MI.getOperand(DstIdx).getReg()) ||
+          MI.getOpcode() == AMDGPU::LDS_CMPST_RET)
         return BB;
 
       NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
-                      TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode())));
-      for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
-        NewMI.addOperand(MI->getOperand(i));
+                      TII->get(AMDGPU::getLDSNoRetOp(MI.getOpcode())));
+      for (unsigned i = 1, e = MI.getNumOperands(); i < e; ++i) {
+        NewMI.addOperand(MI.getOperand(i));
       }
     } else {
       return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
     }
     break;
   case AMDGPU::CLAMP_R600: {
-    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
-                                                   AMDGPU::MOV,
-                                                   MI->getOperand(0).getReg(),
-                                                   MI->getOperand(1).getReg());
-    TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
+    MachineInstr *NewMI = TII->buildDefaultInstruction(
+        *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
+        MI.getOperand(1).getReg());
+    TII->addFlag(*NewMI, 0, MO_FLAG_CLAMP);
     break;
   }
 
   case AMDGPU::FABS_R600: {
-    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
-                                                    AMDGPU::MOV,
-                                                    MI->getOperand(0).getReg(),
-                                                    MI->getOperand(1).getReg());
-    TII->addFlag(NewMI, 0, MO_FLAG_ABS);
+    MachineInstr *NewMI = TII->buildDefaultInstruction(
+        *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
+        MI.getOperand(1).getReg());
+    TII->addFlag(*NewMI, 0, MO_FLAG_ABS);
     break;
   }
 
   case AMDGPU::FNEG_R600: {
-    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
-                                                    AMDGPU::MOV,
-                                                    MI->getOperand(0).getReg(),
-                                                    MI->getOperand(1).getReg());
-    TII->addFlag(NewMI, 0, MO_FLAG_NEG);
+    MachineInstr *NewMI = TII->buildDefaultInstruction(
+        *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
+        MI.getOperand(1).getReg());
+    TII->addFlag(*NewMI, 0, MO_FLAG_NEG);
     break;
   }
 
   case AMDGPU::MASK_WRITE: {
-    unsigned maskedRegister = MI->getOperand(0).getReg();
+    unsigned maskedRegister = MI.getOperand(0).getReg();
     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
-    TII->addFlag(defInstr, 0, MO_FLAG_MASK);
+    TII->addFlag(*defInstr, 0, MO_FLAG_MASK);
     break;
   }
 
   case AMDGPU::MOV_IMM_F32:
-    TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
-                     MI->getOperand(1).getFPImm()->getValueAPF()
-                         .bitcastToAPInt().getZExtValue());
+    TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(), MI.getOperand(1)
+                                                            .getFPImm()
+                                                            ->getValueAPF()
+                                                            .bitcastToAPInt()
+                                                            .getZExtValue());
     break;
   case AMDGPU::MOV_IMM_I32:
-    TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
-                     MI->getOperand(1).getImm());
+    TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(),
+                     MI.getOperand(1).getImm());
     break;
+  case AMDGPU::MOV_IMM_GLOBAL_ADDR: {
+    //TODO: Perhaps combine this instruction with the next if possible
+    auto MIB = TII->buildDefaultInstruction(
+        *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_LITERAL_X);
+    int Idx = TII->getOperandIdx(*MIB, AMDGPU::OpName::literal);
+    //TODO: Ugh this is rather ugly
+    MIB->getOperand(Idx) = MI.getOperand(1);
+    break;
+  }
   case AMDGPU::CONST_COPY: {
-    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
-        MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
-    TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel,
-        MI->getOperand(1).getImm());
+    MachineInstr *NewMI = TII->buildDefaultInstruction(
+        *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_CONST);
+    TII->setImmOperand(*NewMI, AMDGPU::OpName::src0_sel,
+                       MI.getOperand(1).getImm());
     break;
   }
 
   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
   case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
-    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
-            .addOperand(MI->getOperand(0))
-            .addOperand(MI->getOperand(1))
-            .addImm(isEOP(I)); // Set End of program bit
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
+        .addOperand(MI.getOperand(0))
+        .addOperand(MI.getOperand(1))
+        .addImm(isEOP(I)); // Set End of program bit
     break;
   }
   case AMDGPU::RAT_STORE_TYPED_eg: {
-    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
-            .addOperand(MI->getOperand(0))
-            .addOperand(MI->getOperand(1))
-            .addOperand(MI->getOperand(2))
-            .addImm(isEOP(I)); // Set End of program bit
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
+        .addOperand(MI.getOperand(0))
+        .addOperand(MI.getOperand(1))
+        .addOperand(MI.getOperand(2))
+        .addImm(isEOP(I)); // Set End of program bit
     break;
   }
 
   case AMDGPU::TXD: {
     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
-    MachineOperand &RID = MI->getOperand(4);
-    MachineOperand &SID = MI->getOperand(5);
-    unsigned TextureId = MI->getOperand(6).getImm();
+    MachineOperand &RID = MI.getOperand(4);
+    MachineOperand &SID = MI.getOperand(5);
+    unsigned TextureId = MI.getOperand(6).getImm();
     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 
@@ -333,75 +354,77 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
       CTZ = 0;
       break;
     }
-    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
-            .addOperand(MI->getOperand(3))
-            .addImm(SrcX)
-            .addImm(SrcY)
-            .addImm(SrcZ)
-            .addImm(SrcW)
-            .addImm(0)
-            .addImm(0)
-            .addImm(0)
-            .addImm(0)
-            .addImm(1)
-            .addImm(2)
-            .addImm(3)
-            .addOperand(RID)
-            .addOperand(SID)
-            .addImm(CTX)
-            .addImm(CTY)
-            .addImm(CTZ)
-            .addImm(CTW);
-    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
-            .addOperand(MI->getOperand(2))
-            .addImm(SrcX)
-            .addImm(SrcY)
-            .addImm(SrcZ)
-            .addImm(SrcW)
-            .addImm(0)
-            .addImm(0)
-            .addImm(0)
-            .addImm(0)
-            .addImm(1)
-            .addImm(2)
-            .addImm(3)
-            .addOperand(RID)
-            .addOperand(SID)
-            .addImm(CTX)
-            .addImm(CTY)
-            .addImm(CTZ)
-            .addImm(CTW);
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H),
+            T0)
+        .addOperand(MI.getOperand(3))
+        .addImm(SrcX)
+        .addImm(SrcY)
+        .addImm(SrcZ)
+        .addImm(SrcW)
+        .addImm(0)
+        .addImm(0)
+        .addImm(0)
+        .addImm(0)
+        .addImm(1)
+        .addImm(2)
+        .addImm(3)
+        .addOperand(RID)
+        .addOperand(SID)
+        .addImm(CTX)
+        .addImm(CTY)
+        .addImm(CTZ)
+        .addImm(CTW);
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V),
+            T1)
+        .addOperand(MI.getOperand(2))
+        .addImm(SrcX)
+        .addImm(SrcY)
+        .addImm(SrcZ)
+        .addImm(SrcW)
+        .addImm(0)
+        .addImm(0)
+        .addImm(0)
+        .addImm(0)
+        .addImm(1)
+        .addImm(2)
+        .addImm(3)
+        .addOperand(RID)
+        .addOperand(SID)
+        .addImm(CTX)
+        .addImm(CTY)
+        .addImm(CTZ)
+        .addImm(CTW);
     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
-            .addOperand(MI->getOperand(0))
-            .addOperand(MI->getOperand(1))
-            .addImm(SrcX)
-            .addImm(SrcY)
-            .addImm(SrcZ)
-            .addImm(SrcW)
-            .addImm(0)
-            .addImm(0)
-            .addImm(0)
-            .addImm(0)
-            .addImm(1)
-            .addImm(2)
-            .addImm(3)
-            .addOperand(RID)
-            .addOperand(SID)
-            .addImm(CTX)
-            .addImm(CTY)
-            .addImm(CTZ)
-            .addImm(CTW)
-            .addReg(T0, RegState::Implicit)
-            .addReg(T1, RegState::Implicit);
+        .addOperand(MI.getOperand(0))
+        .addOperand(MI.getOperand(1))
+        .addImm(SrcX)
+        .addImm(SrcY)
+        .addImm(SrcZ)
+        .addImm(SrcW)
+        .addImm(0)
+        .addImm(0)
+        .addImm(0)
+        .addImm(0)
+        .addImm(1)
+        .addImm(2)
+        .addImm(3)
+        .addOperand(RID)
+        .addOperand(SID)
+        .addImm(CTX)
+        .addImm(CTY)
+        .addImm(CTZ)
+        .addImm(CTW)
+        .addReg(T0, RegState::Implicit)
+        .addReg(T1, RegState::Implicit);
     break;
   }
 
   case AMDGPU::TXD_SHADOW: {
     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
-    MachineOperand &RID = MI->getOperand(4);
-    MachineOperand &SID = MI->getOperand(5);
-    unsigned TextureId = MI->getOperand(6).getImm();
+    MachineOperand &RID = MI.getOperand(4);
+    MachineOperand &SID = MI.getOperand(5);
+    unsigned TextureId = MI.getOperand(6).getImm();
     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 
@@ -435,99 +458,101 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
       break;
     }
 
-    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
-            .addOperand(MI->getOperand(3))
-            .addImm(SrcX)
-            .addImm(SrcY)
-            .addImm(SrcZ)
-            .addImm(SrcW)
-            .addImm(0)
-            .addImm(0)
-            .addImm(0)
-            .addImm(0)
-            .addImm(1)
-            .addImm(2)
-            .addImm(3)
-            .addOperand(RID)
-            .addOperand(SID)
-            .addImm(CTX)
-            .addImm(CTY)
-            .addImm(CTZ)
-            .addImm(CTW);
-    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
-            .addOperand(MI->getOperand(2))
-            .addImm(SrcX)
-            .addImm(SrcY)
-            .addImm(SrcZ)
-            .addImm(SrcW)
-            .addImm(0)
-            .addImm(0)
-            .addImm(0)
-            .addImm(0)
-            .addImm(1)
-            .addImm(2)
-            .addImm(3)
-            .addOperand(RID)
-            .addOperand(SID)
-            .addImm(CTX)
-            .addImm(CTY)
-            .addImm(CTZ)
-            .addImm(CTW);
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H),
+            T0)
+        .addOperand(MI.getOperand(3))
+        .addImm(SrcX)
+        .addImm(SrcY)
+        .addImm(SrcZ)
+        .addImm(SrcW)
+        .addImm(0)
+        .addImm(0)
+        .addImm(0)
+        .addImm(0)
+        .addImm(1)
+        .addImm(2)
+        .addImm(3)
+        .addOperand(RID)
+        .addOperand(SID)
+        .addImm(CTX)
+        .addImm(CTY)
+        .addImm(CTZ)
+        .addImm(CTW);
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V),
+            T1)
+        .addOperand(MI.getOperand(2))
+        .addImm(SrcX)
+        .addImm(SrcY)
+        .addImm(SrcZ)
+        .addImm(SrcW)
+        .addImm(0)
+        .addImm(0)
+        .addImm(0)
+        .addImm(0)
+        .addImm(1)
+        .addImm(2)
+        .addImm(3)
+        .addOperand(RID)
+        .addOperand(SID)
+        .addImm(CTX)
+        .addImm(CTY)
+        .addImm(CTZ)
+        .addImm(CTW);
     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
-            .addOperand(MI->getOperand(0))
-            .addOperand(MI->getOperand(1))
-            .addImm(SrcX)
-            .addImm(SrcY)
-            .addImm(SrcZ)
-            .addImm(SrcW)
-            .addImm(0)
-            .addImm(0)
-            .addImm(0)
-            .addImm(0)
-            .addImm(1)
-            .addImm(2)
-            .addImm(3)
-            .addOperand(RID)
-            .addOperand(SID)
-            .addImm(CTX)
-            .addImm(CTY)
-            .addImm(CTZ)
-            .addImm(CTW)
-            .addReg(T0, RegState::Implicit)
-            .addReg(T1, RegState::Implicit);
+        .addOperand(MI.getOperand(0))
+        .addOperand(MI.getOperand(1))
+        .addImm(SrcX)
+        .addImm(SrcY)
+        .addImm(SrcZ)
+        .addImm(SrcW)
+        .addImm(0)
+        .addImm(0)
+        .addImm(0)
+        .addImm(0)
+        .addImm(1)
+        .addImm(2)
+        .addImm(3)
+        .addOperand(RID)
+        .addOperand(SID)
+        .addImm(CTX)
+        .addImm(CTY)
+        .addImm(CTZ)
+        .addImm(CTW)
+        .addReg(T0, RegState::Implicit)
+        .addReg(T1, RegState::Implicit);
     break;
   }
 
   case AMDGPU::BRANCH:
-      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
-              .addOperand(MI->getOperand(0));
-      break;
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
+        .addOperand(MI.getOperand(0));
+    break;
 
   case AMDGPU::BRANCH_COND_f32: {
     MachineInstr *NewMI =
-      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
-              AMDGPU::PREDICATE_BIT)
-              .addOperand(MI->getOperand(1))
-              .addImm(OPCODE_IS_NOT_ZERO)
-              .addImm(0); // Flags
-    TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
+        BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
+                AMDGPU::PREDICATE_BIT)
+            .addOperand(MI.getOperand(1))
+            .addImm(OPCODE_IS_NOT_ZERO)
+            .addImm(0); // Flags
+    TII->addFlag(*NewMI, 0, MO_FLAG_PUSH);
     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
-            .addOperand(MI->getOperand(0))
-            .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
+        .addOperand(MI.getOperand(0))
+        .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
     break;
   }
 
   case AMDGPU::BRANCH_COND_i32: {
     MachineInstr *NewMI =
-      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
-            AMDGPU::PREDICATE_BIT)
-            .addOperand(MI->getOperand(1))
+        BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
+                AMDGPU::PREDICATE_BIT)
+            .addOperand(MI.getOperand(1))
             .addImm(OPCODE_IS_NOT_ZERO_INT)
             .addImm(0); // Flags
-    TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
+    TII->addFlag(*NewMI, 0, MO_FLAG_PUSH);
     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
-           .addOperand(MI->getOperand(0))
-            .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
+        .addOperand(MI.getOperand(0))
+        .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
     break;
   }
 
@@ -535,7 +560,7 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
   case AMDGPU::R600_ExportSwz: {
     // Instruction is left unmodified if its not the last one of its type
     bool isLastInstructionOfItsType = true;
-    unsigned InstExportType = MI->getOperand(1).getImm();
+    unsigned InstExportType = MI.getOperand(1).getImm();
     for (MachineBasicBlock::iterator NextExportInst = std::next(I),
          EndBlock = BB->end(); NextExportInst != EndBlock;
          NextExportInst = std::next(NextExportInst)) {
@@ -552,17 +577,17 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
     bool EOP = isEOP(I);
     if (!EOP && !isLastInstructionOfItsType)
       return BB;
-    unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
-    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
-            .addOperand(MI->getOperand(0))
-            .addOperand(MI->getOperand(1))
-            .addOperand(MI->getOperand(2))
-            .addOperand(MI->getOperand(3))
-            .addOperand(MI->getOperand(4))
-            .addOperand(MI->getOperand(5))
-            .addOperand(MI->getOperand(6))
-            .addImm(CfInst)
-            .addImm(EOP);
+    unsigned CfInst = (MI.getOpcode() == AMDGPU::EG_ExportSwz) ? 84 : 40;
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
+        .addOperand(MI.getOperand(0))
+        .addOperand(MI.getOperand(1))
+        .addOperand(MI.getOperand(2))
+        .addOperand(MI.getOperand(3))
+        .addOperand(MI.getOperand(4))
+        .addOperand(MI.getOperand(5))
+        .addOperand(MI.getOperand(6))
+        .addImm(CfInst)
+        .addImm(EOP);
     break;
   }
   case AMDGPU::RETURN: {
@@ -576,7 +601,7 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
   }
   }
 
-  MI->eraseFromParent();
+  MI.eraseFromParent();
   return BB;
 }
 
@@ -610,18 +635,13 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
 
   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
   case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
+  case ISD::FrameIndex: return lowerFrameIndex(Op, DAG);
   case ISD::INTRINSIC_VOID: {
     SDValue Chain = Op.getOperand(0);
     unsigned IntrinsicID =
                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
     switch (IntrinsicID) {
-    case AMDGPUIntrinsic::AMDGPU_store_output: {
-      int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
-      unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
-      MFI->LiveOuts.push_back(Reg);
-      return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
-    }
-    case AMDGPUIntrinsic::R600_store_swizzle: {
+    case AMDGPUIntrinsic::r600_store_swizzle: {
       SDLoc DL(Op);
       const SDValue Args[8] = {
         Chain,
@@ -649,114 +669,48 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
     SDLoc DL(Op);
     switch(IntrinsicID) {
     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
-    case AMDGPUIntrinsic::R600_load_input: {
-      int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
-      unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
-      MachineFunction &MF = DAG.getMachineFunction();
-      MachineRegisterInfo &MRI = MF.getRegInfo();
-      MRI.addLiveIn(Reg);
-      return DAG.getCopyFromReg(DAG.getEntryNode(),
-          SDLoc(DAG.getEntryNode()), Reg, VT);
-    }
-
-    case AMDGPUIntrinsic::R600_interp_input: {
-      int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
-      int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
-      MachineSDNode *interp;
-      if (ijb < 0) {
-        const R600InstrInfo *TII =
-            static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo());
-        interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
-            MVT::v4f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32));
-        return DAG.getTargetExtractSubreg(
-            TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
-            DL, MVT::f32, SDValue(interp, 0));
-      }
-      MachineFunction &MF = DAG.getMachineFunction();
-      MachineRegisterInfo &MRI = MF.getRegInfo();
-      unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
-      unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1);
-      MRI.addLiveIn(RegisterI);
-      MRI.addLiveIn(RegisterJ);
-      SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(),
-          SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32);
-      SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(),
-          SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32);
-
-      if (slot % 4 < 2)
-        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
-            MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32),
-            RegisterJNode, RegisterINode);
-      else
-        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
-            MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32),
-            RegisterJNode, RegisterINode);
-      return SDValue(interp, slot % 2);
-    }
-    case AMDGPUIntrinsic::R600_interp_xy:
-    case AMDGPUIntrinsic::R600_interp_zw: {
-      int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
-      MachineSDNode *interp;
-      SDValue RegisterINode = Op.getOperand(2);
-      SDValue RegisterJNode = Op.getOperand(3);
-
-      if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy)
-        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
-            MVT::f32, MVT::f32, DAG.getTargetConstant(slot, DL, MVT::i32),
-            RegisterJNode, RegisterINode);
-      else
-        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
-            MVT::f32, MVT::f32, DAG.getTargetConstant(slot, DL, MVT::i32),
-            RegisterJNode, RegisterINode);
-      return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32,
-          SDValue(interp, 0), SDValue(interp, 1));
-    }
-    case AMDGPUIntrinsic::R600_tex:
-    case AMDGPUIntrinsic::R600_texc:
-    case AMDGPUIntrinsic::R600_txl:
-    case AMDGPUIntrinsic::R600_txlc:
-    case AMDGPUIntrinsic::R600_txb:
-    case AMDGPUIntrinsic::R600_txbc:
-    case AMDGPUIntrinsic::R600_txf:
-    case AMDGPUIntrinsic::R600_txq:
-    case AMDGPUIntrinsic::R600_ddx:
-    case AMDGPUIntrinsic::R600_ddy:
-    case AMDGPUIntrinsic::R600_ldptr: {
+    case AMDGPUIntrinsic::r600_tex:
+    case AMDGPUIntrinsic::r600_texc:
+    case AMDGPUIntrinsic::r600_txl:
+    case AMDGPUIntrinsic::r600_txlc:
+    case AMDGPUIntrinsic::r600_txb:
+    case AMDGPUIntrinsic::r600_txbc:
+    case AMDGPUIntrinsic::r600_txf:
+    case AMDGPUIntrinsic::r600_txq:
+    case AMDGPUIntrinsic::r600_ddx:
+    case AMDGPUIntrinsic::r600_ddy: {
       unsigned TextureOp;
       switch (IntrinsicID) {
-      case AMDGPUIntrinsic::R600_tex:
+      case AMDGPUIntrinsic::r600_tex:
         TextureOp = 0;
         break;
-      case AMDGPUIntrinsic::R600_texc:
+      case AMDGPUIntrinsic::r600_texc:
         TextureOp = 1;
         break;
-      case AMDGPUIntrinsic::R600_txl:
+      case AMDGPUIntrinsic::r600_txl:
         TextureOp = 2;
         break;
-      case AMDGPUIntrinsic::R600_txlc:
+      case AMDGPUIntrinsic::r600_txlc:
         TextureOp = 3;
         break;
-      case AMDGPUIntrinsic::R600_txb:
+      case AMDGPUIntrinsic::r600_txb:
         TextureOp = 4;
         break;
-      case AMDGPUIntrinsic::R600_txbc:
+      case AMDGPUIntrinsic::r600_txbc:
         TextureOp = 5;
         break;
-      case AMDGPUIntrinsic::R600_txf:
+      case AMDGPUIntrinsic::r600_txf:
         TextureOp = 6;
         break;
-      case AMDGPUIntrinsic::R600_txq:
+      case AMDGPUIntrinsic::r600_txq:
         TextureOp = 7;
         break;
-      case AMDGPUIntrinsic::R600_ddx:
+      case AMDGPUIntrinsic::r600_ddx:
         TextureOp = 8;
         break;
-      case AMDGPUIntrinsic::R600_ddy:
+      case AMDGPUIntrinsic::r600_ddy:
         TextureOp = 9;
         break;
-      case AMDGPUIntrinsic::R600_ldptr:
-        TextureOp = 10;
-        break;
       default:
         llvm_unreachable("Unknow Texture Operation");
       }
@@ -784,7 +738,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
       };
       return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
     }
-    case AMDGPUIntrinsic::AMDGPU_dp4: {
+    case AMDGPUIntrinsic::r600_dot4: {
       SDValue Args[8] = {
       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
           DAG.getConstant(0, DL, MVT::i32)),
@@ -806,6 +760,11 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
       return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args);
     }
 
+    case Intrinsic::r600_implicitarg_ptr: {
+      MVT PtrVT = getPointerTy(DAG.getDataLayout(), AMDGPUAS::PARAM_I_ADDRESS);
+      uint32_t ByteOffset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
+      return DAG.getConstant(ByteOffset, DL, PtrVT);
+    }
     case Intrinsic::r600_read_ngroups_x:
       return LowerImplicitParameter(DAG, VT, DL, 0);
     case Intrinsic::r600_read_ngroups_y:
@@ -825,7 +784,8 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
     case Intrinsic::r600_read_local_size_z:
       return LowerImplicitParameter(DAG, VT, DL, 8);
 
-    case Intrinsic::AMDGPU_read_workdim: {
+    case Intrinsic::r600_read_workdim:
+    case AMDGPUIntrinsic::AMDGPU_read_workdim: { // Legacy name.
       uint32_t ByteOffset = getImplicitParameterOffset(MFI, GRID_DIM);
       return LowerImplicitParameter(DAG, VT, DL, ByteOffset / 4);
     }
@@ -848,14 +808,14 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
     case Intrinsic::r600_read_tidig_z:
       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
                                   AMDGPU::T0_Z, VT);
-    case Intrinsic::AMDGPU_rsq:
-      // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior.
-      return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
 
-    case AMDGPUIntrinsic::AMDGPU_fract:
-    case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name.
-      return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
+    case Intrinsic::r600_recipsqrt_ieee:
+      return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
+
+    case Intrinsic::r600_recipsqrt_clamped:
+      return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
     }
+
     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
     break;
   }
@@ -950,6 +910,22 @@ SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
   return vectorToVerticalVector(DAG, Insert);
 }
 
+SDValue R600TargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
+                                               SDValue Op,
+                                               SelectionDAG &DAG) const {
+
+  GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
+  if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
+    return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
+
+  const DataLayout &DL = DAG.getDataLayout();
+  const GlobalValue *GV = GSD->getGlobal();
+  MVT ConstPtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
+
+  SDValue GA = DAG.getTargetGlobalAddress(GV, SDLoc(GSD), ConstPtrVT);
+  return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, SDLoc(GSD), ConstPtrVT, GA);
+}
+
 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
   // On hw >= R700, COS/SIN input must be between -1. and 1.
   // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
@@ -977,7 +953,7 @@ SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
   SDValue TrigVal = DAG.getNode(TrigNode, DL, VT,
       DAG.getNode(ISD::FADD, DL, VT, FractPart,
         DAG.getConstantFP(-0.5, DL, MVT::f32)));
-  if (Gen >= AMDGPUSubtarget::R700)
+  if (Gen >= R600Subtarget::R700)
     return TrigVal;
   // On R600 hw, COS/SIN input must be between -Pi and Pi.
   return DAG.getNode(ISD::FMUL, DL, VT, TrigVal,
@@ -1088,7 +1064,7 @@ SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
 }
 
 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
-                                                   SDLoc DL,
+                                                   const SDLoc &DL,
                                                    unsigned DwordOffset) const {
   unsigned ByteOffset = DwordOffset * 4;
   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
@@ -1099,8 +1075,7 @@ SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
 
   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
                      DAG.getConstant(ByteOffset, DL, MVT::i32), // PTR
-                     MachinePointerInfo(ConstantPointerNull::get(PtrType)),
-                     false, false, false, 0);
+                     MachinePointerInfo(ConstantPointerNull::get(PtrType)));
 }
 
 bool R600TargetLowering::isZero(SDValue Op) const {
@@ -1113,6 +1088,20 @@ bool R600TargetLowering::isZero(SDValue Op) const {
   }
 }
 
+bool R600TargetLowering::isHWTrueValue(SDValue Op) const {
+  if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
+    return CFP->isExactlyValue(1.0);
+  }
+  return isAllOnesConstant(Op);
+}
+
+bool R600TargetLowering::isHWFalseValue(SDValue Op) const {
+  if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
+    return CFP->getValueAPF().isZero();
+  }
+  return isNullConstant(Op);
+}
+
 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
   EVT VT = Op.getValueType();
@@ -1311,19 +1300,73 @@ void R600TargetLowering::getStackAddress(unsigned StackWidth,
   }
 }
 
+SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store,
+                                                   SelectionDAG &DAG) const {
+  SDLoc DL(Store);
+
+  unsigned Mask = 0;
+  if (Store->getMemoryVT() == MVT::i8) {
+    Mask = 0xff;
+  } else if (Store->getMemoryVT() == MVT::i16) {
+    Mask = 0xffff;
+  }
+
+  SDValue Chain = Store->getChain();
+  SDValue BasePtr = Store->getBasePtr();
+  EVT MemVT = Store->getMemoryVT();
+
+  SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr,
+                            DAG.getConstant(2, DL, MVT::i32));
+  SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
+                            Chain, Ptr,
+                            DAG.getTargetConstant(0, DL, MVT::i32));
+
+  SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr,
+                                DAG.getConstant(0x3, DL, MVT::i32));
+
+  SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
+                                 DAG.getConstant(3, DL, MVT::i32));
+
+  SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32,
+                                  Store->getValue());
+
+  SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT);
+
+  SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32,
+                                     MaskedValue, ShiftAmt);
+
+  SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32,
+                                DAG.getConstant(Mask, DL, MVT::i32),
+                                ShiftAmt);
+  DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask,
+                        DAG.getConstant(0xffffffff, DL, MVT::i32));
+  Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask);
+
+  SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue);
+  return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
+                     Chain, Value, Ptr,
+                     DAG.getTargetConstant(0, DL, MVT::i32));
+}
+
 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc DL(Op);
+  if (SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG))
+    return Result;
+
   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
-  SDValue Chain = Op.getOperand(0);
-  SDValue Value = Op.getOperand(1);
-  SDValue Ptr = Op.getOperand(2);
+  unsigned AS = StoreNode->getAddressSpace();
+  SDValue Value = StoreNode->getValue();
+  EVT ValueVT = Value.getValueType();
 
-  SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
-  if (Result.getNode()) {
-    return Result;
+  if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) &&
+      ValueVT.isVector()) {
+    return SplitVectorStore(Op, DAG);
   }
 
-  if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) {
+  SDLoc DL(Op);
+  SDValue Chain = StoreNode->getChain();
+  SDValue Ptr = StoreNode->getBasePtr();
+
+  if (AS == AMDGPUAS::GLOBAL_ADDRESS) {
     if (StoreNode->isTruncatingStore()) {
       EVT VT = Value.getValueType();
       assert(VT.bitsLE(MVT::i32));
@@ -1352,13 +1395,13 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
         DAG.getConstant(0, DL, MVT::i32),
         Mask
       };
-      SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src);
+      SDValue Input = DAG.getBuildVector(MVT::v4i32, DL, Src);
       SDValue Args[3] = { Chain, Input, DWordAddr };
       return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
                                      Op->getVTList(), Args, MemVT,
                                      StoreNode->getMemOperand());
     } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
-               Value.getValueType().bitsGE(MVT::i32)) {
+               ValueVT.bitsGE(MVT::i32)) {
       // Convert pointer from byte address to dword address.
       Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
                         DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
@@ -1373,21 +1416,16 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
     }
   }
 
-  EVT ValueVT = Value.getValueType();
-
-  if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
+  if (AS != AMDGPUAS::PRIVATE_ADDRESS)
     return SDValue();
-  }
 
-  SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
-  if (Ret.getNode()) {
-    return Ret;
-  }
-  // Lowering for indirect addressing
+  EVT MemVT = StoreNode->getMemoryVT();
+  if (MemVT.bitsLT(MVT::i32))
+    return lowerPrivateTruncStore(StoreNode, DAG);
 
+  // Lowering for indirect addressing
   const MachineFunction &MF = DAG.getMachineFunction();
-  const AMDGPUFrameLowering *TFL =
-      static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering());
+  const R600FrameLowering *TFL = getSubtarget()->getFrameLowering();
   unsigned StackWidth = TFL->getStackWidth(MF);
 
   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
@@ -1465,37 +1503,81 @@ ConstantAddressBlock(unsigned AddressSpace) {
   }
 }
 
-SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
-{
-  EVT VT = Op.getValueType();
+SDValue R600TargetLowering::lowerPrivateExtLoad(SDValue Op,
+                                                SelectionDAG &DAG) const {
   SDLoc DL(Op);
-  LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
-  SDValue Chain = Op.getOperand(0);
-  SDValue Ptr = Op.getOperand(1);
-  SDValue LoweredLoad;
+  LoadSDNode *Load = cast<LoadSDNode>(Op);
+  ISD::LoadExtType ExtType = Load->getExtensionType();
+  EVT MemVT = Load->getMemoryVT();
+
+  // <SI && AS=PRIVATE && EXTLOAD && size < 32bit,
+  // register (2-)byte extract.
+
+  // Get Register holding the target.
+  SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(),
+                            DAG.getConstant(2, DL, MVT::i32));
+  // Load the Register.
+  SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
+                            Load->getChain(),
+                            Ptr,
+                            DAG.getTargetConstant(0, DL, MVT::i32),
+                            Op.getOperand(2));
+
+  // Get offset within the register.
+  SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32,
+                                Load->getBasePtr(),
+                                DAG.getConstant(0x3, DL, MVT::i32));
+
+  // Bit offset of target byte (byteIdx * 8).
+  SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
+                                 DAG.getConstant(3, DL, MVT::i32));
+
+  // Shift to the right.
+  Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt);
+
+  // Eliminate the upper bits by setting them to ...
+  EVT MemEltVT = MemVT.getScalarType();
+
+  // ... ones.
+  if (ExtType == ISD::SEXTLOAD) {
+    SDValue MemEltVTNode = DAG.getValueType(MemEltVT);
+
+    SDValue Ops[] = {
+      DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode),
+      Load->getChain()
+    };
 
-  if (SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG))
-    return Ret;
+    return DAG.getMergeValues(Ops, DL);
+  }
+
+  // ... or zeros.
+  SDValue Ops[] = {
+    DAG.getZeroExtendInReg(Ret, DL, MemEltVT),
+    Load->getChain()
+  };
 
-  // Lower loads constant address space global variable loads
-  if (LoadNode->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
-      isa<GlobalVariable>(GetUnderlyingObject(
-          LoadNode->getMemOperand()->getValue(), DAG.getDataLayout()))) {
+  return DAG.getMergeValues(Ops, DL);
+}
 
-    SDValue Ptr = DAG.getZExtOrTrunc(
-        LoadNode->getBasePtr(), DL,
-        getPointerTy(DAG.getDataLayout(), AMDGPUAS::PRIVATE_ADDRESS));
-    Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
-        DAG.getConstant(2, DL, MVT::i32));
-    return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(),
-                       LoadNode->getChain(), Ptr,
-                       DAG.getTargetConstant(0, DL, MVT::i32),
-                       Op.getOperand(2));
+SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+  LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
+  unsigned AS = LoadNode->getAddressSpace();
+  EVT MemVT = LoadNode->getMemoryVT();
+  ISD::LoadExtType ExtType = LoadNode->getExtensionType();
+
+  if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
+      ExtType != ISD::NON_EXTLOAD && MemVT.bitsLT(MVT::i32)) {
+    return lowerPrivateExtLoad(Op, DAG);
   }
 
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  SDValue Chain = LoadNode->getChain();
+  SDValue Ptr = LoadNode->getBasePtr();
+
   if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
     SDValue MergedValues[2] = {
-      ScalarizeVectorLoad(Op, DAG),
+      scalarizeVectorLoad(LoadNode, DAG),
       Chain
     };
     return DAG.getMergeValues(MergedValues, DL);
@@ -1526,8 +1608,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
         NewVT = VT;
         NumElements = VT.getVectorNumElements();
       }
-      Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT,
-                           makeArrayRef(Slots, NumElements));
+      Result = DAG.getBuildVector(NewVT, DL, makeArrayRef(Slots, NumElements));
     } else {
       // non-constant ptr can't be folded, keeps it as a v4f32 load
       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
@@ -1550,6 +1631,8 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
     return DAG.getMergeValues(MergedValues, DL);
   }
 
+  SDValue LoweredLoad;
+
   // For most operations returning SDValue() will result in the node being
   // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
   // need to manually expand loads that may be legal in some address spaces and
@@ -1560,12 +1643,9 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
   if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
     EVT MemVT = LoadNode->getMemoryVT();
     assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
-    SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr,
-                                  LoadNode->getPointerInfo(), MemVT,
-                                  LoadNode->isVolatile(),
-                                  LoadNode->isNonTemporal(),
-                                  LoadNode->isInvariant(),
-                                  LoadNode->getAlignment());
+    SDValue NewLoad = DAG.getExtLoad(
+        ISD::EXTLOAD, DL, VT, Chain, Ptr, LoadNode->getPointerInfo(), MemVT,
+        LoadNode->getAlignment(), LoadNode->getMemOperand()->getFlags());
     SDValue Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, NewLoad,
                               DAG.getValueType(MemVT));
 
@@ -1579,8 +1659,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
 
   // Lowering for indirect addressing
   const MachineFunction &MF = DAG.getMachineFunction();
-  const AMDGPUFrameLowering *TFL =
-      static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering());
+  const R600FrameLowering *TFL = getSubtarget()->getFrameLowering();
   unsigned StackWidth = TFL->getStackWidth(MF);
 
   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
@@ -1590,6 +1669,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
     EVT ElemVT = VT.getVectorElementType();
     SDValue Loads[4];
 
+    assert(NumElemVT <= 4);
     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
                                       "vector width in load");
 
@@ -1603,11 +1683,8 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
                              DAG.getTargetConstant(Channel, DL, MVT::i32),
                              Op.getOperand(2));
     }
-    for (unsigned i = NumElemVT; i < 4; ++i) {
-      Loads[i] = DAG.getUNDEF(ElemVT);
-    }
-    EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
-    LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads);
+    EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, NumElemVT);
+    LoweredLoad = DAG.getBuildVector(TargetVT, DL, makeArrayRef(Loads, NumElemVT));
   } else {
     LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
                               Chain, Ptr,
@@ -1632,16 +1709,28 @@ SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
                      Chain, Jump, Cond);
 }
 
+SDValue R600TargetLowering::lowerFrameIndex(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  const R600FrameLowering *TFL = getSubtarget()->getFrameLowering();
+
+  FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op);
+
+  unsigned FrameIndex = FIN->getIndex();
+  unsigned IgnoredFrameReg;
+  unsigned Offset =
+    TFL->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg);
+  return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), SDLoc(Op),
+                         Op.getValueType());
+}
+
 /// XXX Only kernel functions are supported, so we can assume for now that
 /// every function is a kernel function, but in the future we should use
 /// separate calling conventions for kernel and non-kernel functions.
 SDValue R600TargetLowering::LowerFormalArguments(
-                                      SDValue Chain,
-                                      CallingConv::ID CallConv,
-                                      bool isVarArg,
-                                      const SmallVectorImpl<ISD::InputArg> &Ins,
-                                      SDLoc DL, SelectionDAG &DAG,
-                                      SmallVectorImpl<SDValue> &InVals) const {
+    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
                  *DAG.getContext());
@@ -1664,7 +1753,7 @@ SDValue R600TargetLowering::LowerFormalArguments(
       MemVT = MemVT.getVectorElementType();
     }
 
-    if (MFI->getShaderType() != ShaderType::COMPUTE) {
+    if (AMDGPU::isShader(CallConv)) {
       unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
       SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
       InVals.push_back(Register);
@@ -1699,11 +1788,11 @@ SDValue R600TargetLowering::LowerFormalArguments(
     unsigned Offset = 36 + VA.getLocMemOffset();
 
     MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
-    SDValue Arg = DAG.getLoad(ISD::UNINDEXED, Ext, VT, DL, Chain,
-                              DAG.getConstant(Offset, DL, MVT::i32),
-                              DAG.getUNDEF(MVT::i32),
-                              PtrInfo,
-                              MemVT, false, true, true, 4);
+    SDValue Arg = DAG.getLoad(
+        ISD::UNINDEXED, Ext, VT, DL, Chain,
+        DAG.getConstant(Offset, DL, MVT::i32), DAG.getUNDEF(MVT::i32), PtrInfo,
+        MemVT, /* Alignment = */ 4,
+        MachineMemOperand::MONonTemporal | MachineMemOperand::MOInvariant);
 
     // 4 is the preferred alignment for the CONSTANT memory space.
     InVals.push_back(Arg);
@@ -1719,6 +1808,26 @@ EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
    return VT.changeVectorElementTypeToInteger();
 }
 
+bool R600TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
+                                                        unsigned AddrSpace,
+                                                        unsigned Align,
+                                                        bool *IsFast) const {
+  if (IsFast)
+    *IsFast = false;
+
+  if (!VT.isSimple() || VT == MVT::Other)
+    return false;
+
+  if (VT.bitsLT(MVT::i32))
+    return false;
+
+  // TODO: This is a rough estimate.
+  if (IsFast)
+    *IsFast = true;
+
+  return VT.bitsGT(MVT::i32) && Align % 4 == 0;
+}
+
 static SDValue CompactSwizzlableVector(
   SelectionDAG &DAG, SDValue VectorEntry,
   DenseMap<unsigned, unsigned> &RemapSwizzle) {
@@ -1732,7 +1841,7 @@ static SDValue CompactSwizzlableVector(
   };
 
   for (unsigned i = 0; i < 4; i++) {
-    if (NewBldVec[i].getOpcode() == ISD::UNDEF)
+    if (NewBldVec[i].isUndef())
       // We mask write here to teach later passes that the ith element of this
       // vector is undef. Thus we can use it to reduce 128 bits reg usage,
       // break false dependencies and additionnaly make assembly easier to read.
@@ -1747,7 +1856,7 @@ static SDValue CompactSwizzlableVector(
       }
     }
 
-    if (NewBldVec[i].getOpcode() == ISD::UNDEF)
+    if (NewBldVec[i].isUndef())
       continue;
     for (unsigned j = 0; j < i; j++) {
       if (NewBldVec[i] == NewBldVec[j]) {
@@ -1758,8 +1867,8 @@ static SDValue CompactSwizzlableVector(
     }
   }
 
-  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
-                     VectorEntry.getValueType(), NewBldVec);
+  return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry),
+                            NewBldVec);
 }
 
 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
@@ -1796,14 +1905,13 @@ static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
     }
   }
 
-  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
-                     VectorEntry.getValueType(), NewBldVec);
+  return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry),
+                            NewBldVec);
 }
 
-
-SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector,
-                                            SDValue Swz[4], SelectionDAG &DAG,
-                                            SDLoc DL) const {
+SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[4],
+                                            SelectionDAG &DAG,
+                                            const SDLoc &DL) const {
   assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
   // Old -> New swizzle values
   DenseMap<unsigned, unsigned> SwizzleRemap;
@@ -1886,7 +1994,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
     SDLoc dl(N);
 
     // If the inserted element is an UNDEF, just use the input vector.
-    if (InVal.getOpcode() == ISD::UNDEF)
+    if (InVal.isUndef())
       return InVec;
 
     EVT VT = InVec.getValueType();
@@ -1907,7 +2015,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
     if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
       Ops.append(InVec.getNode()->op_begin(),
                  InVec.getNode()->op_end());
-    } else if (InVec.getOpcode() == ISD::UNDEF) {
+    } else if (InVec.isUndef()) {
       unsigned NElts = VT.getVectorNumElements();
       Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
     } else {
@@ -1927,7 +2035,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
     }
 
     // Return the new vector
-    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
+    return DAG.getBuildVector(VT, dl, Ops);
   }
 
   // Extract_vec (Build_vector) generated by custom lowering
@@ -1953,8 +2061,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
 
   case ISD::SELECT_CC: {
     // Try common optimizations
-    SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
-    if (Ret.getNode())
+    if (SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI))
       return Ret;
 
     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
@@ -2053,13 +2160,14 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
 }
 
-static bool
-FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
-            SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) {
-  const R600InstrInfo *TII =
-      static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
+bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
+                                     SDValue &Src, SDValue &Neg, SDValue &Abs,
+                                     SDValue &Sel, SDValue &Imm,
+                                     SelectionDAG &DAG) const {
+  const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
   if (!Src.isMachineOpcode())
     return false;
+
   switch (Src.getMachineOpcode()) {
   case AMDGPU::FNEG_R600:
     if (!Neg.getNode())
@@ -2127,6 +2235,13 @@ FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
     Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
     return true;
   }
+  case AMDGPU::MOV_IMM_GLOBAL_ADDR:
+    // Check if the Imm slot is used. Taken from below.
+    if (cast<ConstantSDNode>(Imm)->getZExtValue())
+      return false;
+    Imm = Src.getOperand(0);
+    Src = DAG.getRegister(AMDGPU::ALU_LITERAL_X, MVT::i32);
+    return true;
   case AMDGPU::MOV_IMM_I32:
   case AMDGPU::MOV_IMM_F32: {
     unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
@@ -2177,14 +2292,13 @@ FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
   }
 }
 
-
 /// \brief Fold the instructions after selecting them
 SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
                                             SelectionDAG &DAG) const {
-  const R600InstrInfo *TII =
-      static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
+  const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
   if (!Node->isMachineOpcode())
     return Node;
+
   unsigned Opcode = Node->getMachineOpcode();
   SDValue FakeOp;
 
diff --git a/lib/Target/AMDGPU/R600ISelLowering.h b/lib/Target/AMDGPU/R600ISelLowering.h
index 4dbac97af2a1..2fb6ee25caa9 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.h
+++ b/lib/Target/AMDGPU/R600ISelLowering.h
@@ -12,55 +12,69 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_R600_R600ISELLOWERING_H
-#define LLVM_LIB_TARGET_R600_R600ISELLOWERING_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_R600ISELLOWERING_H
+#define LLVM_LIB_TARGET_AMDGPU_R600ISELLOWERING_H
 
 #include "AMDGPUISelLowering.h"
 
 namespace llvm {
 
 class R600InstrInfo;
+class R600Subtarget;
 
-class R600TargetLowering : public AMDGPUTargetLowering {
+class R600TargetLowering final : public AMDGPUTargetLowering {
 public:
-  R600TargetLowering(TargetMachine &TM, const AMDGPUSubtarget &STI);
-  MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr *MI,
-      MachineBasicBlock * BB) const override;
+  R600TargetLowering(const TargetMachine &TM, const R600Subtarget &STI);
+
+  const R600Subtarget *getSubtarget() const;
+
+  MachineBasicBlock *
+  EmitInstrWithCustomInserter(MachineInstr &MI,
+                              MachineBasicBlock *BB) const override;
   SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
   SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
   void ReplaceNodeResults(SDNode * N,
                           SmallVectorImpl<SDValue> &Results,
                           SelectionDAG &DAG) const override;
-  SDValue LowerFormalArguments(
-                              SDValue Chain,
-                              CallingConv::ID CallConv,
-                              bool isVarArg,
-                              const SmallVectorImpl<ISD::InputArg> &Ins,
-                              SDLoc DL, SelectionDAG &DAG,
-                              SmallVectorImpl<SDValue> &InVals) const override;
+  SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
+                               bool isVarArg,
+                               const SmallVectorImpl<ISD::InputArg> &Ins,
+                               const SDLoc &DL, SelectionDAG &DAG,
+                               SmallVectorImpl<SDValue> &InVals) const override;
   EVT getSetCCResultType(const DataLayout &DL, LLVMContext &,
                          EVT VT) const override;
 
+  bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS,
+                                      unsigned Align,
+                                      bool *IsFast) const override;
+
 private:
   unsigned Gen;
   /// Each OpenCL kernel has nine implicit parameters that are stored in the
   /// first nine dwords of a Vertex Buffer.  These implicit parameters are
   /// lowered to load instructions which retrieve the values from the Vertex
   /// Buffer.
-  SDValue LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
-                                 SDLoc DL, unsigned DwordOffset) const;
+  SDValue LowerImplicitParameter(SelectionDAG &DAG, EVT VT, const SDLoc &DL,
+                                 unsigned DwordOffset) const;
 
   void lowerImplicitParameter(MachineInstr *MI, MachineBasicBlock &BB,
       MachineRegisterInfo & MRI, unsigned dword_offset) const;
   SDValue OptimizeSwizzle(SDValue BuildVector, SDValue Swz[], SelectionDAG &DAG,
-                          SDLoc DL) const;
+                          const SDLoc &DL) const;
   SDValue vectorToVerticalVector(SelectionDAG &DAG, SDValue Vector) const;
 
+  SDValue lowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
+                             SelectionDAG &DAG) const override;
   SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue lowerPrivateTruncStore(StoreSDNode *Store, SelectionDAG &DAG) const;
   SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue lowerPrivateExtLoad(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const;
@@ -74,6 +88,13 @@ private:
   void getStackAddress(unsigned StackWidth, unsigned ElemIdx,
                        unsigned &Channel, unsigned &PtrIncr) const;
   bool isZero(SDValue Op) const;
+  bool isHWTrueValue(SDValue Op) const;
+  bool isHWFalseValue(SDValue Op) const;
+
+ bool FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src,
+                  SDValue &Neg, SDValue &Abs, SDValue &Sel, SDValue &Imm,
+                  SelectionDAG &DAG) const;
+
   SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override;
 };
 
diff --git a/lib/Target/AMDGPU/R600InstrInfo.cpp b/lib/Target/AMDGPU/R600InstrInfo.cpp
index 8b6eea17130b..1c5f7ec1b6ef 100644
--- a/lib/Target/AMDGPU/R600InstrInfo.cpp
+++ b/lib/Target/AMDGPU/R600InstrInfo.cpp
@@ -28,26 +28,17 @@ using namespace llvm;
 #define GET_INSTRINFO_CTOR_DTOR
 #include "AMDGPUGenDFAPacketizer.inc"
 
-R600InstrInfo::R600InstrInfo(const AMDGPUSubtarget &st)
-    : AMDGPUInstrInfo(st), RI() {}
-
-const R600RegisterInfo &R600InstrInfo::getRegisterInfo() const {
-  return RI;
-}
-
-bool R600InstrInfo::isTrig(const MachineInstr &MI) const {
-  return get(MI.getOpcode()).TSFlags & R600_InstFlag::TRIG;
-}
+R600InstrInfo::R600InstrInfo(const R600Subtarget &ST)
+  : AMDGPUInstrInfo(ST), RI(), ST(ST) {}
 
 bool R600InstrInfo::isVector(const MachineInstr &MI) const {
   return get(MI.getOpcode()).TSFlags & R600_InstFlag::VECTOR;
 }
 
-void
-R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MI, DebugLoc DL,
-                           unsigned DestReg, unsigned SrcReg,
-                           bool KillSrc) const {
+void R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MI,
+                                const DebugLoc &DL, unsigned DestReg,
+                                unsigned SrcReg, bool KillSrc) const {
   unsigned VectorComponents = 0;
   if ((AMDGPU::R600_Reg128RegClass.contains(DestReg) ||
       AMDGPU::R600_Reg128VerticalRegClass.contains(DestReg)) &&
@@ -91,10 +82,9 @@ bool R600InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB,
 }
 
 bool R600InstrInfo::isMov(unsigned Opcode) const {
-
-
   switch(Opcode) {
-  default: return false;
+  default:
+    return false;
   case AMDGPU::MOV:
   case AMDGPU::MOV_IMM_F32:
   case AMDGPU::MOV_IMM_I32:
@@ -102,17 +92,6 @@ bool R600InstrInfo::isMov(unsigned Opcode) const {
   }
 }
 
-// Some instructions act as place holders to emulate operations that the GPU
-// hardware does automatically. This function can be used to check if
-// an opcode falls into this category.
-bool R600InstrInfo::isPlaceHolderOpcode(unsigned Opcode) const {
-  switch (Opcode) {
-  default: return false;
-  case AMDGPU::RETURN:
-    return true;
-  }
-}
-
 bool R600InstrInfo::isReductionOp(unsigned Opcode) const {
   return false;
 }
@@ -150,20 +129,16 @@ bool R600InstrInfo::isLDSInstr(unsigned Opcode) const {
           (TargetFlags & R600_InstFlag::LDS_1A2D));
 }
 
-bool R600InstrInfo::isLDSNoRetInstr(unsigned Opcode) const {
-  return isLDSInstr(Opcode) && getOperandIdx(Opcode, AMDGPU::OpName::dst) == -1;
-}
-
 bool R600InstrInfo::isLDSRetInstr(unsigned Opcode) const {
   return isLDSInstr(Opcode) && getOperandIdx(Opcode, AMDGPU::OpName::dst) != -1;
 }
 
-bool R600InstrInfo::canBeConsideredALU(const MachineInstr *MI) const {
-  if (isALUInstr(MI->getOpcode()))
+bool R600InstrInfo::canBeConsideredALU(const MachineInstr &MI) const {
+  if (isALUInstr(MI.getOpcode()))
     return true;
-  if (isVector(*MI) || isCubeOp(MI->getOpcode()))
+  if (isVector(MI) || isCubeOp(MI.getOpcode()))
     return true;
-  switch (MI->getOpcode()) {
+  switch (MI.getOpcode()) {
   case AMDGPU::PRED_X:
   case AMDGPU::INTERP_PAIR_XY:
   case AMDGPU::INTERP_PAIR_ZW:
@@ -182,16 +157,16 @@ bool R600InstrInfo::isTransOnly(unsigned Opcode) const {
   return (get(Opcode).getSchedClass() == AMDGPU::Sched::TransALU);
 }
 
-bool R600InstrInfo::isTransOnly(const MachineInstr *MI) const {
-  return isTransOnly(MI->getOpcode());
+bool R600InstrInfo::isTransOnly(const MachineInstr &MI) const {
+  return isTransOnly(MI.getOpcode());
 }
 
 bool R600InstrInfo::isVectorOnly(unsigned Opcode) const {
   return (get(Opcode).getSchedClass() == AMDGPU::Sched::VecALU);
 }
 
-bool R600InstrInfo::isVectorOnly(const MachineInstr *MI) const {
-  return isVectorOnly(MI->getOpcode());
+bool R600InstrInfo::isVectorOnly(const MachineInstr &MI) const {
+  return isVectorOnly(MI.getOpcode());
 }
 
 bool R600InstrInfo::isExport(unsigned Opcode) const {
@@ -202,23 +177,21 @@ bool R600InstrInfo::usesVertexCache(unsigned Opcode) const {
   return ST.hasVertexCache() && IS_VTX(get(Opcode));
 }
 
-bool R600InstrInfo::usesVertexCache(const MachineInstr *MI) const {
-  const MachineFunction *MF = MI->getParent()->getParent();
-  const R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
-  return MFI->getShaderType() != ShaderType::COMPUTE &&
-    usesVertexCache(MI->getOpcode());
+bool R600InstrInfo::usesVertexCache(const MachineInstr &MI) const {
+  const MachineFunction *MF = MI.getParent()->getParent();
+  return !AMDGPU::isCompute(MF->getFunction()->getCallingConv()) &&
+         usesVertexCache(MI.getOpcode());
 }
 
 bool R600InstrInfo::usesTextureCache(unsigned Opcode) const {
   return (!ST.hasVertexCache() && IS_VTX(get(Opcode))) || IS_TEX(get(Opcode));
 }
 
-bool R600InstrInfo::usesTextureCache(const MachineInstr *MI) const {
-  const MachineFunction *MF = MI->getParent()->getParent();
-  const R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
-  return (MFI->getShaderType() == ShaderType::COMPUTE &&
-          usesVertexCache(MI->getOpcode())) ||
-    usesTextureCache(MI->getOpcode());
+bool R600InstrInfo::usesTextureCache(const MachineInstr &MI) const {
+  const MachineFunction *MF = MI.getParent()->getParent();
+  return (AMDGPU::isCompute(MF->getFunction()->getCallingConv()) &&
+          usesVertexCache(MI.getOpcode())) ||
+         usesTextureCache(MI.getOpcode());
 }
 
 bool R600InstrInfo::mustBeLastInClause(unsigned Opcode) const {
@@ -231,20 +204,21 @@ bool R600InstrInfo::mustBeLastInClause(unsigned Opcode) const {
   }
 }
 
-bool R600InstrInfo::usesAddressRegister(MachineInstr *MI) const {
-  return  MI->findRegisterUseOperandIdx(AMDGPU::AR_X) != -1;
+bool R600InstrInfo::usesAddressRegister(MachineInstr &MI) const {
+  return MI.findRegisterUseOperandIdx(AMDGPU::AR_X) != -1;
 }
 
-bool R600InstrInfo::definesAddressRegister(MachineInstr *MI) const {
-  return MI->findRegisterDefOperandIdx(AMDGPU::AR_X) != -1;
+bool R600InstrInfo::definesAddressRegister(MachineInstr &MI) const {
+  return MI.findRegisterDefOperandIdx(AMDGPU::AR_X) != -1;
 }
 
-bool R600InstrInfo::readsLDSSrcReg(const MachineInstr *MI) const {
-  if (!isALUInstr(MI->getOpcode())) {
+bool R600InstrInfo::readsLDSSrcReg(const MachineInstr &MI) const {
+  if (!isALUInstr(MI.getOpcode())) {
     return false;
   }
-  for (MachineInstr::const_mop_iterator I = MI->operands_begin(),
-                                        E = MI->operands_end(); I != E; ++I) {
+  for (MachineInstr::const_mop_iterator I = MI.operands_begin(),
+                                        E = MI.operands_end();
+       I != E; ++I) {
     if (!I->isReg() || !I->isUse() ||
         TargetRegisterInfo::isVirtualRegister(I->getReg()))
       continue;
@@ -255,17 +229,6 @@ bool R600InstrInfo::readsLDSSrcReg(const MachineInstr *MI) const {
   return false;
 }
 
-int R600InstrInfo::getSrcIdx(unsigned Opcode, unsigned SrcNum) const {
-  static const unsigned OpTable[] = {
-    AMDGPU::OpName::src0,
-    AMDGPU::OpName::src1,
-    AMDGPU::OpName::src2
-  };
-
-  assert (SrcNum < 3);
-  return getOperandIdx(Opcode, OpTable[SrcNum]);
-}
-
 int R600InstrInfo::getSelIdx(unsigned Opcode, unsigned SrcIdx) const {
   static const unsigned SrcSelTable[][2] = {
     {AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel},
@@ -290,10 +253,10 @@ int R600InstrInfo::getSelIdx(unsigned Opcode, unsigned SrcIdx) const {
 }
 
 SmallVector<std::pair<MachineOperand *, int64_t>, 3>
-R600InstrInfo::getSrcs(MachineInstr *MI) const {
+R600InstrInfo::getSrcs(MachineInstr &MI) const {
   SmallVector<std::pair<MachineOperand *, int64_t>, 3> Result;
 
-  if (MI->getOpcode() == AMDGPU::DOT_4) {
+  if (MI.getOpcode() == AMDGPU::DOT_4) {
     static const unsigned OpTable[8][2] = {
       {AMDGPU::OpName::src0_X, AMDGPU::OpName::src0_sel_X},
       {AMDGPU::OpName::src0_Y, AMDGPU::OpName::src0_sel_Y},
@@ -306,13 +269,13 @@ R600InstrInfo::getSrcs(MachineInstr *MI) const {
     };
 
     for (unsigned j = 0; j < 8; j++) {
-      MachineOperand &MO = MI->getOperand(getOperandIdx(MI->getOpcode(),
-                                                        OpTable[j][0]));
+      MachineOperand &MO =
+          MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][0]));
       unsigned Reg = MO.getReg();
       if (Reg == AMDGPU::ALU_CONST) {
-        unsigned Sel = MI->getOperand(getOperandIdx(MI->getOpcode(),
-                                                    OpTable[j][1])).getImm();
-        Result.push_back(std::pair<MachineOperand *, int64_t>(&MO, Sel));
+        MachineOperand &Sel =
+            MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][1]));
+        Result.push_back(std::make_pair(&MO, Sel.getImm()));
         continue;
       }
 
@@ -327,30 +290,33 @@ R600InstrInfo::getSrcs(MachineInstr *MI) const {
   };
 
   for (unsigned j = 0; j < 3; j++) {
-    int SrcIdx = getOperandIdx(MI->getOpcode(), OpTable[j][0]);
+    int SrcIdx = getOperandIdx(MI.getOpcode(), OpTable[j][0]);
     if (SrcIdx < 0)
       break;
-    MachineOperand &MO = MI->getOperand(SrcIdx);
-    unsigned Reg = MI->getOperand(SrcIdx).getReg();
+    MachineOperand &MO = MI.getOperand(SrcIdx);
+    unsigned Reg = MO.getReg();
     if (Reg == AMDGPU::ALU_CONST) {
-      unsigned Sel = MI->getOperand(
-          getOperandIdx(MI->getOpcode(), OpTable[j][1])).getImm();
-      Result.push_back(std::pair<MachineOperand *, int64_t>(&MO, Sel));
+      MachineOperand &Sel =
+          MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][1]));
+      Result.push_back(std::make_pair(&MO, Sel.getImm()));
       continue;
     }
     if (Reg == AMDGPU::ALU_LITERAL_X) {
-      unsigned Imm = MI->getOperand(
-          getOperandIdx(MI->getOpcode(), AMDGPU::OpName::literal)).getImm();
-      Result.push_back(std::pair<MachineOperand *, int64_t>(&MO, Imm));
-      continue;
+      MachineOperand &Operand =
+          MI.getOperand(getOperandIdx(MI.getOpcode(), AMDGPU::OpName::literal));
+      if (Operand.isImm()) {
+        Result.push_back(std::make_pair(&MO, Operand.getImm()));
+        continue;
+      }
+      assert(Operand.isGlobal());
     }
-    Result.push_back(std::pair<MachineOperand *, int64_t>(&MO, 0));
+    Result.push_back(std::make_pair(&MO, 0));
   }
   return Result;
 }
 
-std::vector<std::pair<int, unsigned> >
-R600InstrInfo::ExtractSrcs(MachineInstr *MI,
+std::vector<std::pair<int, unsigned>>
+R600InstrInfo::ExtractSrcs(MachineInstr &MI,
                            const DenseMap<unsigned, unsigned> &PV,
                            unsigned &ConstCount) const {
   ConstCount = 0;
@@ -360,13 +326,13 @@ R600InstrInfo::ExtractSrcs(MachineInstr *MI,
   unsigned i = 0;
   for (unsigned n = Srcs.size(); i < n; ++i) {
     unsigned Reg = Srcs[i].first->getReg();
-    unsigned Index = RI.getEncodingValue(Reg) & 0xff;
+    int Index = RI.getEncodingValue(Reg) & 0xff;
     if (Reg == AMDGPU::OQAP) {
-      Result.push_back(std::pair<int, unsigned>(Index, 0));
+      Result.push_back(std::make_pair(Index, 0U));
     }
     if (PV.find(Reg) != PV.end()) {
       // 255 is used to tells its a PS/PV reg
-      Result.push_back(std::pair<int, unsigned>(255, 0));
+      Result.push_back(std::make_pair(255, 0U));
       continue;
     }
     if (Index > 127) {
@@ -375,7 +341,7 @@ R600InstrInfo::ExtractSrcs(MachineInstr *MI,
       continue;
     }
     unsigned Chan = RI.getHWRegChan(Reg);
-    Result.push_back(std::pair<int, unsigned>(Index, Chan));
+    Result.push_back(std::make_pair(Index, Chan));
   }
   for (; i < 3; ++i)
     Result.push_back(DummyPair);
@@ -411,8 +377,7 @@ Swizzle(std::vector<std::pair<int, unsigned> > Src,
   return Src;
 }
 
-static unsigned
-getTransSwizzle(R600InstrInfo::BankSwizzle Swz, unsigned Op) {
+static unsigned getTransSwizzle(R600InstrInfo::BankSwizzle Swz, unsigned Op) {
   switch (Swz) {
   case R600InstrInfo::ALU_VEC_012_SCL_210: {
     unsigned Cycles[3] = { 2, 1, 0};
@@ -432,7 +397,6 @@ getTransSwizzle(R600InstrInfo::BankSwizzle Swz, unsigned Op) {
   }
   default:
     llvm_unreachable("Wrong Swizzle for Trans Slot");
-    return 0;
   }
 }
 
@@ -557,7 +521,7 @@ R600InstrInfo::fitsReadPortLimitations(const std::vector<MachineInstr *> &IG,
   unsigned ConstCount;
   BankSwizzle TransBS = ALU_VEC_012_SCL_210;
   for (unsigned i = 0, e = IG.size(); i < e; ++i) {
-    IGSrcs.push_back(ExtractSrcs(IG[i], PV, ConstCount));
+    IGSrcs.push_back(ExtractSrcs(*IG[i], PV, ConstCount));
     unsigned Op = getOperandIdx(IG[i]->getOpcode(),
         AMDGPU::OpName::bank_swizzle);
     ValidSwizzle.push_back( (R600InstrInfo::BankSwizzle)
@@ -624,14 +588,13 @@ R600InstrInfo::fitsConstReadLimitations(const std::vector<MachineInstr *> &MIs)
   std::vector<unsigned> Consts;
   SmallSet<int64_t, 4> Literals;
   for (unsigned i = 0, n = MIs.size(); i < n; i++) {
-    MachineInstr *MI = MIs[i];
-    if (!isALUInstr(MI->getOpcode()))
+    MachineInstr &MI = *MIs[i];
+    if (!isALUInstr(MI.getOpcode()))
       continue;
 
     ArrayRef<std::pair<MachineOperand *, int64_t>> Srcs = getSrcs(MI);
 
-    for (unsigned j = 0, e = Srcs.size(); j < e; j++) {
-      std::pair<MachineOperand *, unsigned> Src = Srcs[j];
+    for (const auto &Src:Srcs) {
       if (Src.first->getReg() == AMDGPU::ALU_LITERAL_X)
         Literals.insert(Src.second);
       if (Literals.size() > 4)
@@ -652,7 +615,7 @@ R600InstrInfo::fitsConstReadLimitations(const std::vector<MachineInstr *> &MIs)
 DFAPacketizer *
 R600InstrInfo::CreateTargetScheduleState(const TargetSubtargetInfo &STI) const {
   const InstrItineraryData *II = STI.getInstrItineraryData();
-  return static_cast<const AMDGPUSubtarget &>(STI).createDFAPacketizer(II);
+  return static_cast<const R600Subtarget &>(STI).createDFAPacketizer(II);
 }
 
 static bool
@@ -670,9 +633,9 @@ findFirstPredicateSetterFrom(MachineBasicBlock &MBB,
                              MachineBasicBlock::iterator I) {
   while (I != MBB.begin()) {
     --I;
-    MachineInstr *MI = I;
-    if (isPredicateSetter(MI->getOpcode()))
-      return MI;
+    MachineInstr &MI = *I;
+    if (isPredicateSetter(MI.getOpcode()))
+      return &MI;
   }
 
   return nullptr;
@@ -688,12 +651,11 @@ static bool isBranch(unsigned Opcode) {
       Opcode == AMDGPU::BRANCH_COND_f32;
 }
 
-bool
-R600InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
-                             MachineBasicBlock *&TBB,
-                             MachineBasicBlock *&FBB,
-                             SmallVectorImpl<MachineOperand> &Cond,
-                             bool AllowModify) const {
+bool R600InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+                                  MachineBasicBlock *&TBB,
+                                  MachineBasicBlock *&FBB,
+                                  SmallVectorImpl<MachineOperand> &Cond,
+                                  bool AllowModify) const {
   // Most of the following comes from the ARM implementation of AnalyzeBranch
 
   // If the block has no terminators, it just falls into the block after it.
@@ -716,21 +678,21 @@ R600InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
         I->removeFromParent();
       I = PriorI;
   }
-  MachineInstr *LastInst = I;
+  MachineInstr &LastInst = *I;
 
   // If there is only one terminator instruction, process it.
-  unsigned LastOpc = LastInst->getOpcode();
+  unsigned LastOpc = LastInst.getOpcode();
   if (I == MBB.begin() ||
           !isJump(static_cast<MachineInstr *>(--I)->getOpcode())) {
     if (LastOpc == AMDGPU::JUMP) {
-      TBB = LastInst->getOperand(0).getMBB();
+      TBB = LastInst.getOperand(0).getMBB();
       return false;
     } else if (LastOpc == AMDGPU::JUMP_COND) {
-      MachineInstr *predSet = I;
+      auto predSet = I;
       while (!isPredicateSetter(predSet->getOpcode())) {
         predSet = --I;
       }
-      TBB = LastInst->getOperand(0).getMBB();
+      TBB = LastInst.getOperand(0).getMBB();
       Cond.push_back(predSet->getOperand(1));
       Cond.push_back(predSet->getOperand(2));
       Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false));
@@ -740,17 +702,17 @@ R600InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
   }
 
   // Get the instruction before it if it is a terminator.
-  MachineInstr *SecondLastInst = I;
-  unsigned SecondLastOpc = SecondLastInst->getOpcode();
+  MachineInstr &SecondLastInst = *I;
+  unsigned SecondLastOpc = SecondLastInst.getOpcode();
 
   // If the block ends with a B and a Bcc, handle it.
   if (SecondLastOpc == AMDGPU::JUMP_COND && LastOpc == AMDGPU::JUMP) {
-    MachineInstr *predSet = --I;
+    auto predSet = --I;
     while (!isPredicateSetter(predSet->getOpcode())) {
       predSet = --I;
     }
-    TBB = SecondLastInst->getOperand(0).getMBB();
-    FBB = LastInst->getOperand(0).getMBB();
+    TBB = SecondLastInst.getOperand(0).getMBB();
+    FBB = LastInst.getOperand(0).getMBB();
     Cond.push_back(predSet->getOperand(1));
     Cond.push_back(predSet->getOperand(2));
     Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false));
@@ -772,12 +734,11 @@ MachineBasicBlock::iterator FindLastAluClause(MachineBasicBlock &MBB) {
   return MBB.end();
 }
 
-unsigned
-R600InstrInfo::InsertBranch(MachineBasicBlock &MBB,
-                            MachineBasicBlock *TBB,
-                            MachineBasicBlock *FBB,
-                            ArrayRef<MachineOperand> Cond,
-                            DebugLoc DL) const {
+unsigned R600InstrInfo::InsertBranch(MachineBasicBlock &MBB,
+                                     MachineBasicBlock *TBB,
+                                     MachineBasicBlock *FBB,
+                                     ArrayRef<MachineOperand> Cond,
+                                     const DebugLoc &DL) const {
   assert(TBB && "InsertBranch must not be told to insert a fallthrough");
 
   if (!FBB) {
@@ -787,7 +748,7 @@ R600InstrInfo::InsertBranch(MachineBasicBlock &MBB,
     } else {
       MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end());
       assert(PredSet && "No previous predicate !");
-      addFlag(PredSet, 0, MO_FLAG_PUSH);
+      addFlag(*PredSet, 0, MO_FLAG_PUSH);
       PredSet->getOperand(2).setImm(Cond[1].getImm());
 
       BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND))
@@ -803,7 +764,7 @@ R600InstrInfo::InsertBranch(MachineBasicBlock &MBB,
   } else {
     MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end());
     assert(PredSet && "No previous predicate !");
-    addFlag(PredSet, 0, MO_FLAG_PUSH);
+    addFlag(*PredSet, 0, MO_FLAG_PUSH);
     PredSet->getOperand(2).setImm(Cond[1].getImm());
     BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND))
             .addMBB(TBB)
@@ -835,7 +796,7 @@ R600InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
     return 0;
   case AMDGPU::JUMP_COND: {
     MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I);
-    clearFlag(predSet, 0, MO_FLAG_PUSH);
+    clearFlag(*predSet, 0, MO_FLAG_PUSH);
     I->eraseFromParent();
     MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB);
     if (CfAlu == MBB.end())
@@ -860,7 +821,7 @@ R600InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
     return 1;
   case AMDGPU::JUMP_COND: {
     MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I);
-    clearFlag(predSet, 0, MO_FLAG_PUSH);
+    clearFlag(*predSet, 0, MO_FLAG_PUSH);
     I->eraseFromParent();
     MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB);
     if (CfAlu == MBB.end())
@@ -876,13 +837,12 @@ R600InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
   return 2;
 }
 
-bool
-R600InstrInfo::isPredicated(const MachineInstr *MI) const {
-  int idx = MI->findFirstPredOperandIdx();
+bool R600InstrInfo::isPredicated(const MachineInstr &MI) const {
+  int idx = MI.findFirstPredOperandIdx();
   if (idx < 0)
     return false;
 
-  unsigned Reg = MI->getOperand(idx).getReg();
+  unsigned Reg = MI.getOperand(idx).getReg();
   switch (Reg) {
   default: return false;
   case AMDGPU::PRED_SEL_ONE:
@@ -892,25 +852,22 @@ R600InstrInfo::isPredicated(const MachineInstr *MI) const {
   }
 }
 
-bool
-R600InstrInfo::isPredicable(MachineInstr *MI) const {
+bool R600InstrInfo::isPredicable(MachineInstr &MI) const {
   // XXX: KILL* instructions can be predicated, but they must be the last
   // instruction in a clause, so this means any instructions after them cannot
   // be predicated.  Until we have proper support for instruction clauses in the
   // backend, we will mark KILL* instructions as unpredicable.
 
-  if (MI->getOpcode() == AMDGPU::KILLGT) {
+  if (MI.getOpcode() == AMDGPU::KILLGT) {
     return false;
-  } else if (MI->getOpcode() == AMDGPU::CF_ALU) {
+  } else if (MI.getOpcode() == AMDGPU::CF_ALU) {
     // If the clause start in the middle of MBB then the MBB has more
     // than a single clause, unable to predicate several clauses.
-    if (MI->getParent()->begin() != MachineBasicBlock::iterator(MI))
+    if (MI.getParent()->begin() != MachineBasicBlock::iterator(MI))
       return false;
     // TODO: We don't support KC merging atm
-    if (MI->getOperand(3).getImm() != 0 || MI->getOperand(4).getImm() != 0)
-      return false;
-    return true;
-  } else if (isVector(*MI)) {
+    return MI.getOperand(3).getImm() == 0 && MI.getOperand(4).getImm() == 0;
+  } else if (isVector(MI)) {
     return false;
   } else {
     return AMDGPUInstrInfo::isPredicable(MI);
@@ -986,48 +943,39 @@ R600InstrInfo::ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) con
   return false;
 }
 
-bool
-R600InstrInfo::DefinesPredicate(MachineInstr *MI,
-                                std::vector<MachineOperand> &Pred) const {
-  return isPredicateSetter(MI->getOpcode());
+bool R600InstrInfo::DefinesPredicate(MachineInstr &MI,
+                                     std::vector<MachineOperand> &Pred) const {
+  return isPredicateSetter(MI.getOpcode());
 }
 
 
-bool
-R600InstrInfo::SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
-                                 ArrayRef<MachineOperand> Pred2) const {
-  return false;
-}
-
-
-bool
-R600InstrInfo::PredicateInstruction(MachineInstr *MI,
-                                    ArrayRef<MachineOperand> Pred) const {
-  int PIdx = MI->findFirstPredOperandIdx();
+bool R600InstrInfo::PredicateInstruction(MachineInstr &MI,
+                                         ArrayRef<MachineOperand> Pred) const {
+  int PIdx = MI.findFirstPredOperandIdx();
 
-  if (MI->getOpcode() == AMDGPU::CF_ALU) {
-    MI->getOperand(8).setImm(0);
+  if (MI.getOpcode() == AMDGPU::CF_ALU) {
+    MI.getOperand(8).setImm(0);
     return true;
   }
 
-  if (MI->getOpcode() == AMDGPU::DOT_4) {
-    MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_X))
+  if (MI.getOpcode() == AMDGPU::DOT_4) {
+    MI.getOperand(getOperandIdx(MI, AMDGPU::OpName::pred_sel_X))
         .setReg(Pred[2].getReg());
-    MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_Y))
+    MI.getOperand(getOperandIdx(MI, AMDGPU::OpName::pred_sel_Y))
         .setReg(Pred[2].getReg());
-    MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_Z))
+    MI.getOperand(getOperandIdx(MI, AMDGPU::OpName::pred_sel_Z))
         .setReg(Pred[2].getReg());
-    MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_W))
+    MI.getOperand(getOperandIdx(MI, AMDGPU::OpName::pred_sel_W))
         .setReg(Pred[2].getReg());
-    MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI);
+    MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
     MIB.addReg(AMDGPU::PREDICATE_BIT, RegState::Implicit);
     return true;
   }
 
   if (PIdx != -1) {
-    MachineOperand &PMO = MI->getOperand(PIdx);
+    MachineOperand &PMO = MI.getOperand(PIdx);
     PMO.setReg(Pred[2].getReg());
-    MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI);
+    MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
     MIB.addReg(AMDGPU::PREDICATE_BIT, RegState::Implicit);
     return true;
   }
@@ -1035,45 +983,94 @@ R600InstrInfo::PredicateInstruction(MachineInstr *MI,
   return false;
 }
 
-unsigned int R600InstrInfo::getPredicationCost(const MachineInstr *) const {
+unsigned int R600InstrInfo::getPredicationCost(const MachineInstr &) const {
   return 2;
 }
 
 unsigned int R600InstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
-                                            const MachineInstr *MI,
+                                            const MachineInstr &,
                                             unsigned *PredCost) const {
   if (PredCost)
     *PredCost = 2;
   return 2;
 }
 
-bool R600InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
+unsigned R600InstrInfo::calculateIndirectAddress(unsigned RegIndex,
+                                                   unsigned Channel) const {
+  assert(Channel == 0);
+  return RegIndex;
+}
 
-  switch(MI->getOpcode()) {
-  default: return AMDGPUInstrInfo::expandPostRAPseudo(MI);
+bool R600InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  default: {
+    MachineBasicBlock *MBB = MI.getParent();
+    int OffsetOpIdx =
+        AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::addr);
+    // addr is a custom operand with multiple MI operands, and only the
+    // first MI operand is given a name.
+    int RegOpIdx = OffsetOpIdx + 1;
+    int ChanOpIdx =
+        AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::chan);
+    if (isRegisterLoad(MI)) {
+      int DstOpIdx =
+          AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
+      unsigned RegIndex = MI.getOperand(RegOpIdx).getImm();
+      unsigned Channel = MI.getOperand(ChanOpIdx).getImm();
+      unsigned Address = calculateIndirectAddress(RegIndex, Channel);
+      unsigned OffsetReg = MI.getOperand(OffsetOpIdx).getReg();
+      if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) {
+        buildMovInstr(MBB, MI, MI.getOperand(DstOpIdx).getReg(),
+                      getIndirectAddrRegClass()->getRegister(Address));
+      } else {
+        buildIndirectRead(MBB, MI, MI.getOperand(DstOpIdx).getReg(), Address,
+                          OffsetReg);
+      }
+    } else if (isRegisterStore(MI)) {
+      int ValOpIdx =
+          AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::val);
+      unsigned RegIndex = MI.getOperand(RegOpIdx).getImm();
+      unsigned Channel = MI.getOperand(ChanOpIdx).getImm();
+      unsigned Address = calculateIndirectAddress(RegIndex, Channel);
+      unsigned OffsetReg = MI.getOperand(OffsetOpIdx).getReg();
+      if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) {
+        buildMovInstr(MBB, MI, getIndirectAddrRegClass()->getRegister(Address),
+                      MI.getOperand(ValOpIdx).getReg());
+      } else {
+        buildIndirectWrite(MBB, MI, MI.getOperand(ValOpIdx).getReg(),
+                           calculateIndirectAddress(RegIndex, Channel),
+                           OffsetReg);
+      }
+    } else {
+      return false;
+    }
+
+    MBB->erase(MI);
+    return true;
+  }
   case AMDGPU::R600_EXTRACT_ELT_V2:
   case AMDGPU::R600_EXTRACT_ELT_V4:
-    buildIndirectRead(MI->getParent(), MI, MI->getOperand(0).getReg(),
-                      RI.getHWRegIndex(MI->getOperand(1).getReg()), //  Address
-                      MI->getOperand(2).getReg(),
-                      RI.getHWRegChan(MI->getOperand(1).getReg()));
+    buildIndirectRead(MI.getParent(), MI, MI.getOperand(0).getReg(),
+                      RI.getHWRegIndex(MI.getOperand(1).getReg()), //  Address
+                      MI.getOperand(2).getReg(),
+                      RI.getHWRegChan(MI.getOperand(1).getReg()));
     break;
   case AMDGPU::R600_INSERT_ELT_V2:
   case AMDGPU::R600_INSERT_ELT_V4:
-    buildIndirectWrite(MI->getParent(), MI, MI->getOperand(2).getReg(), // Value
-                       RI.getHWRegIndex(MI->getOperand(1).getReg()),  // Address
-                       MI->getOperand(3).getReg(),                    // Offset
-                       RI.getHWRegChan(MI->getOperand(1).getReg()));  // Channel
+    buildIndirectWrite(MI.getParent(), MI, MI.getOperand(2).getReg(), // Value
+                       RI.getHWRegIndex(MI.getOperand(1).getReg()),   // Address
+                       MI.getOperand(3).getReg(),                     // Offset
+                       RI.getHWRegChan(MI.getOperand(1).getReg()));   // Channel
     break;
   }
-  MI->eraseFromParent();
+  MI.eraseFromParent();
   return true;
 }
 
 void  R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved,
                                              const MachineFunction &MF) const {
-  const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>(
-      MF.getSubtarget().getFrameLowering());
+  const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>();
+  const R600FrameLowering *TFL = ST.getFrameLowering();
 
   unsigned StackWidth = TFL->getStackWidth(MF);
   int End = getIndirectIndexEnd(MF);
@@ -1091,13 +1088,6 @@ void  R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved,
   }
 }
 
-unsigned R600InstrInfo::calculateIndirectAddress(unsigned RegIndex,
-                                                 unsigned Channel) const {
-  // XXX: Remove when we support a stack width > 2
-  assert(Channel == 0);
-  return RegIndex;
-}
-
 const TargetRegisterClass *R600InstrInfo::getIndirectAddrRegClass() const {
   return &AMDGPU::R600_TReg32_XRegClass;
 }
@@ -1124,13 +1114,13 @@ MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB,
   }
   MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg,
                                                AMDGPU::AR_X, OffsetReg);
-  setImmOperand(MOVA, AMDGPU::OpName::write, 0);
+  setImmOperand(*MOVA, AMDGPU::OpName::write, 0);
 
   MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV,
                                       AddrReg, ValueReg)
                                       .addReg(AMDGPU::AR_X,
                                            RegState::Implicit | RegState::Kill);
-  setImmOperand(Mov, AMDGPU::OpName::dst_rel, 1);
+  setImmOperand(*Mov, AMDGPU::OpName::dst_rel, 1);
   return Mov;
 }
 
@@ -1157,17 +1147,74 @@ MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB,
   MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg,
                                                        AMDGPU::AR_X,
                                                        OffsetReg);
-  setImmOperand(MOVA, AMDGPU::OpName::write, 0);
+  setImmOperand(*MOVA, AMDGPU::OpName::write, 0);
   MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV,
                                       ValueReg,
                                       AddrReg)
                                       .addReg(AMDGPU::AR_X,
                                            RegState::Implicit | RegState::Kill);
-  setImmOperand(Mov, AMDGPU::OpName::src0_rel, 1);
+  setImmOperand(*Mov, AMDGPU::OpName::src0_rel, 1);
 
   return Mov;
 }
 
+int R600InstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const {
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  int Offset = -1;
+
+  if (MFI->getNumObjects() == 0) {
+    return -1;
+  }
+
+  if (MRI.livein_empty()) {
+    return 0;
+  }
+
+  const TargetRegisterClass *IndirectRC = getIndirectAddrRegClass();
+  for (MachineRegisterInfo::livein_iterator LI = MRI.livein_begin(),
+                                            LE = MRI.livein_end();
+                                            LI != LE; ++LI) {
+    unsigned Reg = LI->first;
+    if (TargetRegisterInfo::isVirtualRegister(Reg) ||
+        !IndirectRC->contains(Reg))
+      continue;
+
+    unsigned RegIndex;
+    unsigned RegEnd;
+    for (RegIndex = 0, RegEnd = IndirectRC->getNumRegs(); RegIndex != RegEnd;
+                                                          ++RegIndex) {
+      if (IndirectRC->getRegister(RegIndex) == Reg)
+        break;
+    }
+    Offset = std::max(Offset, (int)RegIndex);
+  }
+
+  return Offset + 1;
+}
+
+int R600InstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const {
+  int Offset = 0;
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // Variable sized objects are not supported
+  if (MFI->hasVarSizedObjects()) {
+    return -1;
+  }
+
+  if (MFI->getNumObjects() == 0) {
+    return -1;
+  }
+
+  const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>();
+  const R600FrameLowering *TFL = ST.getFrameLowering();
+
+  unsigned IgnoredFrameReg;
+  Offset = TFL->getFrameIndexReference(MF, -1, IgnoredFrameReg);
+
+  return getIndirectIndexBegin(MF) + Offset;
+}
+
 unsigned R600InstrInfo::getMaxAlusPerClause() const {
   return 115;
 }
@@ -1256,7 +1303,7 @@ MachineInstr *R600InstrInfo::buildSlotOfVectorInstruction(
     const {
   assert (MI->getOpcode() == AMDGPU::DOT_4 && "Not Implemented");
   unsigned Opcode;
-  if (ST.getGeneration() <= AMDGPUSubtarget::R700)
+  if (ST.getGeneration() <= R600Subtarget::R700)
     Opcode = AMDGPU::DOT4_r600;
   else
     Opcode = AMDGPU::DOT4_eg;
@@ -1293,7 +1340,7 @@ MachineInstr *R600InstrInfo::buildSlotOfVectorInstruction(
     MachineOperand &MO = MI->getOperand(
         getOperandIdx(MI->getOpcode(), getSlotedOps(Operands[i], Slot)));
     assert (MO.isImm());
-    setImmOperand(MIB, Operands[i], MO.getImm());
+    setImmOperand(*MIB, Operands[i], MO.getImm());
   }
   MIB->getOperand(20).setImm(0);
   return MIB;
@@ -1305,7 +1352,7 @@ MachineInstr *R600InstrInfo::buildMovImm(MachineBasicBlock &BB,
                                          uint64_t Imm) const {
   MachineInstr *MovImm = buildDefaultInstruction(BB, I, AMDGPU::MOV, DstReg,
                                                   AMDGPU::ALU_LITERAL_X);
-  setImmOperand(MovImm, AMDGPU::OpName::literal, Imm);
+  setImmOperand(*MovImm, AMDGPU::OpName::literal, Imm);
   return MovImm;
 }
 
@@ -1323,25 +1370,21 @@ int R600InstrInfo::getOperandIdx(unsigned Opcode, unsigned Op) const {
   return AMDGPU::getNamedOperandIdx(Opcode, Op);
 }
 
-void R600InstrInfo::setImmOperand(MachineInstr *MI, unsigned Op,
+void R600InstrInfo::setImmOperand(MachineInstr &MI, unsigned Op,
                                   int64_t Imm) const {
-  int Idx = getOperandIdx(*MI, Op);
+  int Idx = getOperandIdx(MI, Op);
   assert(Idx != -1 && "Operand not supported for this instruction.");
-  assert(MI->getOperand(Idx).isImm());
-  MI->getOperand(Idx).setImm(Imm);
+  assert(MI.getOperand(Idx).isImm());
+  MI.getOperand(Idx).setImm(Imm);
 }
 
 //===----------------------------------------------------------------------===//
 // Instruction flag getters/setters
 //===----------------------------------------------------------------------===//
 
-bool R600InstrInfo::hasFlagOperand(const MachineInstr &MI) const {
-  return GET_FLAG_OPERAND_IDX(get(MI.getOpcode()).TSFlags) != 0;
-}
-
-MachineOperand &R600InstrInfo::getFlagOp(MachineInstr *MI, unsigned SrcIdx,
+MachineOperand &R600InstrInfo::getFlagOp(MachineInstr &MI, unsigned SrcIdx,
                                          unsigned Flag) const {
-  unsigned TargetFlags = get(MI->getOpcode()).TSFlags;
+  unsigned TargetFlags = get(MI.getOpcode()).TSFlags;
   int FlagIndex = 0;
   if (Flag != 0) {
     // If we pass something other than the default value of Flag to this
@@ -1351,20 +1394,26 @@ MachineOperand &R600InstrInfo::getFlagOp(MachineInstr *MI, unsigned SrcIdx,
     bool IsOP3 = (TargetFlags & R600_InstFlag::OP3) == R600_InstFlag::OP3;
     switch (Flag) {
     case MO_FLAG_CLAMP:
-      FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::clamp);
+      FlagIndex = getOperandIdx(MI, AMDGPU::OpName::clamp);
       break;
     case MO_FLAG_MASK:
-      FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::write);
+      FlagIndex = getOperandIdx(MI, AMDGPU::OpName::write);
       break;
     case MO_FLAG_NOT_LAST:
     case MO_FLAG_LAST:
-      FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::last);
+      FlagIndex = getOperandIdx(MI, AMDGPU::OpName::last);
       break;
     case MO_FLAG_NEG:
       switch (SrcIdx) {
-      case 0: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src0_neg); break;
-      case 1: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src1_neg); break;
-      case 2: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src2_neg); break;
+      case 0:
+        FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src0_neg);
+        break;
+      case 1:
+        FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src1_neg);
+        break;
+      case 2:
+        FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src2_neg);
+        break;
       }
       break;
 
@@ -1373,8 +1422,12 @@ MachineOperand &R600InstrInfo::getFlagOp(MachineInstr *MI, unsigned SrcIdx,
                        "instructions.");
       (void)IsOP3;
       switch (SrcIdx) {
-      case 0: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src0_abs); break;
-      case 1: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src1_abs); break;
+      case 0:
+        FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src0_abs);
+        break;
+      case 1:
+        FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src1_abs);
+        break;
       }
       break;
 
@@ -1389,14 +1442,14 @@ MachineOperand &R600InstrInfo::getFlagOp(MachineInstr *MI, unsigned SrcIdx,
          "Instruction flags not supported for this instruction");
   }
 
-  MachineOperand &FlagOp = MI->getOperand(FlagIndex);
+  MachineOperand &FlagOp = MI.getOperand(FlagIndex);
   assert(FlagOp.isImm());
   return FlagOp;
 }
 
-void R600InstrInfo::addFlag(MachineInstr *MI, unsigned Operand,
+void R600InstrInfo::addFlag(MachineInstr &MI, unsigned Operand,
                             unsigned Flag) const {
-  unsigned TargetFlags = get(MI->getOpcode()).TSFlags;
+  unsigned TargetFlags = get(MI.getOpcode()).TSFlags;
   if (Flag == 0) {
     return;
   }
@@ -1415,9 +1468,9 @@ void R600InstrInfo::addFlag(MachineInstr *MI, unsigned Operand,
   }
 }
 
-void R600InstrInfo::clearFlag(MachineInstr *MI, unsigned Operand,
+void R600InstrInfo::clearFlag(MachineInstr &MI, unsigned Operand,
                               unsigned Flag) const {
-  unsigned TargetFlags = get(MI->getOpcode()).TSFlags;
+  unsigned TargetFlags = get(MI.getOpcode()).TSFlags;
   if (HAS_NATIVE_OPERANDS(TargetFlags)) {
     MachineOperand &FlagOp = getFlagOp(MI, Operand, Flag);
     FlagOp.setImm(0);
@@ -1428,3 +1481,11 @@ void R600InstrInfo::clearFlag(MachineInstr *MI, unsigned Operand,
     FlagOp.setImm(InstFlags);
   }
 }
+
+bool R600InstrInfo::isRegisterStore(const MachineInstr &MI) const {
+  return get(MI.getOpcode()).TSFlags & AMDGPU_FLAG_REGISTER_STORE;
+}
+
+bool R600InstrInfo::isRegisterLoad(const MachineInstr &MI) const {
+  return get(MI.getOpcode()).TSFlags & AMDGPU_FLAG_REGISTER_LOAD;
+}
diff --git a/lib/Target/AMDGPU/R600InstrInfo.h b/lib/Target/AMDGPU/R600InstrInfo.h
index e7251c31107b..feaca98def44 100644
--- a/lib/Target/AMDGPU/R600InstrInfo.h
+++ b/lib/Target/AMDGPU/R600InstrInfo.h
@@ -12,30 +12,28 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_R600_R600INSTRINFO_H
-#define LLVM_LIB_TARGET_R600_R600INSTRINFO_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_R600INSTRINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_R600INSTRINFO_H
 
 #include "AMDGPUInstrInfo.h"
-#include "R600Defines.h"
 #include "R600RegisterInfo.h"
-#include <map>
 
 namespace llvm {
-
-  class AMDGPUTargetMachine;
-  class DFAPacketizer;
-  class ScheduleDAG;
-  class MachineFunction;
-  class MachineInstr;
-  class MachineInstrBuilder;
-
-  class R600InstrInfo : public AMDGPUInstrInfo {
-  private:
+class AMDGPUTargetMachine;
+class DFAPacketizer;
+class MachineFunction;
+class MachineInstr;
+class MachineInstrBuilder;
+class R600Subtarget;
+
+class R600InstrInfo final : public AMDGPUInstrInfo {
+private:
   const R600RegisterInfo RI;
+  const R600Subtarget &ST;
 
-  std::vector<std::pair<int, unsigned> >
-  ExtractSrcs(MachineInstr *MI, const DenseMap<unsigned, unsigned> &PV, unsigned &ConstCount) const;
-
+  std::vector<std::pair<int, unsigned>>
+  ExtractSrcs(MachineInstr &MI, const DenseMap<unsigned, unsigned> &PV,
+              unsigned &ConstCount) const;
 
   MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
                                         MachineBasicBlock::iterator I,
@@ -44,11 +42,11 @@ namespace llvm {
                                         unsigned AddrChan) const;
 
   MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
-                                        MachineBasicBlock::iterator I,
-                                        unsigned ValueReg, unsigned Address,
-                                        unsigned OffsetReg,
-                                        unsigned AddrChan) const;
-  public:
+                                         MachineBasicBlock::iterator I,
+                                         unsigned ValueReg, unsigned Address,
+                                         unsigned OffsetReg,
+                                         unsigned AddrChan) const;
+public:
   enum BankSwizzle {
     ALU_VEC_012_SCL_210 = 0,
     ALU_VEC_021_SCL_122,
@@ -58,18 +56,18 @@ namespace llvm {
     ALU_VEC_210
   };
 
-  explicit R600InstrInfo(const AMDGPUSubtarget &st);
+  explicit R600InstrInfo(const R600Subtarget &);
 
-  const R600RegisterInfo &getRegisterInfo() const override;
-  void copyPhysReg(MachineBasicBlock &MBB,
-                   MachineBasicBlock::iterator MI, DebugLoc DL,
-                   unsigned DestReg, unsigned SrcReg,
+  const R600RegisterInfo &getRegisterInfo() const {
+    return RI;
+  }
+
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                   const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
                    bool KillSrc) const override;
   bool isLegalToSplitMBBAt(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI) const override;
 
-  bool isTrig(const MachineInstr &MI) const;
-  bool isPlaceHolderOpcode(unsigned opcode) const;
   bool isReductionOp(unsigned opcode) const;
   bool isCubeOp(unsigned opcode) const;
 
@@ -77,32 +75,28 @@ namespace llvm {
   bool isALUInstr(unsigned Opcode) const;
   bool hasInstrModifiers(unsigned Opcode) const;
   bool isLDSInstr(unsigned Opcode) const;
-  bool isLDSNoRetInstr(unsigned Opcode) const;
   bool isLDSRetInstr(unsigned Opcode) const;
 
   /// \returns true if this \p Opcode represents an ALU instruction or an
   /// instruction that will be lowered in ExpandSpecialInstrs Pass.
-  bool canBeConsideredALU(const MachineInstr *MI) const;
+  bool canBeConsideredALU(const MachineInstr &MI) const;
 
   bool isTransOnly(unsigned Opcode) const;
-  bool isTransOnly(const MachineInstr *MI) const;
+  bool isTransOnly(const MachineInstr &MI) const;
   bool isVectorOnly(unsigned Opcode) const;
-  bool isVectorOnly(const MachineInstr *MI) const;
+  bool isVectorOnly(const MachineInstr &MI) const;
   bool isExport(unsigned Opcode) const;
 
   bool usesVertexCache(unsigned Opcode) const;
-  bool usesVertexCache(const MachineInstr *MI) const;
+  bool usesVertexCache(const MachineInstr &MI) const;
   bool usesTextureCache(unsigned Opcode) const;
-  bool usesTextureCache(const MachineInstr *MI) const;
+  bool usesTextureCache(const MachineInstr &MI) const;
 
   bool mustBeLastInClause(unsigned Opcode) const;
-  bool usesAddressRegister(MachineInstr *MI) const;
-  bool definesAddressRegister(MachineInstr *MI) const;
-  bool readsLDSSrcReg(const MachineInstr *MI) const;
+  bool usesAddressRegister(MachineInstr &MI) const;
+  bool definesAddressRegister(MachineInstr &MI) const;
+  bool readsLDSSrcReg(const MachineInstr &MI) const;
 
-  /// \returns The operand index for the given source number.  Legal values
-  /// for SrcNum are 0, 1, and 2.
-  int getSrcIdx(unsigned Opcode, unsigned SrcNum) const;
   /// \returns The operand Index for the Sel operand given an index to one
   /// of the instruction's src operands.
   int getSelIdx(unsigned Opcode, unsigned SrcIdx) const;
@@ -113,7 +107,7 @@ namespace llvm {
   /// If register is ALU_LITERAL, second member is IMM.
   /// Otherwise, second member value is undefined.
   SmallVector<std::pair<MachineOperand *, int64_t>, 3>
-      getSrcs(MachineInstr *MI) const;
+  getSrcs(MachineInstr &MI) const;
 
   unsigned  isLegalUpTo(
     const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
@@ -152,89 +146,107 @@ namespace llvm {
   /// instruction slots within an instruction group.
   bool isVector(const MachineInstr &MI) const;
 
-  bool isMov(unsigned Opcode) const override;
+  bool isMov(unsigned Opcode) const;
 
   DFAPacketizer *
   CreateTargetScheduleState(const TargetSubtargetInfo &) const override;
 
-  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+  bool ReverseBranchCondition(
+    SmallVectorImpl<MachineOperand> &Cond) const override;
 
-  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
-                     SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const override;
+  bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify) const override;
 
   unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                         MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
-                        DebugLoc DL) const override;
+                        const DebugLoc &DL) const override;
 
   unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
 
-  bool isPredicated(const MachineInstr *MI) const override;
+  bool isPredicated(const MachineInstr &MI) const override;
 
-  bool isPredicable(MachineInstr *MI) const override;
+  bool isPredicable(MachineInstr &MI) const override;
 
-  bool
-   isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
-                             BranchProbability Probability) const override;
+  bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
+                                 BranchProbability Probability) const override;
 
   bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
                            unsigned ExtraPredCycles,
                            BranchProbability Probability) const override ;
 
-  bool
-   isProfitableToIfCvt(MachineBasicBlock &TMBB,
-                       unsigned NumTCycles, unsigned ExtraTCycles,
-                       MachineBasicBlock &FMBB,
-                       unsigned NumFCycles, unsigned ExtraFCycles,
-                       BranchProbability Probability) const override;
-
-  bool DefinesPredicate(MachineInstr *MI,
-                                  std::vector<MachineOperand> &Pred) const override;
+  bool isProfitableToIfCvt(MachineBasicBlock &TMBB,
+                           unsigned NumTCycles, unsigned ExtraTCycles,
+                           MachineBasicBlock &FMBB,
+                           unsigned NumFCycles, unsigned ExtraFCycles,
+                           BranchProbability Probability) const override;
 
-  bool SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
-                         ArrayRef<MachineOperand> Pred2) const override;
+  bool DefinesPredicate(MachineInstr &MI,
+                        std::vector<MachineOperand> &Pred) const override;
 
   bool isProfitableToUnpredicate(MachineBasicBlock &TMBB,
-                                          MachineBasicBlock &FMBB) const override;
+                                 MachineBasicBlock &FMBB) const override;
 
-  bool PredicateInstruction(MachineInstr *MI,
+  bool PredicateInstruction(MachineInstr &MI,
                             ArrayRef<MachineOperand> Pred) const override;
 
-  unsigned int getPredicationCost(const MachineInstr *) const override;
+  unsigned int getPredicationCost(const MachineInstr &) const override;
 
   unsigned int getInstrLatency(const InstrItineraryData *ItinData,
-                               const MachineInstr *MI,
+                               const MachineInstr &MI,
                                unsigned *PredCost = nullptr) const override;
 
-  int getInstrLatency(const InstrItineraryData *ItinData,
-                      SDNode *Node) const override { return 1;}
-
-  bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
+  bool expandPostRAPseudo(MachineInstr &MI) const override;
 
   /// \brief Reserve the registers that may be accesed using indirect addressing.
   void reserveIndirectRegisters(BitVector &Reserved,
                                 const MachineFunction &MF) const;
 
-  unsigned calculateIndirectAddress(unsigned RegIndex,
-                                    unsigned Channel) const override;
+  /// Calculate the "Indirect Address" for the given \p RegIndex and
+  /// \p Channel
+  ///
+  /// We model indirect addressing using a virtual address space that can be
+  /// accesed with loads and stores.  The "Indirect Address" is the memory
+  /// address in this virtual address space that maps to the given \p RegIndex
+  /// and \p Channel.
+  unsigned calculateIndirectAddress(unsigned RegIndex, unsigned Channel) const;
+
+
+  /// \returns The register class to be used for loading and storing values
+  /// from an "Indirect Address" .
+  const TargetRegisterClass *getIndirectAddrRegClass() const;
+
+  /// \returns the smallest register index that will be accessed by an indirect
+  /// read or write or -1 if indirect addressing is not used by this program.
+  int getIndirectIndexBegin(const MachineFunction &MF) const;
 
-  const TargetRegisterClass *getIndirectAddrRegClass() const override;
+  /// \returns the largest register index that will be accessed by an indirect
+  /// read or write or -1 if indirect addressing is not used by this program.
+  int getIndirectIndexEnd(const MachineFunction &MF) const;
 
+  /// \brief Build instruction(s) for an indirect register write.
+  ///
+  /// \returns The instruction that performs the indirect register write
   MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
-                          MachineBasicBlock::iterator I,
-                          unsigned ValueReg, unsigned Address,
-                          unsigned OffsetReg) const override;
+                                         MachineBasicBlock::iterator I,
+                                         unsigned ValueReg, unsigned Address,
+                                         unsigned OffsetReg) const;
 
+  /// \brief Build instruction(s) for an indirect register read.
+  ///
+  /// \returns The instruction that performs the indirect register read
   MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
                                         MachineBasicBlock::iterator I,
                                         unsigned ValueReg, unsigned Address,
-                                        unsigned OffsetReg) const override;
+                                        unsigned OffsetReg) const;
 
   unsigned getMaxAlusPerClause() const;
 
-  ///buildDefaultInstruction - This function returns a MachineInstr with
-  /// all the instruction modifiers initialized to their default values.
-  /// You can use this function to avoid manually specifying each instruction
-  /// modifier operand when building a new instruction.
+  /// buildDefaultInstruction - This function returns a MachineInstr with all
+  /// the instruction modifiers initialized to their default values.  You can
+  /// use this function to avoid manually specifying each instruction modifier
+  /// operand when building a new instruction.
   ///
   /// \returns a MachineInstr with all the instruction modifiers initialized
   /// to their default values.
@@ -251,13 +263,13 @@ namespace llvm {
                                              unsigned DstReg) const;
 
   MachineInstr *buildMovImm(MachineBasicBlock &BB,
-                                  MachineBasicBlock::iterator I,
-                                  unsigned DstReg,
-                                  uint64_t Imm) const;
+                            MachineBasicBlock::iterator I,
+                            unsigned DstReg,
+                            uint64_t Imm) const;
 
   MachineInstr *buildMovInstr(MachineBasicBlock *MBB,
                               MachineBasicBlock::iterator I,
-                              unsigned DstReg, unsigned SrcReg) const override;
+                              unsigned DstReg, unsigned SrcReg) const;
 
   /// \brief Get the index of Op in the MachineInstr.
   ///
@@ -270,13 +282,10 @@ namespace llvm {
   int getOperandIdx(unsigned Opcode, unsigned Op) const;
 
   /// \brief Helper function for setting instruction flag values.
-  void setImmOperand(MachineInstr *MI, unsigned Op, int64_t Imm) const;
-
-  /// \returns true if this instruction has an operand for storing target flags.
-  bool hasFlagOperand(const MachineInstr &MI) const;
+  void setImmOperand(MachineInstr &MI, unsigned Op, int64_t Imm) const;
 
   ///\brief Add one of the MO_FLAG* flags to the specified \p Operand.
-  void addFlag(MachineInstr *MI, unsigned Operand, unsigned Flag) const;
+  void addFlag(MachineInstr &MI, unsigned Operand, unsigned Flag) const;
 
   ///\brief Determine if the specified \p Flag is set on this \p Operand.
   bool isFlagSet(const MachineInstr &MI, unsigned Operand, unsigned Flag) const;
@@ -285,11 +294,15 @@ namespace llvm {
   /// \param Flag The flag being set.
   ///
   /// \returns the operand containing the flags for this instruction.
-  MachineOperand &getFlagOp(MachineInstr *MI, unsigned SrcIdx = 0,
+  MachineOperand &getFlagOp(MachineInstr &MI, unsigned SrcIdx = 0,
                             unsigned Flag = 0) const;
 
   /// \brief Clear the specified flag on the instruction.
-  void clearFlag(MachineInstr *MI, unsigned Operand, unsigned Flag) const;
+  void clearFlag(MachineInstr &MI, unsigned Operand, unsigned Flag) const;
+
+  // Helper functions that check the opcode for status information
+  bool isRegisterStore(const MachineInstr &MI) const;
+  bool isRegisterLoad(const MachineInstr &MI) const;
 };
 
 namespace AMDGPU {
diff --git a/lib/Target/AMDGPU/R600Instructions.td b/lib/Target/AMDGPU/R600Instructions.td
index 33ef6a4e19ea..b6b576d95278 100644
--- a/lib/Target/AMDGPU/R600Instructions.td
+++ b/lib/Target/AMDGPU/R600Instructions.td
@@ -15,7 +15,7 @@
 include "R600Intrinsics.td"
 include "R600InstrFormats.td"
 
-class InstR600ISA <dag outs, dag ins, string asm, list<dag> pattern> :
+class InstR600ISA <dag outs, dag ins, string asm, list<dag> pattern = []> :
     InstR600 <outs, ins, asm, pattern, NullALU> {
 
   let Namespace = "AMDGPU";
@@ -160,7 +160,8 @@ class R600_2OP <bits<11> inst, string opName, list<dag> pattern,
   let Inst{63-32} = Word1;
 }
 
-class R600_2OP_Helper <bits<11> inst, string opName, SDPatternOperator node,
+class R600_2OP_Helper <bits<11> inst, string opName,
+                       SDPatternOperator node = null_frag,
                        InstrItinClass itin = AnyALU> :
     R600_2OP <inst, opName,
               [(set R600_Reg32:$dst, (node R600_Reg32:$src0,
@@ -283,7 +284,7 @@ class EG_CF_RAT <bits <8> cfinst, bits <6> ratinst, bits<4> ratid, bits<4> mask,
 }
 
 class VTX_READ <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
-    : InstR600ISA <outs, (ins MEMxi:$src_gpr), name, pattern>,
+    : InstR600ISA <outs, (ins MEMxi:$src_gpr), !strconcat("  ", name), pattern>,
       VTX_WORD1_GPR {
 
   // Static fields
@@ -328,18 +329,44 @@ class VTX_READ <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
 
 class LoadParamFrag <PatFrag load_type> : PatFrag <
   (ops node:$ptr), (load_type node:$ptr),
-  [{ return isConstantLoad(dyn_cast<LoadSDNode>(N), 0); }]
+  [{ return isConstantLoad(cast<LoadSDNode>(N), 0) ||
+            (cast<LoadSDNode>(N)->getAddressSpace() == AMDGPUAS::PARAM_I_ADDRESS); }]
 >;
 
 def load_param : LoadParamFrag<load>;
 def load_param_exti8 : LoadParamFrag<az_extloadi8>;
 def load_param_exti16 : LoadParamFrag<az_extloadi16>;
 
-def isR600 : Predicate<"Subtarget->getGeneration() <= AMDGPUSubtarget::R700">;
+class LoadVtxId1 <PatFrag load> : PatFrag <
+  (ops node:$ptr), (load node:$ptr), [{
+  const MemSDNode *LD = cast<MemSDNode>(N);
+  return LD->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
+         (LD->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
+           !isa<GlobalValue>(GetUnderlyingObject(
+           LD->getMemOperand()->getValue(), CurDAG->getDataLayout())));
+}]>;
+
+def vtx_id1_az_extloadi8 : LoadVtxId1 <az_extloadi8>;
+def vtx_id1_az_extloadi16 : LoadVtxId1 <az_extloadi16>;
+def vtx_id1_load : LoadVtxId1 <load>;
+
+class LoadVtxId2 <PatFrag load> : PatFrag <
+  (ops node:$ptr), (load node:$ptr), [{
+  const MemSDNode *LD = cast<MemSDNode>(N);
+  return LD->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
+         isa<GlobalValue>(GetUnderlyingObject(
+         LD->getMemOperand()->getValue(), CurDAG->getDataLayout()));
+}]>;
+
+def vtx_id2_az_extloadi8 : LoadVtxId2 <az_extloadi8>;
+def vtx_id2_az_extloadi16 : LoadVtxId2 <az_extloadi16>;
+def vtx_id2_load : LoadVtxId2 <load>;
+
+def isR600 : Predicate<"Subtarget->getGeneration() <= R600Subtarget::R700">;
 
 def isR600toCayman
     : Predicate<
-          "Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS">;
+          "Subtarget->getGeneration() <= R600Subtarget::NORTHERN_ISLANDS">;
 
 //===----------------------------------------------------------------------===//
 // R600 SDNodes
@@ -407,8 +434,7 @@ def : Pat<(TEXTURE_FETCH (i32 TextureOp), vt:$SRC_GPR,
 def INTERP_VEC_LOAD :  AMDGPUShaderInst <
   (outs R600_Reg128:$dst),
   (ins i32imm:$src0),
-  "INTERP_LOAD $src0 : $dst",
-  [(set R600_Reg128:$dst, (int_R600_interp_const imm:$src0))]>;
+  "INTERP_LOAD $src0 : $dst">;
 
 def INTERP_XY : R600_2OP <0xD6, "INTERP_XY", []> {
   let bank_swizzle = 5;
@@ -474,28 +500,6 @@ class ExportBufWord1 {
 }
 
 multiclass ExportPattern<Instruction ExportInst, bits<8> cf_inst> {
-  def : Pat<(int_R600_store_pixel_depth R600_Reg32:$reg),
-    (ExportInst
-        (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), $reg, sub0),
-        0, 61, 0, 7, 7, 7, cf_inst, 0)
-  >;
-
-  def : Pat<(int_R600_store_pixel_stencil R600_Reg32:$reg),
-    (ExportInst
-        (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), $reg, sub0),
-        0, 61, 7, 0, 7, 7, cf_inst, 0)
-  >;
-
-  def : Pat<(int_R600_store_dummy (i32 imm:$type)),
-    (ExportInst
-        (v4f32 (IMPLICIT_DEF)), imm:$type, 0, 7, 7, 7, 7, cf_inst, 0)
-  >;
-
-  def : Pat<(int_R600_store_dummy 1),
-    (ExportInst
-        (v4f32 (IMPLICIT_DEF)), 1, 60, 7, 7, 7, 7, cf_inst, 0)
-  >;
-
   def : Pat<(EXPORT (v4f32 R600_Reg128:$src), (i32 imm:$base), (i32 imm:$type),
     (i32 imm:$swz_x), (i32 imm:$swz_y), (i32 imm:$swz_z), (i32 imm:$swz_w)),
         (ExportInst R600_Reg128:$src, imm:$type, imm:$base,
@@ -507,22 +511,22 @@ multiclass ExportPattern<Instruction ExportInst, bits<8> cf_inst> {
 multiclass SteamOutputExportPattern<Instruction ExportInst,
     bits<8> buf0inst, bits<8> buf1inst, bits<8> buf2inst, bits<8> buf3inst> {
 // Stream0
-  def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src),
+  def : Pat<(int_r600_store_stream_output (v4f32 R600_Reg128:$src),
       (i32 imm:$arraybase), (i32 0), (i32 imm:$mask)),
       (ExportInst R600_Reg128:$src, 0, imm:$arraybase,
       4095, imm:$mask, buf0inst, 0)>;
 // Stream1
-  def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src),
+  def : Pat<(int_r600_store_stream_output (v4f32 R600_Reg128:$src),
       (i32 imm:$arraybase), (i32 1), (i32 imm:$mask)),
       (ExportInst $src, 0, imm:$arraybase,
       4095, imm:$mask, buf1inst, 0)>;
 // Stream2
-  def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src),
+  def : Pat<(int_r600_store_stream_output (v4f32 R600_Reg128:$src),
       (i32 imm:$arraybase), (i32 2), (i32 imm:$mask)),
       (ExportInst $src, 0, imm:$arraybase,
       4095, imm:$mask, buf2inst, 0)>;
 // Stream3
-  def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src),
+  def : Pat<(int_r600_store_stream_output (v4f32 R600_Reg128:$src),
       (i32 imm:$arraybase), (i32 3), (i32 imm:$mask)),
       (ExportInst $src, 0, imm:$arraybase,
       4095, imm:$mask, buf3inst, 0)>;
@@ -678,7 +682,7 @@ let Predicates = [isR600toCayman] in {
 
 def ADD : R600_2OP_Helper <0x0, "ADD", fadd>;
 // Non-IEEE MUL: 0 * anything = 0
-def MUL : R600_2OP_Helper <0x1, "MUL NON-IEEE", int_AMDGPU_mul>;
+def MUL : R600_2OP_Helper <0x1, "MUL NON-IEEE">;
 def MUL_IEEE : R600_2OP_Helper <0x2, "MUL_IEEE", fmul>;
 // TODO: Do these actually match the regular fmin/fmax behavior?
 def MAX : R600_2OP_Helper <0x3, "MAX", AMDGPUfmax_legacy>;
@@ -733,6 +737,7 @@ def SETNE_DX10 : R600_2OP <
   [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_UNE_NE))]
 >;
 
+// FIXME: Need combine for AMDGPUfract
 def FRACT : R600_1OP_Helper <0x10, "FRACT", AMDGPUfract>;
 def TRUNC : R600_1OP_Helper <0x11, "TRUNC", ftrunc>;
 def CEIL : R600_1OP_Helper <0x12, "CEIL", fceil>;
@@ -758,6 +763,13 @@ def : Pat <
   (MOV_IMM_I32 imm:$val)
 >;
 
+def MOV_IMM_GLOBAL_ADDR : MOV_IMM<iPTR, i32imm>;
+def : Pat <
+  (AMDGPUconstdata_ptr tglobaladdr:$addr),
+  (MOV_IMM_GLOBAL_ADDR tglobaladdr:$addr)
+>;
+
+
 def MOV_IMM_F32 : MOV_IMM<f32, f32imm>;
 def : Pat <
   (fpimm:$val),
@@ -851,7 +863,7 @@ class R600_TEX <bits<11> inst, string opName> :
           i32imm:$RESOURCE_ID, i32imm:$SAMPLER_ID,
           CT:$COORD_TYPE_X, CT:$COORD_TYPE_Y, CT:$COORD_TYPE_Z,
           CT:$COORD_TYPE_W),
-          !strconcat(opName,
+          !strconcat("  ", opName,
           " $DST_GPR.$DST_SEL_X$DST_SEL_Y$DST_SEL_Z$DST_SEL_W, "
           "$SRC_GPR.$srcx$srcy$srcz$srcw "
           "RID:$RESOURCE_ID SID:$SAMPLER_ID "
@@ -1099,14 +1111,13 @@ class RECIP_UINT_Common <bits<11> inst> : R600_1OP_Helper <
 
 // Clamped to maximum.
 class RECIPSQRT_CLAMPED_Common <bits<11> inst> : R600_1OP_Helper <
-  inst, "RECIPSQRT_CLAMPED", AMDGPUrsq_clamped
+  inst, "RECIPSQRT_CLAMPED", AMDGPUrsq_clamp
 > {
   let Itinerary = TransALU;
 }
 
 class RECIPSQRT_IEEE_Common <bits<11> inst> : R600_1OP_Helper <
-  inst, "RECIPSQRT_IEEE", AMDGPUrsq_legacy
-> {
+  inst, "RECIPSQRT_IEEE", AMDGPUrsq> {
   let Itinerary = TransALU;
 }
 
@@ -1135,11 +1146,6 @@ def FNEG_R600 : FNEG<R600_Reg32>;
 // FIXME: Should be predicated on unsafe fp math.
 multiclass DIV_Common <InstR600 recip_ieee> {
 def : Pat<
-  (int_AMDGPU_div f32:$src0, f32:$src1),
-  (MUL_IEEE $src0, (recip_ieee $src1))
->;
-
-def : Pat<
   (fdiv f32:$src0, f32:$src1),
   (MUL_IEEE $src0, (recip_ieee $src1))
 >;
@@ -1147,12 +1153,6 @@ def : Pat<
 def : RcpPat<recip_ieee, f32>;
 }
 
-class TGSI_LIT_Z_Common <InstR600 mul_lit, InstR600 log_clamped, InstR600 exp_ieee>
-  : Pat <
-  (int_TGSI_lit_z f32:$src_x, f32:$src_y, f32:$src_w),
-  (exp_ieee (mul_lit (log_clamped (MAX $src_y, (f32 ZERO))), $src_w, $src_x))
->;
-
 //===----------------------------------------------------------------------===//
 // R600 / R700 Instructions
 //===----------------------------------------------------------------------===//
@@ -1191,7 +1191,6 @@ let Predicates = [isR600] in {
 
   defm DIV_r600 : DIV_Common<RECIP_IEEE_r600>;
   def : POW_Common <LOG_IEEE_r600, EXP_IEEE_r600, MUL>;
-  def TGSI_LIT_Z_r600 : TGSI_LIT_Z_Common<MUL_LIT_r600, LOG_CLAMPED_r600, EXP_IEEE_r600>;
 
   def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_r600 $src))>;
   def : RsqPat<RECIPSQRT_IEEE_r600, f32>;
@@ -1332,9 +1331,7 @@ def TXD: InstR600 <
   (outs R600_Reg128:$dst),
   (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2,
        i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget),
-  "TXD $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget",
-  [(set v4f32:$dst, (int_AMDGPU_txd v4f32:$src0, v4f32:$src1, v4f32:$src2,
-                     imm:$resourceId, imm:$samplerId, imm:$textureTarget))],
+  "TXD $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget", [],
   NullALU > {
   let TEXInst = 1;
 }
@@ -1344,10 +1341,7 @@ def TXD_SHADOW: InstR600 <
   (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2,
        i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget),
   "TXD_SHADOW $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget",
-  [(set v4f32:$dst, (int_AMDGPU_txd v4f32:$src0, v4f32:$src1, v4f32:$src2,
-        imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))],
-   NullALU
-> {
+  [], NullALU> {
   let TEXInst = 1;
 }
 } // End isPseudo = 1
@@ -1426,8 +1420,7 @@ def TEX_VTX_CONSTBUF :
 }
 
 def TEX_VTX_TEXBUF:
-  InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$BUFFER_ID), "TEX_VTX_EXPLICIT_READ $dst, $ptr",
-      [(set v4f32:$dst, (int_R600_load_texbuf ADDRGA_VAR_OFFSET:$ptr, imm:$BUFFER_ID))]>,
+  InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$BUFFER_ID), "TEX_VTX_EXPLICIT_READ $dst, $ptr">,
 VTX_WORD1_GPR, VTX_WORD0_eg {
 
 let VC_INST = 0;
@@ -1542,8 +1535,9 @@ let isTerminator = 1, usesCustomInserter = 1, isBranch = 1, isBarrier = 1 in {
 //===---------------------------------------------------------------------===//
 let isTerminator = 1, isReturn = 1, hasCtrlDep = 1,
     usesCustomInserter = 1 in {
-  def RETURN          : ILFormat<(outs), (ins variable_ops),
-      "RETURN", [(IL_retflag)]>;
+  def RETURN : ILFormat<(outs), (ins variable_ops),
+    "RETURN", [(AMDGPUendpgm)]
+  >;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1729,12 +1723,6 @@ def : DwordAddrPat  <i32, R600_Reg32>;
 
 } // End isR600toCayman Predicate
 
-let Predicates = [isR600] in {
-// Intrinsic patterns
-defm : Expand24IBitOps<MULLO_INT_r600, ADD_INT>;
-defm : Expand24UBitOps<MULLO_UINT_r600, ADD_INT>;
-} // End isR600
-
 def getLDSNoRetOp : InstrMapping {
   let FilterClass = "R600_LDS_1A1D";
   let RowFields = ["BaseOp"];
diff --git a/lib/Target/AMDGPU/R600Intrinsics.td b/lib/Target/AMDGPU/R600Intrinsics.td
index 9681747006d9..a5310e9fd6d0 100644
--- a/lib/Target/AMDGPU/R600Intrinsics.td
+++ b/lib/Target/AMDGPU/R600Intrinsics.td
@@ -11,65 +11,57 @@
 //
 //===----------------------------------------------------------------------===//
 
-let TargetPrefix = "R600", isTarget = 1 in {
-  class TextureIntrinsicFloatInput :
-    Intrinsic<[llvm_v4f32_ty], [
-      llvm_v4f32_ty, // Coord
-      llvm_i32_ty, // offset_x
-      llvm_i32_ty, // offset_y,
-      llvm_i32_ty, // offset_z,
-      llvm_i32_ty, // resource_id
-      llvm_i32_ty, // samplerid
-      llvm_i32_ty, // coord_type_x
-      llvm_i32_ty, // coord_type_y
-      llvm_i32_ty, // coord_type_z
-      llvm_i32_ty // coord_type_w
-    ], [IntrNoMem]>;
-  class TextureIntrinsicInt32Input :
-    Intrinsic<[llvm_v4i32_ty], [
-      llvm_v4i32_ty, // Coord
-      llvm_i32_ty, // offset_x
-      llvm_i32_ty, // offset_y,
-      llvm_i32_ty, // offset_z,
-      llvm_i32_ty, // resource_id
-      llvm_i32_ty, // samplerid
-      llvm_i32_ty, // coord_type_x
-      llvm_i32_ty, // coord_type_y
-      llvm_i32_ty, // coord_type_z
-      llvm_i32_ty // coord_type_w
-    ], [IntrNoMem]>;
+class TextureIntrinsicFloatInput : Intrinsic<[llvm_v4f32_ty], [
+  llvm_v4f32_ty, // Coord
+  llvm_i32_ty,   // offset_x
+  llvm_i32_ty,   // offset_y,
+  llvm_i32_ty,   // offset_z,
+  llvm_i32_ty,   // resource_id
+  llvm_i32_ty,   // samplerid
+  llvm_i32_ty,   // coord_type_x
+  llvm_i32_ty,   // coord_type_y
+  llvm_i32_ty,   // coord_type_z
+  llvm_i32_ty],  // coord_type_w
+  [IntrNoMem]
+>;
 
-  def int_R600_load_input :
-    Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
-  def int_R600_interp_input :
-    Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_R600_interp_const :
-    Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty], [IntrNoMem]>;
-def int_R600_interp_xy :
-    Intrinsic<[llvm_v2f32_ty], [llvm_i32_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
-def int_R600_interp_zw :
-    Intrinsic<[llvm_v2f32_ty], [llvm_i32_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
-  def int_R600_load_texbuf :
-    Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_R600_tex : TextureIntrinsicFloatInput;
-  def int_R600_texc : TextureIntrinsicFloatInput;
-  def int_R600_txl : TextureIntrinsicFloatInput;
-  def int_R600_txlc : TextureIntrinsicFloatInput;
-  def int_R600_txb : TextureIntrinsicFloatInput;
-  def int_R600_txbc : TextureIntrinsicFloatInput;
-  def int_R600_txf : TextureIntrinsicInt32Input;
-  def int_R600_ldptr : TextureIntrinsicInt32Input;
-  def int_R600_txq : TextureIntrinsicInt32Input;
-  def int_R600_ddx : TextureIntrinsicFloatInput;
-  def int_R600_ddy : TextureIntrinsicFloatInput;
-  def int_R600_store_swizzle :
-    Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []>;
-  def int_R600_store_stream_output :
-    Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
-  def int_R600_store_pixel_depth :
-      Intrinsic<[], [llvm_float_ty], []>;
-  def int_R600_store_pixel_stencil :
-      Intrinsic<[], [llvm_float_ty], []>;
-  def int_R600_store_dummy :
-      Intrinsic<[], [llvm_i32_ty], []>;
-}
+class TextureIntrinsicInt32Input : Intrinsic<[llvm_v4i32_ty], [
+    llvm_v4i32_ty, // Coord
+    llvm_i32_ty,   // offset_x
+    llvm_i32_ty,   // offset_y,
+    llvm_i32_ty,   // offset_z,
+    llvm_i32_ty,   // resource_id
+    llvm_i32_ty,   // samplerid
+    llvm_i32_ty,   // coord_type_x
+    llvm_i32_ty,   // coord_type_y
+    llvm_i32_ty,   // coord_type_z
+    llvm_i32_ty],  // coord_type_w
+    [IntrNoMem]
+>;
+
+let TargetPrefix = "r600", isTarget = 1 in {
+
+def int_r600_store_swizzle :
+  Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []
+>;
+
+def int_r600_store_stream_output : Intrinsic<
+  [], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []
+>;
+
+def int_r600_tex : TextureIntrinsicFloatInput;
+def int_r600_texc : TextureIntrinsicFloatInput;
+def int_r600_txl : TextureIntrinsicFloatInput;
+def int_r600_txlc : TextureIntrinsicFloatInput;
+def int_r600_txb : TextureIntrinsicFloatInput;
+def int_r600_txbc : TextureIntrinsicFloatInput;
+def int_r600_txf : TextureIntrinsicInt32Input;
+def int_r600_txq : TextureIntrinsicInt32Input;
+def int_r600_ddx : TextureIntrinsicFloatInput;
+def int_r600_ddy : TextureIntrinsicFloatInput;
+
+def int_r600_dot4 : Intrinsic<[llvm_float_ty],
+  [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]
+>;
+
+} // End TargetPrefix = "r600", isTarget = 1
diff --git a/lib/Target/AMDGPU/R600MachineFunctionInfo.h b/lib/Target/AMDGPU/R600MachineFunctionInfo.h
index 263561edd30d..04a4436ebe03 100644
--- a/lib/Target/AMDGPU/R600MachineFunctionInfo.h
+++ b/lib/Target/AMDGPU/R600MachineFunctionInfo.h
@@ -10,17 +10,16 @@
 /// \file
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_R600_R600MACHINEFUNCTIONINFO_H
-#define LLVM_LIB_TARGET_R600_R600MACHINEFUNCTIONINFO_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_R600MACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_R600MACHINEFUNCTIONINFO_H
 
 #include "AMDGPUMachineFunction.h"
-#include "llvm/ADT/BitVector.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include <vector>
 
 namespace llvm {
 
-class R600MachineFunctionInfo : public AMDGPUMachineFunction {
+class R600MachineFunctionInfo final : public AMDGPUMachineFunction {
   void anchor() override;
 public:
   R600MachineFunctionInfo(const MachineFunction &MF);
diff --git a/lib/Target/AMDGPU/R600MachineScheduler.cpp b/lib/Target/AMDGPU/R600MachineScheduler.cpp
index bcde5fb50dac..db18e5bd1afa 100644
--- a/lib/Target/AMDGPU/R600MachineScheduler.cpp
+++ b/lib/Target/AMDGPU/R600MachineScheduler.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "R600MachineScheduler.h"
+#include "R600InstrInfo.h"
 #include "AMDGPUSubtarget.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Pass.h"
@@ -26,7 +27,7 @@ using namespace llvm;
 void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {
   assert(dag->hasVRegLiveness() && "R600SchedStrategy needs vreg liveness");
   DAG = static_cast<ScheduleDAGMILive*>(dag);
-  const AMDGPUSubtarget &ST = DAG->MF.getSubtarget<AMDGPUSubtarget>();
+  const R600Subtarget &ST = DAG->MF.getSubtarget<R600Subtarget>();
   TII = static_cast<const R600InstrInfo*>(DAG->TII);
   TRI = static_cast<const R600RegisterInfo*>(DAG->TRI);
   VLIW5 = !ST.hasCaymanISA();
@@ -48,8 +49,7 @@ void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc,
   QSrc.clear();
 }
 
-static
-unsigned getWFCountLimitedByGPR(unsigned GPRCount) {
+static unsigned getWFCountLimitedByGPR(unsigned GPRCount) {
   assert (GPRCount && "GPRCount cannot be 0");
   return 248 / GPRCount;
 }
@@ -222,75 +222,74 @@ bool R600SchedStrategy::regBelongsToClass(unsigned Reg,
 R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
   MachineInstr *MI = SU->getInstr();
 
-  if (TII->isTransOnly(MI))
+  if (TII->isTransOnly(*MI))
     return AluTrans;
 
-    switch (MI->getOpcode()) {
-    case AMDGPU::PRED_X:
-      return AluPredX;
-    case AMDGPU::INTERP_PAIR_XY:
-    case AMDGPU::INTERP_PAIR_ZW:
-    case AMDGPU::INTERP_VEC_LOAD:
-    case AMDGPU::DOT_4:
-      return AluT_XYZW;
-    case AMDGPU::COPY:
-      if (MI->getOperand(1).isUndef()) {
-        // MI will become a KILL, don't considers it in scheduling
-        return AluDiscarded;
-      }
-    default:
-      break;
-    }
-
-    // Does the instruction take a whole IG ?
-    // XXX: Is it possible to add a helper function in R600InstrInfo that can
-    // be used here and in R600PacketizerList::isSoloInstruction() ?
-    if(TII->isVector(*MI) ||
-        TII->isCubeOp(MI->getOpcode()) ||
-        TII->isReductionOp(MI->getOpcode()) ||
-        MI->getOpcode() == AMDGPU::GROUP_BARRIER) {
-      return AluT_XYZW;
+  switch (MI->getOpcode()) {
+  case AMDGPU::PRED_X:
+    return AluPredX;
+  case AMDGPU::INTERP_PAIR_XY:
+  case AMDGPU::INTERP_PAIR_ZW:
+  case AMDGPU::INTERP_VEC_LOAD:
+  case AMDGPU::DOT_4:
+    return AluT_XYZW;
+  case AMDGPU::COPY:
+    if (MI->getOperand(1).isUndef()) {
+      // MI will become a KILL, don't considers it in scheduling
+      return AluDiscarded;
     }
+  default:
+    break;
+  }
 
-    if (TII->isLDSInstr(MI->getOpcode())) {
-      return AluT_X;
-    }
+  // Does the instruction take a whole IG ?
+  // XXX: Is it possible to add a helper function in R600InstrInfo that can
+  // be used here and in R600PacketizerList::isSoloInstruction() ?
+  if(TII->isVector(*MI) ||
+     TII->isCubeOp(MI->getOpcode()) ||
+     TII->isReductionOp(MI->getOpcode()) ||
+     MI->getOpcode() == AMDGPU::GROUP_BARRIER) {
+    return AluT_XYZW;
+  }
 
-    // Is the result already assigned to a channel ?
-    unsigned DestSubReg = MI->getOperand(0).getSubReg();
-    switch (DestSubReg) {
-    case AMDGPU::sub0:
-      return AluT_X;
-    case AMDGPU::sub1:
-      return AluT_Y;
-    case AMDGPU::sub2:
-      return AluT_Z;
-    case AMDGPU::sub3:
-      return AluT_W;
-    default:
-      break;
-    }
+  if (TII->isLDSInstr(MI->getOpcode())) {
+    return AluT_X;
+  }
 
-    // Is the result already member of a X/Y/Z/W class ?
-    unsigned DestReg = MI->getOperand(0).getReg();
-    if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_XRegClass) ||
-        regBelongsToClass(DestReg, &AMDGPU::R600_AddrRegClass))
-      return AluT_X;
-    if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_YRegClass))
-      return AluT_Y;
-    if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass))
-      return AluT_Z;
-    if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_WRegClass))
-      return AluT_W;
-    if (regBelongsToClass(DestReg, &AMDGPU::R600_Reg128RegClass))
-      return AluT_XYZW;
-
-    // LDS src registers cannot be used in the Trans slot.
-    if (TII->readsLDSSrcReg(MI))
-      return AluT_XYZW;
-
-    return AluAny;
+  // Is the result already assigned to a channel ?
+  unsigned DestSubReg = MI->getOperand(0).getSubReg();
+  switch (DestSubReg) {
+  case AMDGPU::sub0:
+    return AluT_X;
+  case AMDGPU::sub1:
+    return AluT_Y;
+  case AMDGPU::sub2:
+    return AluT_Z;
+  case AMDGPU::sub3:
+    return AluT_W;
+  default:
+    break;
+  }
 
+  // Is the result already member of a X/Y/Z/W class ?
+  unsigned DestReg = MI->getOperand(0).getReg();
+  if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_XRegClass) ||
+      regBelongsToClass(DestReg, &AMDGPU::R600_AddrRegClass))
+    return AluT_X;
+  if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_YRegClass))
+    return AluT_Y;
+  if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass))
+    return AluT_Z;
+  if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_WRegClass))
+    return AluT_W;
+  if (regBelongsToClass(DestReg, &AMDGPU::R600_Reg128RegClass))
+    return AluT_XYZW;
+
+  // LDS src registers cannot be used in the Trans slot.
+  if (TII->readsLDSSrcReg(*MI))
+    return AluT_XYZW;
+
+  return AluAny;
 }
 
 int R600SchedStrategy::getInstKind(SUnit* SU) {
@@ -324,9 +323,8 @@ SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q, bool AnyALU) {
       It != E; ++It) {
     SUnit *SU = *It;
     InstructionsGroupCandidate.push_back(SU->getInstr());
-    if (TII->fitsConstReadLimitations(InstructionsGroupCandidate)
-        && (!AnyALU || !TII->isVectorOnly(SU->getInstr()))
-    ) {
+    if (TII->fitsConstReadLimitations(InstructionsGroupCandidate) &&
+        (!AnyALU || !TII->isVectorOnly(*SU->getInstr()))) {
       InstructionsGroupCandidate.pop_back();
       Q.erase((It + 1).base());
       return SU;
@@ -350,7 +348,7 @@ void R600SchedStrategy::PrepareNextSlot() {
   DEBUG(dbgs() << "New Slot\n");
   assert (OccupedSlotsMask && "Slot wasn't filled");
   OccupedSlotsMask = 0;
-//  if (HwGen == AMDGPUSubtarget::NORTHERN_ISLANDS)
+//  if (HwGen == R600Subtarget::NORTHERN_ISLANDS)
 //    OccupedSlotsMask |= 16;
   InstructionsGroupCandidate.clear();
   LoadAlu();
diff --git a/lib/Target/AMDGPU/R600MachineScheduler.h b/lib/Target/AMDGPU/R600MachineScheduler.h
index fc5b95c28e71..16d5d939708c 100644
--- a/lib/Target/AMDGPU/R600MachineScheduler.h
+++ b/lib/Target/AMDGPU/R600MachineScheduler.h
@@ -12,20 +12,19 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_R600_R600MACHINESCHEDULER_H
-#define LLVM_LIB_TARGET_R600_R600MACHINESCHEDULER_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_R600MACHINESCHEDULER_H
+#define LLVM_LIB_TARGET_AMDGPU_R600MACHINESCHEDULER_H
 
-#include "R600InstrInfo.h"
-#include "llvm/ADT/PriorityQueue.h"
 #include "llvm/CodeGen/MachineScheduler.h"
-#include "llvm/Support/Debug.h"
 
 using namespace llvm;
 
 namespace llvm {
 
-class R600SchedStrategy : public MachineSchedStrategy {
+class R600InstrInfo;
+struct R600RegisterInfo;
 
+class R600SchedStrategy final : public MachineSchedStrategy {
   const ScheduleDAGMILive *DAG;
   const R600InstrInfo *TII;
   const R600RegisterInfo *TRI;
diff --git a/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
index 5efb3b9fc20e..ecae27d2233d 100644
--- a/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
+++ b/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
@@ -29,6 +29,7 @@
 
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
+#include "R600Defines.h"
 #include "R600InstrInfo.h"
 #include "llvm/CodeGen/DFAPacketizer.h"
 #include "llvm/CodeGen/MachineDominators.h"
@@ -210,9 +211,9 @@ MachineInstr *R600VectorRegMerger::RebuildVector(
     (void)Tmp;
     SrcVec = DstReg;
   }
-  Pos = BuildMI(MBB, Pos, DL, TII->get(AMDGPU::COPY), Reg)
-      .addReg(SrcVec);
-  DEBUG(dbgs() << "    ->"; Pos->dump(););
+  MachineInstr *NewMI =
+      BuildMI(MBB, Pos, DL, TII->get(AMDGPU::COPY), Reg).addReg(SrcVec);
+  DEBUG(dbgs() << "    ->"; NewMI->dump(););
 
   DEBUG(dbgs() << "  Updating Swizzle:\n");
   for (MachineRegisterInfo::use_instr_iterator It = MRI->use_instr_begin(Reg),
@@ -224,11 +225,11 @@ MachineInstr *R600VectorRegMerger::RebuildVector(
   RSI->Instr->eraseFromParent();
 
   // Update RSI
-  RSI->Instr = Pos;
+  RSI->Instr = NewMI;
   RSI->RegToChan = UpdatedRegToChan;
   RSI->UndefReg = UpdatedUndef;
 
-  return Pos;
+  return NewMI;
 }
 
 void R600VectorRegMerger::RemoveMI(MachineInstr *MI) {
@@ -314,8 +315,13 @@ void R600VectorRegMerger::trackRSI(const RegSeqInfo &RSI) {
 }
 
 bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) {
-  TII = static_cast<const R600InstrInfo *>(Fn.getSubtarget().getInstrInfo());
-  MRI = &(Fn.getRegInfo());
+  if (skipFunction(*Fn.getFunction()))
+    return false;
+
+  const R600Subtarget &ST = Fn.getSubtarget<R600Subtarget>();
+  TII = ST.getInstrInfo();
+  MRI = &Fn.getRegInfo();
+
   for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
        MBB != MBBe; ++MBB) {
     MachineBasicBlock *MB = &*MBB;
@@ -325,10 +331,10 @@ bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) {
 
     for (MachineBasicBlock::iterator MII = MB->begin(), MIIE = MB->end();
          MII != MIIE; ++MII) {
-      MachineInstr *MI = MII;
-      if (MI->getOpcode() != AMDGPU::REG_SEQUENCE) {
-        if (TII->get(MI->getOpcode()).TSFlags & R600_InstFlag::TEX_INST) {
-          unsigned Reg = MI->getOperand(1).getReg();
+      MachineInstr &MI = *MII;
+      if (MI.getOpcode() != AMDGPU::REG_SEQUENCE) {
+        if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST) {
+          unsigned Reg = MI.getOperand(1).getReg();
           for (MachineRegisterInfo::def_instr_iterator
                It = MRI->def_instr_begin(Reg), E = MRI->def_instr_end();
                It != E; ++It) {
@@ -338,17 +344,17 @@ bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) {
         continue;
       }
 
-
-      RegSeqInfo RSI(*MRI, MI);
+      RegSeqInfo RSI(*MRI, &MI);
 
       // All uses of MI are swizzeable ?
-      unsigned Reg = MI->getOperand(0).getReg();
+      unsigned Reg = MI.getOperand(0).getReg();
       if (!areAllUsesSwizzeable(Reg))
         continue;
 
-      DEBUG (dbgs() << "Trying to optimize ";
-          MI->dump();
-      );
+      DEBUG({
+        dbgs() << "Trying to optimize ";
+        MI.dump();
+      });
 
       RegSeqInfo CandidateRSI;
       std::vector<std::pair<unsigned, unsigned> > RemapChan;
diff --git a/lib/Target/AMDGPU/R600Packetizer.cpp b/lib/Target/AMDGPU/R600Packetizer.cpp
index 21269613a305..c84866469ae8 100644
--- a/lib/Target/AMDGPU/R600Packetizer.cpp
+++ b/lib/Target/AMDGPU/R600Packetizer.cpp
@@ -56,15 +56,14 @@ public:
 char R600Packetizer::ID = 0;
 
 class R600PacketizerList : public VLIWPacketizerList {
-
 private:
   const R600InstrInfo *TII;
   const R600RegisterInfo &TRI;
   bool VLIW5;
   bool ConsideredInstUsesAlreadyWrittenVectorElement;
 
-  unsigned getSlot(const MachineInstr *MI) const {
-    return TRI.getHWRegChan(MI->getOperand(0).getReg());
+  unsigned getSlot(const MachineInstr &MI) const {
+    return TRI.getHWRegChan(MI.getOperand(0).getReg());
   }
 
   /// \returns register to PV chan mapping for bundle/single instructions that
@@ -81,11 +80,11 @@ private:
     int LastDstChan = -1;
     do {
       bool isTrans = false;
-      int BISlot = getSlot(&*BI);
+      int BISlot = getSlot(*BI);
       if (LastDstChan >= BISlot)
         isTrans = true;
       LastDstChan = BISlot;
-      if (TII->isPredicated(&*BI))
+      if (TII->isPredicated(*BI))
         continue;
       int OperandIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::write);
       if (OperandIdx > -1 && BI->getOperand(OperandIdx).getImm() == 0)
@@ -95,7 +94,7 @@ private:
         continue;
       }
       unsigned Dst = BI->getOperand(DstIdx).getReg();
-      if (isTrans || TII->isTransOnly(&*BI)) {
+      if (isTrans || TII->isTransOnly(*BI)) {
         Result[Dst] = AMDGPU::PS;
         continue;
       }
@@ -129,7 +128,7 @@ private:
     return Result;
   }
 
-  void substitutePV(MachineInstr *MI, const DenseMap<unsigned, unsigned> &PVs)
+  void substitutePV(MachineInstr &MI, const DenseMap<unsigned, unsigned> &PVs)
       const {
     unsigned Ops[] = {
       AMDGPU::OpName::src0,
@@ -137,23 +136,23 @@ private:
       AMDGPU::OpName::src2
     };
     for (unsigned i = 0; i < 3; i++) {
-      int OperandIdx = TII->getOperandIdx(MI->getOpcode(), Ops[i]);
+      int OperandIdx = TII->getOperandIdx(MI.getOpcode(), Ops[i]);
       if (OperandIdx < 0)
         continue;
-      unsigned Src = MI->getOperand(OperandIdx).getReg();
+      unsigned Src = MI.getOperand(OperandIdx).getReg();
       const DenseMap<unsigned, unsigned>::const_iterator It = PVs.find(Src);
       if (It != PVs.end())
-        MI->getOperand(OperandIdx).setReg(It->second);
+        MI.getOperand(OperandIdx).setReg(It->second);
     }
   }
 public:
   // Ctor.
-  R600PacketizerList(MachineFunction &MF, MachineLoopInfo &MLI)
+  R600PacketizerList(MachineFunction &MF, const R600Subtarget &ST,
+                     MachineLoopInfo &MLI)
       : VLIWPacketizerList(MF, MLI, nullptr),
-        TII(static_cast<const R600InstrInfo *>(
-            MF.getSubtarget().getInstrInfo())),
+        TII(ST.getInstrInfo()),
         TRI(TII->getRegisterInfo()) {
-    VLIW5 = !MF.getSubtarget<AMDGPUSubtarget>().hasCaymanISA();
+    VLIW5 = !ST.hasCaymanISA();
   }
 
   // initPacketizerState - initialize some internal flags.
@@ -162,32 +161,30 @@ public:
   }
 
   // ignorePseudoInstruction - Ignore bundling of pseudo instructions.
-  bool ignorePseudoInstruction(const MachineInstr *MI,
+  bool ignorePseudoInstruction(const MachineInstr &MI,
                                const MachineBasicBlock *MBB) override {
     return false;
   }
 
   // isSoloInstruction - return true if instruction MI can not be packetized
   // with any other instruction, which means that MI itself is a packet.
-  bool isSoloInstruction(const MachineInstr *MI) override {
-    if (TII->isVector(*MI))
+  bool isSoloInstruction(const MachineInstr &MI) override {
+    if (TII->isVector(MI))
       return true;
-    if (!TII->isALUInstr(MI->getOpcode()))
+    if (!TII->isALUInstr(MI.getOpcode()))
       return true;
-    if (MI->getOpcode() == AMDGPU::GROUP_BARRIER)
+    if (MI.getOpcode() == AMDGPU::GROUP_BARRIER)
       return true;
     // XXX: This can be removed once the packetizer properly handles all the
     // LDS instruction group restrictions.
-    if (TII->isLDSInstr(MI->getOpcode()))
-      return true;
-    return false;
+    return TII->isLDSInstr(MI.getOpcode());
   }
 
   // isLegalToPacketizeTogether - Is it legal to packetize SUI and SUJ
   // together.
   bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) override {
     MachineInstr *MII = SUI->getInstr(), *MIJ = SUJ->getInstr();
-    if (getSlot(MII) == getSlot(MIJ))
+    if (getSlot(*MII) == getSlot(*MIJ))
       ConsideredInstUsesAlreadyWrittenVectorElement = true;
     // Does MII and MIJ share the same pred_sel ?
     int OpI = TII->getOperandIdx(MII->getOpcode(), AMDGPU::OpName::pred_sel),
@@ -210,14 +207,12 @@ public:
       }
     }
 
-    bool ARDef = TII->definesAddressRegister(MII) ||
-                 TII->definesAddressRegister(MIJ);
-    bool ARUse = TII->usesAddressRegister(MII) ||
-                 TII->usesAddressRegister(MIJ);
-    if (ARDef && ARUse)
-      return false;
+    bool ARDef =
+        TII->definesAddressRegister(*MII) || TII->definesAddressRegister(*MIJ);
+    bool ARUse =
+        TII->usesAddressRegister(*MII) || TII->usesAddressRegister(*MIJ);
 
-    return true;
+    return !ARDef || !ARUse;
   }
 
   // isLegalToPruneDependencies - Is it legal to prune dependece between SUI
@@ -231,7 +226,7 @@ public:
     MI->getOperand(LastOp).setImm(Bit);
   }
 
-  bool isBundlableWithCurrentPMI(MachineInstr *MI,
+  bool isBundlableWithCurrentPMI(MachineInstr &MI,
                                  const DenseMap<unsigned, unsigned> &PV,
                                  std::vector<R600InstrInfo::BankSwizzle> &BS,
                                  bool &isTransSlot) {
@@ -240,11 +235,14 @@ public:
 
     // Is the dst reg sequence legal ?
     if (!isTransSlot && !CurrentPacketMIs.empty()) {
-      if (getSlot(MI) <= getSlot(CurrentPacketMIs.back())) {
-        if (ConsideredInstUsesAlreadyWrittenVectorElement  &&
+      if (getSlot(MI) <= getSlot(*CurrentPacketMIs.back())) {
+        if (ConsideredInstUsesAlreadyWrittenVectorElement &&
             !TII->isVectorOnly(MI) && VLIW5) {
           isTransSlot = true;
-          DEBUG(dbgs() << "Considering as Trans Inst :"; MI->dump(););
+          DEBUG({
+            dbgs() << "Considering as Trans Inst :";
+            MI.dump();
+          });
         }
         else
           return false;
@@ -252,18 +250,18 @@ public:
     }
 
     // Are the Constants limitations met ?
-    CurrentPacketMIs.push_back(MI);
+    CurrentPacketMIs.push_back(&MI);
     if (!TII->fitsConstReadLimitations(CurrentPacketMIs)) {
-      DEBUG(
+      DEBUG({
         dbgs() << "Couldn't pack :\n";
-        MI->dump();
+        MI.dump();
         dbgs() << "with the following packets :\n";
         for (unsigned i = 0, e = CurrentPacketMIs.size() - 1; i < e; i++) {
           CurrentPacketMIs[i]->dump();
           dbgs() << "\n";
         }
         dbgs() << "because of Consts read limitations\n";
-      );
+      });
       CurrentPacketMIs.pop_back();
       return false;
     }
@@ -271,16 +269,16 @@ public:
     // Is there a BankSwizzle set that meet Read Port limitations ?
     if (!TII->fitsReadPortLimitations(CurrentPacketMIs,
             PV, BS, isTransSlot)) {
-      DEBUG(
+      DEBUG({
         dbgs() << "Couldn't pack :\n";
-        MI->dump();
+        MI.dump();
         dbgs() << "with the following packets :\n";
         for (unsigned i = 0, e = CurrentPacketMIs.size() - 1; i < e; i++) {
           CurrentPacketMIs[i]->dump();
           dbgs() << "\n";
         }
         dbgs() << "because of Read port limitations\n";
-      );
+      });
       CurrentPacketMIs.pop_back();
       return false;
     }
@@ -293,9 +291,9 @@ public:
     return true;
   }
 
-  MachineBasicBlock::iterator addToPacket(MachineInstr *MI) override {
+  MachineBasicBlock::iterator addToPacket(MachineInstr &MI) override {
     MachineBasicBlock::iterator FirstInBundle =
-        CurrentPacketMIs.empty() ? MI : CurrentPacketMIs.front();
+        CurrentPacketMIs.empty() ? &MI : CurrentPacketMIs.front();
     const DenseMap<unsigned, unsigned> &PV =
         getPreviousVector(FirstInBundle);
     std::vector<R600InstrInfo::BankSwizzle> BS;
@@ -308,9 +306,9 @@ public:
             AMDGPU::OpName::bank_swizzle);
         MI->getOperand(Op).setImm(BS[i]);
       }
-      unsigned Op = TII->getOperandIdx(MI->getOpcode(),
-          AMDGPU::OpName::bank_swizzle);
-      MI->getOperand(Op).setImm(BS.back());
+      unsigned Op =
+          TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::bank_swizzle);
+      MI.getOperand(Op).setImm(BS.back());
       if (!CurrentPacketMIs.empty())
         setIsLastBit(CurrentPacketMIs.back(), 0);
       substitutePV(MI, PV);
@@ -320,7 +318,7 @@ public:
       }
       return It;
     }
-    endPacket(MI->getParent(), MI);
+    endPacket(MI.getParent(), MI);
     if (TII->isTransOnly(MI))
       return MI;
     return VLIWPacketizerList::addToPacket(MI);
@@ -328,15 +326,20 @@ public:
 };
 
 bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) {
-  const TargetInstrInfo *TII = Fn.getSubtarget().getInstrInfo();
+  const R600Subtarget &ST = Fn.getSubtarget<R600Subtarget>();
+  const R600InstrInfo *TII = ST.getInstrInfo();
+
   MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
 
   // Instantiate the packetizer.
-  R600PacketizerList Packetizer(Fn, MLI);
+  R600PacketizerList Packetizer(Fn, ST, MLI);
 
   // DFA state table should not be empty.
   assert(Packetizer.getResourceTracker() && "Empty DFA table!");
 
+  if (Packetizer.getResourceTracker()->getInstrItins()->isEmpty())
+    return false;
+
   //
   // Loop over all basic blocks and remove KILL pseudo-instructions
   // These instructions confuse the dependence analysis. Consider:
@@ -375,7 +378,7 @@ bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) {
       // instruction stream until we find the nearest boundary.
       MachineBasicBlock::iterator I = RegionEnd;
       for(;I != MBB->begin(); --I, --RemainingCount) {
-        if (TII->isSchedulingBoundary(&*std::prev(I), &*MBB, Fn))
+        if (TII->isSchedulingBoundary(*std::prev(I), &*MBB, Fn))
           break;
       }
       I = MBB->begin();
diff --git a/lib/Target/AMDGPU/R600RegisterInfo.cpp b/lib/Target/AMDGPU/R600RegisterInfo.cpp
index fb0359cfc651..dfdc602b80cd 100644
--- a/lib/Target/AMDGPU/R600RegisterInfo.cpp
+++ b/lib/Target/AMDGPU/R600RegisterInfo.cpp
@@ -28,8 +28,8 @@ R600RegisterInfo::R600RegisterInfo() : AMDGPURegisterInfo() {
 BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
 
-  const R600InstrInfo *TII =
-      static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>();
+  const R600InstrInfo *TII = ST.getInstrInfo();
 
   Reserved.set(AMDGPU::ZERO);
   Reserved.set(AMDGPU::HALF);
@@ -89,3 +89,10 @@ bool R600RegisterInfo::isPhysRegLiveAcrossClauses(unsigned Reg) const {
     return true;
   }
 }
+
+void R600RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
+                                           int SPAdj,
+                                           unsigned FIOperandNum,
+                                           RegScavenger *RS) const {
+  llvm_unreachable("Subroutines not supported yet");
+}
diff --git a/lib/Target/AMDGPU/R600RegisterInfo.h b/lib/Target/AMDGPU/R600RegisterInfo.h
index 4f8a129ce4a6..9dfb3106c6cc 100644
--- a/lib/Target/AMDGPU/R600RegisterInfo.h
+++ b/lib/Target/AMDGPU/R600RegisterInfo.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_R600_R600REGISTERINFO_H
-#define LLVM_LIB_TARGET_R600_R600REGISTERINFO_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_R600REGISTERINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_R600REGISTERINFO_H
 
 #include "AMDGPURegisterInfo.h"
 
@@ -21,7 +21,7 @@ namespace llvm {
 
 class AMDGPUSubtarget;
 
-struct R600RegisterInfo : public AMDGPURegisterInfo {
+struct R600RegisterInfo final : public AMDGPURegisterInfo {
   RegClassWeight RCW;
 
   R600RegisterInfo();
@@ -31,7 +31,7 @@ struct R600RegisterInfo : public AMDGPURegisterInfo {
   /// \brief get the HW encoding for a register's channel.
   unsigned getHWRegChan(unsigned reg) const;
 
-  unsigned getHWRegIndex(unsigned Reg) const override;
+  unsigned getHWRegIndex(unsigned Reg) const;
 
   /// \brief get the register class of the specified type to use in the
   /// CFGStructurizer
@@ -40,8 +40,13 @@ struct R600RegisterInfo : public AMDGPURegisterInfo {
   const RegClassWeight &
     getRegClassWeight(const TargetRegisterClass *RC) const override;
 
-  // \returns true if \p Reg can be defined in one ALU caluse and used in another.
+  // \returns true if \p Reg can be defined in one ALU clause and used in
+  // another.
   bool isPhysRegLiveAcrossClauses(unsigned Reg) const;
+
+  void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
+                           unsigned FIOperandNum,
+                           RegScavenger *RS = nullptr) const override;
 };
 
 } // End namespace llvm
diff --git a/lib/Target/AMDGPU/R600Schedule.td b/lib/Target/AMDGPU/R600Schedule.td
index df62bf85c0ad..70fb46c1a7d6 100644
--- a/lib/Target/AMDGPU/R600Schedule.td
+++ b/lib/Target/AMDGPU/R600Schedule.td
@@ -9,7 +9,7 @@
 //
 // R600 has a VLIW architecture.  On pre-cayman cards there are 5 instruction
 // slots ALU.X, ALU.Y, ALU.Z, ALU.W, and TRANS.  For cayman cards, the TRANS
-// slot has been removed. 
+// slot has been removed.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/lib/Target/AMDGPU/R600TextureIntrinsicsReplacer.cpp b/lib/Target/AMDGPU/R600TextureIntrinsicsReplacer.cpp
deleted file mode 100644
index 2fc7b02f673f..000000000000
--- a/lib/Target/AMDGPU/R600TextureIntrinsicsReplacer.cpp
+++ /dev/null
@@ -1,303 +0,0 @@
-//===-- R600TextureIntrinsicsReplacer.cpp ---------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// This pass translates tgsi-like texture intrinsics into R600 texture
-/// closer to hardware intrinsics.
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/Passes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstVisitor.h"
-
-using namespace llvm;
-
-namespace {
-class R600TextureIntrinsicsReplacer :
-    public FunctionPass, public InstVisitor<R600TextureIntrinsicsReplacer> {
-  static char ID;
-
-  Module *Mod;
-  Type *FloatType;
-  Type *Int32Type;
-  Type *V4f32Type;
-  Type *V4i32Type;
-  FunctionType *TexSign;
-  FunctionType *TexQSign;
-
-  void getAdjustmentFromTextureTarget(unsigned TextureType, bool hasLOD,
-                                      unsigned SrcSelect[4], unsigned CT[4],
-                                      bool &useShadowVariant) {
-    enum TextureTypes {
-      TEXTURE_1D = 1,
-      TEXTURE_2D,
-      TEXTURE_3D,
-      TEXTURE_CUBE,
-      TEXTURE_RECT,
-      TEXTURE_SHADOW1D,
-      TEXTURE_SHADOW2D,
-      TEXTURE_SHADOWRECT,
-      TEXTURE_1D_ARRAY,
-      TEXTURE_2D_ARRAY,
-      TEXTURE_SHADOW1D_ARRAY,
-      TEXTURE_SHADOW2D_ARRAY,
-      TEXTURE_SHADOWCUBE,
-      TEXTURE_2D_MSAA,
-      TEXTURE_2D_ARRAY_MSAA,
-      TEXTURE_CUBE_ARRAY,
-      TEXTURE_SHADOWCUBE_ARRAY
-    };
-
-    switch (TextureType) {
-    case 0:
-      useShadowVariant = false;
-      return;
-    case TEXTURE_RECT:
-    case TEXTURE_1D:
-    case TEXTURE_2D:
-    case TEXTURE_3D:
-    case TEXTURE_CUBE:
-    case TEXTURE_1D_ARRAY:
-    case TEXTURE_2D_ARRAY:
-    case TEXTURE_CUBE_ARRAY:
-    case TEXTURE_2D_MSAA:
-    case TEXTURE_2D_ARRAY_MSAA:
-      useShadowVariant = false;
-      break;
-    case TEXTURE_SHADOW1D:
-    case TEXTURE_SHADOW2D:
-    case TEXTURE_SHADOWRECT:
-    case TEXTURE_SHADOW1D_ARRAY:
-    case TEXTURE_SHADOW2D_ARRAY:
-    case TEXTURE_SHADOWCUBE:
-    case TEXTURE_SHADOWCUBE_ARRAY:
-      useShadowVariant = true;
-      break;
-    default:
-      llvm_unreachable("Unknow Texture Type");
-    }
-
-    if (TextureType == TEXTURE_RECT ||
-        TextureType == TEXTURE_SHADOWRECT) {
-      CT[0] = 0;
-      CT[1] = 0;
-    }
-
-    if (TextureType == TEXTURE_CUBE_ARRAY ||
-        TextureType == TEXTURE_SHADOWCUBE_ARRAY)
-      CT[2] = 0;
-
-    if (TextureType == TEXTURE_1D_ARRAY ||
-        TextureType == TEXTURE_SHADOW1D_ARRAY) {
-      if (hasLOD && useShadowVariant) {
-        CT[1] = 0;
-      } else {
-        CT[2] = 0;
-        SrcSelect[2] = 1;
-      }
-    } else if (TextureType == TEXTURE_2D_ARRAY ||
-        TextureType == TEXTURE_SHADOW2D_ARRAY) {
-      CT[2] = 0;
-    }
-
-    if ((TextureType == TEXTURE_SHADOW1D ||
-        TextureType == TEXTURE_SHADOW2D ||
-        TextureType == TEXTURE_SHADOWRECT ||
-        TextureType == TEXTURE_SHADOW1D_ARRAY) &&
-        !(hasLOD && useShadowVariant))
-      SrcSelect[3] = 2;
-  }
-
-  void ReplaceCallInst(CallInst &I, FunctionType *FT, const char *Name,
-                       unsigned SrcSelect[4], Value *Offset[3], Value *Resource,
-                       Value *Sampler, unsigned CT[4], Value *Coord) {
-    IRBuilder<> Builder(&I);
-    Constant *Mask[] = {
-      ConstantInt::get(Int32Type, SrcSelect[0]),
-      ConstantInt::get(Int32Type, SrcSelect[1]),
-      ConstantInt::get(Int32Type, SrcSelect[2]),
-      ConstantInt::get(Int32Type, SrcSelect[3])
-    };
-    Value *SwizzleMask = ConstantVector::get(Mask);
-    Value *SwizzledCoord =
-        Builder.CreateShuffleVector(Coord, Coord, SwizzleMask);
-
-    Value *Args[] = {
-      SwizzledCoord,
-      Offset[0],
-      Offset[1],
-      Offset[2],
-      Resource,
-      Sampler,
-      ConstantInt::get(Int32Type, CT[0]),
-      ConstantInt::get(Int32Type, CT[1]),
-      ConstantInt::get(Int32Type, CT[2]),
-      ConstantInt::get(Int32Type, CT[3])
-    };
-
-    Function *F = Mod->getFunction(Name);
-    if (!F) {
-      F = Function::Create(FT, GlobalValue::ExternalLinkage, Name, Mod);
-      F->addFnAttr(Attribute::ReadNone);
-    }
-    I.replaceAllUsesWith(Builder.CreateCall(F, Args));
-    I.eraseFromParent();
-  }
-
-  void ReplaceTexIntrinsic(CallInst &I, bool hasLOD, FunctionType *FT,
-                           const char *VanillaInt,
-                           const char *ShadowInt) {
-    Value *Coord = I.getArgOperand(0);
-    Value *ResourceId = I.getArgOperand(1);
-    Value *SamplerId = I.getArgOperand(2);
-
-    unsigned TextureType =
-        cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
-
-    unsigned SrcSelect[4] = { 0, 1, 2, 3 };
-    unsigned CT[4] = {1, 1, 1, 1};
-    Value *Offset[3] = {
-      ConstantInt::get(Int32Type, 0),
-      ConstantInt::get(Int32Type, 0),
-      ConstantInt::get(Int32Type, 0)
-    };
-    bool useShadowVariant;
-
-    getAdjustmentFromTextureTarget(TextureType, hasLOD, SrcSelect, CT,
-                                   useShadowVariant);
-
-    ReplaceCallInst(I, FT, useShadowVariant?ShadowInt:VanillaInt, SrcSelect,
-                    Offset, ResourceId, SamplerId, CT, Coord);
-  }
-
-  void ReplaceTXF(CallInst &I) {
-    Value *Coord = I.getArgOperand(0);
-    Value *ResourceId = I.getArgOperand(4);
-    Value *SamplerId = I.getArgOperand(5);
-
-    unsigned TextureType =
-        cast<ConstantInt>(I.getArgOperand(6))->getZExtValue();
-
-    unsigned SrcSelect[4] = { 0, 1, 2, 3 };
-    unsigned CT[4] = {1, 1, 1, 1};
-    Value *Offset[3] = {
-      I.getArgOperand(1),
-      I.getArgOperand(2),
-      I.getArgOperand(3),
-    };
-    bool useShadowVariant;
-
-    getAdjustmentFromTextureTarget(TextureType, false, SrcSelect, CT,
-                                   useShadowVariant);
-
-    ReplaceCallInst(I, TexQSign, "llvm.R600.txf", SrcSelect,
-                    Offset, ResourceId, SamplerId, CT, Coord);
-  }
-
-public:
-  R600TextureIntrinsicsReplacer():
-    FunctionPass(ID) {
-  }
-
-  bool doInitialization(Module &M) override {
-    LLVMContext &Ctx = M.getContext();
-    Mod = &M;
-    FloatType = Type::getFloatTy(Ctx);
-    Int32Type = Type::getInt32Ty(Ctx);
-    V4f32Type = VectorType::get(FloatType, 4);
-    V4i32Type = VectorType::get(Int32Type, 4);
-    Type *ArgsType[] = {
-      V4f32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-    };
-    TexSign = FunctionType::get(V4f32Type, ArgsType, /*isVarArg=*/false);
-    Type *ArgsQType[] = {
-      V4i32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-    };
-    TexQSign = FunctionType::get(V4f32Type, ArgsQType, /*isVarArg=*/false);
-    return false;
-  }
-
-  bool runOnFunction(Function &F) override {
-    visit(F);
-    return false;
-  }
-
-  const char *getPassName() const override {
-    return "R600 Texture Intrinsics Replacer";
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-  }
-
-  void visitCallInst(CallInst &I) {
-    if (!I.getCalledFunction())
-      return;
-
-    StringRef Name = I.getCalledFunction()->getName();
-    if (Name == "llvm.AMDGPU.tex") {
-      ReplaceTexIntrinsic(I, false, TexSign, "llvm.R600.tex", "llvm.R600.texc");
-      return;
-    }
-    if (Name == "llvm.AMDGPU.txl") {
-      ReplaceTexIntrinsic(I, true, TexSign, "llvm.R600.txl", "llvm.R600.txlc");
-      return;
-    }
-    if (Name == "llvm.AMDGPU.txb") {
-      ReplaceTexIntrinsic(I, true, TexSign, "llvm.R600.txb", "llvm.R600.txbc");
-      return;
-    }
-    if (Name == "llvm.AMDGPU.txf") {
-      ReplaceTXF(I);
-      return;
-    }
-    if (Name == "llvm.AMDGPU.txq") {
-      ReplaceTexIntrinsic(I, false, TexQSign, "llvm.R600.txq", "llvm.R600.txq");
-      return;
-    }
-    if (Name == "llvm.AMDGPU.ddx") {
-      ReplaceTexIntrinsic(I, false, TexSign, "llvm.R600.ddx", "llvm.R600.ddx");
-      return;
-    }
-    if (Name == "llvm.AMDGPU.ddy") {
-      ReplaceTexIntrinsic(I, false, TexSign, "llvm.R600.ddy", "llvm.R600.ddy");
-      return;
-    }
-  }
-
-};
-
-char R600TextureIntrinsicsReplacer::ID = 0;
-
-}
-
-FunctionPass *llvm::createR600TextureIntrinsicsReplacer() {
-  return new R600TextureIntrinsicsReplacer();
-}
diff --git a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index fa4d24a2f25a..5f182c5304c6 100644
--- a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -14,6 +14,7 @@
 
 #include "AMDGPU.h"
 #include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/Analysis/DivergenceAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Dominators.h"
@@ -34,17 +35,16 @@ typedef std::pair<BasicBlock *, Value *> StackEntry;
 typedef SmallVector<StackEntry, 16> StackVector;
 
 // Intrinsic names the control flow is annotated with
-static const char *const IfIntrinsic = "llvm.SI.if";
-static const char *const ElseIntrinsic = "llvm.SI.else";
-static const char *const BreakIntrinsic = "llvm.SI.break";
-static const char *const IfBreakIntrinsic = "llvm.SI.if.break";
-static const char *const ElseBreakIntrinsic = "llvm.SI.else.break";
-static const char *const LoopIntrinsic = "llvm.SI.loop";
-static const char *const EndCfIntrinsic = "llvm.SI.end.cf";
+static const char *const IfIntrinsic = "llvm.amdgcn.if";
+static const char *const ElseIntrinsic = "llvm.amdgcn.else";
+static const char *const BreakIntrinsic = "llvm.amdgcn.break";
+static const char *const IfBreakIntrinsic = "llvm.amdgcn.if.break";
+static const char *const ElseBreakIntrinsic = "llvm.amdgcn.else.break";
+static const char *const LoopIntrinsic = "llvm.amdgcn.loop";
+static const char *const EndCfIntrinsic = "llvm.amdgcn.end.cf";
 
 class SIAnnotateControlFlow : public FunctionPass {
-
-  static char ID;
+  DivergenceAnalysis *DA;
 
   Type *Boolean;
   Type *Void;
@@ -69,6 +69,8 @@ class SIAnnotateControlFlow : public FunctionPass {
 
   LoopInfo *LI;
 
+  bool isUniform(BranchInst *T);
+
   bool isTopOfStack(BasicBlock *BB);
 
   Value *popSaved();
@@ -83,13 +85,16 @@ class SIAnnotateControlFlow : public FunctionPass {
 
   void insertElse(BranchInst *Term);
 
-  Value *handleLoopCondition(Value *Cond, PHINode *Broken, llvm::Loop *L);
+  Value *handleLoopCondition(Value *Cond, PHINode *Broken,
+                             llvm::Loop *L, BranchInst *Term);
 
   void handleLoop(BranchInst *Term);
 
   void closeControlFlow(BasicBlock *BB);
 
 public:
+  static char ID;
+
   SIAnnotateControlFlow():
     FunctionPass(ID) { }
 
@@ -104,6 +109,7 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<LoopInfoWrapperPass>();
     AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<DivergenceAnalysis>();
     AU.addPreserved<DominatorTreeWrapperPass>();
     FunctionPass::getAnalysisUsage(AU);
   }
@@ -112,6 +118,12 @@ public:
 
 } // end anonymous namespace
 
+INITIALIZE_PASS_BEGIN(SIAnnotateControlFlow, DEBUG_TYPE,
+                      "Annotate SI Control Flow", false, false)
+INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
+INITIALIZE_PASS_END(SIAnnotateControlFlow, DEBUG_TYPE,
+                    "Annotate SI Control Flow", false, false)
+
 char SIAnnotateControlFlow::ID = 0;
 
 /// \brief Initialize all the types and constants used in the pass
@@ -152,6 +164,13 @@ bool SIAnnotateControlFlow::doInitialization(Module &M) {
   return false;
 }
 
+/// \brief Is the branch condition uniform or did the StructurizeCFG pass
+/// consider it as such?
+bool SIAnnotateControlFlow::isUniform(BranchInst *T) {
+  return DA->isUniform(T->getCondition()) ||
+         T->getMetadata("structurizecfg.uniform") != nullptr;
+}
+
 /// \brief Is BB the last block saved on the stack ?
 bool SIAnnotateControlFlow::isTopOfStack(BasicBlock *BB) {
   return !Stack.empty() && Stack.back().first == BB;
@@ -194,6 +213,9 @@ void SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) {
 
 /// \brief Open a new "If" block
 void SIAnnotateControlFlow::openIf(BranchInst *Term) {
+  if (isUniform(Term)) {
+    return;
+  }
   Value *Ret = CallInst::Create(If, Term->getCondition(), "", Term);
   Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term));
   push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term));
@@ -201,6 +223,9 @@ void SIAnnotateControlFlow::openIf(BranchInst *Term) {
 
 /// \brief Close the last "If" block and open a new "Else" block
 void SIAnnotateControlFlow::insertElse(BranchInst *Term) {
+  if (isUniform(Term)) {
+    return;
+  }
   Value *Ret = CallInst::Create(Else, popSaved(), "", Term);
   Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term));
   push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term));
@@ -208,7 +233,7 @@ void SIAnnotateControlFlow::insertElse(BranchInst *Term) {
 
 /// \brief Recursively handle the condition leading to a loop
 Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken,
-                                                  llvm::Loop *L) {
+                                             llvm::Loop *L, BranchInst *Term) {
 
   // Only search through PHI nodes which are inside the loop.  If we try this
   // with PHI nodes that are outside of the loop, we end up inserting new PHI
@@ -232,7 +257,7 @@ Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken,
       }
 
       Phi->setIncomingValue(i, BoolFalse);
-      Value *PhiArg = handleLoopCondition(Incoming, Broken, L);
+      Value *PhiArg = handleLoopCondition(Incoming, Broken, L, Term);
       NewPhi->addIncoming(PhiArg, From);
     }
 
@@ -246,7 +271,23 @@ Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken,
 
       BasicBlock *From = Phi->getIncomingBlock(i);
       if (From == IDom) {
+        // We're in the following situation:
+        //   IDom/From
+        //      |   \
+        //      |   If-block
+        //      |   /
+        //     Parent
+        // where we want to break out of the loop if the If-block is not taken.
+        // Due to the depth-first traversal, there should be an end.cf
+        // intrinsic in Parent, and we insert an else.break before it.
+        //
+        // Note that the end.cf need not be the first non-phi instruction
+        // of parent, particularly when we're dealing with a multi-level
+        // break, but it should occur within a group of intrinsic calls
+        // at the beginning of the block.
         CallInst *OldEnd = dyn_cast<CallInst>(Parent->getFirstInsertionPt());
+        while (OldEnd && OldEnd->getCalledFunction() != EndCf)
+          OldEnd = dyn_cast<CallInst>(OldEnd->getNextNode());
         if (OldEnd && OldEnd->getCalledFunction() == EndCf) {
           Value *Args[] = { OldEnd->getArgOperand(0), NewPhi };
           Ret = CallInst::Create(ElseBreak, Args, "", OldEnd);
@@ -271,14 +312,23 @@ Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken,
     Value *Args[] = { Cond, Broken };
     return CallInst::Create(IfBreak, Args, "", Insert);
 
+  // Insert IfBreak before TERM for constant COND.
+  } else if (isa<ConstantInt>(Cond)) {
+    Value *Args[] = { Cond, Broken };
+    return CallInst::Create(IfBreak, Args, "", Term);
+
   } else {
     llvm_unreachable("Unhandled loop condition!");
   }
-  return 0;
+  return nullptr;
 }
 
 /// \brief Handle a back edge (loop)
 void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
+  if (isUniform(Term)) {
+    return;
+  }
+
   BasicBlock *BB = Term->getParent();
   llvm::Loop *L = LI->getLoopFor(BB);
   BasicBlock *Target = Term->getSuccessor(1);
@@ -286,7 +336,7 @@ void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
 
   Value *Cond = Term->getCondition();
   Term->setCondition(BoolTrue);
-  Value *Arg = handleLoopCondition(Cond, Broken, L);
+  Value *Arg = handleLoopCondition(Cond, Broken, L, Term);
 
   for (pred_iterator PI = pred_begin(Target), PE = pred_end(Target);
        PI != PE; ++PI) {
@@ -300,6 +350,8 @@ void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
 void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
   llvm::Loop *L = LI->getLoopFor(BB);
 
+  assert(Stack.back().first == BB);
+
   if (L && L->getHeader() == BB) {
     // We can't insert an EndCF call into a loop header, because it will
     // get executed on every iteration of the loop, when it should be
@@ -315,14 +367,18 @@ void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
     BB = llvm::SplitBlockPredecessors(BB, Preds, "endcf.split", DT, LI, false);
   }
 
-  CallInst::Create(EndCf, popSaved(), "", &*BB->getFirstInsertionPt());
+  Value *Exec = popSaved();
+  if (!isa<UndefValue>(Exec))
+    CallInst::Create(EndCf, Exec, "", &*BB->getFirstInsertionPt());
 }
 
 /// \brief Annotate the control flow with intrinsics so the backend can
 /// recognize if/then/else and loops.
 bool SIAnnotateControlFlow::runOnFunction(Function &F) {
+
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  DA = &getAnalysis<DivergenceAnalysis>();
 
   for (df_iterator<BasicBlock *> I = df_begin(&F.getEntryBlock()),
        E = df_end(&F.getEntryBlock()); I != E; ++I) {
@@ -332,12 +388,14 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) {
     if (!Term || Term->isUnconditional()) {
       if (isTopOfStack(*I))
         closeControlFlow(*I);
+
       continue;
     }
 
     if (I.nodeVisited(Term->getSuccessor(1))) {
       if (isTopOfStack(*I))
         closeControlFlow(*I);
+
       handleLoop(Term);
       continue;
     }
diff --git a/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp b/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp
new file mode 100644
index 000000000000..65ceff3930ac
--- /dev/null
+++ b/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp
@@ -0,0 +1,96 @@
+//===--- SIDebuggerInsertNops.cpp - Inserts nops for debugger usage -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Inserts one nop instruction for each high level source statement for
+/// debugger usage.
+///
+/// Tools, such as a debugger, need to pause execution based on user input (i.e.
+/// breakpoint). In order to do this, one nop instruction is inserted before the
+/// first isa instruction of each high level source statement. Further, the
+/// debugger may replace nop instructions with trap instructions based on user
+/// input.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SIInstrInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "si-debugger-insert-nops"
+#define PASS_NAME "SI Debugger Insert Nops"
+
+namespace {
+
+class SIDebuggerInsertNops : public MachineFunctionPass {
+public:
+  static char ID;
+
+  SIDebuggerInsertNops() : MachineFunctionPass(ID) { }
+  const char *getPassName() const override { return PASS_NAME; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+
+} // anonymous namespace
+
+INITIALIZE_PASS(SIDebuggerInsertNops, DEBUG_TYPE, PASS_NAME, false, false)
+
+char SIDebuggerInsertNops::ID = 0;
+char &llvm::SIDebuggerInsertNopsID = SIDebuggerInsertNops::ID;
+
+FunctionPass *llvm::createSIDebuggerInsertNopsPass() {
+  return new SIDebuggerInsertNops();
+}
+
+bool SIDebuggerInsertNops::runOnMachineFunction(MachineFunction &MF) {
+  // Skip this pass if "amdgpu-debugger-insert-nops" attribute was not
+  // specified.
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  if (!ST.debuggerInsertNops())
+    return false;
+
+  // Skip machine functions without debug info.
+  if (!MF.getMMI().hasDebugInfo())
+    return false;
+
+  // Target instruction info.
+  const SIInstrInfo *TII = ST.getInstrInfo();
+
+  // Set containing line numbers that have nop inserted.
+  DenseSet<unsigned> NopInserted;
+
+  for (auto &MBB : MF) {
+    for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
+      // Skip DBG_VALUE instructions and instructions without location.
+      if (MI->isDebugValue() || !MI->getDebugLoc())
+        continue;
+
+      // Insert nop instruction if line number does not have nop inserted.
+      auto DL = MI->getDebugLoc();
+      if (NopInserted.find(DL.getLine()) == NopInserted.end()) {
+        BuildMI(MBB, *MI, DL, TII->get(AMDGPU::S_NOP))
+          .addImm(0);
+        NopInserted.insert(DL.getLine());
+      }
+    }
+  }
+
+  return true;
+}
diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h
index aa1e352ed748..54efdc0a0466 100644
--- a/lib/Target/AMDGPU/SIDefines.h
+++ b/lib/Target/AMDGPU/SIDefines.h
@@ -10,8 +10,8 @@
 
 #include "llvm/MC/MCInstrDesc.h"
 
-#ifndef LLVM_LIB_TARGET_R600_SIDEFINES_H
-#define LLVM_LIB_TARGET_R600_SIDEFINES_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_SIDEFINES_H
+#define LLVM_LIB_TARGET_AMDGPU_SIDEFINES_H
 
 namespace SIInstrFlags {
 // This needs to be kept in sync with the field bits in InstSI.
@@ -29,16 +29,19 @@ enum {
   VOP2 = 1 << 11,
   VOP3 = 1 << 12,
   VOPC = 1 << 13,
+  SDWA = 1 << 14,
+  DPP = 1 << 15,
 
-  MUBUF = 1 << 14,
-  MTBUF = 1 << 15,
-  SMRD = 1 << 16,
-  DS = 1 << 17,
-  MIMG = 1 << 18,
-  FLAT = 1 << 19,
-  WQM = 1 << 20,
-  VGPRSpill = 1 << 21,
-  VOPAsmPrefer32Bit = 1 << 22
+  MUBUF = 1 << 16,
+  MTBUF = 1 << 17,
+  SMRD = 1 << 18,
+  DS = 1 << 19,
+  MIMG = 1 << 20,
+  FLAT = 1 << 21,
+  WQM = 1 << 22,
+  VGPRSpill = 1 << 23,
+  VOPAsmPrefer32Bit = 1 << 24,
+  Gather4 = 1 << 25
 };
 }
 
@@ -46,9 +49,14 @@ namespace llvm {
 namespace AMDGPU {
   enum OperandType {
     /// Operand with register or 32-bit immediate
-    OPERAND_REG_IMM32 = llvm::MCOI::OPERAND_FIRST_TARGET,
+    OPERAND_REG_IMM32 = MCOI::OPERAND_FIRST_TARGET,
     /// Operand with register or inline constant
-    OPERAND_REG_INLINE_C
+    OPERAND_REG_INLINE_C,
+
+    /// Operand with 32-bit immediate that uses the constant bus. The standard
+    /// OPERAND_IMMEDIATE should be used for special immediates such as source
+    /// modifiers.
+    OPERAND_KIMM32
   };
 }
 }
@@ -77,10 +85,13 @@ namespace SIInstrFlags {
   };
 }
 
+// Input operand modifiers bit-masks
+// NEG and SEXT share same bit-mask because they can't be set simultaneously.
 namespace SISrcMods {
   enum {
-   NEG = 1 << 0,
-   ABS = 1 << 1
+   NEG = 1 << 0,  // Floating-point negate modifier
+   ABS = 1 << 1,  // Floating-point absolute modifier
+   SEXT = 1 << 0  // Integer sign-extend modifier
   };
 }
 
@@ -93,6 +104,109 @@ namespace SIOutMods {
   };
 }
 
+namespace llvm {
+namespace AMDGPU {
+namespace EncValues { // Encoding values of enum9/8/7 operands
+
+enum {
+  SGPR_MIN = 0,
+  SGPR_MAX = 101,
+  TTMP_MIN = 112,
+  TTMP_MAX = 123,
+  INLINE_INTEGER_C_MIN = 128,
+  INLINE_INTEGER_C_POSITIVE_MAX = 192, // 64
+  INLINE_INTEGER_C_MAX = 208,
+  INLINE_FLOATING_C_MIN = 240,
+  INLINE_FLOATING_C_MAX = 248,
+  LITERAL_CONST = 255,
+  VGPR_MIN = 256,
+  VGPR_MAX = 511
+};
+
+} // namespace EncValues
+} // namespace AMDGPU
+} // namespace llvm
+
+namespace llvm {
+namespace AMDGPU {
+namespace SendMsg { // Encoding of SIMM16 used in s_sendmsg* insns.
+
+enum Id { // Message ID, width(4) [3:0].
+  ID_UNKNOWN_ = -1,
+  ID_INTERRUPT = 1,
+  ID_GS,
+  ID_GS_DONE,
+  ID_SYSMSG = 15,
+  ID_GAPS_LAST_, // Indicate that sequence has gaps.
+  ID_GAPS_FIRST_ = ID_INTERRUPT,
+  ID_SHIFT_ = 0,
+  ID_WIDTH_ = 4,
+  ID_MASK_ = (((1 << ID_WIDTH_) - 1) << ID_SHIFT_)
+};
+
+enum Op { // Both GS and SYS operation IDs.
+  OP_UNKNOWN_ = -1,
+  OP_SHIFT_ = 4,
+  // width(2) [5:4]
+  OP_GS_NOP = 0,
+  OP_GS_CUT,
+  OP_GS_EMIT,
+  OP_GS_EMIT_CUT,
+  OP_GS_LAST_,
+  OP_GS_FIRST_ = OP_GS_NOP,
+  OP_GS_WIDTH_ = 2,
+  OP_GS_MASK_ = (((1 << OP_GS_WIDTH_) - 1) << OP_SHIFT_),
+  // width(3) [6:4]
+  OP_SYS_ECC_ERR_INTERRUPT = 1,
+  OP_SYS_REG_RD,
+  OP_SYS_HOST_TRAP_ACK,
+  OP_SYS_TTRACE_PC,
+  OP_SYS_LAST_,
+  OP_SYS_FIRST_ = OP_SYS_ECC_ERR_INTERRUPT,
+  OP_SYS_WIDTH_ = 3,
+  OP_SYS_MASK_ = (((1 << OP_SYS_WIDTH_) - 1) << OP_SHIFT_)
+};
+
+enum StreamId { // Stream ID, (2) [9:8].
+  STREAM_ID_DEFAULT_ = 0,
+  STREAM_ID_LAST_ = 4,
+  STREAM_ID_FIRST_ = STREAM_ID_DEFAULT_,
+  STREAM_ID_SHIFT_ = 8,
+  STREAM_ID_WIDTH_=  2,
+  STREAM_ID_MASK_ = (((1 << STREAM_ID_WIDTH_) - 1) << STREAM_ID_SHIFT_)
+};
+
+} // namespace SendMsg
+
+namespace Hwreg { // Encoding of SIMM16 used in s_setreg/getreg* insns.
+
+enum Id { // HwRegCode, (6) [5:0]
+  ID_UNKNOWN_ = -1,
+  ID_SYMBOLIC_FIRST_ = 1, // There are corresponding symbolic names defined.
+  ID_SYMBOLIC_LAST_ = 8,
+  ID_SHIFT_ = 0,
+  ID_WIDTH_ = 6,
+  ID_MASK_ = (((1 << ID_WIDTH_) - 1) << ID_SHIFT_)
+};
+
+enum Offset { // Offset, (5) [10:6]
+  OFFSET_DEFAULT_ = 0,
+  OFFSET_SHIFT_ = 6,
+  OFFSET_WIDTH_ = 5,
+  OFFSET_MASK_ = (((1 << OFFSET_WIDTH_) - 1) << OFFSET_SHIFT_)
+};
+
+enum WidthMinusOne { // WidthMinusOne, (5) [15:11]
+  WIDTH_M1_DEFAULT_ = 31,
+  WIDTH_M1_SHIFT_ = 11,
+  WIDTH_M1_WIDTH_ = 5,
+  WIDTH_M1_MASK_ = (((1 << WIDTH_M1_WIDTH_) - 1) << WIDTH_M1_SHIFT_)
+};
+
+} // namespace Hwreg
+} // namespace AMDGPU
+} // namespace llvm
+
 #define R_00B028_SPI_SHADER_PGM_RSRC1_PS                                0x00B028
 #define R_00B02C_SPI_SHADER_PGM_RSRC2_PS                                0x00B02C
 #define   S_00B02C_EXTRA_LDS_SIZE(x)                                  (((x) & 0xFF) << 8)
@@ -134,7 +248,7 @@ namespace SIOutMods {
 #define   C_00B84C_LDS_SIZE                                           0xFF007FFF
 #define   S_00B84C_EXCP_EN(x)                                         (((x) & 0x7F) << 24)
 #define   G_00B84C_EXCP_EN(x)                                         (((x) >> 24) & 0x7F)
-#define   C_00B84C_EXCP_EN 
+#define   C_00B84C_EXCP_EN
 
 #define R_0286CC_SPI_PS_INPUT_ENA                                       0x0286CC
 #define R_0286D0_SPI_PS_INPUT_ADDR                                      0x0286D0
@@ -194,5 +308,7 @@ namespace SIOutMods {
 #define R_0286E8_SPI_TMPRING_SIZE                                       0x0286E8
 #define   S_0286E8_WAVESIZE(x)                                        (((x) & 0x1FFF) << 12)
 
+#define R_SPILLED_SGPRS         0x4
+#define R_SPILLED_VGPRS         0x8
 
 #endif
diff --git a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index f59d9948f98e..9e0086b79087 100644
--- a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -77,7 +77,7 @@
 
 using namespace llvm;
 
-#define DEBUG_TYPE "sgpr-copies"
+#define DEBUG_TYPE "si-fix-sgpr-copies"
 
 namespace {
 
@@ -237,11 +237,10 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
 }
 
 bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  const SIRegisterInfo *TRI =
-      static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+  const SIInstrInfo *TII = ST.getInstrInfo();
 
   SmallVector<MachineInstr *, 16> Worklist;
 
diff --git a/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp b/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp
deleted file mode 100644
index 8bda283f0fca..000000000000
--- a/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp
+++ /dev/null
@@ -1,219 +0,0 @@
-//===-- SIFixSGPRLiveRanges.cpp - Fix SGPR live ranges ----------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file SALU instructions ignore the execution mask, so we need to modify the
-/// live ranges of the registers they define in some cases.
-///
-/// The main case we need to handle is when a def is used in one side of a
-/// branch and not another.  For example:
-///
-/// %def
-/// IF
-///   ...
-///   ...
-/// ELSE
-///   %use
-///   ...
-/// ENDIF
-///
-/// Here we need the register allocator to avoid assigning any of the defs
-/// inside of the IF to the same register as %def.  In traditional live
-/// interval analysis %def is not live inside the IF branch, however, since
-/// SALU instructions inside of IF will be executed even if the branch is not
-/// taken, there is the chance that one of the instructions will overwrite the
-/// value of %def, so the use in ELSE will see the wrong value.
-///
-/// The strategy we use for solving this is to add an extra use after the ENDIF:
-///
-/// %def
-/// IF
-///   ...
-///   ...
-/// ELSE
-///   %use
-///   ...
-/// ENDIF
-/// %use
-///
-/// Adding this use will make the def live throughout the IF branch, which is
-/// what we want.
-
-#include "AMDGPU.h"
-#include "SIInstrInfo.h"
-#include "SIRegisterInfo.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/CodeGen/LiveIntervalAnalysis.h"
-#include "llvm/CodeGen/LiveVariables.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachinePostDominators.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "si-fix-sgpr-live-ranges"
-
-namespace {
-
-class SIFixSGPRLiveRanges : public MachineFunctionPass {
-public:
-  static char ID;
-
-public:
-  SIFixSGPRLiveRanges() : MachineFunctionPass(ID) {
-    initializeSIFixSGPRLiveRangesPass(*PassRegistry::getPassRegistry());
-  }
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  const char *getPassName() const override {
-    return "SI Fix SGPR live ranges";
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<LiveVariables>();
-    AU.addPreserved<LiveVariables>();
-
-    AU.addRequired<MachinePostDominatorTree>();
-    AU.addPreserved<MachinePostDominatorTree>();
-    AU.setPreservesCFG();
-
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-};
-
-} // End anonymous namespace.
-
-INITIALIZE_PASS_BEGIN(SIFixSGPRLiveRanges, DEBUG_TYPE,
-                      "SI Fix SGPR Live Ranges", false, false)
-INITIALIZE_PASS_DEPENDENCY(LiveVariables)
-INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
-INITIALIZE_PASS_END(SIFixSGPRLiveRanges, DEBUG_TYPE,
-                    "SI Fix SGPR Live Ranges", false, false)
-
-char SIFixSGPRLiveRanges::ID = 0;
-
-char &llvm::SIFixSGPRLiveRangesID = SIFixSGPRLiveRanges::ID;
-
-FunctionPass *llvm::createSIFixSGPRLiveRangesPass() {
-  return new SIFixSGPRLiveRanges();
-}
-
-bool SIFixSGPRLiveRanges::runOnMachineFunction(MachineFunction &MF) {
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
-  const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(
-      MF.getSubtarget().getRegisterInfo());
-  bool MadeChange = false;
-
-  MachinePostDominatorTree *PDT = &getAnalysis<MachinePostDominatorTree>();
-  SmallVector<unsigned, 16> SGPRLiveRanges;
-
-  LiveVariables *LV = &getAnalysis<LiveVariables>();
-  MachineBasicBlock *Entry = &MF.front();
-
-  // Use a depth first order so that in SSA, we encounter all defs before
-  // uses. Once the defs of the block have been found, attempt to insert
-  // SGPR_USE instructions in successor blocks if required.
-  for (MachineBasicBlock *MBB : depth_first(Entry)) {
-    for (const MachineInstr &MI : *MBB) {
-      for (const MachineOperand &MO : MI.defs()) {
-        // We should never see a live out def of a physical register, so we also
-        // do not need to worry about implicit_defs().
-        unsigned Def = MO.getReg();
-        if (TargetRegisterInfo::isVirtualRegister(Def)) {
-          if (TRI->isSGPRClass(MRI.getRegClass(Def))) {
-            // Only consider defs that are live outs. We don't care about def /
-            // use within the same block.
-
-            // LiveVariables does not consider registers that are only used in a
-            // phi in a sucessor block as live out, unlike LiveIntervals.
-            //
-            // This is OK because SIFixSGPRCopies replaced any SGPR phis with
-            // VGPRs.
-            if (LV->isLiveOut(Def, *MBB))
-              SGPRLiveRanges.push_back(Def);
-          }
-        }
-      }
-    }
-
-    if (MBB->succ_size() < 2)
-      continue;
-
-    // We have structured control flow, so the number of successors should be
-    // two.
-    assert(MBB->succ_size() == 2);
-    MachineBasicBlock *SuccA = *MBB->succ_begin();
-    MachineBasicBlock *SuccB = *(++MBB->succ_begin());
-    MachineBasicBlock *NCD = PDT->findNearestCommonDominator(SuccA, SuccB);
-
-    if (!NCD)
-      continue;
-
-    MachineBasicBlock::iterator NCDTerm = NCD->getFirstTerminator();
-
-    if (NCDTerm != NCD->end() && NCDTerm->getOpcode() == AMDGPU::SI_ELSE) {
-      assert(NCD->succ_size() == 2);
-      // We want to make sure we insert the Use after the ENDIF, not after
-      // the ELSE.
-      NCD = PDT->findNearestCommonDominator(*NCD->succ_begin(),
-                                            *(++NCD->succ_begin()));
-    }
-
-    for (unsigned Reg : SGPRLiveRanges) {
-      // FIXME: We could be smarter here. If the register is Live-In to one
-      // block, but the other doesn't have any SGPR defs, then there won't be a
-      // conflict. Also, if the branch condition is uniform then there will be
-      // no conflict.
-      bool LiveInToA = LV->isLiveIn(Reg, *SuccA);
-      bool LiveInToB = LV->isLiveIn(Reg, *SuccB);
-
-      if (!LiveInToA && !LiveInToB) {
-        DEBUG(dbgs() << PrintReg(Reg, TRI, 0)
-              << " is live into neither successor\n");
-        continue;
-      }
-
-      if (LiveInToA && LiveInToB) {
-        DEBUG(dbgs() << PrintReg(Reg, TRI, 0)
-              << " is live into both successors\n");
-        continue;
-      }
-
-      // This interval is live in to one successor, but not the other, so
-      // we need to update its range so it is live in to both.
-      DEBUG(dbgs() << "Possible SGPR conflict detected for "
-            << PrintReg(Reg, TRI, 0)
-            << " BB#" << SuccA->getNumber()
-            << ", BB#" << SuccB->getNumber()
-            << " with NCD = BB#" << NCD->getNumber() << '\n');
-
-      assert(TargetRegisterInfo::isVirtualRegister(Reg) &&
-             "Not expecting to extend live range of physreg");
-
-      // FIXME: Need to figure out how to update LiveRange here so this pass
-      // will be able to preserve LiveInterval analysis.
-      MachineInstr *NCDSGPRUse =
-        BuildMI(*NCD, NCD->getFirstNonPHI(), DebugLoc(),
-                TII->get(AMDGPU::SGPR_USE))
-        .addReg(Reg, RegState::Implicit);
-
-      MadeChange = true;
-      LV->HandleVirtRegUse(Reg, NCD, NCDSGPRUse);
-
-      DEBUG(NCDSGPRUse->dump());
-    }
-  }
-
-  return MadeChange;
-}
diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp
index 6230d1e28b74..4ecc0fcc6232 100644
--- a/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -13,12 +13,9 @@
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
-#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
@@ -44,8 +41,6 @@ public:
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<MachineDominatorTree>();
-    AU.addPreserved<MachineDominatorTree>();
     AU.setPreservesCFG();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
@@ -76,11 +71,8 @@ struct FoldCandidate {
 
 } // End anonymous namespace.
 
-INITIALIZE_PASS_BEGIN(SIFoldOperands, DEBUG_TYPE,
-                      "SI Fold Operands", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
-INITIALIZE_PASS_END(SIFoldOperands, DEBUG_TYPE,
-                    "SI Fold Operands", false, false)
+INITIALIZE_PASS(SIFoldOperands, DEBUG_TYPE,
+                "SI Fold Operands", false, false)
 
 char SIFoldOperands::ID = 0;
 
@@ -140,7 +132,7 @@ static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList,
                              MachineInstr *MI, unsigned OpNo,
                              MachineOperand *OpToFold,
                              const SIInstrInfo *TII) {
-  if (!TII->isOperandLegal(MI, OpNo, OpToFold)) {
+  if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) {
 
     // Special case for v_mac_f32_e64 if we are trying to fold into src2
     unsigned Opc = MI->getOpcode();
@@ -167,7 +159,7 @@ static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList,
     // see if this makes it possible to fold.
     unsigned CommuteIdx0 = TargetInstrInfo::CommuteAnyOperandIndex;
     unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
-    bool CanCommute = TII->findCommutedOpIndices(MI, CommuteIdx0, CommuteIdx1);
+    bool CanCommute = TII->findCommutedOpIndices(*MI, CommuteIdx0, CommuteIdx1);
 
     if (CanCommute) {
       if (CommuteIdx0 == OpNo)
@@ -185,10 +177,10 @@ static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList,
       return false;
 
     if (!CanCommute ||
-        !TII->commuteInstruction(MI, false, CommuteIdx0, CommuteIdx1))
+        !TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1))
       return false;
 
-    if (!TII->isOperandLegal(MI, OpNo, OpToFold))
+    if (!TII->isOperandLegal(*MI, OpNo, OpToFold))
       return false;
   }
 
@@ -301,9 +293,13 @@ static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI,
 }
 
 bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const SIInstrInfo *TII = ST.getInstrInfo();
   const SIRegisterInfo &TRI = TII->getRegisterInfo();
 
   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp
index 7d20509c464d..03b11f0fd38d 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -11,6 +11,8 @@
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
+#include "AMDGPUSubtarget.h"
+
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -21,24 +23,13 @@ using namespace llvm;
 
 static bool hasOnlySGPRSpills(const SIMachineFunctionInfo *FuncInfo,
                               const MachineFrameInfo *FrameInfo) {
-  if (!FuncInfo->hasSpilledSGPRs())
-    return false;
-
-  if (FuncInfo->hasSpilledVGPRs())
-    return false;
-
-  for (int I = FrameInfo->getObjectIndexBegin(),
-         E = FrameInfo->getObjectIndexEnd(); I != E; ++I) {
-    if (!FrameInfo->isSpillSlotObjectIndex(I))
-      return false;
-  }
-
-  return true;
+  return FuncInfo->hasSpilledSGPRs() &&
+    (!FuncInfo->hasSpilledVGPRs() && !FuncInfo->hasNonSpillStackObjects());
 }
 
 static ArrayRef<MCPhysReg> getAllSGPR128() {
-  return makeArrayRef(AMDGPU::SReg_128RegClass.begin(),
-                      AMDGPU::SReg_128RegClass.getNumRegs());
+  return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(),
+                      AMDGPU::SGPR_128RegClass.getNumRegs());
 }
 
 static ArrayRef<MCPhysReg> getAllSGPRs() {
@@ -48,6 +39,12 @@ static ArrayRef<MCPhysReg> getAllSGPRs() {
 
 void SIFrameLowering::emitPrologue(MachineFunction &MF,
                                    MachineBasicBlock &MBB) const {
+  // Emit debugger prologue if "amdgpu-debugger-emit-prologue" attribute was
+  // specified.
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  if (ST.debuggerEmitPrologue())
+    emitDebuggerPrologue(MF, MBB);
+
   if (!MF.getFrameInfo()->hasStackObjects())
     return;
 
@@ -63,10 +60,10 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
   if (hasOnlySGPRSpills(MFI, MF.getFrameInfo()))
     return;
 
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const SIInstrInfo *TII = ST.getInstrInfo();
   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
-  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  MachineBasicBlock::iterator I = MBB.begin();
 
   // We need to insert initialization of the scratch resource descriptor.
   unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
@@ -84,6 +81,46 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
       MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
   }
 
+  if (MFI->hasFlatScratchInit()) {
+    // We don't need this if we only have spills since there is no user facing
+    // scratch.
+
+    // TODO: If we know we don't have flat instructions earlier, we can omit
+    // this from the input registers.
+    //
+    // TODO: We only need to know if we access scratch space through a flat
+    // pointer. Because we only detect if flat instructions are used at all,
+    // this will be used more often than necessary on VI.
+
+    // Debug location must be unknown since the first debug location is used to
+    // determine the end of the prologue.
+    DebugLoc DL;
+
+    unsigned FlatScratchInitReg
+      = TRI->getPreloadedValue(MF, SIRegisterInfo::FLAT_SCRATCH_INIT);
+
+    MRI.addLiveIn(FlatScratchInitReg);
+    MBB.addLiveIn(FlatScratchInitReg);
+
+    // Copy the size in bytes.
+    unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::FLAT_SCR_LO)
+      .addReg(FlatScrInitHi, RegState::Kill);
+
+    unsigned FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
+
+    // Add wave offset in bytes to private base offset.
+    // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
+      .addReg(FlatScrInitLo)
+      .addReg(ScratchWaveOffsetReg);
+
+    // Convert offset to 256-byte units.
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), AMDGPU::FLAT_SCR_HI)
+      .addReg(FlatScrInitLo, RegState::Kill)
+      .addImm(8);
+  }
+
   // If we reserved the original input registers, we don't need to copy to the
   // reserved registers.
   if (ScratchRsrcReg == PreloadedPrivateBufferReg) {
@@ -96,7 +133,6 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
 
   // We added live-ins during argument lowering, but since they were not used
   // they were deleted. We're adding the uses now, so add them back.
-  MachineRegisterInfo &MRI = MF.getRegInfo();
   MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
   MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
 
@@ -137,15 +173,28 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
 
     if (ScratchWaveOffsetReg == TRI->reservedPrivateSegmentWaveByteOffsetReg(MF)) {
       MachineRegisterInfo &MRI = MF.getRegInfo();
-      // Skip the last 2 elements because the last one is reserved for VCC, and
-      // this is the 2nd to last element already.
       unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
-      for (MCPhysReg Reg : getAllSGPRs().drop_back(6).slice(NumPreloaded)) {
+
+      // We need to drop register from the end of the list that we cannot use
+      // for the scratch wave offset.
+      // + 2 s102 and s103 do not exist on VI.
+      // + 2 for vcc
+      // + 2 for xnack_mask
+      // + 2 for flat_scratch
+      // + 4 for registers reserved for scratch resource register
+      // + 1 for register reserved for scratch wave offset.  (By exluding this
+      //     register from the list to consider, it means that when this
+      //     register is being used for the scratch wave offset and there
+      //     are no other free SGPRs, then the value will stay in this register.
+      // ----
+      //  13
+      for (MCPhysReg Reg : getAllSGPRs().drop_back(13).slice(NumPreloaded)) {
         // Pick the first unallocated SGPR. Be careful not to pick an alias of the
         // scratch descriptor, since we haven’t added its uses yet.
         if (!MRI.isPhysRegUsed(Reg)) {
-          assert(MRI.isAllocatable(Reg) &&
-                !TRI->isSubRegisterEq(ScratchRsrcReg, Reg));
+          if (!MRI.isAllocatable(Reg) ||
+              TRI->isSubRegisterEq(ScratchRsrcReg, Reg))
+            continue;
 
           MRI.replaceRegWith(ScratchWaveOffsetReg, Reg);
           ScratchWaveOffsetReg = Reg;
@@ -160,7 +209,6 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
   assert(!TRI->isSubRegister(ScratchRsrcReg, ScratchWaveOffsetReg));
 
   const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
-  MachineBasicBlock::iterator I = MBB.begin();
   DebugLoc DL;
 
   if (PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) {
@@ -223,6 +271,11 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
   }
 }
 
+void SIFrameLowering::emitEpilogue(MachineFunction &MF,
+                                   MachineBasicBlock &MBB) const {
+
+}
+
 void SIFrameLowering::processFunctionBeforeFrameFinalized(
   MachineFunction &MF,
   RegScavenger *RS) const {
@@ -243,3 +296,44 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
     RS->addScavengingFrameIndex(ScavengeFI);
   }
 }
+
+void SIFrameLowering::emitDebuggerPrologue(MachineFunction &MF,
+                                           MachineBasicBlock &MBB) const {
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  const SIRegisterInfo *TRI = &TII->getRegisterInfo();
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+  MachineBasicBlock::iterator I = MBB.begin();
+  DebugLoc DL;
+
+  // For each dimension:
+  for (unsigned i = 0; i < 3; ++i) {
+    // Get work group ID SGPR, and make it live-in again.
+    unsigned WorkGroupIDSGPR = MFI->getWorkGroupIDSGPR(i);
+    MF.getRegInfo().addLiveIn(WorkGroupIDSGPR);
+    MBB.addLiveIn(WorkGroupIDSGPR);
+
+    // Since SGPRs are spilled into VGPRs, copy work group ID SGPR to VGPR in
+    // order to spill it to scratch.
+    unsigned WorkGroupIDVGPR =
+      MF.getRegInfo().createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), WorkGroupIDVGPR)
+      .addReg(WorkGroupIDSGPR);
+
+    // Spill work group ID.
+    int WorkGroupIDObjectIdx = MFI->getDebuggerWorkGroupIDStackObjectIndex(i);
+    TII->storeRegToStackSlot(MBB, I, WorkGroupIDVGPR, false,
+      WorkGroupIDObjectIdx, &AMDGPU::VGPR_32RegClass, TRI);
+
+    // Get work item ID VGPR, and make it live-in again.
+    unsigned WorkItemIDVGPR = MFI->getWorkItemIDVGPR(i);
+    MF.getRegInfo().addLiveIn(WorkItemIDVGPR);
+    MBB.addLiveIn(WorkItemIDVGPR);
+
+    // Spill work item ID.
+    int WorkItemIDObjectIdx = MFI->getDebuggerWorkItemIDStackObjectIndex(i);
+    TII->storeRegToStackSlot(MBB, I, WorkItemIDVGPR, false,
+      WorkItemIDObjectIdx, &AMDGPU::VGPR_32RegClass, TRI);
+  }
+}
diff --git a/lib/Target/AMDGPU/SIFrameLowering.h b/lib/Target/AMDGPU/SIFrameLowering.h
index a9152fd8b2aa..37417d098f31 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/lib/Target/AMDGPU/SIFrameLowering.h
@@ -23,10 +23,16 @@ public:
 
   void emitPrologue(MachineFunction &MF,
                     MachineBasicBlock &MBB) const override;
+  void emitEpilogue(MachineFunction &MF,
+                    MachineBasicBlock &MBB) const override;
 
   void processFunctionBeforeFrameFinalized(
     MachineFunction &MF,
     RegScavenger *RS = nullptr) const override;
+
+private:
+  /// \brief Emits debugger prologue.
+  void emitDebuggerPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const;
 };
 
 }
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 544867513d9c..51241cf0a432 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -18,33 +18,46 @@
 #include <cmath>
 #endif
 
-#include "SIISelLowering.h"
 #include "AMDGPU.h"
-#include "AMDGPUDiagnosticInfoUnsupported.h"
 #include "AMDGPUIntrinsicInfo.h"
 #include "AMDGPUSubtarget.h"
+#include "SIISelLowering.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
 #include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Function.h"
-#include "llvm/ADT/SmallString.h"
 
 using namespace llvm;
 
-SITargetLowering::SITargetLowering(TargetMachine &TM,
-                                   const AMDGPUSubtarget &STI)
+// -amdgpu-fast-fdiv - Command line option to enable faster 2.5 ulp fdiv.
+static cl::opt<bool> EnableAMDGPUFastFDIV(
+  "amdgpu-fast-fdiv",
+  cl::desc("Enable faster 2.5 ulp fdiv"),
+  cl::init(false));
+
+static unsigned findFirstFreeSGPR(CCState &CCInfo) {
+  unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
+  for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
+    if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
+      return AMDGPU::SGPR0 + Reg;
+    }
+  }
+  llvm_unreachable("Cannot allocate sgpr");
+}
+
+SITargetLowering::SITargetLowering(const TargetMachine &TM,
+                                   const SISubtarget &STI)
     : AMDGPUTargetLowering(TM, STI) {
   addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
   addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
 
-  addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass);
-  addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass);
-
   addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
   addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
 
@@ -66,34 +79,25 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
 
   computeRegisterProperties(STI.getRegisterInfo());
 
-  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
-  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
-  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
-  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
-
-  setOperationAction(ISD::ADD, MVT::i32, Legal);
-  setOperationAction(ISD::ADDC, MVT::i32, Legal);
-  setOperationAction(ISD::ADDE, MVT::i32, Legal);
-  setOperationAction(ISD::SUBC, MVT::i32, Legal);
-  setOperationAction(ISD::SUBE, MVT::i32, Legal);
-
-  setOperationAction(ISD::FSIN, MVT::f32, Custom);
-  setOperationAction(ISD::FCOS, MVT::f32, Custom);
-
-  setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
-  setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
-
   // We need to custom lower vector stores from local memory
+  setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
   setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
   setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
+  setOperationAction(ISD::LOAD, MVT::i1, Custom);
 
+  setOperationAction(ISD::STORE, MVT::v2i32, Custom);
+  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
   setOperationAction(ISD::STORE, MVT::v8i32, Custom);
   setOperationAction(ISD::STORE, MVT::v16i32, Custom);
-
   setOperationAction(ISD::STORE, MVT::i1, Custom);
-  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
 
+  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
+  setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
+  setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand);
+
+  setOperationAction(ISD::SELECT, MVT::i1, Promote);
   setOperationAction(ISD::SELECT, MVT::i64, Custom);
   setOperationAction(ISD::SELECT, MVT::f64, Promote);
   AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
@@ -102,109 +106,39 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
   setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
   setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
   setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
 
+  setOperationAction(ISD::SETCC, MVT::i1, Promote);
   setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
   setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
 
-  setOperationAction(ISD::BSWAP, MVT::i32, Legal);
-  setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
+  setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
+  setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
 
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
-
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Legal);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
-
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
-
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
 
-  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
-  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v16i8, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
 
-  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
-
-  for (MVT VT : MVT::integer_valuetypes()) {
-    if (VT == MVT::i64)
-      continue;
-
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal);
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal);
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand);
-
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal);
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal);
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand);
-
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal);
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal);
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand);
-  }
-
-  for (MVT VT : MVT::integer_vector_valuetypes()) {
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i16, Expand);
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v16i16, Expand);
-  }
-
-  for (MVT VT : MVT::fp_valuetypes())
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
-
-  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
-
-  setTruncStoreAction(MVT::i64, MVT::i32, Expand);
-  setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
-  setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
-  setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
-
-
-  setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
-
-  setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
-  setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
-
-  setOperationAction(ISD::LOAD, MVT::i1, Custom);
-
-  setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
-  AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
-
-  setOperationAction(ISD::STORE, MVT::v2i64, Promote);
-  AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
-
-  setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand);
-
-  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
-  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
-  setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
-
-  // These should use UDIVREM, so set them to expand
-  setOperationAction(ISD::UDIV, MVT::i64, Expand);
-  setOperationAction(ISD::UREM, MVT::i64, Expand);
-
-  setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
-  setOperationAction(ISD::SELECT, MVT::i1, Promote);
-
-  setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
-
-
-  setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
+  setOperationAction(ISD::BR_CC, MVT::i1, Expand);
+  setOperationAction(ISD::BR_CC, MVT::i32, Expand);
+  setOperationAction(ISD::BR_CC, MVT::i64, Expand);
+  setOperationAction(ISD::BR_CC, MVT::f32, Expand);
+  setOperationAction(ISD::BR_CC, MVT::f64, Expand);
 
   // We only support LOAD/STORE and vector manipulation ops for vectors
   // with > 4 elements.
   for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64}) {
     for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
-      switch(Op) {
+      switch (Op) {
       case ISD::LOAD:
       case ISD::STORE:
       case ISD::BUILD_VECTOR:
@@ -241,13 +175,46 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
     AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
   }
 
-  if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
+  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
+  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
+  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
+  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
+
+  // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
+  // and output demarshalling
+  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
+
+  // We can't return success/failure, only the old value,
+  // let LLVM add the comparison
+  setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand);
+  setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand);
+
+  if (getSubtarget()->hasFlatAddressSpace()) {
+    setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
+    setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
+  }
+
+  setOperationAction(ISD::BSWAP, MVT::i32, Legal);
+  setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
+
+  // On SI this is s_memtime and s_memrealtime on VI.
+  setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
+  setOperationAction(ISD::TRAP, MVT::Other, Custom);
+
+  setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
+  setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
+
+  if (Subtarget->getGeneration() >= SISubtarget::SEA_ISLANDS) {
     setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
     setOperationAction(ISD::FCEIL, MVT::f64, Legal);
     setOperationAction(ISD::FRINT, MVT::f64, Legal);
   }
 
   setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
+
+  setOperationAction(ISD::FSIN, MVT::f32, Custom);
+  setOperationAction(ISD::FCOS, MVT::f32, Custom);
   setOperationAction(ISD::FDIV, MVT::f32, Custom);
   setOperationAction(ISD::FDIV, MVT::f64, Custom);
 
@@ -263,6 +230,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
   setTargetDAGCombine(ISD::AND);
   setTargetDAGCombine(ISD::OR);
   setTargetDAGCombine(ISD::UINT_TO_FP);
+  setTargetDAGCombine(ISD::FCANONICALIZE);
 
   // All memory operations. Some folding on the pointer operand is done to help
   // matching the constant offsets in the addressing modes.
@@ -287,10 +255,33 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
   setSchedulingPreference(Sched::RegPressure);
 }
 
+const SISubtarget *SITargetLowering::getSubtarget() const {
+  return static_cast<const SISubtarget *>(Subtarget);
+}
+
 //===----------------------------------------------------------------------===//
 // TargetLowering queries
 //===----------------------------------------------------------------------===//
 
+bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
+                                          const CallInst &CI,
+                                          unsigned IntrID) const {
+  switch (IntrID) {
+  case Intrinsic::amdgcn_atomic_inc:
+  case Intrinsic::amdgcn_atomic_dec:
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::getVT(CI.getType());
+    Info.ptrVal = CI.getOperand(0);
+    Info.align = 0;
+    Info.vol = false;
+    Info.readMem = true;
+    Info.writeMem = true;
+    return true;
+  default:
+    return false;
+  }
+}
+
 bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &,
                                           EVT) const {
   // SI has some legal vector types, but no legal vector operations. Say no
@@ -348,7 +339,7 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
 
   switch (AS) {
   case AMDGPUAS::GLOBAL_ADDRESS: {
-    if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+    if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
       // Assume the we will use FLAT for all global memory accesses
       // on VI.
       // FIXME: This assumption is currently wrong.  On VI we still use
@@ -376,16 +367,16 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
     if (DL.getTypeStoreSize(Ty) < 4)
       return isLegalMUBUFAddressingMode(AM);
 
-    if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+    if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) {
       // SMRD instructions have an 8-bit, dword offset on SI.
       if (!isUInt<8>(AM.BaseOffs / 4))
         return false;
-    } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
+    } else if (Subtarget->getGeneration() == SISubtarget::SEA_ISLANDS) {
       // On CI+, this can also be a 32-bit literal constant offset. If it fits
       // in 8-bits, it can use a smaller encoding.
       if (!isUInt<32>(AM.BaseOffs / 4))
         return false;
-    } else if (Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+    } else if (Subtarget->getGeneration() == SISubtarget::VOLCANIC_ISLANDS) {
       // On VI, these use the SMEM format and the offset is 20-bit in bytes.
       if (!isUInt<20>(AM.BaseOffs))
         return false;
@@ -402,7 +393,6 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
   }
 
   case AMDGPUAS::PRIVATE_ADDRESS:
-  case AMDGPUAS::UNKNOWN_ADDRESS_SPACE:
     return isLegalMUBUFAddressingMode(AM);
 
   case AMDGPUAS::LOCAL_ADDRESS:
@@ -423,6 +413,12 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
     return false;
   }
   case AMDGPUAS::FLAT_ADDRESS:
+  case AMDGPUAS::UNKNOWN_ADDRESS_SPACE:
+    // For an unknown address space, this usually means that this is for some
+    // reason being used for pure arithmetic, and not based on some addressing
+    // computation. We don't have instructions that compute pointers with any
+    // addressing modes, so treat them as having no offset like flat
+    // instructions.
     return isLegalFlatAddressingMode(AM);
 
   default:
@@ -442,24 +438,30 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
   if (!VT.isSimple() || VT == MVT::Other)
     return false;
 
-  // TODO - CI+ supports unaligned memory accesses, but this requires driver
-  // support.
-
-  // XXX - The only mention I see of this in the ISA manual is for LDS direct
-  // reads the "byte address and must be dword aligned". Is it also true for the
-  // normal loads and stores?
-  if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS) {
+  if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
+      AddrSpace == AMDGPUAS::REGION_ADDRESS) {
     // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
     // aligned, 8 byte access in a single operation using ds_read2/write2_b32
     // with adjacent offsets.
     bool AlignedBy4 = (Align % 4 == 0);
     if (IsFast)
       *IsFast = AlignedBy4;
+
     return AlignedBy4;
   }
 
+  if (Subtarget->hasUnalignedBufferAccess()) {
+    // If we have an uniform constant load, it still requires using a slow
+    // buffer instruction if unaligned.
+    if (IsFast) {
+      *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS) ?
+        (Align % 4 == 0) : true;
+    }
+
+    return true;
+  }
+
   // Smaller than dword value must be aligned.
-  // FIXME: This should be allowed on CI+
   if (VT.bitsLT(MVT::i32))
     return false;
 
@@ -500,21 +502,22 @@ static bool isFlatGlobalAddrSpace(unsigned AS) {
 
 bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
                                            unsigned DestAS) const {
-  return isFlatGlobalAddrSpace(SrcAS) &&  isFlatGlobalAddrSpace(DestAS);
+  return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS);
 }
 
-
 bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
   const MemSDNode *MemNode = cast<MemSDNode>(N);
   const Value *Ptr = MemNode->getMemOperand()->getValue();
 
   // UndefValue means this is a load of a kernel input.  These are uniform.
-  // Sometimes LDS instructions have constant pointers
-  if (isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || isa<Constant>(Ptr) ||
-      isa<GlobalValue>(Ptr))
+  // Sometimes LDS instructions have constant pointers.
+  // If Ptr is null, then that means this mem operand contains a
+  // PseudoSourceValue like GOT.
+  if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
+      isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
     return true;
 
-  const Instruction *I = dyn_cast_or_null<Instruction>(Ptr);
+  const Instruction *I = dyn_cast<Instruction>(Ptr);
   return I && I->getMetadata("amdgpu.uniform");
 }
 
@@ -528,29 +531,42 @@ SITargetLowering::getPreferredVectorAction(EVT VT) const {
 
 bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                                          Type *Ty) const {
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
+  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
   return TII->isInlineConstant(Imm);
 }
 
-SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
-                                         SDLoc SL, SDValue Chain,
-                                         unsigned Offset, bool Signed) const {
+bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
+
+  // SimplifySetCC uses this function to determine whether or not it should
+  // create setcc with i1 operands.  We don't have instructions for i1 setcc.
+  if (VT == MVT::i1 && Op == ISD::SETCC)
+    return false;
+
+  return TargetLowering::isTypeDesirableForOp(Op, VT);
+}
+
+SDValue SITargetLowering::LowerParameterPtr(SelectionDAG &DAG,
+                                            const SDLoc &SL, SDValue Chain,
+                                            unsigned Offset) const {
   const DataLayout &DL = DAG.getDataLayout();
   MachineFunction &MF = DAG.getMachineFunction();
-  const SIRegisterInfo *TRI =
-      static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
+  const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
   unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
 
-  Type *Ty = VT.getTypeForEVT(*DAG.getContext());
-
   MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
   MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
-  PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
   SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
                                        MRI.getLiveInVirtReg(InputPtrReg), PtrVT);
-  SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
-                            DAG.getConstant(Offset, SL, PtrVT));
+  return DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
+                     DAG.getConstant(Offset, SL, PtrVT));
+}
+SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
+                                         const SDLoc &SL, SDValue Chain,
+                                         unsigned Offset, bool Signed) const {
+  const DataLayout &DL = DAG.getDataLayout();
+  Type *Ty = VT.getTypeForEVT(*DAG.getContext());
+  MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
+  PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
   SDValue PtrOffset = DAG.getUNDEF(PtrVT);
   MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
 
@@ -560,34 +576,35 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
   if (MemVT.isFloatingPoint())
     ExtTy = ISD::EXTLOAD;
 
-  return DAG.getLoad(ISD::UNINDEXED, ExtTy,
-                     VT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemVT,
-                     false, // isVolatile
-                     true, // isNonTemporal
-                     true, // isInvariant
-                     Align); // Alignment
+  SDValue Ptr = LowerParameterPtr(DAG, SL, Chain, Offset);
+  return DAG.getLoad(ISD::UNINDEXED, ExtTy, VT, SL, Chain, Ptr, PtrOffset,
+                     PtrInfo, MemVT, Align, MachineMemOperand::MONonTemporal |
+                                                MachineMemOperand::MOInvariant);
 }
 
 SDValue SITargetLowering::LowerFormalArguments(
     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
-    const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
-    SmallVectorImpl<SDValue> &InVals) const {
-  const SIRegisterInfo *TRI =
-      static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+  const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
 
   MachineFunction &MF = DAG.getMachineFunction();
   FunctionType *FType = MF.getFunction()->getFunctionType();
   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
-  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
 
-  if (Subtarget->isAmdHsaOS() && Info->getShaderType() != ShaderType::COMPUTE) {
+  if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
     const Function *Fn = MF.getFunction();
-    DiagnosticInfoUnsupported NoGraphicsHSA(*Fn, "non-compute shaders with HSA");
+    DiagnosticInfoUnsupported NoGraphicsHSA(
+        *Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
     DAG.getContext()->diagnose(NoGraphicsHSA);
-    return SDValue();
+    return DAG.getEntryNode();
   }
 
-  // FIXME: We currently assume all calling conventions are kernels.
+  // Create stack objects that are used for emitting debugger prologue if
+  // "amdgpu-debugger-emit-prologue" attribute was specified.
+  if (ST.debuggerEmitPrologue())
+    createDebuggerPrologueStackObjects(MF);
 
   SmallVector<ISD::InputArg, 16> Splits;
   BitVector Skipped(Ins.size());
@@ -596,7 +613,7 @@ SDValue SITargetLowering::LowerFormalArguments(
     const ISD::InputArg &Arg = Ins[i];
 
     // First check if it's a PS input addr
-    if (Info->getShaderType() == ShaderType::PIXEL && !Arg.Flags.isInReg() &&
+    if (CallConv == CallingConv::AMDGPU_PS && !Arg.Flags.isInReg() &&
         !Arg.Flags.isByVal() && PSInputNum <= 15) {
 
       if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) {
@@ -613,25 +630,26 @@ SDValue SITargetLowering::LowerFormalArguments(
       ++PSInputNum;
     }
 
-    // Second split vertices into their elements
-    if (Info->getShaderType() != ShaderType::COMPUTE && Arg.VT.isVector()) {
-      ISD::InputArg NewArg = Arg;
-      NewArg.Flags.setSplit();
-      NewArg.VT = Arg.VT.getVectorElementType();
-
-      // We REALLY want the ORIGINAL number of vertex elements here, e.g. a
-      // three or five element vertex only needs three or five registers,
-      // NOT four or eight.
-      Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
-      unsigned NumElements = ParamType->getVectorNumElements();
-
-      for (unsigned j = 0; j != NumElements; ++j) {
-        Splits.push_back(NewArg);
-        NewArg.PartOffset += NewArg.VT.getStoreSize();
+    if (AMDGPU::isShader(CallConv)) {
+      // Second split vertices into their elements
+      if (Arg.VT.isVector()) {
+        ISD::InputArg NewArg = Arg;
+        NewArg.Flags.setSplit();
+        NewArg.VT = Arg.VT.getVectorElementType();
+
+        // We REALLY want the ORIGINAL number of vertex elements here, e.g. a
+        // three or five element vertex only needs three or five registers,
+        // NOT four or eight.
+        Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
+        unsigned NumElements = ParamType->getVectorNumElements();
+
+        for (unsigned j = 0; j != NumElements; ++j) {
+          Splits.push_back(NewArg);
+          NewArg.PartOffset += NewArg.VT.getStoreSize();
+        }
+      } else {
+        Splits.push_back(Arg);
       }
-
-    } else if (Info->getShaderType() != ShaderType::COMPUTE) {
-      Splits.push_back(Arg);
     }
   }
 
@@ -651,19 +669,27 @@ SDValue SITargetLowering::LowerFormalArguments(
   // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
   // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
   //   enabled too.
-  if (Info->getShaderType() == ShaderType::PIXEL &&
+  if (CallConv == CallingConv::AMDGPU_PS &&
       ((Info->getPSInputAddr() & 0x7F) == 0 ||
-       ((Info->getPSInputAddr() & 0xF) == 0 &&
-	Info->isPSInputAllocated(11)))) {
+       ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11)))) {
     CCInfo.AllocateReg(AMDGPU::VGPR0);
     CCInfo.AllocateReg(AMDGPU::VGPR1);
     Info->markPSInputAllocated(0);
     Info->PSInputEna |= 1;
   }
 
-  if (Info->getShaderType() == ShaderType::COMPUTE) {
+  if (!AMDGPU::isShader(CallConv)) {
     getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins,
                             Splits);
+
+    assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
+  } else {
+    assert(!Info->hasPrivateSegmentBuffer() && !Info->hasDispatchPtr() &&
+           !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
+           !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
+           !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
+           !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
+           !Info->hasWorkItemIDZ());
   }
 
   // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
@@ -679,12 +705,24 @@ SDValue SITargetLowering::LowerFormalArguments(
     CCInfo.AllocateReg(DispatchPtrReg);
   }
 
+  if (Info->hasQueuePtr()) {
+    unsigned QueuePtrReg = Info->addQueuePtr(*TRI);
+    MF.addLiveIn(QueuePtrReg, &AMDGPU::SReg_64RegClass);
+    CCInfo.AllocateReg(QueuePtrReg);
+  }
+
   if (Info->hasKernargSegmentPtr()) {
     unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI);
     MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass);
     CCInfo.AllocateReg(InputPtrReg);
   }
 
+  if (Info->hasFlatScratchInit()) {
+    unsigned FlatScratchInitReg = Info->addFlatScratchInit(*TRI);
+    MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SReg_64RegClass);
+    CCInfo.AllocateReg(FlatScratchInitReg);
+  }
+
   AnalyzeFormalArguments(CCInfo, Splits);
 
   SmallVector<SDValue, 16> Chains;
@@ -713,7 +751,7 @@ SDValue SITargetLowering::LowerFormalArguments(
 
       auto *ParamTy =
         dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
-      if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
+      if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS &&
           ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
         // On SI local pointers are just offsets into LDS, so they are always
         // less than 16-bits.  On CI and newer they could potentially be
@@ -765,7 +803,7 @@ SDValue SITargetLowering::LowerFormalArguments(
       NumElements = Arg.VT.getVectorNumElements() - NumElements;
       Regs.append(NumElements, DAG.getUNDEF(VT));
 
-      InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT, Regs));
+      InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs));
       continue;
     }
 
@@ -780,8 +818,7 @@ SDValue SITargetLowering::LowerFormalArguments(
     unsigned Reg = Info->addWorkGroupIDX();
     MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
     CCInfo.AllocateReg(Reg);
-  } else
-    llvm_unreachable("work group id x is always enabled");
+  }
 
   if (Info->hasWorkGroupIDY()) {
     unsigned Reg = Info->addWorkGroupIDY();
@@ -803,8 +840,13 @@ SDValue SITargetLowering::LowerFormalArguments(
 
   if (Info->hasPrivateSegmentWaveByteOffset()) {
     // Scratch wave offset passed in system SGPR.
-    unsigned PrivateSegmentWaveByteOffsetReg
-      = Info->addPrivateSegmentWaveByteOffset();
+    unsigned PrivateSegmentWaveByteOffsetReg;
+
+    if (AMDGPU::isShader(CallConv)) {
+      PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
+      Info->setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
+    } else
+      PrivateSegmentWaveByteOffsetReg = Info->addPrivateSegmentWaveByteOffset();
 
     MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
     CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
@@ -812,8 +854,11 @@ SDValue SITargetLowering::LowerFormalArguments(
 
   // Now that we've figured out where the scratch register inputs are, see if
   // should reserve the arguments and use them directly.
-
   bool HasStackObjects = MF.getFrameInfo()->hasStackObjects();
+  // Record that we know we have non-spill stack objects so we don't need to
+  // check all stack objects later.
+  if (HasStackObjects)
+    Info->setHasNonSpillStackObjects(true);
 
   if (ST.isAmdHsaOS()) {
     // TODO: Assume we will spill without optimizations.
@@ -866,8 +911,7 @@ SDValue SITargetLowering::LowerFormalArguments(
     unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X);
     MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
     CCInfo.AllocateReg(Reg);
-  } else
-    llvm_unreachable("workitem id x should always be enabled");
+  }
 
   if (Info->hasWorkItemIDY()) {
     unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y);
@@ -887,16 +931,16 @@ SDValue SITargetLowering::LowerFormalArguments(
   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
 }
 
-SDValue SITargetLowering::LowerReturn(SDValue Chain,
-                                      CallingConv::ID CallConv,
-                                      bool isVarArg,
-                                      const SmallVectorImpl<ISD::OutputArg> &Outs,
-                                      const SmallVectorImpl<SDValue> &OutVals,
-                                      SDLoc DL, SelectionDAG &DAG) const {
+SDValue
+SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                              bool isVarArg,
+                              const SmallVectorImpl<ISD::OutputArg> &Outs,
+                              const SmallVectorImpl<SDValue> &OutVals,
+                              const SDLoc &DL, SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
 
-  if (Info->getShaderType() == ShaderType::COMPUTE)
+  if (!AMDGPU::isShader(CallConv))
     return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
                                              OutVals, DL, DAG);
 
@@ -975,17 +1019,131 @@ SDValue SITargetLowering::LowerReturn(SDValue Chain,
   if (Flag.getNode())
     RetOps.push_back(Flag);
 
-  return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, RetOps);
+  unsigned Opc = Info->returnsVoid() ? AMDGPUISD::ENDPGM : AMDGPUISD::RETURN;
+  return DAG.getNode(Opc, DL, MVT::Other, RetOps);
 }
 
-MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
-    MachineInstr * MI, MachineBasicBlock * BB) const {
-
-  switch (MI->getOpcode()) {
+unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
+                                             SelectionDAG &DAG) const {
+  unsigned Reg = StringSwitch<unsigned>(RegName)
+    .Case("m0", AMDGPU::M0)
+    .Case("exec", AMDGPU::EXEC)
+    .Case("exec_lo", AMDGPU::EXEC_LO)
+    .Case("exec_hi", AMDGPU::EXEC_HI)
+    .Case("flat_scratch", AMDGPU::FLAT_SCR)
+    .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
+    .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
+    .Default(AMDGPU::NoRegister);
+
+  if (Reg == AMDGPU::NoRegister) {
+    report_fatal_error(Twine("invalid register name \""
+                             + StringRef(RegName)  + "\"."));
+
+  }
+
+  if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS &&
+      Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
+    report_fatal_error(Twine("invalid register \""
+                             + StringRef(RegName)  + "\" for subtarget."));
+  }
+
+  switch (Reg) {
+  case AMDGPU::M0:
+  case AMDGPU::EXEC_LO:
+  case AMDGPU::EXEC_HI:
+  case AMDGPU::FLAT_SCR_LO:
+  case AMDGPU::FLAT_SCR_HI:
+    if (VT.getSizeInBits() == 32)
+      return Reg;
+    break;
+  case AMDGPU::EXEC:
+  case AMDGPU::FLAT_SCR:
+    if (VT.getSizeInBits() == 64)
+      return Reg;
+    break;
   default:
-    return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
+    llvm_unreachable("missing register type checking");
+  }
+
+  report_fatal_error(Twine("invalid type for register \""
+                           + StringRef(RegName) + "\"."));
+}
+
+// If kill is not the last instruction, split the block so kill is always a
+// proper terminator.
+MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI,
+                                                    MachineBasicBlock *BB) const {
+  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+
+  MachineBasicBlock::iterator SplitPoint(&MI);
+  ++SplitPoint;
+
+  if (SplitPoint == BB->end()) {
+    // Don't bother with a new block.
+    MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR));
+    return BB;
+  }
+
+  MachineFunction *MF = BB->getParent();
+  MachineBasicBlock *SplitBB
+    = MF->CreateMachineBasicBlock(BB->getBasicBlock());
+
+  // Fix the block phi references to point to the new block for the defs in the
+  // second piece of the block.
+  for (MachineBasicBlock *Succ : BB->successors()) {
+    for (MachineInstr &MI : *Succ) {
+      if (!MI.isPHI())
+        break;
+
+      for (unsigned I = 2, E = MI.getNumOperands(); I != E; I += 2) {
+        MachineOperand &FromBB = MI.getOperand(I);
+        if (BB == FromBB.getMBB()) {
+          FromBB.setMBB(SplitBB);
+          break;
+        }
+      }
+    }
+  }
+
+  MF->insert(++MachineFunction::iterator(BB), SplitBB);
+  SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end());
+
+  SplitBB->transferSuccessors(BB);
+  BB->addSuccessor(SplitBB);
+
+  MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR));
+  return SplitBB;
+}
+
+MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
+  MachineInstr &MI, MachineBasicBlock *BB) const {
+  switch (MI.getOpcode()) {
+  case AMDGPU::SI_INIT_M0: {
+    const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+    BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
+            TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+        .addOperand(MI.getOperand(0));
+    MI.eraseFromParent();
+    break;
+  }
   case AMDGPU::BRANCH:
     return BB;
+  case AMDGPU::GET_GROUPSTATICSIZE: {
+    const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+
+    MachineFunction *MF = BB->getParent();
+    SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+    DebugLoc DL = MI.getDebugLoc();
+    BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOVK_I32))
+        .addOperand(MI.getOperand(0))
+        .addImm(MFI->LDSSize);
+    MI.eraseFromParent();
+    return BB;
+  }
+  case AMDGPU::SI_KILL:
+    return splitKillBlock(MI, BB);
+  default:
+    return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
   }
   return BB;
 }
@@ -1072,6 +1230,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     return LowerTrig(Op, DAG);
   case ISD::SELECT: return LowerSELECT(Op, DAG);
   case ISD::FDIV: return LowerFDIV(Op, DAG);
+  case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
   case ISD::STORE: return LowerSTORE(Op, DAG);
   case ISD::GlobalAddress: {
     MachineFunction &MF = DAG.getMachineFunction();
@@ -1079,7 +1238,10 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     return LowerGlobalAddress(MFI, Op, DAG);
   }
   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+  case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
   case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
+  case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
+  case ISD::TRAP: return lowerTRAP(Op, DAG);
   }
   return SDValue();
 }
@@ -1106,25 +1268,78 @@ SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
   FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op);
   unsigned FrameIndex = FINode->getIndex();
 
-  // A FrameIndex node represents a 32-bit offset into scratch memory.  If
-  // the high bit of a frame index offset were to be set, this would mean
-  // that it represented an offset of ~2GB * 64 = ~128GB from the start of the
-  // scratch buffer, with 64 being the number of threads per wave.
+  // A FrameIndex node represents a 32-bit offset into scratch memory. If the
+  // high bit of a frame index offset were to be set, this would mean that it
+  // represented an offset of ~2GB * 64 = ~128GB from the start of the scratch
+  // buffer, with 64 being the number of threads per wave.
   //
-  // If we know the machine uses less than 128GB of scratch, then we can
-  // amrk the high bit of the FrameIndex node as known zero,
-  // which is important, because it means in most situations we can
-  // prove that values derived from FrameIndex nodes are non-negative.
-  // This enables us to take advantage of more addressing modes when
-  // accessing scratch buffers, since for scratch reads/writes, the register
-  // offset must always be positive.
+  // The maximum private allocation for the entire GPU is 4G, and we are
+  // concerned with the largest the index could ever be for an individual
+  // workitem. This will occur with the minmum dispatch size. If a program
+  // requires more, the dispatch size will be reduced.
+  //
+  // With this limit, we can mark the high bit of the FrameIndex node as known
+  // zero, which is important, because it means in most situations we can prove
+  // that values derived from FrameIndex nodes are non-negative. This enables us
+  // to take advantage of more addressing modes when accessing scratch buffers,
+  // since for scratch reads/writes, the register offset must always be
+  // positive.
 
-  SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
-  if (Subtarget->enableHugeScratchBuffer())
-    return TFI;
+  uint64_t MaxGPUAlloc = UINT64_C(4) * 1024 * 1024 * 1024;
 
+  // XXX - It is unclear if partial dispatch works. Assume it works at half wave
+  // granularity. It is probably a full wave.
+  uint64_t MinGranularity = 32;
+
+  unsigned KnownBits = Log2_64(MaxGPUAlloc / MinGranularity);
+  EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), KnownBits);
+
+  SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
   return DAG.getNode(ISD::AssertZext, SL, MVT::i32, TFI,
-                    DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), 31)));
+                     DAG.getValueType(ExtVT));
+}
+
+bool SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
+  if (Intr->getOpcode() != ISD::INTRINSIC_W_CHAIN)
+    return false;
+
+  switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
+  default: return false;
+  case AMDGPUIntrinsic::amdgcn_if:
+  case AMDGPUIntrinsic::amdgcn_else:
+  case AMDGPUIntrinsic::amdgcn_break:
+  case AMDGPUIntrinsic::amdgcn_if_break:
+  case AMDGPUIntrinsic::amdgcn_else_break:
+  case AMDGPUIntrinsic::amdgcn_loop:
+  case AMDGPUIntrinsic::amdgcn_end_cf:
+    return true;
+  }
+}
+
+void SITargetLowering::createDebuggerPrologueStackObjects(
+    MachineFunction &MF) const {
+  // Create stack objects that are used for emitting debugger prologue.
+  //
+  // Debugger prologue writes work group IDs and work item IDs to scratch memory
+  // at fixed location in the following format:
+  //   offset 0:  work group ID x
+  //   offset 4:  work group ID y
+  //   offset 8:  work group ID z
+  //   offset 16: work item ID x
+  //   offset 20: work item ID y
+  //   offset 24: work item ID z
+  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+  int ObjectIdx = 0;
+
+  // For each dimension:
+  for (unsigned i = 0; i < 3; ++i) {
+    // Create fixed stack object for work group ID.
+    ObjectIdx = MF.getFrameInfo()->CreateFixedObject(4, i * 4, true);
+    Info->setDebuggerWorkGroupIDStackObjectIndex(i, ObjectIdx);
+    // Create fixed stack object for work item ID.
+    ObjectIdx = MF.getFrameInfo()->CreateFixedObject(4, i * 4 + 16, true);
+    Info->setDebuggerWorkItemIDStackObjectIndex(i, ObjectIdx);
+  }
 }
 
 /// This transforms the control flow intrinsics to get the branch destination as
@@ -1137,13 +1352,11 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
   SDNode *Intr = BRCOND.getOperand(1).getNode();
   SDValue Target = BRCOND.getOperand(2);
   SDNode *BR = nullptr;
+  SDNode *SetCC = nullptr;
 
   if (Intr->getOpcode() == ISD::SETCC) {
     // As long as we negate the condition everything is fine
-    SDNode *SetCC = Intr;
-    assert(SetCC->getConstantOperandVal(1) == 1);
-    assert(cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
-           ISD::SETNE);
+    SetCC = Intr;
     Intr = SetCC->getOperand(0).getNode();
 
   } else {
@@ -1152,7 +1365,15 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
     Target = BR->getOperand(1);
   }
 
-  assert(Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN);
+  if (!isCFIntrinsic(Intr)) {
+    // This is a uniform branch so we don't need to legalize.
+    return BRCOND;
+  }
+
+  assert(!SetCC ||
+        (SetCC->getConstantOperandVal(1) == 1 &&
+         cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
+                                                             ISD::SETNE));
 
   // Build the result and
   ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
@@ -1204,37 +1425,185 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
   return Chain;
 }
 
+SDValue SITargetLowering::getSegmentAperture(unsigned AS,
+                                             SelectionDAG &DAG) const {
+  SDLoc SL;
+  MachineFunction &MF = DAG.getMachineFunction();
+  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+  unsigned UserSGPR = Info->getQueuePtrUserSGPR();
+  assert(UserSGPR != AMDGPU::NoRegister);
+
+  SDValue QueuePtr = CreateLiveInRegister(
+    DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
+
+  // Offset into amd_queue_t for group_segment_aperture_base_hi /
+  // private_segment_aperture_base_hi.
+  uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
+
+  SDValue Ptr = DAG.getNode(ISD::ADD, SL, MVT::i64, QueuePtr,
+                            DAG.getConstant(StructOffset, SL, MVT::i64));
+
+  // TODO: Use custom target PseudoSourceValue.
+  // TODO: We should use the value from the IR intrinsic call, but it might not
+  // be available and how do we get it?
+  Value *V = UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG.getContext()),
+                                              AMDGPUAS::CONSTANT_ADDRESS));
+
+  MachinePointerInfo PtrInfo(V, StructOffset);
+  return DAG.getLoad(MVT::i32, SL, QueuePtr.getValue(1), Ptr, PtrInfo,
+                     MinAlign(64, StructOffset),
+                     MachineMemOperand::MOInvariant);
+}
+
+SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);
+
+  SDValue Src = ASC->getOperand(0);
+
+  // FIXME: Really support non-0 null pointers.
+  SDValue SegmentNullPtr = DAG.getConstant(-1, SL, MVT::i32);
+  SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
+
+  // flat -> local/private
+  if (ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
+    if (ASC->getDestAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+        ASC->getDestAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
+      SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
+      SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
+
+      return DAG.getNode(ISD::SELECT, SL, MVT::i32,
+                         NonNull, Ptr, SegmentNullPtr);
+    }
+  }
+
+  // local/private -> flat
+  if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
+    if (ASC->getSrcAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+        ASC->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
+      SDValue NonNull
+        = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
+
+      SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), DAG);
+      SDValue CvtPtr
+        = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
+
+      return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull,
+                         DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr),
+                         FlatNullPtr);
+    }
+  }
+
+  // global <-> flat are no-ops and never emitted.
+
+  const MachineFunction &MF = DAG.getMachineFunction();
+  DiagnosticInfoUnsupported InvalidAddrSpaceCast(
+    *MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
+  DAG.getContext()->diagnose(InvalidAddrSpaceCast);
+
+  return DAG.getUNDEF(ASC->getValueType(0));
+}
+
+static bool shouldEmitGOTReloc(const GlobalValue *GV,
+                               const TargetMachine &TM) {
+  return GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
+         !TM.shouldAssumeDSOLocal(*GV->getParent(), GV);
+}
+
+bool
+SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
+  // We can fold offsets for anything that doesn't require a GOT relocation.
+  return GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
+         !shouldEmitGOTReloc(GA->getGlobal(), getTargetMachine());
+}
+
+static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
+                                      SDLoc DL, unsigned Offset, EVT PtrVT,
+                                      unsigned GAFlags = SIInstrInfo::MO_NONE) {
+  // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
+  // lowered to the following code sequence:
+  // s_getpc_b64 s[0:1]
+  // s_add_u32 s0, s0, $symbol
+  // s_addc_u32 s1, s1, 0
+  //
+  // s_getpc_b64 returns the address of the s_add_u32 instruction and then
+  // a fixup or relocation is emitted to replace $symbol with a literal
+  // constant, which is a pc-relative offset from the encoding of the $symbol
+  // operand to the global variable.
+  //
+  // What we want here is an offset from the value returned by s_getpc
+  // (which is the address of the s_add_u32 instruction) to the global
+  // variable, but since the encoding of $symbol starts 4 bytes after the start
+  // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
+  // small. This requires us to add 4 to the global variable offset in order to
+  // compute the correct address.
+  SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
+                                          GAFlags);
+  return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, GA);
+}
+
 SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
                                              SDValue Op,
                                              SelectionDAG &DAG) const {
   GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
 
-  if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
+  if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS &&
+      GSD->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS)
     return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
 
   SDLoc DL(GSD);
   const GlobalValue *GV = GSD->getGlobal();
-  MVT PtrVT = getPointerTy(DAG.getDataLayout(), GSD->getAddressSpace());
+  EVT PtrVT = Op.getValueType();
+
+  if (!shouldEmitGOTReloc(GV, getTargetMachine()))
+    return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
+
+  SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
+                                            SIInstrInfo::MO_GOTPCREL);
+
+  Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
+  PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
+  const DataLayout &DataLayout = DAG.getDataLayout();
+  unsigned Align = DataLayout.getABITypeAlignment(PtrTy);
+  // FIXME: Use a PseudoSourceValue once those can be assigned an address space.
+  MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
 
-  SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32);
-  return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT, GA);
+  return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Align,
+                     MachineMemOperand::MOInvariant);
 }
 
-SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL,
-                                   SDValue V) const {
+SDValue SITargetLowering::lowerTRAP(SDValue Op,
+                                    SelectionDAG &DAG) const {
+  const MachineFunction &MF = DAG.getMachineFunction();
+  DiagnosticInfoUnsupported NoTrap(*MF.getFunction(),
+                                   "trap handler not supported",
+                                   Op.getDebugLoc(),
+                                   DS_Warning);
+  DAG.getContext()->diagnose(NoTrap);
+
+  // Emit s_endpgm.
+
+  // FIXME: This should really be selected to s_trap, but that requires
+  // setting up the trap handler for it o do anything.
+  return DAG.getNode(AMDGPUISD::ENDPGM, SDLoc(Op), MVT::Other,
+                     Op.getOperand(0));
+}
+
+SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,
+                                   const SDLoc &DL, SDValue V) const {
+  // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
+  // the destination register.
+  //
   // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
   // so we will end up with redundant moves to m0.
   //
-  // We can't use S_MOV_B32, because there is no way to specify m0 as the
-  // destination register.
-  //
-  // We have to use them both.  Machine cse will combine all the S_MOV_B32
-  // instructions and the register coalescer eliminate the extra copies.
-  SDNode *M0 = DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, V.getValueType(), V);
-  return DAG.getCopyToReg(Chain, DL, DAG.getRegister(AMDGPU::M0, MVT::i32),
-                          SDValue(M0, 0), SDValue()); // Glue
-                                                      // A Null SDValue creates
-                                                      // a glue result.
+  // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
+
+  // A Null SDValue creates a glue result.
+  SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
+                                  V, Chain);
+  return SDValue(M0, 0);
 }
 
 SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
@@ -1249,12 +1618,27 @@ SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
                      DAG.getValueType(VT));
 }
 
+static SDValue emitNonHSAIntrinsicError(SelectionDAG& DAG, SDLoc DL, EVT VT) {
+  DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(),
+                                      "non-hsa intrinsic with hsa target",
+                                      DL.getDebugLoc());
+  DAG.getContext()->diagnose(BadIntrin);
+  return DAG.getUNDEF(VT);
+}
+
+static SDValue emitRemovedIntrinsicError(SelectionDAG& DAG, SDLoc DL, EVT VT) {
+  DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(),
+                                      "intrinsic not supported on subtarget",
+                                      DL.getDebugLoc());
+  DAG.getContext()->diagnose(BadIntrin);
+  return DAG.getUNDEF(VT);
+}
+
 SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                   SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   auto MFI = MF.getInfo<SIMachineFunctionInfo>();
-  const SIRegisterInfo *TRI =
-      static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
+  const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
 
   EVT VT = Op.getValueType();
   SDLoc DL(Op);
@@ -1264,62 +1648,134 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
 
   switch (IntrinsicID) {
   case Intrinsic::amdgcn_dispatch_ptr:
+  case Intrinsic::amdgcn_queue_ptr: {
     if (!Subtarget->isAmdHsaOS()) {
-      DiagnosticInfoUnsupported BadIntrin(*MF.getFunction(),
-                                          "hsa intrinsic without hsa target");
+      DiagnosticInfoUnsupported BadIntrin(
+          *MF.getFunction(), "unsupported hsa intrinsic without hsa target",
+          DL.getDebugLoc());
       DAG.getContext()->diagnose(BadIntrin);
       return DAG.getUNDEF(VT);
     }
 
+    auto Reg = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
+      SIRegisterInfo::DISPATCH_PTR : SIRegisterInfo::QUEUE_PTR;
     return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass,
-      TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_PTR), VT);
-
+                                TRI->getPreloadedValue(MF, Reg), VT);
+  }
+  case Intrinsic::amdgcn_implicitarg_ptr: {
+    unsigned offset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
+    return LowerParameterPtr(DAG, DL, DAG.getEntryNode(), offset);
+  }
+  case Intrinsic::amdgcn_kernarg_segment_ptr: {
+    unsigned Reg
+      = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
+    return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT);
+  }
+  case Intrinsic::amdgcn_rcp:
+    return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
+  case Intrinsic::amdgcn_rsq:
+  case AMDGPUIntrinsic::AMDGPU_rsq: // Legacy name
+    return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
+  case Intrinsic::amdgcn_rsq_legacy: {
+    if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
+      return emitRemovedIntrinsicError(DAG, DL, VT);
+
+    return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
+  }
+  case Intrinsic::amdgcn_rsq_clamp: {
+    if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
+      return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
+
+    Type *Type = VT.getTypeForEVT(*DAG.getContext());
+    APFloat Max = APFloat::getLargest(Type->getFltSemantics());
+    APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
+
+    SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
+    SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
+                              DAG.getConstantFP(Max, DL, VT));
+    return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
+                       DAG.getConstantFP(Min, DL, VT));
+  }
   case Intrinsic::r600_read_ngroups_x:
+    if (Subtarget->isAmdHsaOS())
+      return emitNonHSAIntrinsicError(DAG, DL, VT);
+
     return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
                           SI::KernelInputOffsets::NGROUPS_X, false);
   case Intrinsic::r600_read_ngroups_y:
+    if (Subtarget->isAmdHsaOS())
+      return emitNonHSAIntrinsicError(DAG, DL, VT);
+
     return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
                           SI::KernelInputOffsets::NGROUPS_Y, false);
   case Intrinsic::r600_read_ngroups_z:
+    if (Subtarget->isAmdHsaOS())
+      return emitNonHSAIntrinsicError(DAG, DL, VT);
+
     return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
                           SI::KernelInputOffsets::NGROUPS_Z, false);
   case Intrinsic::r600_read_global_size_x:
+    if (Subtarget->isAmdHsaOS())
+      return emitNonHSAIntrinsicError(DAG, DL, VT);
+
     return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
                           SI::KernelInputOffsets::GLOBAL_SIZE_X, false);
   case Intrinsic::r600_read_global_size_y:
+    if (Subtarget->isAmdHsaOS())
+      return emitNonHSAIntrinsicError(DAG, DL, VT);
+
     return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
                           SI::KernelInputOffsets::GLOBAL_SIZE_Y, false);
   case Intrinsic::r600_read_global_size_z:
+    if (Subtarget->isAmdHsaOS())
+      return emitNonHSAIntrinsicError(DAG, DL, VT);
+
     return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
                           SI::KernelInputOffsets::GLOBAL_SIZE_Z, false);
   case Intrinsic::r600_read_local_size_x:
+    if (Subtarget->isAmdHsaOS())
+      return emitNonHSAIntrinsicError(DAG, DL, VT);
+
     return lowerImplicitZextParam(DAG, Op, MVT::i16,
                                   SI::KernelInputOffsets::LOCAL_SIZE_X);
   case Intrinsic::r600_read_local_size_y:
+    if (Subtarget->isAmdHsaOS())
+      return emitNonHSAIntrinsicError(DAG, DL, VT);
+
     return lowerImplicitZextParam(DAG, Op, MVT::i16,
                                   SI::KernelInputOffsets::LOCAL_SIZE_Y);
   case Intrinsic::r600_read_local_size_z:
+    if (Subtarget->isAmdHsaOS())
+      return emitNonHSAIntrinsicError(DAG, DL, VT);
+
     return lowerImplicitZextParam(DAG, Op, MVT::i16,
                                   SI::KernelInputOffsets::LOCAL_SIZE_Z);
-  case Intrinsic::AMDGPU_read_workdim:
+  case Intrinsic::amdgcn_read_workdim:
+  case AMDGPUIntrinsic::AMDGPU_read_workdim: // Legacy name.
     // Really only 2 bits.
     return lowerImplicitZextParam(DAG, Op, MVT::i8,
                                   getImplicitParameterOffset(MFI, GRID_DIM));
+  case Intrinsic::amdgcn_workgroup_id_x:
   case Intrinsic::r600_read_tgid_x:
     return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
       TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_X), VT);
+  case Intrinsic::amdgcn_workgroup_id_y:
   case Intrinsic::r600_read_tgid_y:
     return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
       TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Y), VT);
+  case Intrinsic::amdgcn_workgroup_id_z:
   case Intrinsic::r600_read_tgid_z:
     return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
       TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Z), VT);
+  case Intrinsic::amdgcn_workitem_id_x:
   case Intrinsic::r600_read_tidig_x:
     return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
       TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X), VT);
+  case Intrinsic::amdgcn_workitem_id_y:
   case Intrinsic::r600_read_tidig_y:
     return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
       TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y), VT);
+  case Intrinsic::amdgcn_workitem_id_z:
   case Intrinsic::r600_read_tidig_z:
     return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
       TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z), VT);
@@ -1336,24 +1792,12 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
                                    Op->getVTList(), Ops, VT, MMO);
   }
-  case AMDGPUIntrinsic::SI_sample:
-    return LowerSampleIntrinsic(AMDGPUISD::SAMPLE, Op, DAG);
-  case AMDGPUIntrinsic::SI_sampleb:
-    return LowerSampleIntrinsic(AMDGPUISD::SAMPLEB, Op, DAG);
-  case AMDGPUIntrinsic::SI_sampled:
-    return LowerSampleIntrinsic(AMDGPUISD::SAMPLED, Op, DAG);
-  case AMDGPUIntrinsic::SI_samplel:
-    return LowerSampleIntrinsic(AMDGPUISD::SAMPLEL, Op, DAG);
   case AMDGPUIntrinsic::SI_vs_load_input:
     return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT,
                        Op.getOperand(1),
                        Op.getOperand(2),
                        Op.getOperand(3));
 
-  case AMDGPUIntrinsic::AMDGPU_fract:
-  case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name.
-    return DAG.getNode(ISD::FSUB, DL, VT, Op.getOperand(1),
-                       DAG.getNode(ISD::FFLOOR, DL, VT, Op.getOperand(1)));
   case AMDGPUIntrinsic::SI_fs_constant: {
     SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3));
     SDValue Glue = M0.getValue(1);
@@ -1393,11 +1837,93 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
                        Glue);
   }
+  case Intrinsic::amdgcn_sin:
+    return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
+
+  case Intrinsic::amdgcn_cos:
+    return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
+
+  case Intrinsic::amdgcn_log_clamp: {
+    if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
+      return SDValue();
+
+    DiagnosticInfoUnsupported BadIntrin(
+      *MF.getFunction(), "intrinsic not supported on subtarget",
+      DL.getDebugLoc());
+      DAG.getContext()->diagnose(BadIntrin);
+      return DAG.getUNDEF(VT);
+  }
+  case Intrinsic::amdgcn_ldexp:
+    return DAG.getNode(AMDGPUISD::LDEXP, DL, VT,
+                       Op.getOperand(1), Op.getOperand(2));
+
+  case Intrinsic::amdgcn_fract:
+    return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
+
+  case Intrinsic::amdgcn_class:
+    return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
+                       Op.getOperand(1), Op.getOperand(2));
+  case Intrinsic::amdgcn_div_fmas:
+    return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
+                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
+                       Op.getOperand(4));
+
+  case Intrinsic::amdgcn_div_fixup:
+    return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
+                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
+  case Intrinsic::amdgcn_trig_preop:
+    return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
+                       Op.getOperand(1), Op.getOperand(2));
+  case Intrinsic::amdgcn_div_scale: {
+    // 3rd parameter required to be a constant.
+    const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3));
+    if (!Param)
+      return DAG.getUNDEF(VT);
+
+    // Translate to the operands expected by the machine instruction. The
+    // first parameter must be the same as the first instruction.
+    SDValue Numerator = Op.getOperand(1);
+    SDValue Denominator = Op.getOperand(2);
+
+    // Note this order is opposite of the machine instruction's operations,
+    // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
+    // intrinsic has the numerator as the first operand to match a normal
+    // division operation.
+
+    SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;
+
+    return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
+                       Denominator, Numerator);
+  }
   default:
     return AMDGPUTargetLowering::LowerOperation(Op, DAG);
   }
 }
 
+SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+  switch (IntrID) {
+  case Intrinsic::amdgcn_atomic_inc:
+  case Intrinsic::amdgcn_atomic_dec: {
+    MemSDNode *M = cast<MemSDNode>(Op);
+    unsigned Opc = (IntrID == Intrinsic::amdgcn_atomic_inc) ?
+      AMDGPUISD::ATOMIC_INC : AMDGPUISD::ATOMIC_DEC;
+    SDValue Ops[] = {
+      M->getOperand(0), // Chain
+      M->getOperand(2), // Ptr
+      M->getOperand(3)  // Value
+    };
+
+    return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops,
+                                   M->getMemoryVT(), M->getMemOperand());
+  }
+  default:
+    return SDValue();
+  }
+}
+
 SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
                                               SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
@@ -1439,6 +1965,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
                                    Op->getVTList(), Ops, VT, MMO);
   }
+  case AMDGPUIntrinsic::AMDGPU_kill: {
+    if (const ConstantFPSDNode *K = dyn_cast<ConstantFPSDNode>(Op.getOperand(2))) {
+      if (!K->isNegative())
+        return Chain;
+    }
+
+    return Op;
+  }
   default:
     return SDValue();
   }
@@ -1447,48 +1981,92 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
 SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
   LoadSDNode *Load = cast<LoadSDNode>(Op);
+  ISD::LoadExtType ExtType = Load->getExtensionType();
+  EVT MemVT = Load->getMemoryVT();
 
-  if (Op.getValueType().isVector()) {
-    assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
-           "Custom lowering for non-i32 vectors hasn't been implemented.");
-    unsigned NumElements = Op.getValueType().getVectorNumElements();
-    assert(NumElements != 2 && "v2 loads are supported for all address spaces.");
+  if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
+    assert(MemVT == MVT::i1 && "Only i1 non-extloads expected");
+    // FIXME: Copied from PPC
+    // First, load into 32 bits, then truncate to 1 bit.
 
-    switch (Load->getAddressSpace()) {
-      default: break;
-      case AMDGPUAS::CONSTANT_ADDRESS:
-      if (isMemOpUniform(Load))
-        break;
-        // Non-uniform loads will be selected to MUBUF instructions, so they
-        // have the same legalization requires ments as global and private
-        // loads.
-        //
-        // Fall-through
-      case AMDGPUAS::GLOBAL_ADDRESS:
-      case AMDGPUAS::PRIVATE_ADDRESS:
-        if (NumElements >= 8)
-          return SplitVectorLoad(Op, DAG);
-
-        // v4 loads are supported for private and global memory.
-        if (NumElements <= 4)
-          break;
-        // fall-through
-      case AMDGPUAS::LOCAL_ADDRESS:
-        // If properly aligned, if we split we might be able to use ds_read_b64.
+    SDValue Chain = Load->getChain();
+    SDValue BasePtr = Load->getBasePtr();
+    MachineMemOperand *MMO = Load->getMemOperand();
+
+    SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
+                                   BasePtr, MVT::i8, MMO);
+
+    SDValue Ops[] = {
+      DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
+      NewLD.getValue(1)
+    };
+
+    return DAG.getMergeValues(Ops, DL);
+  }
+
+  if (!MemVT.isVector())
+    return SDValue();
+
+  assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
+         "Custom lowering for non-i32 vectors hasn't been implemented.");
+
+  unsigned AS = Load->getAddressSpace();
+  if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
+                          AS, Load->getAlignment())) {
+    SDValue Ops[2];
+    std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
+    return DAG.getMergeValues(Ops, DL);
+  }
+
+  unsigned NumElements = MemVT.getVectorNumElements();
+  switch (AS) {
+  case AMDGPUAS::CONSTANT_ADDRESS:
+    if (isMemOpUniform(Load))
+      return SDValue();
+    // Non-uniform loads will be selected to MUBUF instructions, so they
+    // have the same legalization requires ments as global and private
+    // loads.
+    //
+    // Fall-through
+  case AMDGPUAS::GLOBAL_ADDRESS:
+  case AMDGPUAS::FLAT_ADDRESS:
+    if (NumElements > 4)
+      return SplitVectorLoad(Op, DAG);
+    // v4 loads are supported for private and global memory.
+    return SDValue();
+  case AMDGPUAS::PRIVATE_ADDRESS: {
+    // Depending on the setting of the private_element_size field in the
+    // resource descriptor, we can only make private accesses up to a certain
+    // size.
+    switch (Subtarget->getMaxPrivateElementSize()) {
+    case 4:
+      return scalarizeVectorLoad(Load, DAG);
+    case 8:
+      if (NumElements > 2)
+        return SplitVectorLoad(Op, DAG);
+      return SDValue();
+    case 16:
+      // Same as global/flat
+      if (NumElements > 4)
         return SplitVectorLoad(Op, DAG);
+      return SDValue();
+    default:
+      llvm_unreachable("unsupported private_element_size");
     }
   }
+  case AMDGPUAS::LOCAL_ADDRESS: {
+    if (NumElements > 2)
+      return SplitVectorLoad(Op, DAG);
 
-  return AMDGPUTargetLowering::LowerLOAD(Op, DAG);
-}
+    if (NumElements == 2)
+      return SDValue();
 
-SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode,
-                                               const SDValue &Op,
-                                               SelectionDAG &DAG) const {
-  return DAG.getNode(Opcode, SDLoc(Op), Op.getValueType(), Op.getOperand(1),
-                     Op.getOperand(2),
-                     Op.getOperand(3),
-                     Op.getOperand(4));
+    // If properly aligned, if we split we might be able to use ds_read_b64.
+    return SplitVectorLoad(Op, DAG);
+  }
+  default:
+    return SDValue();
+  }
 }
 
 SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
@@ -1514,7 +2092,7 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
 
   SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
 
-  SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i32, Lo, Hi);
+  SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
   return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res);
 }
 
@@ -1547,7 +2125,9 @@ SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const {
     }
   }
 
-  if (Unsafe) {
+  const SDNodeFlags *Flags = Op->getFlags();
+
+  if (Unsafe || Flags->hasAllowReciprocal()) {
     // Turn into multiply by the reciprocal.
     // x / y -> x * (1.0 / y)
     SDNodeFlags Flags;
@@ -1560,45 +2140,71 @@ SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const {
 }
 
 SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
-  SDValue FastLowered = LowerFastFDIV(Op, DAG);
-  if (FastLowered.getNode())
+  if (SDValue FastLowered = LowerFastFDIV(Op, DAG))
     return FastLowered;
 
-  // This uses v_rcp_f32 which does not handle denormals. Let this hit a
-  // selection error for now rather than do something incorrect.
-  if (Subtarget->hasFP32Denormals())
-    return SDValue();
-
   SDLoc SL(Op);
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
 
-  SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
+  // faster 2.5 ulp fdiv when using -amdgpu-fast-fdiv flag
+  if (EnableAMDGPUFastFDIV) {
+    // This does not support denormals.
+    SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
+
+    const APFloat K0Val(BitsToFloat(0x6f800000));
+    const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
+
+    const APFloat K1Val(BitsToFloat(0x2f800000));
+    const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
+
+    const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
+
+    EVT SetCCVT =
+        getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
+
+    SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
+
+    SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
+
+    // TODO: Should this propagate fast-math-flags?
 
-  const APFloat K0Val(BitsToFloat(0x6f800000));
-  const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
+    r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
 
-  const APFloat K1Val(BitsToFloat(0x2f800000));
-  const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
+    // rcp does not support denormals.
+    SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
 
+    SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
+
+    return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
+  }
+
+  // Generates more precise fpdiv32.
   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
 
-  EVT SetCCVT =
-      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
+  SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
 
-  SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
+  SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, RHS, RHS, LHS);
+  SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, LHS, RHS, LHS);
 
-  SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
+  // Denominator is scaled to not be denormal, so using rcp is ok.
+  SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled);
 
-  // TODO: Should this propagate fast-math-flags?
+  SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled);
 
-  r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
+  SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, ApproxRcp, One);
+  SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp, ApproxRcp);
 
-  SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
+  SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1);
 
-  SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
+  SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, Mul, NumeratorScaled);
+  SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul);
+  SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3, NumeratorScaled);
 
-  return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
+  SDValue Scale = NumeratorScaled.getValue(1);
+  SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32, Fma4, Fma1, Fma3, Scale);
+
+  return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS);
 }
 
 SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
@@ -1635,7 +2241,7 @@ SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
 
   SDValue Scale;
 
-  if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+  if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) {
     // Workaround a hardware bug on SI where the condition output from div_scale
     // is not usable.
 
@@ -1685,26 +2291,57 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   StoreSDNode *Store = cast<StoreSDNode>(Op);
   EVT VT = Store->getMemoryVT();
 
-  // These stores are legal.
-  if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
-    if (VT.isVector() && VT.getVectorNumElements() > 4)
-      return ScalarizeVectorStore(Op, DAG);
-    return SDValue();
+  if (VT == MVT::i1) {
+    return DAG.getTruncStore(Store->getChain(), DL,
+       DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
+       Store->getBasePtr(), MVT::i1, Store->getMemOperand());
   }
 
-  SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
-  if (Ret.getNode())
-    return Ret;
+  assert(VT.isVector() &&
+         Store->getValue().getValueType().getScalarType() == MVT::i32);
 
-  if (VT.isVector() && VT.getVectorNumElements() >= 8)
+  unsigned AS = Store->getAddressSpace();
+  if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
+                          AS, Store->getAlignment())) {
+    return expandUnalignedStore(Store, DAG);
+  }
+
+  unsigned NumElements = VT.getVectorNumElements();
+  switch (AS) {
+  case AMDGPUAS::GLOBAL_ADDRESS:
+  case AMDGPUAS::FLAT_ADDRESS:
+    if (NumElements > 4)
+      return SplitVectorStore(Op, DAG);
+    return SDValue();
+  case AMDGPUAS::PRIVATE_ADDRESS: {
+    switch (Subtarget->getMaxPrivateElementSize()) {
+    case 4:
+      return scalarizeVectorStore(Store, DAG);
+    case 8:
+      if (NumElements > 2)
+        return SplitVectorStore(Op, DAG);
+      return SDValue();
+    case 16:
+      if (NumElements > 4)
+        return SplitVectorStore(Op, DAG);
+      return SDValue();
+    default:
+      llvm_unreachable("unsupported private_element_size");
+    }
+  }
+  case AMDGPUAS::LOCAL_ADDRESS: {
+    if (NumElements > 2)
       return SplitVectorStore(Op, DAG);
 
-  if (VT == MVT::i1)
-    return DAG.getTruncStore(Store->getChain(), DL,
-                        DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
-                        Store->getBasePtr(), MVT::i1, Store->getMemOperand());
+    if (NumElements == 2)
+      return Op;
 
-  return SDValue();
+    // If properly aligned, if we split we might be able to use ds_write_b64.
+    return SplitVectorStore(Op, DAG);
+  }
+  default:
+    llvm_unreachable("unhandled address space");
+  }
 }
 
 SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
@@ -1727,6 +2364,33 @@ SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
   }
 }
 
+SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
+  AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
+  assert(AtomicNode->isCompareAndSwap());
+  unsigned AS = AtomicNode->getAddressSpace();
+
+  // No custom lowering required for local address space
+  if (!isFlatGlobalAddrSpace(AS))
+    return Op;
+
+  // Non-local address space requires custom lowering for atomic compare
+  // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
+  SDLoc DL(Op);
+  SDValue ChainIn = Op.getOperand(0);
+  SDValue Addr = Op.getOperand(1);
+  SDValue Old = Op.getOperand(2);
+  SDValue New = Op.getOperand(3);
+  EVT VT = Op.getValueType();
+  MVT SimpleVT = VT.getSimpleVT();
+  MVT VecType = MVT::getVectorVT(SimpleVT, 2);
+
+  SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
+  SDValue Ops[] = { ChainIn, Addr, NewOld };
+
+  return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(),
+                                 Ops, VT, AtomicNode->getMemOperand());
+}
+
 //===----------------------------------------------------------------------===//
 // Custom DAG optimizations
 //===----------------------------------------------------------------------===//
@@ -1756,88 +2420,13 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
     }
   }
 
-  // We are primarily trying to catch operations on illegal vector types
-  // before they are expanded.
-  // For scalars, we can use the more flexible method of checking masked bits
-  // after legalization.
-  if (!DCI.isBeforeLegalize() ||
-      !SrcVT.isVector() ||
-      SrcVT.getVectorElementType() != MVT::i8) {
-    return SDValue();
-  }
-
-  assert(DCI.isBeforeLegalize() && "Unexpected legal type");
-
-  // Weird sized vectors are a pain to handle, but we know 3 is really the same
-  // size as 4.
-  unsigned NElts = SrcVT.getVectorNumElements();
-  if (!SrcVT.isSimple() && NElts != 3)
-    return SDValue();
-
-  // Handle v4i8 -> v4f32 extload. Replace the v4i8 with a legal i32 load to
-  // prevent a mess from expanding to v4i32 and repacking.
-  if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
-    EVT LoadVT = getEquivalentMemType(*DAG.getContext(), SrcVT);
-    EVT RegVT = getEquivalentLoadRegType(*DAG.getContext(), SrcVT);
-    EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32, NElts);
-    LoadSDNode *Load = cast<LoadSDNode>(Src);
-
-    unsigned AS = Load->getAddressSpace();
-    unsigned Align = Load->getAlignment();
-    Type *Ty = LoadVT.getTypeForEVT(*DAG.getContext());
-    unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty);
-
-    // Don't try to replace the load if we have to expand it due to alignment
-    // problems. Otherwise we will end up scalarizing the load, and trying to
-    // repack into the vector for no real reason.
-    if (Align < ABIAlignment &&
-        !allowsMisalignedMemoryAccesses(LoadVT, AS, Align, nullptr)) {
-      return SDValue();
-    }
-
-    SDValue NewLoad = DAG.getExtLoad(ISD::ZEXTLOAD, DL, RegVT,
-                                     Load->getChain(),
-                                     Load->getBasePtr(),
-                                     LoadVT,
-                                     Load->getMemOperand());
-
-    // Make sure successors of the original load stay after it by updating
-    // them to use the new Chain.
-    DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), NewLoad.getValue(1));
-
-    SmallVector<SDValue, 4> Elts;
-    if (RegVT.isVector())
-      DAG.ExtractVectorElements(NewLoad, Elts);
-    else
-      Elts.push_back(NewLoad);
-
-    SmallVector<SDValue, 4> Ops;
-
-    unsigned EltIdx = 0;
-    for (SDValue Elt : Elts) {
-      unsigned ComponentsInElt = std::min(4u, NElts - 4 * EltIdx);
-      for (unsigned I = 0; I < ComponentsInElt; ++I) {
-        unsigned Opc = AMDGPUISD::CVT_F32_UBYTE0 + I;
-        SDValue Cvt = DAG.getNode(Opc, DL, MVT::f32, Elt);
-        DCI.AddToWorklist(Cvt.getNode());
-        Ops.push_back(Cvt);
-      }
-
-      ++EltIdx;
-    }
-
-    assert(Ops.size() == NElts);
-
-    return DAG.getNode(ISD::BUILD_VECTOR, DL, FloatVT, Ops);
-  }
-
   return SDValue();
 }
 
 /// \brief Return true if the given offset Size in bytes can be folded into
 /// the immediate offsets of a memory instruction for the given address space.
 static bool canFoldOffset(unsigned OffsetSize, unsigned AS,
-                          const AMDGPUSubtarget &STI) {
+                          const SISubtarget &STI) {
   switch (AS) {
   case AMDGPUAS::GLOBAL_ADDRESS: {
     // MUBUF instructions a 12-bit offset in bytes.
@@ -1846,7 +2435,7 @@ static bool canFoldOffset(unsigned OffsetSize, unsigned AS,
   case AMDGPUAS::CONSTANT_ADDRESS: {
     // SMRD instructions have an 8-bit offset in dwords on SI and
     // a 20-bit offset in bytes on VI.
-    if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+    if (STI.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
       return isUInt<20>(OffsetSize);
     else
       return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4);
@@ -1897,7 +2486,7 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
   // If the resulting offset is too large, we can't fold it into the addressing
   // mode offset.
   APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
-  if (!canFoldOffset(Offset.getZExtValue(), AddrSpace, *Subtarget))
+  if (!canFoldOffset(Offset.getZExtValue(), AddrSpace, *getSubtarget()))
     return SDValue();
 
   SelectionDAG &DAG = DCI.DAG;
@@ -1915,6 +2504,9 @@ SDValue SITargetLowering::performAndCombine(SDNode *N,
   if (DCI.isBeforeLegalize())
     return SDValue();
 
+  if (SDValue Base = AMDGPUTargetLowering::performAndCombine(N, DCI))
+    return Base;
+
   SelectionDAG &DAG = DCI.DAG;
 
   // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
@@ -1970,6 +2562,36 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
 
+  EVT VT = N->getValueType(0);
+  if (VT == MVT::i64) {
+    // TODO: This could be a generic combine with a predicate for extracting the
+    // high half of an integer being free.
+
+    // (or i64:x, (zero_extend i32:y)) ->
+    //   i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
+    if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
+        RHS.getOpcode() != ISD::ZERO_EXTEND)
+      std::swap(LHS, RHS);
+
+    if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
+      SDValue ExtSrc = RHS.getOperand(0);
+      EVT SrcVT = ExtSrc.getValueType();
+      if (SrcVT == MVT::i32) {
+        SDLoc SL(N);
+        SDValue LowLHS, HiBits;
+        std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
+        SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
+
+        DCI.AddToWorklist(LowOr.getNode());
+        DCI.AddToWorklist(HiBits.getNode());
+
+        SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
+                                  LowOr, HiBits);
+        return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
+      }
+    }
+  }
+
   // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
   if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
       RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
@@ -2005,9 +2627,52 @@ SDValue SITargetLowering::performClassCombine(SDNode *N,
       return DAG.getConstant(0, SDLoc(N), MVT::i1);
   }
 
+  if (N->getOperand(0).isUndef())
+    return DAG.getUNDEF(MVT::i1);
+
   return SDValue();
 }
 
+// Constant fold canonicalize.
+SDValue SITargetLowering::performFCanonicalizeCombine(
+  SDNode *N,
+  DAGCombinerInfo &DCI) const {
+  ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
+  if (!CFP)
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  const APFloat &C = CFP->getValueAPF();
+
+  // Flush denormals to 0 if not enabled.
+  if (C.isDenormal()) {
+    EVT VT = N->getValueType(0);
+    if (VT == MVT::f32 && !Subtarget->hasFP32Denormals())
+      return DAG.getConstantFP(0.0, SDLoc(N), VT);
+
+    if (VT == MVT::f64 && !Subtarget->hasFP64Denormals())
+      return DAG.getConstantFP(0.0, SDLoc(N), VT);
+  }
+
+  if (C.isNaN()) {
+    EVT VT = N->getValueType(0);
+    APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
+    if (C.isSignaling()) {
+      // Quiet a signaling NaN.
+      return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT);
+    }
+
+    // Make sure it is the canonical NaN bitpattern.
+    //
+    // TODO: Can we use -1 as the canonical NaN value since it's an inline
+    // immediate?
+    if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
+      return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT);
+  }
+
+  return SDValue(CFP, 0);
+}
+
 static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
   switch (Opc) {
   case ISD::FMAXNUM:
@@ -2027,8 +2692,64 @@ static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
   }
 }
 
-SDValue SITargetLowering::performMin3Max3Combine(SDNode *N,
-                                                 DAGCombinerInfo &DCI) const {
+static SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
+                                        SDValue Op0, SDValue Op1, bool Signed) {
+  ConstantSDNode *K1 = dyn_cast<ConstantSDNode>(Op1);
+  if (!K1)
+    return SDValue();
+
+  ConstantSDNode *K0 = dyn_cast<ConstantSDNode>(Op0.getOperand(1));
+  if (!K0)
+    return SDValue();
+
+  if (Signed) {
+    if (K0->getAPIntValue().sge(K1->getAPIntValue()))
+      return SDValue();
+  } else {
+    if (K0->getAPIntValue().uge(K1->getAPIntValue()))
+      return SDValue();
+  }
+
+  EVT VT = K0->getValueType(0);
+  return DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, VT,
+                     Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
+}
+
+static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
+  if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions())
+    return true;
+
+  return DAG.isKnownNeverNaN(Op);
+}
+
+static SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
+                                       SDValue Op0, SDValue Op1) {
+  ConstantFPSDNode *K1 = dyn_cast<ConstantFPSDNode>(Op1);
+  if (!K1)
+    return SDValue();
+
+  ConstantFPSDNode *K0 = dyn_cast<ConstantFPSDNode>(Op0.getOperand(1));
+  if (!K0)
+    return SDValue();
+
+  // Ordered >= (although NaN inputs should have folded away by now).
+  APFloat::cmpResult Cmp = K0->getValueAPF().compare(K1->getValueAPF());
+  if (Cmp == APFloat::cmpGreaterThan)
+    return SDValue();
+
+  // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
+  // signaling NaN gives a quiet NaN. The quiet NaN input to the min would then
+  // give the other result, which is different from med3 with a NaN input.
+  SDValue Var = Op0.getOperand(0);
+  if (!isKnownNeverSNan(DAG, Var))
+    return SDValue();
+
+  return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
+                     Var, SDValue(K0, 0), SDValue(K1, 0));
+}
+
+SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
+                                               DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
 
   unsigned Opc = N->getOpcode();
@@ -2038,26 +2759,51 @@ SDValue SITargetLowering::performMin3Max3Combine(SDNode *N,
   // Only do this if the inner op has one use since this will just increases
   // register pressure for no benefit.
 
-  // max(max(a, b), c)
-  if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
-    SDLoc DL(N);
-    return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
-                       DL,
-                       N->getValueType(0),
-                       Op0.getOperand(0),
-                       Op0.getOperand(1),
-                       Op1);
+  if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY) {
+    // max(max(a, b), c) -> max3(a, b, c)
+    // min(min(a, b), c) -> min3(a, b, c)
+    if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
+      SDLoc DL(N);
+      return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
+                         DL,
+                         N->getValueType(0),
+                         Op0.getOperand(0),
+                         Op0.getOperand(1),
+                         Op1);
+    }
+
+    // Try commuted.
+    // max(a, max(b, c)) -> max3(a, b, c)
+    // min(a, min(b, c)) -> min3(a, b, c)
+    if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
+      SDLoc DL(N);
+      return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
+                         DL,
+                         N->getValueType(0),
+                         Op0,
+                         Op1.getOperand(0),
+                         Op1.getOperand(1));
+    }
   }
 
-  // max(a, max(b, c))
-  if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
-    SDLoc DL(N);
-    return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
-                       DL,
-                       N->getValueType(0),
-                       Op0,
-                       Op1.getOperand(0),
-                       Op1.getOperand(1));
+  // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
+  if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
+    if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, true))
+      return Med3;
+  }
+
+  if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
+    if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, false))
+      return Med3;
+  }
+
+  // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
+  if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
+       (Opc == AMDGPUISD::FMIN_LEGACY &&
+        Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
+      N->getValueType(0) == MVT::f32 && Op0.hasOneUse()) {
+    if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
+      return Res;
   }
 
   return SDValue();
@@ -2104,16 +2850,18 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
     return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
   case ISD::SETCC:
     return performSetCCCombine(N, DCI);
-  case ISD::FMAXNUM: // TODO: What about fmax_legacy?
+  case ISD::FMAXNUM:
   case ISD::FMINNUM:
   case ISD::SMAX:
   case ISD::SMIN:
   case ISD::UMAX:
-  case ISD::UMIN: {
+  case ISD::UMIN:
+  case AMDGPUISD::FMIN_LEGACY:
+  case AMDGPUISD::FMAX_LEGACY: {
     if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG &&
         N->getValueType(0) != MVT::f64 &&
         getTargetMachine().getOptLevel() > CodeGenOpt::None)
-      return performMin3Max3Combine(N, DCI);
+      return performMinMaxCombine(N, DCI);
     break;
   }
 
@@ -2122,8 +2870,23 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
   case AMDGPUISD::CVT_F32_UBYTE2:
   case AMDGPUISD::CVT_F32_UBYTE3: {
     unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
-
     SDValue Src = N->getOperand(0);
+
+    // TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero.
+    if (Src.getOpcode() == ISD::SRL) {
+      // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
+      // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
+      // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
+
+      if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(1))) {
+        unsigned SrcOffset = C->getZExtValue() + 8 * Offset;
+        if (SrcOffset < 32 && SrcOffset % 8 == 0) {
+          return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + SrcOffset / 8, DL,
+                             MVT::f32, Src.getOperand(0));
+        }
+      }
+    }
+
     APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
 
     APInt KnownZero, KnownOne;
@@ -2238,7 +3001,9 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::ATOMIC_LOAD_MIN:
   case ISD::ATOMIC_LOAD_MAX:
   case ISD::ATOMIC_LOAD_UMIN:
-  case ISD::ATOMIC_LOAD_UMAX: { // TODO: Target mem intrinsics.
+  case ISD::ATOMIC_LOAD_UMAX:
+  case AMDGPUISD::ATOMIC_INC:
+  case AMDGPUISD::ATOMIC_DEC: { // TODO: Target mem intrinsics.
     if (DCI.isBeforeLegalize())
       break;
 
@@ -2264,6 +3029,19 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
     return performOrCombine(N, DCI);
   case AMDGPUISD::FP_CLASS:
     return performClassCombine(N, DCI);
+  case ISD::FCANONICALIZE:
+    return performFCanonicalizeCombine(N, DCI);
+  case AMDGPUISD::FRACT:
+  case AMDGPUISD::RCP:
+  case AMDGPUISD::RSQ:
+  case AMDGPUISD::RSQ_LEGACY:
+  case AMDGPUISD::RSQ_CLAMP:
+  case AMDGPUISD::LDEXP: {
+    SDValue Src = N->getOperand(0);
+    if (Src.isUndef())
+      return Src;
+    break;
+  }
   }
   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
 }
@@ -2273,9 +3051,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
 /// Returns -1 if it isn't an immediate, 0 if it's and inline immediate
 /// and the immediate value if it's a literal immediate
 int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const {
-
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
+  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
 
   if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N)) {
     if (TII->isInlineConstant(Node->getAPIntValue()))
@@ -2314,7 +3090,8 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
                                        SelectionDAG &DAG) const {
   SDNode *Users[4] = { };
   unsigned Lane = 0;
-  unsigned OldDmask = Node->getConstantOperandVal(0);
+  unsigned DmaskIdx = (Node->getNumOperands() - Node->getNumValues() == 9) ? 2 : 3;
+  unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
   unsigned NewDmask = 0;
 
   // Try to figure out the used register components
@@ -2354,8 +3131,9 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
 
   // Adjust the writemask in the node
   std::vector<SDValue> Ops;
+  Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
   Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
-  Ops.insert(Ops.end(), Node->op_begin() + 1, Node->op_end());
+  Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
   Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops);
 
   // If we only got one lane, replace it with a copy
@@ -2421,14 +3199,15 @@ void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
 /// \brief Fold the instructions after selecting them.
 SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
                                           SelectionDAG &DAG) const {
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
+  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+  unsigned Opcode = Node->getMachineOpcode();
 
-  if (TII->isMIMG(Node->getMachineOpcode()))
+  if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() &&
+      !TII->isGather4(Opcode))
     adjustWritemask(Node, DAG);
 
-  if (Node->getMachineOpcode() == AMDGPU::INSERT_SUBREG ||
-      Node->getMachineOpcode() == AMDGPU::REG_SEQUENCE) {
+  if (Opcode == AMDGPU::INSERT_SUBREG ||
+      Opcode == AMDGPU::REG_SEQUENCE) {
     legalizeTargetIndependentNode(Node, DAG);
     return Node;
   }
@@ -2437,22 +3216,22 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
 
 /// \brief Assign the register class depending on the number of
 /// bits set in the writemask
-void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
+void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
                                                      SDNode *Node) const {
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
+  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
 
-  MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+  MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
 
-  if (TII->isVOP3(MI->getOpcode())) {
+  if (TII->isVOP3(MI.getOpcode())) {
     // Make sure constant bus requirements are respected.
     TII->legalizeOperandsVOP3(MRI, MI);
     return;
   }
 
-  if (TII->isMIMG(*MI)) {
-    unsigned VReg = MI->getOperand(0).getReg();
-    unsigned Writemask = MI->getOperand(1).getImm();
+  if (TII->isMIMG(MI)) {
+    unsigned VReg = MI.getOperand(0).getReg();
+    unsigned DmaskIdx = MI.getNumOperands() == 12 ? 3 : 4;
+    unsigned Writemask = MI.getOperand(DmaskIdx).getImm();
     unsigned BitsSet = 0;
     for (unsigned i = 0; i < 4; ++i)
       BitsSet += Writemask & (1 << i) ? 1 : 0;
@@ -2465,34 +3244,58 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
     case 3:  RC = &AMDGPU::VReg_96RegClass; break;
     }
 
-    unsigned NewOpcode = TII->getMaskedMIMGOp(MI->getOpcode(), BitsSet);
-    MI->setDesc(TII->get(NewOpcode));
+    unsigned NewOpcode = TII->getMaskedMIMGOp(MI.getOpcode(), BitsSet);
+    MI.setDesc(TII->get(NewOpcode));
     MRI.setRegClass(VReg, RC);
     return;
   }
 
   // Replace unused atomics with the no return version.
-  int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI->getOpcode());
+  int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode());
   if (NoRetAtomicOp != -1) {
     if (!Node->hasAnyUseOfValue(0)) {
-      MI->setDesc(TII->get(NoRetAtomicOp));
-      MI->RemoveOperand(0);
+      MI.setDesc(TII->get(NoRetAtomicOp));
+      MI.RemoveOperand(0);
+      return;
     }
 
+    // For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg
+    // instruction, because the return type of these instructions is a vec2 of
+    // the memory type, so it can be tied to the input operand.
+    // This means these instructions always have a use, so we need to add a
+    // special case to check if the atomic has only one extract_subreg use,
+    // which itself has no uses.
+    if ((Node->hasNUsesOfValue(1, 0) &&
+         Node->use_begin()->isMachineOpcode() &&
+         Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG &&
+         !Node->use_begin()->hasAnyUseOfValue(0))) {
+      unsigned Def = MI.getOperand(0).getReg();
+
+      // Change this into a noret atomic.
+      MI.setDesc(TII->get(NoRetAtomicOp));
+      MI.RemoveOperand(0);
+
+      // If we only remove the def operand from the atomic instruction, the
+      // extract_subreg will be left with a use of a vreg without a def.
+      // So we need to insert an implicit_def to avoid machine verifier
+      // errors.
+      BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
+              TII->get(AMDGPU::IMPLICIT_DEF), Def);
+    }
     return;
   }
 }
 
-static SDValue buildSMovImm32(SelectionDAG &DAG, SDLoc DL, uint64_t Val) {
+static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
+                              uint64_t Val) {
   SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
   return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
 }
 
 MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
-                                                SDLoc DL,
+                                                const SDLoc &DL,
                                                 SDValue Ptr) const {
-  const SIInstrInfo *TII =
-    static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
+  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
 
   // Build the half of the subregister with the constants before building the
   // full 128-bit register. If we are building multiple resource descriptors,
@@ -2524,10 +3327,8 @@ MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
 ///        The TID (Thread ID) is multiplied by the stride value (bits [61:48]
 ///        of the resource descriptor) to create an offset, which is added to
 ///        the resource pointer.
-MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG,
-                                           SDLoc DL,
-                                           SDValue Ptr,
-                                           uint32_t RsrcDword1,
+MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,
+                                           SDValue Ptr, uint32_t RsrcDword1,
                                            uint64_t RsrcDword2And3) const {
   SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
   SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index f01b2c0d09f3..8e055eea58c2 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -12,26 +12,26 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_R600_SIISELLOWERING_H
-#define LLVM_LIB_TARGET_R600_SIISELLOWERING_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_SIISELLOWERING_H
+#define LLVM_LIB_TARGET_AMDGPU_SIISELLOWERING_H
 
 #include "AMDGPUISelLowering.h"
 #include "SIInstrInfo.h"
 
 namespace llvm {
 
-class SITargetLowering : public AMDGPUTargetLowering {
-  SDValue LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, SDLoc DL,
+class SITargetLowering final : public AMDGPUTargetLowering {
+  SDValue LowerParameterPtr(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain,
+                            unsigned Offset) const;
+  SDValue LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL,
                          SDValue Chain, unsigned Offset, bool Signed) const;
-  SDValue LowerSampleIntrinsic(unsigned Opcode, const SDValue &Op,
-                               SelectionDAG &DAG) const;
   SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
                              SelectionDAG &DAG) const override;
-
   SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
                                  MVT VT, unsigned Offset) const;
 
   SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
@@ -43,8 +43,13 @@ class SITargetLowering : public AMDGPUTargetLowering {
   SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG, bool Signed) const;
   SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
 
+  SDValue getSegmentAperture(unsigned AS, SelectionDAG &DAG) const;
+  SDValue lowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerTRAP(SDValue Op, SelectionDAG &DAG) const;
+
   void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;
 
   SDValue performUCharToFloatCombine(SDNode *N,
@@ -55,14 +60,25 @@ class SITargetLowering : public AMDGPUTargetLowering {
   SDValue performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performOrCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performFCanonicalizeCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+
+  SDValue performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
-  SDValue performMin3Max3Combine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
   bool isLegalFlatAddressingMode(const AddrMode &AM) const;
   bool isLegalMUBUFAddressingMode(const AddrMode &AM) const;
+
+  bool isCFIntrinsic(const SDNode *Intr) const;
+
+  void createDebuggerPrologueStackObjects(MachineFunction &MF) const;
 public:
-  SITargetLowering(TargetMachine &tm, const AMDGPUSubtarget &STI);
+  SITargetLowering(const TargetMachine &tm, const SISubtarget &STI);
+
+  const SISubtarget *getSubtarget() const;
+
+  bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &,
+                          unsigned IntrinsicID) const override;
 
   bool isShuffleMaskLegal(const SmallVectorImpl<int> &/*Mask*/,
                           EVT /*VT*/) const override;
@@ -89,21 +105,30 @@ public:
   bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                         Type *Ty) const override;
 
+  bool isTypeDesirableForOp(unsigned Op, EVT VT) const override;
+
+  bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
+
   SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
                                bool isVarArg,
                                const SmallVectorImpl<ISD::InputArg> &Ins,
-                               SDLoc DL, SelectionDAG &DAG,
+                               const SDLoc &DL, SelectionDAG &DAG,
                                SmallVectorImpl<SDValue> &InVals) const override;
 
-  SDValue LowerReturn(SDValue Chain,
-                      CallingConv::ID CallConv,
-                      bool isVarArg,
+  SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
-                      const SmallVectorImpl<SDValue> &OutVals,
-                      SDLoc DL, SelectionDAG &DAG) const override;
+                      const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
+                      SelectionDAG &DAG) const override;
+
+  unsigned getRegisterByName(const char* RegName, EVT VT,
+                             SelectionDAG &DAG) const override;
+
+  MachineBasicBlock *splitKillBlock(MachineInstr &MI,
+                                    MachineBasicBlock *BB) const;
 
-  MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI,
-                                      MachineBasicBlock * BB) const override;
+  MachineBasicBlock *
+  EmitInstrWithCustomInserter(MachineInstr &MI,
+                              MachineBasicBlock *BB) const override;
   bool enableAggressiveFMAFusion(EVT VT) const override;
   EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
                          EVT VT) const override;
@@ -112,7 +137,7 @@ public:
   SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
   SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
   SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override;
-  void AdjustInstrPostInstrSelection(MachineInstr *MI,
+  void AdjustInstrPostInstrSelection(MachineInstr &MI,
                                      SDNode *Node) const override;
 
   int32_t analyzeImmediate(const SDNode *N) const;
@@ -120,17 +145,16 @@ public:
                                unsigned Reg, EVT VT) const override;
   void legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const;
 
-  MachineSDNode *wrapAddr64Rsrc(SelectionDAG &DAG, SDLoc DL, SDValue Ptr) const;
-  MachineSDNode *buildRSRC(SelectionDAG &DAG,
-                           SDLoc DL,
-                           SDValue Ptr,
-                           uint32_t RsrcDword1,
-                           uint64_t RsrcDword2And3) const;
+  MachineSDNode *wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL,
+                                SDValue Ptr) const;
+  MachineSDNode *buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr,
+                           uint32_t RsrcDword1, uint64_t RsrcDword2And3) const;
   std::pair<unsigned, const TargetRegisterClass *>
   getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                StringRef Constraint, MVT VT) const override;
   ConstraintType getConstraintType(StringRef Constraint) const override;
-  SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, SDValue V) const;
+  SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL,
+                   SDValue V) const;
 };
 
 } // End namespace llvm
diff --git a/lib/Target/AMDGPU/SIInsertWaits.cpp b/lib/Target/AMDGPU/SIInsertWaits.cpp
index 94e614750d2f..d24588d6c143 100644
--- a/lib/Target/AMDGPU/SIInsertWaits.cpp
+++ b/lib/Target/AMDGPU/SIInsertWaits.cpp
@@ -26,6 +26,8 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 
+#define DEBUG_TYPE "si-insert-waits"
+
 using namespace llvm;
 
 namespace {
@@ -53,7 +55,7 @@ typedef std::pair<unsigned, unsigned> RegInterval;
 class SIInsertWaits : public MachineFunctionPass {
 
 private:
-  static char ID;
+  const SISubtarget *ST;
   const SIInstrInfo *TII;
   const SIRegisterInfo *TRI;
   const MachineRegisterInfo *MRI;
@@ -67,6 +69,10 @@ private:
   /// \brief Counter values we have already waited on.
   Counters WaitedOn;
 
+  /// \brief Counter values that we must wait on before the next counter
+  /// increase.
+  Counters DelayedWaitOn;
+
   /// \brief Counter values for last instruction issued.
   Counters LastIssued;
 
@@ -87,6 +93,9 @@ private:
   /// \brief Whether the machine function returns void
   bool ReturnsVoid;
 
+  /// Whether the VCCZ bit is possibly corrupt
+  bool VCCZCorrupt;
+
   /// \brief Get increment/decrement amount for this instruction.
   Counters getHwCounts(MachineInstr &MI);
 
@@ -99,13 +108,17 @@ private:
 
   /// \brief Handle instructions async components
   void pushInstruction(MachineBasicBlock &MBB,
-                       MachineBasicBlock::iterator I);
+                       MachineBasicBlock::iterator I,
+                       const Counters& Increment);
 
   /// \brief Insert the actual wait instruction
   bool insertWait(MachineBasicBlock &MBB,
                   MachineBasicBlock::iterator I,
                   const Counters &Counts);
 
+  /// \brief Handle existing wait instructions (from intrinsics)
+  void handleExistingWait(MachineBasicBlock::iterator I);
+
   /// \brief Do we need def2def checks?
   bool unorderedDefines(MachineInstr &MI);
 
@@ -115,12 +128,20 @@ private:
   /// \brief Insert S_NOP between an instruction writing M0 and S_SENDMSG.
   void handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
 
+  /// Return true if there are LGKM instrucitons that haven't been waited on
+  /// yet.
+  bool hasOutstandingLGKM() const;
+
 public:
-  SIInsertWaits(TargetMachine &tm) :
+  static char ID;
+
+  SIInsertWaits() :
     MachineFunctionPass(ID),
+    ST(nullptr),
     TII(nullptr),
     TRI(nullptr),
-    ExpInstrTypesSeen(0) { }
+    ExpInstrTypesSeen(0),
+    VCCZCorrupt(false) { }
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
@@ -136,13 +157,28 @@ public:
 
 } // End anonymous namespace
 
+INITIALIZE_PASS_BEGIN(SIInsertWaits, DEBUG_TYPE,
+                      "SI Insert Waits", false, false)
+INITIALIZE_PASS_END(SIInsertWaits, DEBUG_TYPE,
+                    "SI Insert Waits", false, false)
+
 char SIInsertWaits::ID = 0;
 
-const Counters SIInsertWaits::WaitCounts = { { 15, 7, 7 } };
+char &llvm::SIInsertWaitsID = SIInsertWaits::ID;
+
+FunctionPass *llvm::createSIInsertWaitsPass() {
+  return new SIInsertWaits();
+}
+
+const Counters SIInsertWaits::WaitCounts = { { 15, 7, 15 } };
 const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } };
 
-FunctionPass *llvm::createSIInsertWaits(TargetMachine &tm) {
-  return new SIInsertWaits(tm);
+static bool readsVCCZ(unsigned Opcode) {
+  return Opcode == AMDGPU::S_CBRANCH_VCCNZ || Opcode == AMDGPU::S_CBRANCH_VCCZ;
+}
+
+bool SIInsertWaits::hasOutstandingLGKM() const {
+  return WaitedOn.Named.LGKM != LastIssued.Named.LGKM;
 }
 
 Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
@@ -205,24 +241,23 @@ bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
     return false;
 
   // Check if this operand is the value being stored.
-  // Special case for DS instructions, since the address
+  // Special case for DS/FLAT instructions, since the address
   // operand comes before the value operand and it may have
   // multiple data operands.
 
-  if (TII->isDS(MI)) {
+  if (TII->isDS(MI) || TII->isFLAT(MI)) {
     MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::data);
     if (Data && Op.isIdenticalTo(*Data))
       return true;
+  }
 
+  if (TII->isDS(MI)) {
     MachineOperand *Data0 = TII->getNamedOperand(MI, AMDGPU::OpName::data0);
     if (Data0 && Op.isIdenticalTo(*Data0))
       return true;
 
     MachineOperand *Data1 = TII->getNamedOperand(MI, AMDGPU::OpName::data1);
-    if (Data1 && Op.isIdenticalTo(*Data1))
-      return true;
-
-    return false;
+    return Data1 && Op.isIdenticalTo(*Data1);
   }
 
   // NOTE: This assumes that the value operand is before the
@@ -250,10 +285,10 @@ RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC,
 }
 
 void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
-                                    MachineBasicBlock::iterator I) {
+                                    MachineBasicBlock::iterator I,
+                                    const Counters &Increment) {
 
   // Get the hardware counter increments and sum them up
-  Counters Increment = getHwCounts(*I);
   Counters Limit = ZeroCounts;
   unsigned Sum = 0;
 
@@ -270,8 +305,7 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
     return;
   }
 
-  if (MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() >=
-      AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+  if (ST->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
     // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM
     // or SMEM clause, respectively.
     //
@@ -281,8 +315,7 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
     // and destination registers don't overlap, e.g. this is illegal:
     //   r0 = load r2
     //   r2 = load r0
-    if ((LastOpcodeType == SMEM && TII->isSMRD(*I)) ||
-        (LastOpcodeType == VMEM && Increment.Named.VM)) {
+    if (LastOpcodeType == VMEM && Increment.Named.VM) {
       // Insert a NOP to break the clause.
       BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP))
           .addImm(0);
@@ -379,7 +412,7 @@ bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
   BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
           .addImm((Counts.Named.VM & 0xF) |
                   ((Counts.Named.EXP & 0x7) << 4) |
-                  ((Counts.Named.LGKM & 0x7) << 8));
+                  ((Counts.Named.LGKM & 0xF) << 8));
 
   LastOpcodeType = OTHER;
   LastInstWritesM0 = false;
@@ -393,16 +426,38 @@ static void increaseCounters(Counters &Dst, const Counters &Src) {
     Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]);
 }
 
+/// \brief check whether any of the counters is non-zero
+static bool countersNonZero(const Counters &Counter) {
+  for (unsigned i = 0; i < 3; ++i)
+    if (Counter.Array[i])
+      return true;
+  return false;
+}
+
+void SIInsertWaits::handleExistingWait(MachineBasicBlock::iterator I) {
+  assert(I->getOpcode() == AMDGPU::S_WAITCNT);
+
+  unsigned Imm = I->getOperand(0).getImm();
+  Counters Counts, WaitOn;
+
+  Counts.Named.VM = Imm & 0xF;
+  Counts.Named.EXP = (Imm >> 4) & 0x7;
+  Counts.Named.LGKM = (Imm >> 8) & 0xF;
+
+  for (unsigned i = 0; i < 3; ++i) {
+    if (Counts.Array[i] <= LastIssued.Array[i])
+      WaitOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
+    else
+      WaitOn.Array[i] = 0;
+  }
+
+  increaseCounters(DelayedWaitOn, WaitOn);
+}
+
 Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
 
   Counters Result = ZeroCounts;
 
-  // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish,
-  // but we also want to wait for any other outstanding transfers before
-  // signalling other hardware blocks
-  if (MI.getOpcode() == AMDGPU::S_SENDMSG)
-    return LastIssued;
-
   // For each register affected by this instruction increase the result
   // sequence.
   //
@@ -432,8 +487,7 @@ Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
 
 void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB,
                                   MachineBasicBlock::iterator I) {
-  if (MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() <
-      AMDGPUSubtarget::VOLCANIC_ISLANDS)
+  if (ST->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
     return;
 
   // There must be "S_NOP 0" between an instruction writing M0 and S_SENDMSG.
@@ -460,13 +514,13 @@ void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB,
 bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
   bool Changes = false;
 
-  TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
-  TRI =
-      static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
-
+  ST = &MF.getSubtarget<SISubtarget>();
+  TII = ST->getInstrInfo();
+  TRI = &TII->getRegisterInfo();
   MRI = &MF.getRegInfo();
 
   WaitedOn = ZeroCounts;
+  DelayedWaitOn = ZeroCounts;
   LastIssued = ZeroCounts;
   LastOpcodeType = OTHER;
   LastInstWritesM0 = false;
@@ -475,6 +529,8 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
   memset(&UsedRegs, 0, sizeof(UsedRegs));
   memset(&DefinedRegs, 0, sizeof(DefinedRegs));
 
+  SmallVector<MachineInstr *, 4> RemoveMI;
+
   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
        BI != BE; ++BI) {
 
@@ -482,27 +538,81 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
          I != E; ++I) {
 
+      if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS) {
+        // There is a hardware bug on CI/SI where SMRD instruction may corrupt
+        // vccz bit, so when we detect that an instruction may read from a
+        // corrupt vccz bit, we need to:
+        // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD operations to
+        //    complete.
+        // 2. Restore the correct value of vccz by writing the current value
+        //    of vcc back to vcc.
+
+        if (TII->isSMRD(I->getOpcode())) {
+          VCCZCorrupt = true;
+        } else if (!hasOutstandingLGKM() && I->modifiesRegister(AMDGPU::VCC, TRI)) {
+          // FIXME: We only care about SMRD instructions here, not LDS or GDS.
+          // Whenever we store a value in vcc, the correct value of vccz is
+          // restored.
+          VCCZCorrupt = false;
+        }
+
+        // Check if we need to apply the bug work-around
+        if (readsVCCZ(I->getOpcode()) && VCCZCorrupt) {
+          DEBUG(dbgs() << "Inserting vccz bug work-around before: " << *I << '\n');
+
+          // Wait on everything, not just LGKM.  vccz reads usually come from
+          // terminators, and we always wait on everything at the end of the
+          // block, so if we only wait on LGKM here, we might end up with
+          // another s_waitcnt inserted right after this if there are non-LGKM
+          // instructions still outstanding.
+          insertWait(MBB, I, LastIssued);
+
+          // Restore the vccz bit.  Any time a value is written to vcc, the vcc
+          // bit is updated, so we can restore the bit by reading the value of
+          // vcc and then writing it back to the register.
+          BuildMI(MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
+                  AMDGPU::VCC)
+                  .addReg(AMDGPU::VCC);
+        }
+      }
+
+      // Record pre-existing, explicitly requested waits
+      if (I->getOpcode() == AMDGPU::S_WAITCNT) {
+        handleExistingWait(*I);
+        RemoveMI.push_back(&*I);
+        continue;
+      }
+
+      Counters Required;
+
       // Wait for everything before a barrier.
-      if (I->getOpcode() == AMDGPU::S_BARRIER)
-        Changes |= insertWait(MBB, I, LastIssued);
+      //
+      // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish,
+      // but we also want to wait for any other outstanding transfers before
+      // signalling other hardware blocks
+      if (I->getOpcode() == AMDGPU::S_BARRIER ||
+          I->getOpcode() == AMDGPU::S_SENDMSG)
+        Required = LastIssued;
       else
-        Changes |= insertWait(MBB, I, handleOperands(*I));
+        Required = handleOperands(*I);
+
+      Counters Increment = getHwCounts(*I);
 
-      pushInstruction(MBB, I);
+      if (countersNonZero(Required) || countersNonZero(Increment))
+        increaseCounters(Required, DelayedWaitOn);
+
+      Changes |= insertWait(MBB, I, Required);
+
+      pushInstruction(MBB, I, Increment);
       handleSendMsg(MBB, I);
     }
 
     // Wait for everything at the end of the MBB
     Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
-
-    // Functions returning something shouldn't contain S_ENDPGM, because other
-    // bytecode will be appended after it.
-    if (!ReturnsVoid) {
-      MachineBasicBlock::iterator I = MBB.getFirstTerminator();
-      if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM)
-        I->eraseFromParent();
-    }
   }
 
+  for (MachineInstr *I : RemoveMI)
+    I->eraseFromParent();
+
   return Changes;
 }
diff --git a/lib/Target/AMDGPU/SIInstrFormats.td b/lib/Target/AMDGPU/SIInstrFormats.td
index 0e883f64caa3..2f63d4ed13b3 100644
--- a/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/lib/Target/AMDGPU/SIInstrFormats.td
@@ -11,8 +11,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-class InstSI <dag outs, dag ins, string asm, list<dag> pattern> :
-    AMDGPUInst<outs, ins, asm, pattern>, PredicateControl {
+class InstSI <dag outs, dag ins, string asm = "",
+              list<dag> pattern = []> :
+  AMDGPUInst<outs, ins, asm, pattern>, PredicateControl {
 
   field bits<1> VM_CNT = 0;
   field bits<1> EXP_CNT = 0;
@@ -31,6 +32,8 @@ class InstSI <dag outs, dag ins, string asm, list<dag> pattern> :
   field bits<1> VOP2 = 0;
   field bits<1> VOP3 = 0;
   field bits<1> VOPC = 0;
+  field bits<1> SDWA = 0;
+  field bits<1> DPP = 0;
 
   field bits<1> MUBUF = 0;
   field bits<1> MTBUF = 0;
@@ -45,6 +48,8 @@ class InstSI <dag outs, dag ins, string asm, list<dag> pattern> :
   // is unable to infer the encoding from the operands.
   field bits<1> VOPAsmPrefer32Bit = 0;
 
+  field bits<1> Gather4 = 0;
+
   // These need to be kept in sync with the enum in SIInstrFlags.
   let TSFlags{0} = VM_CNT;
   let TSFlags{1} = EXP_CNT;
@@ -63,18 +68,33 @@ class InstSI <dag outs, dag ins, string asm, list<dag> pattern> :
   let TSFlags{11} = VOP2;
   let TSFlags{12} = VOP3;
   let TSFlags{13} = VOPC;
-
-  let TSFlags{14} = MUBUF;
-  let TSFlags{15} = MTBUF;
-  let TSFlags{16} = SMRD;
-  let TSFlags{17} = DS;
-  let TSFlags{18} = MIMG;
-  let TSFlags{19} = FLAT;
-  let TSFlags{20} = WQM;
-  let TSFlags{21} = VGPRSpill;
-  let TSFlags{22} = VOPAsmPrefer32Bit;
+  let TSFlags{14} = SDWA;
+  let TSFlags{15} = DPP;
+
+  let TSFlags{16} = MUBUF;
+  let TSFlags{17} = MTBUF;
+  let TSFlags{18} = SMRD;
+  let TSFlags{19} = DS;
+  let TSFlags{20} = MIMG;
+  let TSFlags{21} = FLAT;
+  let TSFlags{22} = WQM;
+  let TSFlags{23} = VGPRSpill;
+  let TSFlags{24} = VOPAsmPrefer32Bit;
+  let TSFlags{25} = Gather4;
 
   let SchedRW = [Write32Bit];
+
+  field bits<1> DisableSIDecoder = 0;
+  field bits<1> DisableVIDecoder = 0;
+  field bits<1> DisableDecoder = 0;
+
+  let isAsmParserOnly = !if(!eq(DisableDecoder{0}, {0}), 0, 1);
+}
+
+class PseudoInstSI<dag outs, dag ins, list<dag> pattern = []>
+  : InstSI<outs, ins, "", pattern> {
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
 }
 
 class Enc32 {
@@ -123,8 +143,10 @@ class VOP2Common <dag outs, dag ins, string asm, list<dag> pattern> :
   let Size = 4;
 }
 
-class VOP3Common <dag outs, dag ins, string asm, list<dag> pattern> :
-    VOPAnyCommon <outs, ins, asm, pattern> {
+class VOP3Common <dag outs, dag ins, string asm = "",
+                  list<dag> pattern = [], bit HasMods = 0,
+                  bit VOP3Only = 0> :
+  VOPAnyCommon <outs, ins, asm, pattern> {
 
   // Using complex patterns gives VOP3 patterns a very high complexity rating,
   // but standalone patterns are almost always prefered, so we need to adjust the
@@ -135,7 +157,11 @@ class VOP3Common <dag outs, dag ins, string asm, list<dag> pattern> :
   let VOP3 = 1;
   let VALU = 1;
 
-  let AsmMatchConverter = "cvtVOP3";
+  let AsmMatchConverter =
+    !if(!eq(VOP3Only,1),
+        "cvtVOP3",
+        !if(!eq(HasMods,1), "cvtVOP3_2_mod", ""));
+
   let isCodeGenOnly = 0;
 
   int Size = 8;
@@ -154,9 +180,9 @@ class VOP3Common <dag outs, dag ins, string asm, list<dag> pattern> :
 
 class SOP1e <bits<8> op> : Enc32 {
   bits<7> sdst;
-  bits<8> ssrc0;
+  bits<8> src0;
 
-  let Inst{7-0} = ssrc0;
+  let Inst{7-0} = src0;
   let Inst{15-8} = op;
   let Inst{22-16} = sdst;
   let Inst{31-23} = 0x17d; //encoding;
@@ -164,22 +190,22 @@ class SOP1e <bits<8> op> : Enc32 {
 
 class SOP2e <bits<7> op> : Enc32 {
   bits<7> sdst;
-  bits<8> ssrc0;
-  bits<8> ssrc1;
+  bits<8> src0;
+  bits<8> src1;
 
-  let Inst{7-0} = ssrc0;
-  let Inst{15-8} = ssrc1;
+  let Inst{7-0} = src0;
+  let Inst{15-8} = src1;
   let Inst{22-16} = sdst;
   let Inst{29-23} = op;
   let Inst{31-30} = 0x2; // encoding
 }
 
 class SOPCe <bits<7> op> : Enc32 {
-  bits<8> ssrc0;
-  bits<8> ssrc1;
+  bits<8> src0;
+  bits<8> src1;
 
-  let Inst{7-0} = ssrc0;
-  let Inst{15-8} = ssrc1;
+  let Inst{7-0} = src0;
+  let Inst{15-8} = src1;
   let Inst{22-16} = op;
   let Inst{31-23} = 0x17e;
 }
@@ -218,9 +244,7 @@ class SOPPe <bits<7> op> : Enc32 {
 class SMRDe <bits<5> op, bits<1> imm> : Enc32 {
   bits<7> sdst;
   bits<7> sbase;
-  bits<8> offset;
 
-  let Inst{7-0} = offset;
   let Inst{8} = imm;
   let Inst{14-9} = sbase{6-1};
   let Inst{21-15} = sdst;
@@ -228,6 +252,18 @@ class SMRDe <bits<5> op, bits<1> imm> : Enc32 {
   let Inst{31-27} = 0x18; //encoding
 }
 
+class SMRD_IMMe <bits<5> op> : SMRDe<op, 1> {
+  bits<8> offset;
+  let Inst{7-0} = offset;
+}
+
+class SMRD_SOFFe <bits<5> op> : SMRDe<op, 0> {
+  bits<8> soff;
+  let Inst{7-0} = soff;
+}
+
+
+
 class SMRD_IMMe_ci <bits<5> op> : Enc64 {
   bits<7> sdst;
   bits<7> sbase;
@@ -348,19 +384,18 @@ class VOP2_MADKe <bits<6> op> : Enc64 {
 
   bits<8>  vdst;
   bits<9>  src0;
-  bits<8>  vsrc1;
-  bits<32> src2;
+  bits<8>  src1;
+  bits<32> imm;
 
   let Inst{8-0} = src0;
-  let Inst{16-9} = vsrc1;
+  let Inst{16-9} = src1;
   let Inst{24-17} = vdst;
   let Inst{30-25} = op;
   let Inst{31} = 0x0; // encoding
-  let Inst{63-32} = src2;
+  let Inst{63-32} = imm;
 }
 
-class VOP3e <bits<9> op> : Enc64 {
-  bits<8> vdst;
+class VOP3a <bits<9> op> : Enc64 {
   bits<2> src0_modifiers;
   bits<9> src0;
   bits<2> src1_modifiers;
@@ -370,7 +405,6 @@ class VOP3e <bits<9> op> : Enc64 {
   bits<1> clamp;
   bits<2> omod;
 
-  let Inst{7-0} = vdst;
   let Inst{8} = src0_modifiers{1};
   let Inst{9} = src1_modifiers{1};
   let Inst{10} = src2_modifiers{1};
@@ -386,6 +420,20 @@ class VOP3e <bits<9> op> : Enc64 {
   let Inst{63} = src2_modifiers{0};
 }
 
+class VOP3e <bits<9> op> : VOP3a <op> {
+  bits<8> vdst;
+
+  let Inst{7-0} = vdst;
+}
+
+// Encoding used for VOPC instructions encoded as VOP3
+// Differs from VOP3e by destination name (sdst) as VOPC doesn't have vector dst
+class VOP3ce <bits<9> op> : VOP3a <op> {
+  bits<8> sdst;
+
+  let Inst{7-0} = sdst;
+}
+
 class VOP3be <bits<9> op> : Enc64 {
   bits<8> vdst;
   bits<2> src0_modifiers;
@@ -412,10 +460,10 @@ class VOP3be <bits<9> op> : Enc64 {
 
 class VOPCe <bits<8> op> : Enc32 {
   bits<9> src0;
-  bits<8> vsrc1;
+  bits<8> src1;
 
   let Inst{8-0} = src0;
-  let Inst{16-9} = vsrc1;
+  let Inst{16-9} = src1;
   let Inst{24-17} = op;
   let Inst{31-25} = 0x3e;
 }
@@ -675,17 +723,17 @@ class FLAT <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
 
   let UseNamedOperandTable = 1;
   let hasSideEffects = 0;
-  let AsmMatchConverter = "cvtFlat";
   let SchedRW = [WriteVMEM];
 }
 
-class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
-    InstSI <outs, ins, asm, pattern>, MIMGe <op> {
+class MIMG <dag outs, dag ins, string asm, list<dag> pattern> :
+    InstSI <outs, ins, asm, pattern> {
 
   let VM_CNT = 1;
   let EXP_CNT = 1;
   let MIMG = 1;
   let Uses = [EXEC];
 
+  let UseNamedOperandTable = 1;
   let hasSideEffects = 0; // XXX ????
 }
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index 1e10d25e8fb7..d171e21c8a4f 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -12,14 +12,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-
 #include "SIInstrInfo.h"
 #include "AMDGPUTargetMachine.h"
+#include "GCNHazardRecognizer.h"
 #include "SIDefines.h"
 #include "SIMachineFunctionInfo.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/IR/Function.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/MC/MCInstrDesc.h"
@@ -27,8 +28,8 @@
 
 using namespace llvm;
 
-SIInstrInfo::SIInstrInfo(const AMDGPUSubtarget &st)
-    : AMDGPUInstrInfo(st), RI() {}
+SIInstrInfo::SIInstrInfo(const SISubtarget &ST)
+  : AMDGPUInstrInfo(ST), RI(), ST(ST) {}
 
 //===----------------------------------------------------------------------===//
 // TargetInstrInfo callbacks
@@ -74,12 +75,12 @@ static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
   return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
 }
 
-bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI,
+bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
                                                     AliasAnalysis *AA) const {
   // TODO: The generic check fails for VALU instructions that should be
   // rematerializable due to implicit reads of exec. We really want all of the
   // generic logic for this except for this.
-  switch (MI->getOpcode()) {
+  switch (MI.getOpcode()) {
   case AMDGPU::V_MOV_B32_e32:
   case AMDGPU::V_MOV_B32_e64:
   case AMDGPU::V_MOV_B64_PSEUDO:
@@ -201,18 +202,18 @@ static bool isStride64(unsigned Opc) {
   }
 }
 
-bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
-                                        unsigned &Offset,
+bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
+                                        int64_t &Offset,
                                         const TargetRegisterInfo *TRI) const {
-  unsigned Opc = LdSt->getOpcode();
+  unsigned Opc = LdSt.getOpcode();
 
-  if (isDS(*LdSt)) {
-    const MachineOperand *OffsetImm = getNamedOperand(*LdSt,
-                                                      AMDGPU::OpName::offset);
+  if (isDS(LdSt)) {
+    const MachineOperand *OffsetImm =
+        getNamedOperand(LdSt, AMDGPU::OpName::offset);
     if (OffsetImm) {
       // Normal, single offset LDS instruction.
-      const MachineOperand *AddrReg = getNamedOperand(*LdSt,
-                                                      AMDGPU::OpName::addr);
+      const MachineOperand *AddrReg =
+          getNamedOperand(LdSt, AMDGPU::OpName::addr);
 
       BaseReg = AddrReg->getReg();
       Offset = OffsetImm->getImm();
@@ -222,10 +223,10 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
     // The 2 offset instructions use offset0 and offset1 instead. We can treat
     // these as a load with a single offset if the 2 offsets are consecutive. We
     // will use this for some partially aligned loads.
-    const MachineOperand *Offset0Imm = getNamedOperand(*LdSt,
-                                                       AMDGPU::OpName::offset0);
-    const MachineOperand *Offset1Imm = getNamedOperand(*LdSt,
-                                                       AMDGPU::OpName::offset1);
+    const MachineOperand *Offset0Imm =
+        getNamedOperand(LdSt, AMDGPU::OpName::offset0);
+    const MachineOperand *Offset1Imm =
+        getNamedOperand(LdSt, AMDGPU::OpName::offset1);
 
     uint8_t Offset0 = Offset0Imm->getImm();
     uint8_t Offset1 = Offset1Imm->getImm();
@@ -235,19 +236,19 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
       // to bytes of the individual reads.
 
       unsigned EltSize;
-      if (LdSt->mayLoad())
-        EltSize = getOpRegClass(*LdSt, 0)->getSize() / 2;
+      if (LdSt.mayLoad())
+        EltSize = getOpRegClass(LdSt, 0)->getSize() / 2;
       else {
-        assert(LdSt->mayStore());
+        assert(LdSt.mayStore());
         int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
-        EltSize = getOpRegClass(*LdSt, Data0Idx)->getSize();
+        EltSize = getOpRegClass(LdSt, Data0Idx)->getSize();
       }
 
       if (isStride64(Opc))
         EltSize *= 64;
 
-      const MachineOperand *AddrReg = getNamedOperand(*LdSt,
-                                                      AMDGPU::OpName::addr);
+      const MachineOperand *AddrReg =
+          getNamedOperand(LdSt, AMDGPU::OpName::addr);
       BaseReg = AddrReg->getReg();
       Offset = EltSize * Offset0;
       return true;
@@ -256,63 +257,91 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
     return false;
   }
 
-  if (isMUBUF(*LdSt) || isMTBUF(*LdSt)) {
+  if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
     if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset) != -1)
       return false;
 
-    const MachineOperand *AddrReg = getNamedOperand(*LdSt,
-                                                    AMDGPU::OpName::vaddr);
+    const MachineOperand *AddrReg =
+        getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
     if (!AddrReg)
       return false;
 
-    const MachineOperand *OffsetImm = getNamedOperand(*LdSt,
-                                                      AMDGPU::OpName::offset);
+    const MachineOperand *OffsetImm =
+        getNamedOperand(LdSt, AMDGPU::OpName::offset);
     BaseReg = AddrReg->getReg();
     Offset = OffsetImm->getImm();
     return true;
   }
 
-  if (isSMRD(*LdSt)) {
-    const MachineOperand *OffsetImm = getNamedOperand(*LdSt,
-                                                      AMDGPU::OpName::offset);
+  if (isSMRD(LdSt)) {
+    const MachineOperand *OffsetImm =
+        getNamedOperand(LdSt, AMDGPU::OpName::offset);
     if (!OffsetImm)
       return false;
 
-    const MachineOperand *SBaseReg = getNamedOperand(*LdSt,
-                                                     AMDGPU::OpName::sbase);
+    const MachineOperand *SBaseReg =
+        getNamedOperand(LdSt, AMDGPU::OpName::sbase);
     BaseReg = SBaseReg->getReg();
     Offset = OffsetImm->getImm();
     return true;
   }
 
+  if (isFLAT(LdSt)) {
+    const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::addr);
+    BaseReg = AddrReg->getReg();
+    Offset = 0;
+    return true;
+  }
+
   return false;
 }
 
-bool SIInstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt,
-                                     MachineInstr *SecondLdSt,
-                                     unsigned NumLoads) const {
-  // TODO: This needs finer tuning
-  if (NumLoads > 4)
+bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt,
+                                      MachineInstr &SecondLdSt,
+                                      unsigned NumLoads) const {
+  const MachineOperand *FirstDst = nullptr;
+  const MachineOperand *SecondDst = nullptr;
+
+  if (isDS(FirstLdSt) && isDS(SecondLdSt)) {
+    FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
+    SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
+  }
+
+  if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) {
+    FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst);
+    SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst);
+  }
+
+  if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) ||
+      (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt))) {
+    FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata);
+    SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata);
+  }
+
+  if (!FirstDst || !SecondDst)
     return false;
 
-  if (isDS(*FirstLdSt) && isDS(*SecondLdSt))
-    return true;
+  // Try to limit clustering based on the total number of bytes loaded
+  // rather than the number of instructions.  This is done to help reduce
+  // register pressure.  The method used is somewhat inexact, though,
+  // because it assumes that all loads in the cluster will load the
+  // same number of bytes as FirstLdSt.
 
-  if (isSMRD(*FirstLdSt) && isSMRD(*SecondLdSt))
-    return true;
+  // The unit of this value is bytes.
+  // FIXME: This needs finer tuning.
+  unsigned LoadClusterThreshold = 16;
 
-  if ((isMUBUF(*FirstLdSt) || isMTBUF(*FirstLdSt)) &&
-      (isMUBUF(*SecondLdSt) || isMTBUF(*SecondLdSt)))
-    return true;
+  const MachineRegisterInfo &MRI =
+      FirstLdSt.getParent()->getParent()->getRegInfo();
+  const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg());
 
-  return false;
+  return (NumLoads * DstRC->getSize()) <= LoadClusterThreshold;
 }
 
-void
-SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
-                         MachineBasicBlock::iterator MI, DebugLoc DL,
-                         unsigned DestReg, unsigned SrcReg,
-                         bool KillSrc) const {
+void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MI,
+                              const DebugLoc &DL, unsigned DestReg,
+                              unsigned SrcReg, bool KillSrc) const {
 
   // If we are trying to copy to or from SCC, there is a bug somewhere else in
   // the backend.  While it may be theoretically possible to do this, it should
@@ -361,7 +390,6 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
   unsigned Opcode;
   ArrayRef<int16_t> SubIndices;
-  bool Forward;
 
   if (AMDGPU::SReg_32RegClass.contains(DestReg)) {
     assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
@@ -445,10 +473,7 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     llvm_unreachable("Can't copy register!");
   }
 
-  if (RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg))
-    Forward = true;
-  else
-    Forward = false;
+  bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
 
   for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
     unsigned SubIdx;
@@ -463,10 +488,12 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     Builder.addReg(RI.getSubReg(SrcReg, SubIdx));
 
     if (Idx == SubIndices.size() - 1)
-      Builder.addReg(SrcReg, RegState::Kill | RegState::Implicit);
+      Builder.addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
 
     if (Idx == 0)
       Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
+
+    Builder.addReg(SrcReg, RegState::Implicit);
   }
 }
 
@@ -525,6 +552,8 @@ static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
     return AMDGPU::SI_SPILL_V32_SAVE;
   case 8:
     return AMDGPU::SI_SPILL_V64_SAVE;
+  case 12:
+    return AMDGPU::SI_SPILL_V96_SAVE;
   case 16:
     return AMDGPU::SI_SPILL_V128_SAVE;
   case 32:
@@ -558,19 +587,25 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   if (RI.isSGPRClass(RC)) {
     MFI->setHasSpilledSGPRs();
 
+    if (TargetRegisterInfo::isVirtualRegister(SrcReg) && RC->getSize() == 4) {
+      // m0 may not be allowed for readlane.
+      MachineRegisterInfo &MRI = MF->getRegInfo();
+      MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass);
+    }
+
     // We are only allowed to create one new instruction when spilling
     // registers, so we need to use pseudo instruction for spilling
     // SGPRs.
     unsigned Opcode = getSGPRSpillSaveOpcode(RC->getSize());
     BuildMI(MBB, MI, DL, get(Opcode))
-      .addReg(SrcReg)            // src
+      .addReg(SrcReg, getKillRegState(isKill)) // src
       .addFrameIndex(FrameIndex) // frame_idx
       .addMemOperand(MMO);
 
     return;
   }
 
-  if (!ST.isVGPRSpillingEnabled(MFI)) {
+  if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) {
     LLVMContext &Ctx = MF->getFunction()->getContext();
     Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to"
                   " spill register");
@@ -585,10 +620,11 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   unsigned Opcode = getVGPRSpillSaveOpcode(RC->getSize());
   MFI->setHasSpilledVGPRs();
   BuildMI(MBB, MI, DL, get(Opcode))
-    .addReg(SrcReg)                   // src
+    .addReg(SrcReg, getKillRegState(isKill)) // src
     .addFrameIndex(FrameIndex)        // frame_idx
     .addReg(MFI->getScratchRSrcReg())       // scratch_rsrc
     .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset
+    .addImm(0)                              // offset
     .addMemOperand(MMO);
 }
 
@@ -615,6 +651,8 @@ static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
     return AMDGPU::SI_SPILL_V32_RESTORE;
   case 8:
     return AMDGPU::SI_SPILL_V64_RESTORE;
+  case 12:
+    return AMDGPU::SI_SPILL_V96_RESTORE;
   case 16:
     return AMDGPU::SI_SPILL_V128_RESTORE;
   case 32:
@@ -648,6 +686,13 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
     // FIXME: Maybe this should not include a memoperand because it will be
     // lowered to non-memory instructions.
     unsigned Opcode = getSGPRSpillRestoreOpcode(RC->getSize());
+
+    if (TargetRegisterInfo::isVirtualRegister(DestReg) && RC->getSize() == 4) {
+      // m0 may not be allowed for readlane.
+      MachineRegisterInfo &MRI = MF->getRegInfo();
+      MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass);
+    }
+
     BuildMI(MBB, MI, DL, get(Opcode), DestReg)
       .addFrameIndex(FrameIndex) // frame_idx
       .addMemOperand(MMO);
@@ -655,7 +700,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
     return;
   }
 
-  if (!ST.isVGPRSpillingEnabled(MFI)) {
+  if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) {
     LLVMContext &Ctx = MF->getFunction()->getContext();
     Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to"
                   " restore register");
@@ -671,20 +716,18 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
     .addFrameIndex(FrameIndex)        // frame_idx
     .addReg(MFI->getScratchRSrcReg())       // scratch_rsrc
     .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset
+    .addImm(0)                              // offset
     .addMemOperand(MMO);
 }
 
 /// \param @Offset Offset in bytes of the FrameIndex being spilled
-unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB,
-                                               MachineBasicBlock::iterator MI,
-                                               RegScavenger *RS, unsigned TmpReg,
-                                               unsigned FrameOffset,
-                                               unsigned Size) const {
+unsigned SIInstrInfo::calculateLDSSpillAddress(
+    MachineBasicBlock &MBB, MachineInstr &MI, RegScavenger *RS, unsigned TmpReg,
+    unsigned FrameOffset, unsigned Size) const {
   MachineFunction *MF = MBB.getParent();
   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
-  const AMDGPUSubtarget &ST = MF->getSubtarget<AMDGPUSubtarget>();
-  const SIRegisterInfo *TRI =
-      static_cast<const SIRegisterInfo*>(ST.getRegisterInfo());
+  const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
   DebugLoc DL = MBB.findDebugLoc(MI);
   unsigned WorkGroupSize = MFI->getMaximumWorkGroupSize(*MF);
   unsigned WavefrontSize = ST.getWavefrontSize();
@@ -699,8 +742,7 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB,
     if (TIDReg == AMDGPU::NoRegister)
       return TIDReg;
 
-
-    if (MFI->getShaderType() == ShaderType::COMPUTE &&
+    if (!AMDGPU::isShader(MF->getFunction()->getCallingConv()) &&
         WorkGroupSize > WavefrontSize) {
 
       unsigned TIDIGXReg
@@ -716,7 +758,7 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB,
           Entry.addLiveIn(Reg);
       }
 
-      RS->enterBasicBlock(&Entry);
+      RS->enterBasicBlock(Entry);
       // FIXME: Can we scavenge an SReg_64 and access the subregs?
       unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
       unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
@@ -773,8 +815,10 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB,
   return TmpReg;
 }
 
-void SIInstrInfo::insertWaitStates(MachineBasicBlock::iterator MI,
+void SIInstrInfo::insertWaitStates(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
                                    int Count) const {
+  DebugLoc DL = MBB.findDebugLoc(MI);
   while (Count > 0) {
     int Arg;
     if (Count >= 8)
@@ -782,76 +826,87 @@ void SIInstrInfo::insertWaitStates(MachineBasicBlock::iterator MI,
     else
       Arg = Count - 1;
     Count -= 8;
-    BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(AMDGPU::S_NOP))
+    BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP))
             .addImm(Arg);
   }
 }
 
-bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
-  MachineBasicBlock &MBB = *MI->getParent();
+void SIInstrInfo::insertNoop(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator MI) const {
+  insertWaitStates(MBB, MI, 1);
+}
+
+unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  default: return 1; // FIXME: Do wait states equal cycles?
+
+  case AMDGPU::S_NOP:
+    return MI.getOperand(0).getImm() + 1;
+  }
+}
+
+bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+  MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc DL = MBB.findDebugLoc(MI);
-  switch (MI->getOpcode()) {
+  switch (MI.getOpcode()) {
   default: return AMDGPUInstrInfo::expandPostRAPseudo(MI);
 
-  case AMDGPU::SGPR_USE:
-    // This is just a placeholder for register allocation.
-    MI->eraseFromParent();
-    break;
-
   case AMDGPU::V_MOV_B64_PSEUDO: {
-    unsigned Dst = MI->getOperand(0).getReg();
+    unsigned Dst = MI.getOperand(0).getReg();
     unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
     unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
 
-    const MachineOperand &SrcOp = MI->getOperand(1);
+    const MachineOperand &SrcOp = MI.getOperand(1);
     // FIXME: Will this work for 64-bit floating point immediates?
     assert(!SrcOp.isFPImm());
     if (SrcOp.isImm()) {
       APInt Imm(64, SrcOp.getImm());
       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
-              .addImm(Imm.getLoBits(32).getZExtValue())
-              .addReg(Dst, RegState::Implicit);
+        .addImm(Imm.getLoBits(32).getZExtValue())
+        .addReg(Dst, RegState::Implicit | RegState::Define);
       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
-              .addImm(Imm.getHiBits(32).getZExtValue())
-              .addReg(Dst, RegState::Implicit);
+        .addImm(Imm.getHiBits(32).getZExtValue())
+        .addReg(Dst, RegState::Implicit | RegState::Define);
     } else {
       assert(SrcOp.isReg());
       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
-              .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
-              .addReg(Dst, RegState::Implicit);
+        .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
+        .addReg(Dst, RegState::Implicit | RegState::Define);
       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
-              .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
-              .addReg(Dst, RegState::Implicit);
+        .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
+        .addReg(Dst, RegState::Implicit | RegState::Define);
     }
-    MI->eraseFromParent();
+    MI.eraseFromParent();
     break;
   }
 
   case AMDGPU::V_CNDMASK_B64_PSEUDO: {
-    unsigned Dst = MI->getOperand(0).getReg();
+    unsigned Dst = MI.getOperand(0).getReg();
     unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
     unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
-    unsigned Src0 = MI->getOperand(1).getReg();
-    unsigned Src1 = MI->getOperand(2).getReg();
-    const MachineOperand &SrcCond = MI->getOperand(3);
+    unsigned Src0 = MI.getOperand(1).getReg();
+    unsigned Src1 = MI.getOperand(2).getReg();
+    const MachineOperand &SrcCond = MI.getOperand(3);
 
     BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
-        .addReg(RI.getSubReg(Src0, AMDGPU::sub0))
-        .addReg(RI.getSubReg(Src1, AMDGPU::sub0))
-        .addOperand(SrcCond);
+      .addReg(RI.getSubReg(Src0, AMDGPU::sub0))
+      .addReg(RI.getSubReg(Src1, AMDGPU::sub0))
+      .addReg(SrcCond.getReg())
+      .addReg(Dst, RegState::Implicit | RegState::Define);
     BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
-        .addReg(RI.getSubReg(Src0, AMDGPU::sub1))
-        .addReg(RI.getSubReg(Src1, AMDGPU::sub1))
-        .addOperand(SrcCond);
-    MI->eraseFromParent();
+      .addReg(RI.getSubReg(Src0, AMDGPU::sub1))
+      .addReg(RI.getSubReg(Src1, AMDGPU::sub1))
+      .addReg(SrcCond.getReg(), getKillRegState(SrcCond.isKill()))
+      .addReg(Dst, RegState::Implicit | RegState::Define);
+    MI.eraseFromParent();
     break;
   }
 
-  case AMDGPU::SI_CONSTDATA_PTR: {
-    const SIRegisterInfo *TRI =
-        static_cast<const SIRegisterInfo *>(ST.getRegisterInfo());
+  case AMDGPU::SI_PC_ADD_REL_OFFSET: {
+    const SIRegisterInfo *TRI
+      = static_cast<const SIRegisterInfo *>(ST.getRegisterInfo());
     MachineFunction &MF = *MBB.getParent();
-    unsigned Reg = MI->getOperand(0).getReg();
+    unsigned Reg = MI.getOperand(0).getReg();
     unsigned RegLo = TRI->getSubReg(Reg, AMDGPU::sub0);
     unsigned RegHi = TRI->getSubReg(Reg, AMDGPU::sub1);
 
@@ -863,15 +918,15 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
     // Add 32-bit offset from this instruction to the start of the
     // constant data.
     Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo)
-                           .addReg(RegLo)
-                           .addOperand(MI->getOperand(1)));
+                       .addReg(RegLo)
+                       .addOperand(MI.getOperand(1)));
     Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
                            .addReg(RegHi)
                            .addImm(0));
 
     llvm::finalizeBundle(MBB, Bundler.begin());
 
-    MI->eraseFromParent();
+    MI.eraseFromParent();
     break;
   }
   }
@@ -885,22 +940,21 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
 /// non-commutable pair of operand indices OpIdx0 and OpIdx1.
 /// Even though the instruction is commutable, the method may still
 /// fail to commute the operands, null pointer is returned in such cases.
-MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr *MI,
-                                                  bool NewMI,
+MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
                                                   unsigned OpIdx0,
                                                   unsigned OpIdx1) const {
-  int CommutedOpcode = commuteOpcode(*MI);
+  int CommutedOpcode = commuteOpcode(MI);
   if (CommutedOpcode == -1)
     return nullptr;
 
-  int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
-                                           AMDGPU::OpName::src0);
-  MachineOperand &Src0 = MI->getOperand(Src0Idx);
+  int Src0Idx =
+      AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
+  MachineOperand &Src0 = MI.getOperand(Src0Idx);
   if (!Src0.isReg())
     return nullptr;
 
-  int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
-                                           AMDGPU::OpName::src1);
+  int Src1Idx =
+      AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src1);
 
   if ((OpIdx0 != static_cast<unsigned>(Src0Idx) ||
        OpIdx1 != static_cast<unsigned>(Src1Idx)) &&
@@ -908,33 +962,32 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr *MI,
        OpIdx1 != static_cast<unsigned>(Src0Idx)))
     return nullptr;
 
-  MachineOperand &Src1 = MI->getOperand(Src1Idx);
+  MachineOperand &Src1 = MI.getOperand(Src1Idx);
 
-
-  if (isVOP2(*MI)) {
-    const MCInstrDesc &InstrDesc = MI->getDesc();
-    // For VOP2 instructions, any operand type is valid to use for src0.  Make
-    // sure we can use the src1 as src0.
+  if (isVOP2(MI) || isVOPC(MI)) {
+    const MCInstrDesc &InstrDesc = MI.getDesc();
+    // For VOP2 and VOPC instructions, any operand type is valid to use for
+    // src0.  Make sure we can use the src0 as src1.
     //
     // We could be stricter here and only allow commuting if there is a reason
     // to do so. i.e. if both operands are VGPRs there is no real benefit,
     // although MachineCSE attempts to find matches by commuting.
-    const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+    const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
     if (!isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0))
       return nullptr;
   }
 
+  MachineInstr *CommutedMI = &MI;
   if (!Src1.isReg()) {
     // Allow commuting instructions with Imm operands.
-    if (NewMI || !Src1.isImm() ||
-        (!isVOP2(*MI) && !isVOP3(*MI))) {
+    if (NewMI || !Src1.isImm() || (!isVOP2(MI) && !isVOP3(MI))) {
       return nullptr;
     }
     // Be sure to copy the source modifiers to the right place.
-    if (MachineOperand *Src0Mods
-          = getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) {
-      MachineOperand *Src1Mods
-        = getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers);
+    if (MachineOperand *Src0Mods =
+            getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)) {
+      MachineOperand *Src1Mods =
+          getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
 
       int Src0ModsVal = Src0Mods->getImm();
       if (!Src1Mods && Src0ModsVal != 0)
@@ -959,26 +1012,26 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr *MI,
     Src1.ChangeToRegister(Reg, false);
     Src1.setSubReg(SubReg);
   } else {
-    MI = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx0, OpIdx1);
+    CommutedMI =
+        TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx0, OpIdx1);
   }
 
-  if (MI)
-    MI->setDesc(get(CommutedOpcode));
+  if (CommutedMI)
+    CommutedMI->setDesc(get(CommutedOpcode));
 
-  return MI;
+  return CommutedMI;
 }
 
 // This needs to be implemented because the source modifiers may be inserted
 // between the true commutable operands, and the base
 // TargetInstrInfo::commuteInstruction uses it.
-bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI,
-                                        unsigned &SrcOpIdx0,
+bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0,
                                         unsigned &SrcOpIdx1) const {
-  const MCInstrDesc &MCID = MI->getDesc();
+  const MCInstrDesc &MCID = MI.getDesc();
   if (!MCID.isCommutable())
     return false;
 
-  unsigned Opc = MI->getOpcode();
+  unsigned Opc = MI.getOpcode();
   int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
   if (Src0Idx == -1)
     return false;
@@ -986,24 +1039,24 @@ bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI,
   // FIXME: Workaround TargetInstrInfo::commuteInstruction asserting on
   // immediate. Also, immediate src0 operand is not handled in
   // SIInstrInfo::commuteInstruction();
-  if (!MI->getOperand(Src0Idx).isReg())
+  if (!MI.getOperand(Src0Idx).isReg())
     return false;
 
   int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
   if (Src1Idx == -1)
     return false;
 
-  MachineOperand &Src1 = MI->getOperand(Src1Idx);
+  MachineOperand &Src1 = MI.getOperand(Src1Idx);
   if (Src1.isImm()) {
     // SIInstrInfo::commuteInstruction() does support commuting the immediate
     // operand src1 in 2 and 3 operand instructions.
-    if (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode()))
+    if (!isVOP2(MI.getOpcode()) && !isVOP3(MI.getOpcode()))
       return false;
   } else if (Src1.isReg()) {
     // If any source modifiers are set, the generic instruction commuting won't
     // understand how to copy the source modifiers.
-    if (hasModifiersSet(*MI, AMDGPU::OpName::src0_modifiers) ||
-        hasModifiersSet(*MI, AMDGPU::OpName::src1_modifiers))
+    if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
+        hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers))
       return false;
   } else
     return false;
@@ -1011,23 +1064,135 @@ bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI,
   return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
 }
 
-MachineInstr *SIInstrInfo::buildMovInstr(MachineBasicBlock *MBB,
-                                         MachineBasicBlock::iterator I,
-                                         unsigned DstReg,
-                                         unsigned SrcReg) const {
-  return BuildMI(*MBB, I, MBB->findDebugLoc(I), get(AMDGPU::V_MOV_B32_e32),
-                 DstReg) .addReg(SrcReg);
+unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
+  switch (Cond) {
+  case SIInstrInfo::SCC_TRUE:
+    return AMDGPU::S_CBRANCH_SCC1;
+  case SIInstrInfo::SCC_FALSE:
+    return AMDGPU::S_CBRANCH_SCC0;
+  case SIInstrInfo::VCCNZ:
+    return AMDGPU::S_CBRANCH_VCCNZ;
+  case SIInstrInfo::VCCZ:
+    return AMDGPU::S_CBRANCH_VCCZ;
+  case SIInstrInfo::EXECNZ:
+    return AMDGPU::S_CBRANCH_EXECNZ;
+  case SIInstrInfo::EXECZ:
+    return AMDGPU::S_CBRANCH_EXECZ;
+  default:
+    llvm_unreachable("invalid branch predicate");
+  }
+}
+
+SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
+  switch (Opcode) {
+  case AMDGPU::S_CBRANCH_SCC0:
+    return SCC_FALSE;
+  case AMDGPU::S_CBRANCH_SCC1:
+    return SCC_TRUE;
+  case AMDGPU::S_CBRANCH_VCCNZ:
+    return VCCNZ;
+  case AMDGPU::S_CBRANCH_VCCZ:
+    return VCCZ;
+  case AMDGPU::S_CBRANCH_EXECNZ:
+    return EXECNZ;
+  case AMDGPU::S_CBRANCH_EXECZ:
+    return EXECZ;
+  default:
+    return INVALID_BR;
+  }
 }
 
-bool SIInstrInfo::isMov(unsigned Opcode) const {
-  switch(Opcode) {
-  default: return false;
-  case AMDGPU::S_MOV_B32:
-  case AMDGPU::S_MOV_B64:
-  case AMDGPU::V_MOV_B32_e32:
-  case AMDGPU::V_MOV_B32_e64:
+bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                                MachineBasicBlock *&FBB,
+                                SmallVectorImpl<MachineOperand> &Cond,
+                                bool AllowModify) const {
+  MachineBasicBlock::iterator I = MBB.getFirstTerminator();
+
+  if (I == MBB.end())
+    return false;
+
+  if (I->getOpcode() == AMDGPU::S_BRANCH) {
+    // Unconditional Branch
+    TBB = I->getOperand(0).getMBB();
+    return false;
+  }
+
+  BranchPredicate Pred = getBranchPredicate(I->getOpcode());
+  if (Pred == INVALID_BR)
     return true;
+
+  MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
+  Cond.push_back(MachineOperand::CreateImm(Pred));
+
+  ++I;
+
+  if (I == MBB.end()) {
+    // Conditional branch followed by fall-through.
+    TBB = CondBB;
+    return false;
+  }
+
+  if (I->getOpcode() == AMDGPU::S_BRANCH) {
+    TBB = CondBB;
+    FBB = I->getOperand(0).getMBB();
+    return false;
+  }
+
+  return true;
+}
+
+unsigned SIInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator I = MBB.getFirstTerminator();
+
+  unsigned Count = 0;
+  while (I != MBB.end()) {
+    MachineBasicBlock::iterator Next = std::next(I);
+    I->eraseFromParent();
+    ++Count;
+    I = Next;
+  }
+
+  return Count;
+}
+
+unsigned SIInstrInfo::InsertBranch(MachineBasicBlock &MBB,
+                                   MachineBasicBlock *TBB,
+                                   MachineBasicBlock *FBB,
+                                   ArrayRef<MachineOperand> Cond,
+                                   const DebugLoc &DL) const {
+
+  if (!FBB && Cond.empty()) {
+    BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
+      .addMBB(TBB);
+    return 1;
+  }
+
+  assert(TBB && Cond[0].isImm());
+
+  unsigned Opcode
+    = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
+
+  if (!FBB) {
+    BuildMI(&MBB, DL, get(Opcode))
+      .addMBB(TBB);
+    return 1;
   }
+
+  assert(TBB && FBB);
+
+  BuildMI(&MBB, DL, get(Opcode))
+    .addMBB(TBB);
+  BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
+    .addMBB(FBB);
+
+  return 2;
+}
+
+bool SIInstrInfo::ReverseBranchCondition(
+  SmallVectorImpl<MachineOperand> &Cond) const {
+  assert(Cond.size() == 1);
+  Cond[0].setImm(-Cond[0].getImm());
+  return false;
 }
 
 static void removeModOperands(MachineInstr &MI) {
@@ -1044,81 +1209,76 @@ static void removeModOperands(MachineInstr &MI) {
   MI.RemoveOperand(Src0ModIdx);
 }
 
-bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
+// TODO: Maybe this should be removed this and custom fold everything in
+// SIFoldOperands?
+bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
                                 unsigned Reg, MachineRegisterInfo *MRI) const {
   if (!MRI->hasOneNonDBGUse(Reg))
     return false;
 
-  unsigned Opc = UseMI->getOpcode();
+  unsigned Opc = UseMI.getOpcode();
   if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64) {
     // Don't fold if we are using source modifiers. The new VOP2 instructions
     // don't have them.
-    if (hasModifiersSet(*UseMI, AMDGPU::OpName::src0_modifiers) ||
-        hasModifiersSet(*UseMI, AMDGPU::OpName::src1_modifiers) ||
-        hasModifiersSet(*UseMI, AMDGPU::OpName::src2_modifiers)) {
+    if (hasModifiersSet(UseMI, AMDGPU::OpName::src0_modifiers) ||
+        hasModifiersSet(UseMI, AMDGPU::OpName::src1_modifiers) ||
+        hasModifiersSet(UseMI, AMDGPU::OpName::src2_modifiers)) {
       return false;
     }
 
-    MachineOperand *Src0 = getNamedOperand(*UseMI, AMDGPU::OpName::src0);
-    MachineOperand *Src1 = getNamedOperand(*UseMI, AMDGPU::OpName::src1);
-    MachineOperand *Src2 = getNamedOperand(*UseMI, AMDGPU::OpName::src2);
+    const MachineOperand &ImmOp = DefMI.getOperand(1);
+
+    // If this is a free constant, there's no reason to do this.
+    // TODO: We could fold this here instead of letting SIFoldOperands do it
+    // later.
+    if (isInlineConstant(ImmOp, 4))
+      return false;
+
+    MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);
+    MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
+    MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
 
     // Multiplied part is the constant: Use v_madmk_f32
     // We should only expect these to be on src0 due to canonicalizations.
     if (Src0->isReg() && Src0->getReg() == Reg) {
-      if (!Src1->isReg() ||
-          (Src1->isReg() && RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))))
+      if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
         return false;
 
-      if (!Src2->isReg() ||
-          (Src2->isReg() && RI.isSGPRClass(MRI->getRegClass(Src2->getReg()))))
+      if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
         return false;
 
-      // We need to do some weird looking operand shuffling since the madmk
-      // operands are out of the normal expected order with the multiplied
-      // constant as the last operand.
-      //
-      // v_mad_f32 src0, src1, src2 -> v_madmk_f32 src0 * src2K + src1
-      // src0 -> src2 K
-      // src1 -> src0
-      // src2 -> src1
+      // We need to swap operands 0 and 1 since madmk constant is at operand 1.
 
-      const int64_t Imm = DefMI->getOperand(1).getImm();
+      const int64_t Imm = DefMI.getOperand(1).getImm();
 
       // FIXME: This would be a lot easier if we could return a new instruction
       // instead of having to modify in place.
 
       // Remove these first since they are at the end.
-      UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc,
-                                                      AMDGPU::OpName::omod));
-      UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc,
-                                                      AMDGPU::OpName::clamp));
+      UseMI.RemoveOperand(
+          AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
+      UseMI.RemoveOperand(
+          AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
 
       unsigned Src1Reg = Src1->getReg();
       unsigned Src1SubReg = Src1->getSubReg();
-      unsigned Src2Reg = Src2->getReg();
-      unsigned Src2SubReg = Src2->getSubReg();
       Src0->setReg(Src1Reg);
       Src0->setSubReg(Src1SubReg);
       Src0->setIsKill(Src1->isKill());
 
-      Src1->setReg(Src2Reg);
-      Src1->setSubReg(Src2SubReg);
-      Src1->setIsKill(Src2->isKill());
-
       if (Opc == AMDGPU::V_MAC_F32_e64) {
-        UseMI->untieRegOperand(
-          AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
+        UseMI.untieRegOperand(
+            AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
       }
 
-      Src2->ChangeToImmediate(Imm);
+      Src1->ChangeToImmediate(Imm);
 
-      removeModOperands(*UseMI);
-      UseMI->setDesc(get(AMDGPU::V_MADMK_F32));
+      removeModOperands(UseMI);
+      UseMI.setDesc(get(AMDGPU::V_MADMK_F32));
 
       bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
       if (DeleteDef)
-        DefMI->eraseFromParent();
+        DefMI.eraseFromParent();
 
       return true;
     }
@@ -1131,36 +1291,35 @@ bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
           (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))
         return false;
 
-      if (!Src1->isReg() ||
-          (Src1->isReg() && RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))))
+      if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
         return false;
 
-      const int64_t Imm = DefMI->getOperand(1).getImm();
+      const int64_t Imm = DefMI.getOperand(1).getImm();
 
       // FIXME: This would be a lot easier if we could return a new instruction
       // instead of having to modify in place.
 
       // Remove these first since they are at the end.
-      UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc,
-                                                      AMDGPU::OpName::omod));
-      UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc,
-                                                      AMDGPU::OpName::clamp));
+      UseMI.RemoveOperand(
+          AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
+      UseMI.RemoveOperand(
+          AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
 
       if (Opc == AMDGPU::V_MAC_F32_e64) {
-        UseMI->untieRegOperand(
-          AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
+        UseMI.untieRegOperand(
+            AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
       }
 
       // ChangingToImmediate adds Src2 back to the instruction.
       Src2->ChangeToImmediate(Imm);
 
       // These come before src2.
-      removeModOperands(*UseMI);
-      UseMI->setDesc(get(AMDGPU::V_MADAK_F32));
+      removeModOperands(UseMI);
+      UseMI.setDesc(get(AMDGPU::V_MADAK_F32));
 
       bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
       if (DeleteDef)
-        DefMI->eraseFromParent();
+        DefMI.eraseFromParent();
 
       return true;
     }
@@ -1177,17 +1336,20 @@ static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
   return LowOffset + LowWidth <= HighOffset;
 }
 
-bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr *MIa,
-                                               MachineInstr *MIb) const {
-  unsigned BaseReg0, Offset0;
-  unsigned BaseReg1, Offset1;
+bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa,
+                                               MachineInstr &MIb) const {
+  unsigned BaseReg0, BaseReg1;
+  int64_t Offset0, Offset1;
 
   if (getMemOpBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) &&
       getMemOpBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) {
-    assert(MIa->hasOneMemOperand() && MIb->hasOneMemOperand() &&
-           "read2 / write2 not expected here yet");
-    unsigned Width0 = (*MIa->memoperands_begin())->getSize();
-    unsigned Width1 = (*MIb->memoperands_begin())->getSize();
+
+    if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
+      // FIXME: Handle ds_read2 / ds_write2.
+      return false;
+    }
+    unsigned Width0 = (*MIa.memoperands_begin())->getSize();
+    unsigned Width1 = (*MIb.memoperands_begin())->getSize();
     if (BaseReg0 == BaseReg1 &&
         offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) {
       return true;
@@ -1197,19 +1359,19 @@ bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr *MIa,
   return false;
 }
 
-bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa,
-                                                  MachineInstr *MIb,
+bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr &MIa,
+                                                  MachineInstr &MIb,
                                                   AliasAnalysis *AA) const {
-  assert(MIa && (MIa->mayLoad() || MIa->mayStore()) &&
+  assert((MIa.mayLoad() || MIa.mayStore()) &&
          "MIa must load from or modify a memory location");
-  assert(MIb && (MIb->mayLoad() || MIb->mayStore()) &&
+  assert((MIb.mayLoad() || MIb.mayStore()) &&
          "MIb must load from or modify a memory location");
 
-  if (MIa->hasUnmodeledSideEffects() || MIb->hasUnmodeledSideEffects())
+  if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects())
     return false;
 
   // XXX - Can we relax this between address spaces?
-  if (MIa->hasOrderedMemoryRef() || MIb->hasOrderedMemoryRef())
+  if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
     return false;
 
   // TODO: Should we check the address space from the MachineMemOperand? That
@@ -1217,29 +1379,29 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa,
   // underlying address space, even if it was lowered to a different one,
   // e.g. private accesses lowered to use MUBUF instructions on a scratch
   // buffer.
-  if (isDS(*MIa)) {
-    if (isDS(*MIb))
+  if (isDS(MIa)) {
+    if (isDS(MIb))
       return checkInstOffsetsDoNotOverlap(MIa, MIb);
 
-    return !isFLAT(*MIb);
+    return !isFLAT(MIb);
   }
 
-  if (isMUBUF(*MIa) || isMTBUF(*MIa)) {
-    if (isMUBUF(*MIb) || isMTBUF(*MIb))
+  if (isMUBUF(MIa) || isMTBUF(MIa)) {
+    if (isMUBUF(MIb) || isMTBUF(MIb))
       return checkInstOffsetsDoNotOverlap(MIa, MIb);
 
-    return !isFLAT(*MIb) && !isSMRD(*MIb);
+    return !isFLAT(MIb) && !isSMRD(MIb);
   }
 
-  if (isSMRD(*MIa)) {
-    if (isSMRD(*MIb))
+  if (isSMRD(MIa)) {
+    if (isSMRD(MIb))
       return checkInstOffsetsDoNotOverlap(MIa, MIb);
 
-    return !isFLAT(*MIb) && !isMUBUF(*MIa) && !isMTBUF(*MIa);
+    return !isFLAT(MIb) && !isMUBUF(MIa) && !isMTBUF(MIa);
   }
 
-  if (isFLAT(*MIa)) {
-    if (isFLAT(*MIb))
+  if (isFLAT(MIa)) {
+    if (isFLAT(MIb))
       return checkInstOffsetsDoNotOverlap(MIa, MIb);
 
     return false;
@@ -1249,35 +1411,49 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa,
 }
 
 MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
-                                                MachineBasicBlock::iterator &MI,
-                                                LiveVariables *LV) const {
-
-  switch (MI->getOpcode()) {
-    default: return nullptr;
-    case AMDGPU::V_MAC_F32_e64: break;
-    case AMDGPU::V_MAC_F32_e32: {
-      const MachineOperand *Src0 = getNamedOperand(*MI, AMDGPU::OpName::src0);
-      if (Src0->isImm() && !isInlineConstant(*Src0, 4))
-        return nullptr;
-      break;
-    }
+                                                 MachineInstr &MI,
+                                                 LiveVariables *LV) const {
+
+  switch (MI.getOpcode()) {
+  default:
+    return nullptr;
+  case AMDGPU::V_MAC_F32_e64:
+    break;
+  case AMDGPU::V_MAC_F32_e32: {
+    const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
+    if (Src0->isImm() && !isInlineConstant(*Src0, 4))
+      return nullptr;
+    break;
+  }
   }
 
-  const MachineOperand *Dst = getNamedOperand(*MI, AMDGPU::OpName::dst);
-  const MachineOperand *Src0 = getNamedOperand(*MI, AMDGPU::OpName::src0);
-  const MachineOperand *Src1 = getNamedOperand(*MI, AMDGPU::OpName::src1);
-  const MachineOperand *Src2 = getNamedOperand(*MI, AMDGPU::OpName::src2);
+  const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
+  const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
+  const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
+  const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
 
-  return BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_MAD_F32))
-                 .addOperand(*Dst)
-                 .addImm(0) // Src0 mods
-                 .addOperand(*Src0)
-                 .addImm(0) // Src1 mods
-                 .addOperand(*Src1)
-                 .addImm(0) // Src mods
-                 .addOperand(*Src2)
-                 .addImm(0)  // clamp
-                 .addImm(0); // omod
+  return BuildMI(*MBB, MI, MI.getDebugLoc(), get(AMDGPU::V_MAD_F32))
+      .addOperand(*Dst)
+      .addImm(0) // Src0 mods
+      .addOperand(*Src0)
+      .addImm(0) // Src1 mods
+      .addOperand(*Src1)
+      .addImm(0) // Src mods
+      .addOperand(*Src2)
+      .addImm(0)  // clamp
+      .addImm(0); // omod
+}
+
+bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
+                                       const MachineBasicBlock *MBB,
+                                       const MachineFunction &MF) const {
+  // XXX - Do we want the SP check in the base implementation?
+
+  // Target-independent instructions do not have an implicit-use of EXEC, even
+  // when they operate on VGPRs. Treating EXEC modifications as scheduling
+  // boundaries prevents incorrect movements of such instructions.
+  return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) ||
+         MI.modifiesRegister(AMDGPU::EXEC, &RI);
 }
 
 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
@@ -1355,9 +1531,9 @@ static bool compareMachineOp(const MachineOperand &Op0,
   }
 }
 
-bool SIInstrInfo::isImmOperandLegal(const MachineInstr *MI, unsigned OpNo,
-                                 const MachineOperand &MO) const {
-  const MCOperandInfo &OpInfo = get(MI->getOpcode()).OpInfo[OpNo];
+bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
+                                    const MachineOperand &MO) const {
+  const MCOperandInfo &OpInfo = get(MI.getOpcode()).OpInfo[OpNo];
 
   assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
 
@@ -1418,14 +1594,10 @@ bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
     return true;
 
   // SGPRs use the constant bus
-  if (MO.getReg() == AMDGPU::M0 || MO.getReg() == AMDGPU::VCC ||
-      (!MO.isImplicit() &&
-      (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) ||
-       AMDGPU::SGPR_64RegClass.contains(MO.getReg())))) {
-    return true;
-  }
-
-  return false;
+  return (MO.getReg() == AMDGPU::VCC || MO.getReg() == AMDGPU::M0 ||
+          (!MO.isImplicit() &&
+           (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) ||
+            AMDGPU::SGPR_64RegClass.contains(MO.getReg()))));
 }
 
 static unsigned findImplicitSGPRRead(const MachineInstr &MI) {
@@ -1448,10 +1620,33 @@ static unsigned findImplicitSGPRRead(const MachineInstr &MI) {
   return AMDGPU::NoRegister;
 }
 
-bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
+static bool shouldReadExec(const MachineInstr &MI) {
+  if (SIInstrInfo::isVALU(MI)) {
+    switch (MI.getOpcode()) {
+    case AMDGPU::V_READLANE_B32:
+    case AMDGPU::V_READLANE_B32_si:
+    case AMDGPU::V_READLANE_B32_vi:
+    case AMDGPU::V_WRITELANE_B32:
+    case AMDGPU::V_WRITELANE_B32_si:
+    case AMDGPU::V_WRITELANE_B32_vi:
+      return false;
+    }
+
+    return true;
+  }
+
+  if (SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
+      SIInstrInfo::isSALU(MI) ||
+      SIInstrInfo::isSMRD(MI))
+    return false;
+
+  return true;
+}
+
+bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
                                     StringRef &ErrInfo) const {
-  uint16_t Opcode = MI->getOpcode();
-  const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+  uint16_t Opcode = MI.getOpcode();
+  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
   int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
   int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
   int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
@@ -1459,14 +1654,14 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
   // Make sure the number of operands is correct.
   const MCInstrDesc &Desc = get(Opcode);
   if (!Desc.isVariadic() &&
-      Desc.getNumOperands() != MI->getNumExplicitOperands()) {
-     ErrInfo = "Instruction has wrong number of operands.";
-     return false;
+      Desc.getNumOperands() != MI.getNumExplicitOperands()) {
+    ErrInfo = "Instruction has wrong number of operands.";
+    return false;
   }
 
   // Make sure the register classes are correct.
   for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
-    if (MI->getOperand(i).isFPImm()) {
+    if (MI.getOperand(i).isFPImm()) {
       ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
                 "all fp values to integers.";
       return false;
@@ -1476,7 +1671,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
 
     switch (Desc.OpInfo[i].OperandType) {
     case MCOI::OPERAND_REGISTER:
-      if (MI->getOperand(i).isImm()) {
+      if (MI.getOperand(i).isImm()) {
         ErrInfo = "Illegal immediate value for operand.";
         return false;
       }
@@ -1484,17 +1679,18 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
     case AMDGPU::OPERAND_REG_IMM32:
       break;
     case AMDGPU::OPERAND_REG_INLINE_C:
-      if (isLiteralConstant(MI->getOperand(i),
+      if (isLiteralConstant(MI.getOperand(i),
                             RI.getRegClass(RegClass)->getSize())) {
         ErrInfo = "Illegal immediate value for operand.";
         return false;
       }
       break;
     case MCOI::OPERAND_IMMEDIATE:
+    case AMDGPU::OPERAND_KIMM32:
       // Check if this operand is an immediate.
       // FrameIndex operands will be replaced by immediates, so they are
       // allowed.
-      if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFI()) {
+      if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
         ErrInfo = "Expected immediate, but got non-immediate";
         return false;
       }
@@ -1503,12 +1699,13 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
       continue;
     }
 
-    if (!MI->getOperand(i).isReg())
+    if (!MI.getOperand(i).isReg())
       continue;
 
     if (RegClass != -1) {
-      unsigned Reg = MI->getOperand(i).getReg();
-      if (TargetRegisterInfo::isVirtualRegister(Reg))
+      unsigned Reg = MI.getOperand(i).getReg();
+      if (Reg == AMDGPU::NoRegister ||
+          TargetRegisterInfo::isVirtualRegister(Reg))
         continue;
 
       const TargetRegisterClass *RC = RI.getRegClass(RegClass);
@@ -1519,23 +1716,26 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
     }
   }
 
-
   // Verify VOP*
-  if (isVOP1(*MI) || isVOP2(*MI) || isVOP3(*MI) || isVOPC(*MI)) {
+  if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI)) {
     // Only look at the true operands. Only a real operand can use the constant
     // bus, and we don't want to check pseudo-operands like the source modifier
     // flags.
     const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
 
     unsigned ConstantBusCount = 0;
-    unsigned SGPRUsed = findImplicitSGPRRead(*MI);
+
+    if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1)
+      ++ConstantBusCount;
+
+    unsigned SGPRUsed = findImplicitSGPRRead(MI);
     if (SGPRUsed != AMDGPU::NoRegister)
       ++ConstantBusCount;
 
     for (int OpIdx : OpIndices) {
       if (OpIdx == -1)
         break;
-      const MachineOperand &MO = MI->getOperand(OpIdx);
+      const MachineOperand &MO = MI.getOperand(OpIdx);
       if (usesConstantBus(MRI, MO, getOpSize(Opcode, OpIdx))) {
         if (MO.isReg()) {
           if (MO.getReg() != SGPRUsed)
@@ -1555,9 +1755,9 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
   // Verify misc. restrictions on specific instructions.
   if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 ||
       Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) {
-    const MachineOperand &Src0 = MI->getOperand(Src0Idx);
-    const MachineOperand &Src1 = MI->getOperand(Src1Idx);
-    const MachineOperand &Src2 = MI->getOperand(Src2Idx);
+    const MachineOperand &Src0 = MI.getOperand(Src0Idx);
+    const MachineOperand &Src1 = MI.getOperand(Src1Idx);
+    const MachineOperand &Src2 = MI.getOperand(Src2Idx);
     if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
       if (!compareMachineOp(Src0, Src1) &&
           !compareMachineOp(Src0, Src2)) {
@@ -1569,9 +1769,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
 
   // Make sure we aren't losing exec uses in the td files. This mostly requires
   // being careful when using let Uses to try to add other use registers.
-  if (!isGenericOpcode(Opcode) && !isSALU(Opcode) && !isSMRD(Opcode)) {
-    const MachineOperand *Exec = MI->findRegisterUseOperand(AMDGPU::EXEC);
-    if (!Exec || !Exec->isImplicit()) {
+  if (shouldReadExec(MI)) {
+    if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
       ErrInfo = "VALU instruction does not implicitly read exec mask";
       return false;
     }
@@ -1624,22 +1823,18 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
   case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32;
   case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32;
   case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32;
-  case AMDGPU::S_LOAD_DWORD_IMM:
-  case AMDGPU::S_LOAD_DWORD_SGPR:
-  case AMDGPU::S_LOAD_DWORD_IMM_ci:
-    return AMDGPU::BUFFER_LOAD_DWORD_ADDR64;
-  case AMDGPU::S_LOAD_DWORDX2_IMM:
-  case AMDGPU::S_LOAD_DWORDX2_SGPR:
-  case AMDGPU::S_LOAD_DWORDX2_IMM_ci:
-    return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64;
-  case AMDGPU::S_LOAD_DWORDX4_IMM:
-  case AMDGPU::S_LOAD_DWORDX4_SGPR:
-  case AMDGPU::S_LOAD_DWORDX4_IMM_ci:
-    return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64;
+  case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32;
+  case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32;
+  case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32;
+  case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32;
+  case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32;
+  case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32;
   case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
   case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
   case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
   case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
+  case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
+  case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
   }
 }
 
@@ -1676,12 +1871,12 @@ bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const {
   }
 }
 
-void SIInstrInfo::legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const {
+void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
   MachineBasicBlock::iterator I = MI;
-  MachineBasicBlock *MBB = MI->getParent();
-  MachineOperand &MO = MI->getOperand(OpIdx);
+  MachineBasicBlock *MBB = MI.getParent();
+  MachineOperand &MO = MI.getOperand(OpIdx);
   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
-  unsigned RCID = get(MI->getOpcode()).OpInfo[OpIdx].RegClass;
+  unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass;
   const TargetRegisterClass *RC = RI.getRegClass(RCID);
   unsigned Opcode = AMDGPU::V_MOV_B32_e32;
   if (MO.isReg())
@@ -1689,7 +1884,6 @@ void SIInstrInfo::legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const {
   else if (RI.isSGPRClass(RC))
     Opcode = AMDGPU::S_MOV_B32;
 
-
   const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
   if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC))
     VRC = &AMDGPU::VReg_64RegClass;
@@ -1698,8 +1892,7 @@ void SIInstrInfo::legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const {
 
   unsigned Reg = MRI.createVirtualRegister(VRC);
   DebugLoc DL = MBB->findDebugLoc(I);
-  BuildMI(*MI->getParent(), I, DL, get(Opcode), Reg)
-    .addOperand(MO);
+  BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).addOperand(MO);
   MO.ChangeToRegister(Reg, false);
 }
 
@@ -1758,11 +1951,11 @@ MachineOperand SIInstrInfo::buildExtractSubRegOrImm(
 }
 
 // Change the order of operands from (0, 1, 2) to (0, 2, 1)
-void SIInstrInfo::swapOperands(MachineBasicBlock::iterator Inst) const {
-  assert(Inst->getNumExplicitOperands() == 3);
-  MachineOperand Op1 = Inst->getOperand(1);
-  Inst->RemoveOperand(1);
-  Inst->addOperand(Op1);
+void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
+  assert(Inst.getNumExplicitOperands() == 3);
+  MachineOperand Op1 = Inst.getOperand(1);
+  Inst.RemoveOperand(1);
+  Inst.addOperand(Op1);
 }
 
 bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
@@ -1804,26 +1997,32 @@ bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
   return true;
 }
 
-bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
+bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
                                  const MachineOperand *MO) const {
-  const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
-  const MCInstrDesc &InstDesc = get(MI->getOpcode());
+  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+  const MCInstrDesc &InstDesc = MI.getDesc();
   const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx];
   const TargetRegisterClass *DefinedRC =
       OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
   if (!MO)
-    MO = &MI->getOperand(OpIdx);
+    MO = &MI.getOperand(OpIdx);
+
+  if (isVALU(MI) && usesConstantBus(MRI, *MO, DefinedRC->getSize())) {
+
+    RegSubRegPair SGPRUsed;
+    if (MO->isReg())
+      SGPRUsed = RegSubRegPair(MO->getReg(), MO->getSubReg());
 
-  if (isVALU(*MI) &&
-      usesConstantBus(MRI, *MO, DefinedRC->getSize())) {
-    unsigned SGPRUsed =
-        MO->isReg() ? MO->getReg() : (unsigned)AMDGPU::NoRegister;
-    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
       if (i == OpIdx)
         continue;
-      const MachineOperand &Op = MI->getOperand(i);
-      if (Op.isReg() && Op.getReg() != SGPRUsed &&
-          usesConstantBus(MRI, Op, getOpSize(*MI, i))) {
+      const MachineOperand &Op = MI.getOperand(i);
+      if (Op.isReg()) {
+        if ((Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) &&
+            usesConstantBus(MRI, Op, getOpSize(MI, i))) {
+          return false;
+        }
+      } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) {
         return false;
       }
     }
@@ -1834,7 +2033,6 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
     return isLegalRegOperand(MRI, OpInfo, *MO);
   }
 
-
   // Handle non-register types that are treated like immediates.
   assert(MO->isImm() || MO->isTargetIndex() || MO->isFI());
 
@@ -1847,12 +2045,12 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
 }
 
 void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
-                                       MachineInstr *MI) const {
-  unsigned Opc = MI->getOpcode();
+                                       MachineInstr &MI) const {
+  unsigned Opc = MI.getOpcode();
   const MCInstrDesc &InstrDesc = get(Opc);
 
   int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
-  MachineOperand &Src1 = MI->getOperand(Src1Idx);
+  MachineOperand &Src1 = MI.getOperand(Src1Idx);
 
   // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
   // we need to only have one constant bus use.
@@ -1860,10 +2058,10 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
   // Note we do not need to worry about literal constants here. They are
   // disabled for the operand type for instructions because they will always
   // violate the one constant bus use rule.
-  bool HasImplicitSGPR = findImplicitSGPRRead(*MI) != AMDGPU::NoRegister;
+  bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister;
   if (HasImplicitSGPR) {
     int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
-    MachineOperand &Src0 = MI->getOperand(Src0Idx);
+    MachineOperand &Src0 = MI.getOperand(Src0Idx);
 
     if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg()))
       legalizeOpWithMove(MI, Src0Idx);
@@ -1878,13 +2076,13 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
   // commute if it is possible. We only want to commute here if it improves
   // legality. This can be called a fairly large number of times so don't waste
   // compile time pointlessly swapping and checking legality again.
-  if (HasImplicitSGPR || !MI->isCommutable()) {
+  if (HasImplicitSGPR || !MI.isCommutable()) {
     legalizeOpWithMove(MI, Src1Idx);
     return;
   }
 
   int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
-  MachineOperand &Src0 = MI->getOperand(Src0Idx);
+  MachineOperand &Src0 = MI.getOperand(Src0Idx);
 
   // If src0 can be used as src1, commuting will make the operands legal.
   // Otherwise we have to give up and insert a move.
@@ -1897,13 +2095,13 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
     return;
   }
 
-  int CommutedOpc = commuteOpcode(*MI);
+  int CommutedOpc = commuteOpcode(MI);
   if (CommutedOpc == -1) {
     legalizeOpWithMove(MI, Src1Idx);
     return;
   }
 
-  MI->setDesc(get(CommutedOpc));
+  MI.setDesc(get(CommutedOpc));
 
   unsigned Src0Reg = Src0.getReg();
   unsigned Src0SubReg = Src0.getSubReg();
@@ -1925,10 +2123,9 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
 // operand, and since literal constants are not allowed and should never be
 // seen, we only need to worry about inserting copies if we use multiple SGPR
 // operands.
-void SIInstrInfo::legalizeOperandsVOP3(
-  MachineRegisterInfo &MRI,
-  MachineInstr *MI) const {
-  unsigned Opc = MI->getOpcode();
+void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
+                                       MachineInstr &MI) const {
+  unsigned Opc = MI.getOpcode();
 
   int VOP3Idx[3] = {
     AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
@@ -1943,7 +2140,7 @@ void SIInstrInfo::legalizeOperandsVOP3(
     int Idx = VOP3Idx[i];
     if (Idx == -1)
       break;
-    MachineOperand &MO = MI->getOperand(Idx);
+    MachineOperand &MO = MI.getOperand(Idx);
 
     // We should never see a VOP3 instruction with an illegal immediate operand.
     if (!MO.isReg())
@@ -1964,32 +2161,78 @@ void SIInstrInfo::legalizeOperandsVOP3(
   }
 }
 
-void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
-  MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI,
+                                         MachineRegisterInfo &MRI) const {
+  const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
+  const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
+  unsigned DstReg = MRI.createVirtualRegister(SRC);
+  unsigned SubRegs = VRC->getSize() / 4;
+
+  SmallVector<unsigned, 8> SRegs;
+  for (unsigned i = 0; i < SubRegs; ++i) {
+    unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+    BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
+            get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
+        .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
+    SRegs.push_back(SGPR);
+  }
+
+  MachineInstrBuilder MIB =
+      BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
+              get(AMDGPU::REG_SEQUENCE), DstReg);
+  for (unsigned i = 0; i < SubRegs; ++i) {
+    MIB.addReg(SRegs[i]);
+    MIB.addImm(RI.getSubRegFromChannel(i));
+  }
+  return DstReg;
+}
+
+void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI,
+                                       MachineInstr &MI) const {
+
+  // If the pointer is store in VGPRs, then we need to move them to
+  // SGPRs using v_readfirstlane.  This is safe because we only select
+  // loads with uniform pointers to SMRD instruction so we know the
+  // pointer value is uniform.
+  MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
+  if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
+      unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
+      SBase->setReg(SGPR);
+  }
+}
+
+void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
+  MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
 
   // Legalize VOP2
-  if (isVOP2(*MI)) {
+  if (isVOP2(MI) || isVOPC(MI)) {
     legalizeOperandsVOP2(MRI, MI);
     return;
   }
 
   // Legalize VOP3
-  if (isVOP3(*MI)) {
+  if (isVOP3(MI)) {
     legalizeOperandsVOP3(MRI, MI);
     return;
   }
 
+  // Legalize SMRD
+  if (isSMRD(MI)) {
+    legalizeOperandsSMRD(MRI, MI);
+    return;
+  }
+
   // Legalize REG_SEQUENCE and PHI
   // The register class of the operands much be the same type as the register
   // class of the output.
-  if (MI->getOpcode() == AMDGPU::PHI) {
+  if (MI.getOpcode() == AMDGPU::PHI) {
     const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
-    for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) {
-      if (!MI->getOperand(i).isReg() ||
-          !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg()))
+    for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
+      if (!MI.getOperand(i).isReg() ||
+          !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg()))
         continue;
       const TargetRegisterClass *OpRC =
-              MRI.getRegClass(MI->getOperand(i).getReg());
+          MRI.getRegClass(MI.getOperand(i).getReg());
       if (RI.hasVGPRs(OpRC)) {
         VRC = OpRC;
       } else {
@@ -2000,7 +2243,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
     // If any of the operands are VGPR registers, then they all most be
     // otherwise we will create illegal VGPR->SGPR copies when legalizing
     // them.
-    if (VRC || !RI.isSGPRClass(getOpRegClass(*MI, 0))) {
+    if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
       if (!VRC) {
         assert(SRC);
         VRC = RI.getEquivalentVGPRClass(SRC);
@@ -2011,18 +2254,18 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
     }
 
     // Update all the operands so they have the same type.
-    for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) {
-      MachineOperand &Op = MI->getOperand(I);
+    for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
+      MachineOperand &Op = MI.getOperand(I);
       if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
         continue;
       unsigned DstReg = MRI.createVirtualRegister(RC);
 
       // MI is a PHI instruction.
-      MachineBasicBlock *InsertBB = MI->getOperand(I + 1).getMBB();
+      MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
       MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator();
 
-      BuildMI(*InsertBB, Insert, MI->getDebugLoc(), get(AMDGPU::COPY), DstReg)
-        .addOperand(Op);
+      BuildMI(*InsertBB, Insert, MI.getDebugLoc(), get(AMDGPU::COPY), DstReg)
+          .addOperand(Op);
       Op.setReg(DstReg);
     }
   }
@@ -2030,15 +2273,15 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
   // REG_SEQUENCE doesn't really require operand legalization, but if one has a
   // VGPR dest type and SGPR sources, insert copies so all operands are
   // VGPRs. This seems to help operand folding / the register coalescer.
-  if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) {
-    MachineBasicBlock *MBB = MI->getParent();
-    const TargetRegisterClass *DstRC = getOpRegClass(*MI, 0);
+  if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
+    MachineBasicBlock *MBB = MI.getParent();
+    const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
     if (RI.hasVGPRs(DstRC)) {
       // Update all the operands so they are VGPR register classes. These may
       // not be the same register class because REG_SEQUENCE supports mixing
       // subregister index types e.g. sub0_sub1 + sub2 + sub3
-      for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) {
-        MachineOperand &Op = MI->getOperand(I);
+      for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
+        MachineOperand &Op = MI.getOperand(I);
         if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
           continue;
 
@@ -2049,8 +2292,8 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
 
         unsigned DstReg = MRI.createVirtualRegister(VRC);
 
-        BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), DstReg)
-          .addOperand(Op);
+        BuildMI(*MBB, MI, MI.getDebugLoc(), get(AMDGPU::COPY), DstReg)
+            .addOperand(Op);
 
         Op.setReg(DstReg);
         Op.setIsKill();
@@ -2062,17 +2305,33 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
 
   // Legalize INSERT_SUBREG
   // src0 must have the same register class as dst
-  if (MI->getOpcode() == AMDGPU::INSERT_SUBREG) {
-    unsigned Dst = MI->getOperand(0).getReg();
-    unsigned Src0 = MI->getOperand(1).getReg();
+  if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
+    unsigned Dst = MI.getOperand(0).getReg();
+    unsigned Src0 = MI.getOperand(1).getReg();
     const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
     const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
     if (DstRC != Src0RC) {
-      MachineBasicBlock &MBB = *MI->getParent();
+      MachineBasicBlock &MBB = *MI.getParent();
       unsigned NewSrc0 = MRI.createVirtualRegister(DstRC);
-      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), NewSrc0)
-              .addReg(Src0);
-      MI->getOperand(1).setReg(NewSrc0);
+      BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::COPY), NewSrc0)
+          .addReg(Src0);
+      MI.getOperand(1).setReg(NewSrc0);
+    }
+    return;
+  }
+
+  // Legalize MIMG
+  if (isMIMG(MI)) {
+    MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc);
+    if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) {
+      unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI);
+      SRsrc->setReg(SGPR);
+    }
+
+    MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp);
+    if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) {
+      unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI);
+      SSamp->setReg(SGPR);
     }
     return;
   }
@@ -2081,11 +2340,11 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
   // FIXME: If we start using the non-addr64 instructions for compute, we
   // may need to legalize them here.
   int SRsrcIdx =
-      AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
+      AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
   if (SRsrcIdx != -1) {
     // We have an MUBUF instruction
-    MachineOperand *SRsrc = &MI->getOperand(SRsrcIdx);
-    unsigned SRsrcRC = get(MI->getOpcode()).OpInfo[SRsrcIdx].RegClass;
+    MachineOperand *SRsrc = &MI.getOperand(SRsrcIdx);
+    unsigned SRsrcRC = get(MI.getOpcode()).OpInfo[SRsrcIdx].RegClass;
     if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()),
                                              RI.getRegClass(SRsrcRC))) {
       // The operands are legal.
@@ -2093,7 +2352,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
       return;
     }
 
-    MachineBasicBlock &MBB = *MI->getParent();
+    MachineBasicBlock &MBB = *MI.getParent();
 
     // Extract the ptr from the resource descriptor.
     unsigned SRsrcPtr = buildExtractSubReg(MI, MRI, *SRsrc,
@@ -2107,30 +2366,27 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
     uint64_t RsrcDataFormat = getDefaultRsrcDataFormat();
 
     // Zero64 = 0
-    BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B64),
-            Zero64)
-            .addImm(0);
+    BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B64), Zero64)
+        .addImm(0);
 
     // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
-    BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
-            SRsrcFormatLo)
-            .addImm(RsrcDataFormat & 0xFFFFFFFF);
+    BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
+        .addImm(RsrcDataFormat & 0xFFFFFFFF);
 
     // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
-    BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
-            SRsrcFormatHi)
-            .addImm(RsrcDataFormat >> 32);
+    BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
+        .addImm(RsrcDataFormat >> 32);
 
     // NewSRsrc = {Zero64, SRsrcFormat}
-    BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc)
-      .addReg(Zero64)
-      .addImm(AMDGPU::sub0_sub1)
-      .addReg(SRsrcFormatLo)
-      .addImm(AMDGPU::sub2)
-      .addReg(SRsrcFormatHi)
-      .addImm(AMDGPU::sub3);
-
-    MachineOperand *VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr);
+    BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc)
+        .addReg(Zero64)
+        .addImm(AMDGPU::sub0_sub1)
+        .addReg(SRsrcFormatLo)
+        .addImm(AMDGPU::sub2)
+        .addReg(SRsrcFormatHi)
+        .addImm(AMDGPU::sub3);
+
+    MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
     unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
     if (VAddr) {
       // This is already an ADDR64 instruction so we need to add the pointer
@@ -2139,7 +2395,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
       unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
 
       // NewVaddrLo = SRsrcPtr:sub0 + VAddr:sub0
-      DebugLoc DL = MI->getDebugLoc();
+      DebugLoc DL = MI.getDebugLoc();
       BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo)
         .addReg(SRsrcPtr, 0, AMDGPU::sub0)
         .addReg(VAddr->getReg(), 0, AMDGPU::sub0);
@@ -2150,82 +2406,82 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
         .addReg(VAddr->getReg(), 0, AMDGPU::sub1);
 
       // NewVaddr = {NewVaddrHi, NewVaddrLo}
-      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
-        .addReg(NewVAddrLo)
-        .addImm(AMDGPU::sub0)
-        .addReg(NewVAddrHi)
-        .addImm(AMDGPU::sub1);
+      BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
+          .addReg(NewVAddrLo)
+          .addImm(AMDGPU::sub0)
+          .addReg(NewVAddrHi)
+          .addImm(AMDGPU::sub1);
     } else {
       // This instructions is the _OFFSET variant, so we need to convert it to
       // ADDR64.
-      assert(MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration()
-             < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
+      assert(MBB.getParent()->getSubtarget<SISubtarget>().getGeneration()
+             < SISubtarget::VOLCANIC_ISLANDS &&
              "FIXME: Need to emit flat atomics here");
 
-      MachineOperand *VData = getNamedOperand(*MI, AMDGPU::OpName::vdata);
-      MachineOperand *Offset = getNamedOperand(*MI, AMDGPU::OpName::offset);
-      MachineOperand *SOffset = getNamedOperand(*MI, AMDGPU::OpName::soffset);
-      unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI->getOpcode());
+      MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
+      MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
+      MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
+      unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
 
       // Atomics rith return have have an additional tied operand and are
       // missing some of the special bits.
-      MachineOperand *VDataIn = getNamedOperand(*MI, AMDGPU::OpName::vdata_in);
+      MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
       MachineInstr *Addr64;
 
       if (!VDataIn) {
         // Regular buffer load / store.
-        MachineInstrBuilder MIB
-          = BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode))
-          .addOperand(*VData)
-          .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
-          // This will be replaced later
-          // with the new value of vaddr.
-          .addOperand(*SRsrc)
-          .addOperand(*SOffset)
-          .addOperand(*Offset);
+        MachineInstrBuilder MIB =
+            BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
+                .addOperand(*VData)
+                .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
+                // This will be replaced later
+                // with the new value of vaddr.
+                .addOperand(*SRsrc)
+                .addOperand(*SOffset)
+                .addOperand(*Offset);
 
         // Atomics do not have this operand.
-        if (const MachineOperand *GLC
-            = getNamedOperand(*MI, AMDGPU::OpName::glc)) {
+        if (const MachineOperand *GLC =
+                getNamedOperand(MI, AMDGPU::OpName::glc)) {
           MIB.addImm(GLC->getImm());
         }
 
-        MIB.addImm(getNamedImmOperand(*MI, AMDGPU::OpName::slc));
+        MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc));
 
-        if (const MachineOperand *TFE
-            = getNamedOperand(*MI, AMDGPU::OpName::tfe)) {
+        if (const MachineOperand *TFE =
+                getNamedOperand(MI, AMDGPU::OpName::tfe)) {
           MIB.addImm(TFE->getImm());
         }
 
-        MIB.setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+        MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
         Addr64 = MIB;
       } else {
         // Atomics with return.
-        Addr64 = BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode))
-          .addOperand(*VData)
-          .addOperand(*VDataIn)
-          .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
-          // This will be replaced later
-          // with the new value of vaddr.
-          .addOperand(*SRsrc)
-          .addOperand(*SOffset)
-          .addOperand(*Offset)
-          .addImm(getNamedImmOperand(*MI, AMDGPU::OpName::slc))
-          .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+        Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
+                     .addOperand(*VData)
+                     .addOperand(*VDataIn)
+                     .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
+                     // This will be replaced later
+                     // with the new value of vaddr.
+                     .addOperand(*SRsrc)
+                     .addOperand(*SOffset)
+                     .addOperand(*Offset)
+                     .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc))
+                     .setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
       }
 
-      MI->removeFromParent();
-      MI = Addr64;
+      MI.removeFromParent();
 
       // NewVaddr = {NewVaddrHi, NewVaddrLo}
-      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
-        .addReg(SRsrcPtr, 0, AMDGPU::sub0)
-        .addImm(AMDGPU::sub0)
-        .addReg(SRsrcPtr, 0, AMDGPU::sub1)
-        .addImm(AMDGPU::sub1);
-
-      VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr);
-      SRsrc = getNamedOperand(*MI, AMDGPU::OpName::srsrc);
+      BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
+              NewVAddr)
+          .addReg(SRsrcPtr, 0, AMDGPU::sub0)
+          .addImm(AMDGPU::sub0)
+          .addReg(SRsrcPtr, 0, AMDGPU::sub1)
+          .addImm(AMDGPU::sub1);
+
+      VAddr = getNamedOperand(*Addr64, AMDGPU::OpName::vaddr);
+      SRsrc = getNamedOperand(*Addr64, AMDGPU::OpName::srsrc);
     }
 
     // Update the instruction to use NewVaddr
@@ -2235,300 +2491,85 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
   }
 }
 
-void SIInstrInfo::splitSMRD(MachineInstr *MI,
-                            const TargetRegisterClass *HalfRC,
-                            unsigned HalfImmOp, unsigned HalfSGPROp,
-                            MachineInstr *&Lo, MachineInstr *&Hi) const {
-
-  DebugLoc DL = MI->getDebugLoc();
-  MachineBasicBlock *MBB = MI->getParent();
-  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
-  unsigned RegLo = MRI.createVirtualRegister(HalfRC);
-  unsigned RegHi = MRI.createVirtualRegister(HalfRC);
-  unsigned HalfSize = HalfRC->getSize();
-  const MachineOperand *OffOp =
-      getNamedOperand(*MI, AMDGPU::OpName::offset);
-  const MachineOperand *SBase = getNamedOperand(*MI, AMDGPU::OpName::sbase);
-
-  // The SMRD has an 8-bit offset in dwords on SI and a 20-bit offset in bytes
-  // on VI.
-
-  bool IsKill = SBase->isKill();
-  if (OffOp) {
-    bool isVI =
-        MBB->getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() >=
-        AMDGPUSubtarget::VOLCANIC_ISLANDS;
-    unsigned OffScale = isVI ? 1 : 4;
-    // Handle the _IMM variant
-    unsigned LoOffset = OffOp->getImm() * OffScale;
-    unsigned HiOffset = LoOffset + HalfSize;
-    Lo = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegLo)
-                  // Use addReg instead of addOperand
-                  // to make sure kill flag is cleared.
-                  .addReg(SBase->getReg(), 0, SBase->getSubReg())
-                  .addImm(LoOffset / OffScale);
-
-    if (!isUInt<20>(HiOffset) || (!isVI && !isUInt<8>(HiOffset / OffScale))) {
-      unsigned OffsetSGPR =
-          MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
-      BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), OffsetSGPR)
-              .addImm(HiOffset); // The offset in register is in bytes.
-      Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegHi)
-                    .addReg(SBase->getReg(), getKillRegState(IsKill),
-                            SBase->getSubReg())
-                    .addReg(OffsetSGPR);
-    } else {
-      Hi = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegHi)
-                     .addReg(SBase->getReg(), getKillRegState(IsKill),
-                             SBase->getSubReg())
-                     .addImm(HiOffset / OffScale);
-    }
-  } else {
-    // Handle the _SGPR variant
-    MachineOperand *SOff = getNamedOperand(*MI, AMDGPU::OpName::soff);
-    Lo = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegLo)
-                  .addReg(SBase->getReg(), 0, SBase->getSubReg())
-                  .addOperand(*SOff);
-    unsigned OffsetSGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
-    BuildMI(*MBB, MI, DL, get(AMDGPU::S_ADD_I32), OffsetSGPR)
-      .addReg(SOff->getReg(), 0, SOff->getSubReg())
-      .addImm(HalfSize);
-    Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegHi)
-                  .addReg(SBase->getReg(), getKillRegState(IsKill),
-                          SBase->getSubReg())
-                  .addReg(OffsetSGPR);
-  }
-
-  unsigned SubLo, SubHi;
-  const TargetRegisterClass *NewDstRC;
-  switch (HalfSize) {
-    case 4:
-      SubLo = AMDGPU::sub0;
-      SubHi = AMDGPU::sub1;
-      NewDstRC = &AMDGPU::VReg_64RegClass;
-      break;
-    case 8:
-      SubLo = AMDGPU::sub0_sub1;
-      SubHi = AMDGPU::sub2_sub3;
-      NewDstRC = &AMDGPU::VReg_128RegClass;
-      break;
-    case 16:
-      SubLo = AMDGPU::sub0_sub1_sub2_sub3;
-      SubHi = AMDGPU::sub4_sub5_sub6_sub7;
-      NewDstRC = &AMDGPU::VReg_256RegClass;
-      break;
-    case 32:
-      SubLo = AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
-      SubHi = AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15;
-      NewDstRC = &AMDGPU::VReg_512RegClass;
-      break;
-    default:
-      llvm_unreachable("Unhandled HalfSize");
-  }
-
-  unsigned OldDst = MI->getOperand(0).getReg();
-  unsigned NewDst = MRI.createVirtualRegister(NewDstRC);
-
-  MRI.replaceRegWith(OldDst, NewDst);
-
-  BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDst)
-    .addReg(RegLo)
-    .addImm(SubLo)
-    .addReg(RegHi)
-    .addImm(SubHi);
-}
-
-void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI,
-                                 MachineRegisterInfo &MRI,
-                                 SmallVectorImpl<MachineInstr *> &Worklist) const {
-  MachineBasicBlock *MBB = MI->getParent();
-  int DstIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
-  assert(DstIdx != -1);
-  unsigned DstRCID = get(MI->getOpcode()).OpInfo[DstIdx].RegClass;
-  switch(RI.getRegClass(DstRCID)->getSize()) {
-    case 4:
-    case 8:
-    case 16: {
-      unsigned NewOpcode = getVALUOp(*MI);
-      unsigned RegOffset;
-      unsigned ImmOffset;
-
-      if (MI->getOperand(2).isReg()) {
-        RegOffset = MI->getOperand(2).getReg();
-        ImmOffset = 0;
-      } else {
-        assert(MI->getOperand(2).isImm());
-        // SMRD instructions take a dword offsets on SI and byte offset on VI
-        // and MUBUF instructions always take a byte offset.
-        ImmOffset = MI->getOperand(2).getImm();
-        if (MBB->getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() <=
-            AMDGPUSubtarget::SEA_ISLANDS)
-          ImmOffset <<= 2;
-        RegOffset = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
-
-        if (isUInt<12>(ImmOffset)) {
-          BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
-                  RegOffset)
-                  .addImm(0);
-        } else {
-          BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
-                  RegOffset)
-                  .addImm(ImmOffset);
-          ImmOffset = 0;
-        }
-      }
-
-      unsigned SRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
-      unsigned DWord0 = RegOffset;
-      unsigned DWord1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
-      unsigned DWord2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
-      unsigned DWord3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
-      uint64_t RsrcDataFormat = getDefaultRsrcDataFormat();
-
-      BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord1)
-              .addImm(0);
-      BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord2)
-              .addImm(RsrcDataFormat & 0xFFFFFFFF);
-      BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord3)
-              .addImm(RsrcDataFormat >> 32);
-      BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), SRsrc)
-        .addReg(DWord0)
-        .addImm(AMDGPU::sub0)
-        .addReg(DWord1)
-        .addImm(AMDGPU::sub1)
-        .addReg(DWord2)
-        .addImm(AMDGPU::sub2)
-        .addReg(DWord3)
-        .addImm(AMDGPU::sub3);
-
-      const MCInstrDesc &NewInstDesc = get(NewOpcode);
-      const TargetRegisterClass *NewDstRC
-        = RI.getRegClass(NewInstDesc.OpInfo[0].RegClass);
-      unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC);
-      unsigned DstReg = MI->getOperand(0).getReg();
-      MRI.replaceRegWith(DstReg, NewDstReg);
-
-      MachineInstr *NewInst =
-        BuildMI(*MBB, MI, MI->getDebugLoc(), NewInstDesc, NewDstReg)
-        .addOperand(MI->getOperand(1)) // sbase
-        .addReg(SRsrc)
-        .addImm(0)
-        .addImm(ImmOffset)
-        .addImm(0) // glc
-        .addImm(0) // slc
-        .addImm(0) // tfe
-        .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
-      MI->eraseFromParent();
-
-      legalizeOperands(NewInst);
-      addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
-      break;
-    }
-    case 32: {
-      MachineInstr *Lo, *Hi;
-      splitSMRD(MI, &AMDGPU::SReg_128RegClass, AMDGPU::S_LOAD_DWORDX4_IMM,
-                AMDGPU::S_LOAD_DWORDX4_SGPR, Lo, Hi);
-      MI->eraseFromParent();
-      moveSMRDToVALU(Lo, MRI, Worklist);
-      moveSMRDToVALU(Hi, MRI, Worklist);
-      break;
-    }
-
-    case 64: {
-      MachineInstr *Lo, *Hi;
-      splitSMRD(MI, &AMDGPU::SReg_256RegClass, AMDGPU::S_LOAD_DWORDX8_IMM,
-                AMDGPU::S_LOAD_DWORDX8_SGPR, Lo, Hi);
-      MI->eraseFromParent();
-      moveSMRDToVALU(Lo, MRI, Worklist);
-      moveSMRDToVALU(Hi, MRI, Worklist);
-      break;
-    }
-  }
-}
-
 void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
   SmallVector<MachineInstr *, 128> Worklist;
   Worklist.push_back(&TopInst);
 
   while (!Worklist.empty()) {
-    MachineInstr *Inst = Worklist.pop_back_val();
-    MachineBasicBlock *MBB = Inst->getParent();
+    MachineInstr &Inst = *Worklist.pop_back_val();
+    MachineBasicBlock *MBB = Inst.getParent();
     MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
 
-    unsigned Opcode = Inst->getOpcode();
-    unsigned NewOpcode = getVALUOp(*Inst);
+    unsigned Opcode = Inst.getOpcode();
+    unsigned NewOpcode = getVALUOp(Inst);
 
     // Handle some special cases
     switch (Opcode) {
     default:
-      if (isSMRD(*Inst)) {
-        moveSMRDToVALU(Inst, MRI, Worklist);
-        continue;
-      }
       break;
     case AMDGPU::S_AND_B64:
       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64);
-      Inst->eraseFromParent();
+      Inst.eraseFromParent();
       continue;
 
     case AMDGPU::S_OR_B64:
       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64);
-      Inst->eraseFromParent();
+      Inst.eraseFromParent();
       continue;
 
     case AMDGPU::S_XOR_B64:
       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64);
-      Inst->eraseFromParent();
+      Inst.eraseFromParent();
       continue;
 
     case AMDGPU::S_NOT_B64:
       splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32);
-      Inst->eraseFromParent();
+      Inst.eraseFromParent();
       continue;
 
     case AMDGPU::S_BCNT1_I32_B64:
       splitScalar64BitBCNT(Worklist, Inst);
-      Inst->eraseFromParent();
+      Inst.eraseFromParent();
       continue;
 
     case AMDGPU::S_BFE_I64: {
       splitScalar64BitBFE(Worklist, Inst);
-      Inst->eraseFromParent();
+      Inst.eraseFromParent();
       continue;
     }
 
     case AMDGPU::S_LSHL_B32:
-      if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+      if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
         NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
         swapOperands(Inst);
       }
       break;
     case AMDGPU::S_ASHR_I32:
-      if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+      if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
         NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
         swapOperands(Inst);
       }
       break;
     case AMDGPU::S_LSHR_B32:
-      if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+      if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
         NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
         swapOperands(Inst);
       }
       break;
     case AMDGPU::S_LSHL_B64:
-      if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+      if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
         NewOpcode = AMDGPU::V_LSHLREV_B64;
         swapOperands(Inst);
       }
       break;
     case AMDGPU::S_ASHR_I64:
-      if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+      if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
         NewOpcode = AMDGPU::V_ASHRREV_I64;
         swapOperands(Inst);
       }
       break;
     case AMDGPU::S_LSHR_B64:
-      if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+      if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
         NewOpcode = AMDGPU::V_LSHRREV_B64;
         swapOperands(Inst);
       }
@@ -2536,9 +2577,18 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
 
     case AMDGPU::S_ABS_I32:
       lowerScalarAbs(Worklist, Inst);
-      Inst->eraseFromParent();
+      Inst.eraseFromParent();
       continue;
 
+    case AMDGPU::S_CBRANCH_SCC0:
+    case AMDGPU::S_CBRANCH_SCC1:
+      // Clear unused bits of vcc
+      BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64),
+              AMDGPU::VCC)
+          .addReg(AMDGPU::EXEC)
+          .addReg(AMDGPU::VCC);
+      break;
+
     case AMDGPU::S_BFE_U64:
     case AMDGPU::S_BFM_B64:
       llvm_unreachable("Moving this op to VALU not implemented");
@@ -2553,34 +2603,36 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
 
     // Use the new VALU Opcode.
     const MCInstrDesc &NewDesc = get(NewOpcode);
-    Inst->setDesc(NewDesc);
+    Inst.setDesc(NewDesc);
 
     // Remove any references to SCC. Vector instructions can't read from it, and
     // We're just about to add the implicit use / defs of VCC, and we don't want
     // both.
-    for (unsigned i = Inst->getNumOperands() - 1; i > 0; --i) {
-      MachineOperand &Op = Inst->getOperand(i);
-      if (Op.isReg() && Op.getReg() == AMDGPU::SCC)
-        Inst->RemoveOperand(i);
+    for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) {
+      MachineOperand &Op = Inst.getOperand(i);
+      if (Op.isReg() && Op.getReg() == AMDGPU::SCC) {
+        Inst.RemoveOperand(i);
+        addSCCDefUsersToVALUWorklist(Inst, Worklist);
+      }
     }
 
     if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
       // We are converting these to a BFE, so we need to add the missing
       // operands for the size and offset.
       unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
-      Inst->addOperand(MachineOperand::CreateImm(0));
-      Inst->addOperand(MachineOperand::CreateImm(Size));
+      Inst.addOperand(MachineOperand::CreateImm(0));
+      Inst.addOperand(MachineOperand::CreateImm(Size));
 
     } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
       // The VALU version adds the second operand to the result, so insert an
       // extra 0 operand.
-      Inst->addOperand(MachineOperand::CreateImm(0));
+      Inst.addOperand(MachineOperand::CreateImm(0));
     }
 
-    Inst->addImplicitDefUseOperands(*Inst->getParent()->getParent());
+    Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent());
 
     if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
-      const MachineOperand &OffsetWidthOp = Inst->getOperand(2);
+      const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
       // If we need to move this to VGPRs, we need to unpack the second operand
       // back into the 2 separate ones for bit offset and width.
       assert(OffsetWidthOp.isImm() &&
@@ -2589,50 +2641,41 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
 
       uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
       uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
-      Inst->RemoveOperand(2); // Remove old immediate.
-      Inst->addOperand(MachineOperand::CreateImm(Offset));
-      Inst->addOperand(MachineOperand::CreateImm(BitWidth));
+      Inst.RemoveOperand(2);                     // Remove old immediate.
+      Inst.addOperand(MachineOperand::CreateImm(Offset));
+      Inst.addOperand(MachineOperand::CreateImm(BitWidth));
     }
 
-    // Update the destination register class.
-    const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*Inst);
-    if (!NewDstRC)
-      continue;
+    bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef();
+    unsigned NewDstReg = AMDGPU::NoRegister;
+    if (HasDst) {
+      // Update the destination register class.
+      const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
+      if (!NewDstRC)
+        continue;
 
-    unsigned DstReg = Inst->getOperand(0).getReg();
-    unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC);
-    MRI.replaceRegWith(DstReg, NewDstReg);
+      unsigned DstReg = Inst.getOperand(0).getReg();
+      NewDstReg = MRI.createVirtualRegister(NewDstRC);
+      MRI.replaceRegWith(DstReg, NewDstReg);
+    }
 
     // Legalize the operands
     legalizeOperands(Inst);
 
-    addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
+    if (HasDst)
+     addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
   }
 }
 
-//===----------------------------------------------------------------------===//
-// Indirect addressing callbacks
-//===----------------------------------------------------------------------===//
-
-unsigned SIInstrInfo::calculateIndirectAddress(unsigned RegIndex,
-                                                 unsigned Channel) const {
-  assert(Channel == 0);
-  return RegIndex;
-}
-
-const TargetRegisterClass *SIInstrInfo::getIndirectAddrRegClass() const {
-  return &AMDGPU::VGPR_32RegClass;
-}
-
 void SIInstrInfo::lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist,
-                                 MachineInstr *Inst) const {
-  MachineBasicBlock &MBB = *Inst->getParent();
+                                 MachineInstr &Inst) const {
+  MachineBasicBlock &MBB = *Inst.getParent();
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   MachineBasicBlock::iterator MII = Inst;
-  DebugLoc DL = Inst->getDebugLoc();
+  DebugLoc DL = Inst.getDebugLoc();
 
-  MachineOperand &Dest = Inst->getOperand(0);
-  MachineOperand &Src = Inst->getOperand(1);
+  MachineOperand &Dest = Inst.getOperand(0);
+  MachineOperand &Src = Inst.getOperand(1);
   unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
   unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
 
@@ -2649,15 +2692,14 @@ void SIInstrInfo::lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist,
 }
 
 void SIInstrInfo::splitScalar64BitUnaryOp(
-  SmallVectorImpl<MachineInstr *> &Worklist,
-  MachineInstr *Inst,
-  unsigned Opcode) const {
-  MachineBasicBlock &MBB = *Inst->getParent();
+    SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst,
+    unsigned Opcode) const {
+  MachineBasicBlock &MBB = *Inst.getParent();
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
 
-  MachineOperand &Dest = Inst->getOperand(0);
-  MachineOperand &Src0 = Inst->getOperand(1);
-  DebugLoc DL = Inst->getDebugLoc();
+  MachineOperand &Dest = Inst.getOperand(0);
+  MachineOperand &Src0 = Inst.getOperand(1);
+  DebugLoc DL = Inst.getDebugLoc();
 
   MachineBasicBlock::iterator MII = Inst;
 
@@ -2703,16 +2745,15 @@ void SIInstrInfo::splitScalar64BitUnaryOp(
 }
 
 void SIInstrInfo::splitScalar64BitBinaryOp(
-  SmallVectorImpl<MachineInstr *> &Worklist,
-  MachineInstr *Inst,
-  unsigned Opcode) const {
-  MachineBasicBlock &MBB = *Inst->getParent();
+    SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst,
+    unsigned Opcode) const {
+  MachineBasicBlock &MBB = *Inst.getParent();
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
 
-  MachineOperand &Dest = Inst->getOperand(0);
-  MachineOperand &Src0 = Inst->getOperand(1);
-  MachineOperand &Src1 = Inst->getOperand(2);
-  DebugLoc DL = Inst->getDebugLoc();
+  MachineOperand &Dest = Inst.getOperand(0);
+  MachineOperand &Src0 = Inst.getOperand(1);
+  MachineOperand &Src1 = Inst.getOperand(2);
+  DebugLoc DL = Inst.getDebugLoc();
 
   MachineBasicBlock::iterator MII = Inst;
 
@@ -2738,9 +2779,9 @@ void SIInstrInfo::splitScalar64BitBinaryOp(
   const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
 
   unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
-  MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0)
-    .addOperand(SrcReg0Sub0)
-    .addOperand(SrcReg1Sub0);
+  MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
+                              .addOperand(SrcReg0Sub0)
+                              .addOperand(SrcReg1Sub0);
 
   MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
                                                        AMDGPU::sub1, Src0SubRC);
@@ -2748,9 +2789,9 @@ void SIInstrInfo::splitScalar64BitBinaryOp(
                                                        AMDGPU::sub1, Src1SubRC);
 
   unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
-  MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1)
-    .addOperand(SrcReg0Sub1)
-    .addOperand(SrcReg1Sub1);
+  MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
+                              .addOperand(SrcReg0Sub1)
+                              .addOperand(SrcReg1Sub1);
 
   unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
@@ -2770,16 +2811,16 @@ void SIInstrInfo::splitScalar64BitBinaryOp(
   addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
 }
 
-void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist,
-                                       MachineInstr *Inst) const {
-  MachineBasicBlock &MBB = *Inst->getParent();
+void SIInstrInfo::splitScalar64BitBCNT(
+    SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst) const {
+  MachineBasicBlock &MBB = *Inst.getParent();
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
 
   MachineBasicBlock::iterator MII = Inst;
-  DebugLoc DL = Inst->getDebugLoc();
+  DebugLoc DL = Inst.getDebugLoc();
 
-  MachineOperand &Dest = Inst->getOperand(0);
-  MachineOperand &Src = Inst->getOperand(1);
+  MachineOperand &Dest = Inst.getOperand(0);
+  MachineOperand &Src = Inst.getOperand(1);
 
   const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
   const TargetRegisterClass *SrcRC = Src.isReg() ?
@@ -2812,24 +2853,22 @@ void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist
 }
 
 void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
-                                      MachineInstr *Inst) const {
-  MachineBasicBlock &MBB = *Inst->getParent();
+                                      MachineInstr &Inst) const {
+  MachineBasicBlock &MBB = *Inst.getParent();
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   MachineBasicBlock::iterator MII = Inst;
-  DebugLoc DL = Inst->getDebugLoc();
+  DebugLoc DL = Inst.getDebugLoc();
 
-  MachineOperand &Dest = Inst->getOperand(0);
-  uint32_t Imm = Inst->getOperand(2).getImm();
+  MachineOperand &Dest = Inst.getOperand(0);
+  uint32_t Imm = Inst.getOperand(2).getImm();
   uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
   uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
 
   (void) Offset;
 
   // Only sext_inreg cases handled.
-  assert(Inst->getOpcode() == AMDGPU::S_BFE_I64 &&
-         BitWidth <= 32 &&
-         Offset == 0 &&
-         "Not implemented");
+  assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
+         Offset == 0 && "Not implemented");
 
   if (BitWidth < 32) {
     unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
@@ -2837,9 +2876,9 @@ void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
     unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
 
     BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo)
-      .addReg(Inst->getOperand(1).getReg(), 0, AMDGPU::sub0)
-      .addImm(0)
-      .addImm(BitWidth);
+        .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
+        .addImm(0)
+        .addImm(BitWidth);
 
     BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
       .addImm(31)
@@ -2856,7 +2895,7 @@ void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
     return;
   }
 
-  MachineOperand &Src = Inst->getOperand(1);
+  MachineOperand &Src = Inst.getOperand(1);
   unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
   unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
 
@@ -2887,6 +2926,22 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist(
   }
 }
 
+void SIInstrInfo::addSCCDefUsersToVALUWorklist(
+    MachineInstr &SCCDefInst, SmallVectorImpl<MachineInstr *> &Worklist) const {
+  // This assumes that all the users of SCC are in the same block
+  // as the SCC def.
+  for (MachineInstr &MI :
+       llvm::make_range(MachineBasicBlock::iterator(SCCDefInst),
+                        SCCDefInst.getParent()->end())) {
+    // Exit if we find another SCC def.
+    if (MI.findRegisterDefOperandIdx(AMDGPU::SCC) != -1)
+      return;
+
+    if (MI.findRegisterUseOperandIdx(AMDGPU::SCC) != -1)
+      Worklist.push_back(&MI);
+  }
+}
+
 const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
   const MachineInstr &Inst) const {
   const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
@@ -2912,9 +2967,9 @@ const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
 }
 
 // Find the one SGPR operand we are allowed to use.
-unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI,
+unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
                                    int OpIndices[3]) const {
-  const MCInstrDesc &Desc = MI->getDesc();
+  const MCInstrDesc &Desc = MI.getDesc();
 
   // Find the one SGPR operand we are allowed to use.
   //
@@ -2925,19 +2980,19 @@ unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI,
   //
   // If the operand's class is an SGPR, we can never move it.
 
-  unsigned SGPRReg = findImplicitSGPRRead(*MI);
+  unsigned SGPRReg = findImplicitSGPRRead(MI);
   if (SGPRReg != AMDGPU::NoRegister)
     return SGPRReg;
 
   unsigned UsedSGPRs[3] = { AMDGPU::NoRegister };
-  const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
 
   for (unsigned i = 0; i < 3; ++i) {
     int Idx = OpIndices[i];
     if (Idx == -1)
       break;
 
-    const MachineOperand &MO = MI->getOperand(Idx);
+    const MachineOperand &MO = MI.getOperand(Idx);
     if (!MO.isReg())
       continue;
 
@@ -2981,70 +3036,6 @@ unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI,
   return SGPRReg;
 }
 
-MachineInstrBuilder SIInstrInfo::buildIndirectWrite(
-                                   MachineBasicBlock *MBB,
-                                   MachineBasicBlock::iterator I,
-                                   unsigned ValueReg,
-                                   unsigned Address, unsigned OffsetReg) const {
-  const DebugLoc &DL = MBB->findDebugLoc(I);
-  unsigned IndirectBaseReg = AMDGPU::VGPR_32RegClass.getRegister(
-                                      getIndirectIndexBegin(*MBB->getParent()));
-
-  return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_DST_V1))
-          .addReg(IndirectBaseReg, RegState::Define)
-          .addOperand(I->getOperand(0))
-          .addReg(IndirectBaseReg)
-          .addReg(OffsetReg)
-          .addImm(0)
-          .addReg(ValueReg);
-}
-
-MachineInstrBuilder SIInstrInfo::buildIndirectRead(
-                                   MachineBasicBlock *MBB,
-                                   MachineBasicBlock::iterator I,
-                                   unsigned ValueReg,
-                                   unsigned Address, unsigned OffsetReg) const {
-  const DebugLoc &DL = MBB->findDebugLoc(I);
-  unsigned IndirectBaseReg = AMDGPU::VGPR_32RegClass.getRegister(
-                                      getIndirectIndexBegin(*MBB->getParent()));
-
-  return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_SRC_V1))
-          .addOperand(I->getOperand(0))
-          .addOperand(I->getOperand(1))
-          .addReg(IndirectBaseReg)
-          .addReg(OffsetReg)
-          .addImm(0);
-
-}
-
-void SIInstrInfo::reserveIndirectRegisters(BitVector &Reserved,
-                                            const MachineFunction &MF) const {
-  int End = getIndirectIndexEnd(MF);
-  int Begin = getIndirectIndexBegin(MF);
-
-  if (End == -1)
-    return;
-
-
-  for (int Index = Begin; Index <= End; ++Index)
-    Reserved.set(AMDGPU::VGPR_32RegClass.getRegister(Index));
-
-  for (int Index = std::max(0, Begin - 1); Index <= End; ++Index)
-    Reserved.set(AMDGPU::VReg_64RegClass.getRegister(Index));
-
-  for (int Index = std::max(0, Begin - 2); Index <= End; ++Index)
-    Reserved.set(AMDGPU::VReg_96RegClass.getRegister(Index));
-
-  for (int Index = std::max(0, Begin - 3); Index <= End; ++Index)
-    Reserved.set(AMDGPU::VReg_128RegClass.getRegister(Index));
-
-  for (int Index = std::max(0, Begin - 7); Index <= End; ++Index)
-    Reserved.set(AMDGPU::VReg_256RegClass.getRegister(Index));
-
-  for (int Index = std::max(0, Begin - 15); Index <= End; ++Index)
-    Reserved.set(AMDGPU::VReg_512RegClass.getRegister(Index));
-}
-
 MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI,
                                              unsigned OperandName) const {
   int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
@@ -3059,9 +3050,9 @@ uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
   if (ST.isAmdHsaOS()) {
     RsrcDataFormat |= (1ULL << 56);
 
-  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
-    // Set MTYPE = 2
-    RsrcDataFormat |= (2ULL << 59);
+    if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
+      // Set MTYPE = 2
+      RsrcDataFormat |= (2ULL << 59);
   }
 
   return RsrcDataFormat;
@@ -3072,22 +3063,103 @@ uint64_t SIInstrInfo::getScratchRsrcWords23() const {
                     AMDGPU::RSRC_TID_ENABLE |
                     0xffffffff; // Size;
 
+  uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1;
+
+  Rsrc23 |= (EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT) |
+            // IndexStride = 64
+            (UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT);
+
   // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
   // Clear them unless we want a huge stride.
-  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+  if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
     Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
 
   return Rsrc23;
 }
 
-bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr *MI) const {
-  unsigned Opc = MI->getOpcode();
+bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const {
+  unsigned Opc = MI.getOpcode();
 
   return isSMRD(Opc);
 }
 
-bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr *MI) const {
-  unsigned Opc = MI->getOpcode();
+bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr &MI) const {
+  unsigned Opc = MI.getOpcode();
 
   return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc);
 }
+
+unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
+  unsigned Opc = MI.getOpcode();
+  const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc);
+  unsigned DescSize = Desc.getSize();
+
+  // If we have a definitive size, we can use it. Otherwise we need to inspect
+  // the operands to know the size.
+  if (DescSize == 8 || DescSize == 4)
+    return DescSize;
+
+  assert(DescSize == 0);
+
+  // 4-byte instructions may have a 32-bit literal encoded after them. Check
+  // operands that coud ever be literals.
+  if (isVALU(MI) || isSALU(MI)) {
+    int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+    if (Src0Idx == -1)
+      return 4; // No operands.
+
+    if (isLiteralConstant(MI.getOperand(Src0Idx), getOpSize(MI, Src0Idx)))
+      return 8;
+
+    int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
+    if (Src1Idx == -1)
+      return 4;
+
+    if (isLiteralConstant(MI.getOperand(Src1Idx), getOpSize(MI, Src1Idx)))
+      return 8;
+
+    return 4;
+  }
+
+  switch (Opc) {
+  case TargetOpcode::IMPLICIT_DEF:
+  case TargetOpcode::KILL:
+  case TargetOpcode::DBG_VALUE:
+  case TargetOpcode::BUNDLE:
+  case TargetOpcode::EH_LABEL:
+    return 0;
+  case TargetOpcode::INLINEASM: {
+    const MachineFunction *MF = MI.getParent()->getParent();
+    const char *AsmStr = MI.getOperand(0).getSymbolName();
+    return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
+  }
+  default:
+    llvm_unreachable("unable to find instruction size");
+  }
+}
+
+ArrayRef<std::pair<int, const char *>>
+SIInstrInfo::getSerializableTargetIndices() const {
+  static const std::pair<int, const char *> TargetIndices[] = {
+      {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
+      {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
+      {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
+      {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
+      {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
+  return makeArrayRef(TargetIndices);
+}
+
+/// This is used by the post-RA scheduler (SchedulePostRAList.cpp).  The
+/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
+ScheduleHazardRecognizer *
+SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
+                                            const ScheduleDAG *DAG) const {
+  return new GCNHazardRecognizer(DAG->MF);
+}
+
+/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
+/// pass.
+ScheduleHazardRecognizer *
+SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const {
+  return new GCNHazardRecognizer(MF);
+}
diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h
index cce1ae725611..227b817227c2 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@@ -13,8 +13,8 @@
 //===----------------------------------------------------------------------===//
 
 
-#ifndef LLVM_LIB_TARGET_R600_SIINSTRINFO_H
-#define LLVM_LIB_TARGET_R600_SIINSTRINFO_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_SIINSTRINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_SIINSTRINFO_H
 
 #include "AMDGPUInstrInfo.h"
 #include "SIDefines.h"
@@ -22,9 +22,24 @@
 
 namespace llvm {
 
-class SIInstrInfo : public AMDGPUInstrInfo {
+class SIInstrInfo final : public AMDGPUInstrInfo {
 private:
   const SIRegisterInfo RI;
+  const SISubtarget &ST;
+
+  // The the inverse predicate should have the negative value.
+  enum BranchPredicate {
+    INVALID_BR = 0,
+    SCC_TRUE = 1,
+    SCC_FALSE = -1,
+    VCCNZ = 2,
+    VCCZ = -2,
+    EXECNZ = -3,
+    EXECZ = 3
+  };
+
+  static unsigned getBranchOpcode(BranchPredicate Cond);
+  static BranchPredicate getBranchPredicate(unsigned Opcode);
 
   unsigned buildExtractSubReg(MachineBasicBlock::iterator MI,
                               MachineRegisterInfo &MRI,
@@ -39,87 +54,89 @@ private:
                                          unsigned SubIdx,
                                          const TargetRegisterClass *SubRC) const;
 
-  void swapOperands(MachineBasicBlock::iterator Inst) const;
+  void swapOperands(MachineInstr &Inst) const;
 
   void lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist,
-                      MachineInstr *Inst) const;
+                      MachineInstr &Inst) const;
 
   void splitScalar64BitUnaryOp(SmallVectorImpl<MachineInstr *> &Worklist,
-                               MachineInstr *Inst, unsigned Opcode) const;
+                               MachineInstr &Inst, unsigned Opcode) const;
 
   void splitScalar64BitBinaryOp(SmallVectorImpl<MachineInstr *> &Worklist,
-                                MachineInstr *Inst, unsigned Opcode) const;
+                                MachineInstr &Inst, unsigned Opcode) const;
 
   void splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist,
-                            MachineInstr *Inst) const;
+                            MachineInstr &Inst) const;
   void splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
-                           MachineInstr *Inst) const;
+                           MachineInstr &Inst) const;
 
   void addUsersToMoveToVALUWorklist(
     unsigned Reg, MachineRegisterInfo &MRI,
     SmallVectorImpl<MachineInstr *> &Worklist) const;
 
+  void
+  addSCCDefUsersToVALUWorklist(MachineInstr &SCCDefInst,
+                               SmallVectorImpl<MachineInstr *> &Worklist) const;
+
   const TargetRegisterClass *
   getDestEquivalentVGPRClass(const MachineInstr &Inst) const;
 
-  bool checkInstOffsetsDoNotOverlap(MachineInstr *MIa,
-                                    MachineInstr *MIb) const;
+  bool checkInstOffsetsDoNotOverlap(MachineInstr &MIa, MachineInstr &MIb) const;
 
-  unsigned findUsedSGPR(const MachineInstr *MI, int OpIndices[3]) const;
+  unsigned findUsedSGPR(const MachineInstr &MI, int OpIndices[3]) const;
 
 protected:
-  MachineInstr *commuteInstructionImpl(MachineInstr *MI,
-                                       bool NewMI,
+  MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI,
                                        unsigned OpIdx0,
                                        unsigned OpIdx1) const override;
 
 public:
-  explicit SIInstrInfo(const AMDGPUSubtarget &st);
 
-  const SIRegisterInfo &getRegisterInfo() const override {
+  enum TargetOperandFlags {
+    MO_NONE = 0,
+    MO_GOTPCREL = 1
+  };
+
+  explicit SIInstrInfo(const SISubtarget &);
+
+  const SIRegisterInfo &getRegisterInfo() const {
     return RI;
   }
 
-  bool isReallyTriviallyReMaterializable(const MachineInstr *MI,
+  bool isReallyTriviallyReMaterializable(const MachineInstr &MI,
                                          AliasAnalysis *AA) const override;
 
   bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
                                int64_t &Offset1,
                                int64_t &Offset2) const override;
 
-  bool getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
-                             unsigned &Offset,
+  bool getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
+                             int64_t &Offset,
                              const TargetRegisterInfo *TRI) const final;
 
-  bool shouldClusterLoads(MachineInstr *FirstLdSt,
-                          MachineInstr *SecondLdSt,
-                          unsigned NumLoads) const final;
+  bool shouldClusterMemOps(MachineInstr &FirstLdSt, MachineInstr &SecondLdSt,
+                           unsigned NumLoads) const final;
 
-  void copyPhysReg(MachineBasicBlock &MBB,
-                   MachineBasicBlock::iterator MI, DebugLoc DL,
-                   unsigned DestReg, unsigned SrcReg,
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                   const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
                    bool KillSrc) const override;
 
-  unsigned calculateLDSSpillAddress(MachineBasicBlock &MBB,
-                                    MachineBasicBlock::iterator MI,
-                                    RegScavenger *RS,
-                                    unsigned TmpReg,
-                                    unsigned Offset,
-                                    unsigned Size) const;
+  unsigned calculateLDSSpillAddress(MachineBasicBlock &MBB, MachineInstr &MI,
+                                    RegScavenger *RS, unsigned TmpReg,
+                                    unsigned Offset, unsigned Size) const;
 
   void storeRegToStackSlot(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MI,
-                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           MachineBasicBlock::iterator MI, unsigned SrcReg,
+                           bool isKill, int FrameIndex,
                            const TargetRegisterClass *RC,
                            const TargetRegisterInfo *TRI) const override;
 
   void loadRegFromStackSlot(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator MI,
-                            unsigned DestReg, int FrameIndex,
-                            const TargetRegisterClass *RC,
+                            MachineBasicBlock::iterator MI, unsigned DestReg,
+                            int FrameIndex, const TargetRegisterClass *RC,
                             const TargetRegisterInfo *TRI) const override;
 
-  bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
+  bool expandPostRAPseudo(MachineInstr &MI) const override;
 
   // \brief Returns an opcode that can be used to move a value to a \p DstRC
   // register.  If there is no hardware instruction that can store to \p
@@ -129,28 +146,40 @@ public:
   LLVM_READONLY
   int commuteOpcode(const MachineInstr &MI) const;
 
-  bool findCommutedOpIndices(MachineInstr *MI,
-                             unsigned &SrcOpIdx1,
+  bool findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
                              unsigned &SrcOpIdx2) const override;
 
-  bool areMemAccessesTriviallyDisjoint(
-    MachineInstr *MIa, MachineInstr *MIb,
-    AliasAnalysis *AA = nullptr) const override;
+  bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify) const override;
+
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
+
+  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+                        const DebugLoc &DL) const override;
 
-  MachineInstr *buildMovInstr(MachineBasicBlock *MBB,
-                              MachineBasicBlock::iterator I,
-                              unsigned DstReg, unsigned SrcReg) const override;
-  bool isMov(unsigned Opcode) const override;
+  bool ReverseBranchCondition(
+    SmallVectorImpl<MachineOperand> &Cond) const override;
 
-  bool FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
-                     unsigned Reg, MachineRegisterInfo *MRI) const final;
+  bool
+  areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb,
+                                  AliasAnalysis *AA = nullptr) const override;
+
+  bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, unsigned Reg,
+                     MachineRegisterInfo *MRI) const final;
 
   unsigned getMachineCSELookAheadLimit() const override { return 500; }
 
   MachineInstr *convertToThreeAddress(MachineFunction::iterator &MBB,
-                                      MachineBasicBlock::iterator &MI,
+                                      MachineInstr &MI,
                                       LiveVariables *LV) const override;
 
+  bool isSchedulingBoundary(const MachineInstr &MI,
+                            const MachineBasicBlock *MBB,
+                            const MachineFunction &MF) const override;
+
   static bool isSALU(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & SIInstrFlags::SALU;
   }
@@ -167,6 +196,14 @@ public:
     return get(Opcode).TSFlags & SIInstrFlags::VALU;
   }
 
+  static bool isVMEM(const MachineInstr &MI) {
+    return isMUBUF(MI) || isMTBUF(MI) || isMIMG(MI);
+  }
+
+  bool isVMEM(uint16_t Opcode) const {
+    return isMUBUF(Opcode) || isMTBUF(Opcode) || isMIMG(Opcode);
+  }
+
   static bool isSOP1(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & SIInstrFlags::SOP1;
   }
@@ -279,6 +316,14 @@ public:
     return get(Opcode).TSFlags & SIInstrFlags::MIMG;
   }
 
+  static bool isGather4(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::Gather4;
+  }
+
+  bool isGather4(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::Gather4;
+  }
+
   static bool isFLAT(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & SIInstrFlags::FLAT;
   }
@@ -303,11 +348,35 @@ public:
     return get(Opcode).TSFlags & SIInstrFlags::VGPRSpill;
   }
 
+  static bool isDPP(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::DPP;
+  }
+
+  bool isDPP(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::DPP;
+  }
+
+  static bool isScalarUnit(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & (SIInstrFlags::SALU | SIInstrFlags::SMRD);
+  }
+
+  static bool usesVM_CNT(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::VM_CNT;
+  }
+
+  bool isVGPRCopy(const MachineInstr &MI) const {
+    assert(MI.isCopy());
+    unsigned Dest = MI.getOperand(0).getReg();
+    const MachineFunction &MF = *MI.getParent()->getParent();
+    const MachineRegisterInfo &MRI = MF.getRegInfo();
+    return !RI.isSGPRReg(MRI, Dest);
+  }
+
   bool isInlineConstant(const APInt &Imm) const;
   bool isInlineConstant(const MachineOperand &MO, unsigned OpSize) const;
   bool isLiteralConstant(const MachineOperand &MO, unsigned OpSize) const;
 
-  bool isImmOperandLegal(const MachineInstr *MI, unsigned OpNo,
+  bool isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
                          const MachineOperand &MO) const;
 
   /// \brief Return true if this 64-bit VALU instruction has a 32-bit encoding.
@@ -326,7 +395,7 @@ public:
   bool hasModifiersSet(const MachineInstr &MI,
                        unsigned OpName) const;
 
-  bool verifyInstruction(const MachineInstr *MI,
+  bool verifyInstruction(const MachineInstr &MI,
                          StringRef &ErrInfo) const override;
 
   static unsigned getVALUOp(const MachineInstr &MI);
@@ -374,11 +443,11 @@ public:
   ///
   /// If the operand being legalized is a register, then a COPY will be used
   /// instead of MOV.
-  void legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const;
+  void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const;
 
   /// \brief Check if \p MO is a legal operand if it was the \p OpIdx Operand
   /// for \p MI.
-  bool isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
+  bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
                       const MachineOperand *MO = nullptr) const;
 
   /// \brief Check if \p MO would be a valid operand for the given operand
@@ -396,52 +465,38 @@ public:
 
   /// \brief Legalize operands in \p MI by either commuting it or inserting a
   /// copy of src1.
-  void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr *MI) const;
+  void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const;
 
   /// \brief Fix operands in \p MI to satisfy constant bus requirements.
-  void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr *MI) const;
+  void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const;
 
-  /// \brief Legalize all operands in this instruction.  This function may
-  /// create new instruction and insert them before \p MI.
-  void legalizeOperands(MachineInstr *MI) const;
+  /// Copy a value from a VGPR (\p SrcReg) to SGPR.  This function can only
+  /// be used when it is know that the value in SrcReg is same across all
+  /// threads in the wave.
+  /// \returns The SGPR register that \p SrcReg was copied to.
+  unsigned readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI,
+                              MachineRegisterInfo &MRI) const;
 
-  /// \brief Split an SMRD instruction into two smaller loads of half the
-  //  size storing the results in \p Lo and \p Hi.
-  void splitSMRD(MachineInstr *MI, const TargetRegisterClass *HalfRC,
-                 unsigned HalfImmOp, unsigned HalfSGPROp,
-                 MachineInstr *&Lo, MachineInstr *&Hi) const;
+  void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const;
 
-  void moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI,
-                      SmallVectorImpl<MachineInstr *> &Worklist) const;
+  /// \brief Legalize all operands in this instruction.  This function may
+  /// create new instruction and insert them before \p MI.
+  void legalizeOperands(MachineInstr &MI) const;
 
   /// \brief Replace this instruction's opcode with the equivalent VALU
   /// opcode.  This function will also move the users of \p MI to the
   /// VALU if necessary.
   void moveToVALU(MachineInstr &MI) const;
 
-  unsigned calculateIndirectAddress(unsigned RegIndex,
-                                    unsigned Channel) const override;
-
-  const TargetRegisterClass *getIndirectAddrRegClass() const override;
+  void insertWaitStates(MachineBasicBlock &MBB,MachineBasicBlock::iterator MI,
+                        int Count) const;
 
-  MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
-                                         MachineBasicBlock::iterator I,
-                                         unsigned ValueReg,
-                                         unsigned Address,
-                                         unsigned OffsetReg) const override;
+  void insertNoop(MachineBasicBlock &MBB,
+                  MachineBasicBlock::iterator MI) const override;
 
-  MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
-                                        MachineBasicBlock::iterator I,
-                                        unsigned ValueReg,
-                                        unsigned Address,
-                                        unsigned OffsetReg) const override;
-  void reserveIndirectRegisters(BitVector &Reserved,
-                                const MachineFunction &MF) const;
-
-  void LoadM0(MachineInstr *MoveRel, MachineBasicBlock::iterator I,
-              unsigned SavReg, unsigned IndexReg) const;
-
-  void insertWaitStates(MachineBasicBlock::iterator MI, int Count) const;
+  /// \brief Return the number of wait states that result from executing this
+  /// instruction.
+  unsigned getNumWaitStates(const MachineInstr &MI) const;
 
   /// \brief Returns the operand named \p Op.  If \p MI does not have an
   /// operand named \c Op, this function returns nullptr.
@@ -463,8 +518,26 @@ public:
   uint64_t getDefaultRsrcDataFormat() const;
   uint64_t getScratchRsrcWords23() const;
 
-  bool isLowLatencyInstruction(const MachineInstr *MI) const;
-  bool isHighLatencyInstruction(const MachineInstr *MI) const;
+  bool isLowLatencyInstruction(const MachineInstr &MI) const;
+  bool isHighLatencyInstruction(const MachineInstr &MI) const;
+
+  /// \brief Return the descriptor of the target-specific machine instruction
+  /// that corresponds to the specified pseudo or native opcode.
+  const MCInstrDesc &getMCOpcodeFromPseudo(unsigned Opcode) const {
+    return get(pseudoToMCOpcode(Opcode));
+  }
+
+  unsigned getInstSizeInBytes(const MachineInstr &MI) const;
+
+  ArrayRef<std::pair<int, const char *>>
+  getSerializableTargetIndices() const override;
+
+  ScheduleHazardRecognizer *
+  CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
+                                 const ScheduleDAG *DAG) const override;
+
+  ScheduleHazardRecognizer *
+  CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const override;
 };
 
 namespace AMDGPU {
@@ -490,8 +563,9 @@ namespace AMDGPU {
   int getAtomicNoRetOp(uint16_t Opcode);
 
   const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
-  const uint64_t RSRC_TID_ENABLE = 1LL << 55;
-
+  const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19);
+  const uint64_t RSRC_INDEX_STRIDE_SHIFT = (32 + 21);
+  const uint64_t RSRC_TID_ENABLE = UINT64_C(1) << (32 + 23);
 } // End namespace AMDGPU
 
 namespace SI {
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index 8735277149a6..253cc32b27e4 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -7,9 +7,9 @@
 //
 //===----------------------------------------------------------------------===//
 def isCI : Predicate<"Subtarget->getGeneration() "
-                      ">= AMDGPUSubtarget::SEA_ISLANDS">;
+                      ">= SISubtarget::SEA_ISLANDS">;
 def isCIOnly : Predicate<"Subtarget->getGeneration() =="
-                         "AMDGPUSubtarget::SEA_ISLANDS">,
+                         "SISubtarget::SEA_ISLANDS">,
   AssemblerPredicate <"FeatureSeaIslands">;
 
 def DisableInst : Predicate <"false">, AssemblerPredicate<"FeatureDisable">;
@@ -69,6 +69,11 @@ class sopk <bits<5> si, bits<5> vi = si> {
   field bits<5> VI = vi;
 }
 
+class dsop <bits<8> si, bits<8> vi = si> {
+  field bits<8> SI = si;
+  field bits<8> VI = vi;
+}
+
 // Specify an SMRD opcode for SI and SMEM opcode for VI
 
 // FIXME: This should really be bits<5> si, Tablegen crashes if
@@ -78,9 +83,9 @@ class smrd<bits<8> si, bits<8> vi = si> {
   field bits<8> VI = vi;
 }
 
-// Execpt for the NONE field, this must be kept in sync with the SISubtarget enum
-// in AMDGPUInstrInfo.cpp
-def SISubtarget {
+// Execpt for the NONE field, this must be kept in sync with the
+// SIEncodingFamily enum in AMDGPUInstrInfo.cpp
+def SIEncodingFamily {
   int NONE = -1;
   int SI = 0;
   int VI = 1;
@@ -95,6 +100,14 @@ def SIload_constant : SDNode<"AMDGPUISD::LOAD_CONSTANT",
                       [SDNPMayLoad, SDNPMemOperand]
 >;
 
+def SIatomic_inc : SDNode<"AMDGPUISD::ATOMIC_INC", SDTAtomic2,
+  [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
+>;
+
+def SIatomic_dec : SDNode<"AMDGPUISD::ATOMIC_DEC", SDTAtomic2,
+  [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
+>;
+
 def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT",
   SDTypeProfile<0, 13,
     [SDTCisVT<0, v4i32>,   // rsrc(SGPR)
@@ -120,7 +133,7 @@ def SIload_input : SDNode<"AMDGPUISD::LOAD_INPUT",
 >;
 
 class SDSample<string opcode> : SDNode <opcode,
-  SDTypeProfile<1, 4, [SDTCisVT<0, v4f32>, SDTCisVT<2, v32i8>,
+  SDTypeProfile<1, 4, [SDTCisVT<0, v4f32>, SDTCisVT<2, v8i32>,
                        SDTCisVT<3, v4i32>, SDTCisVT<4, i32>]>
 >;
 
@@ -129,9 +142,8 @@ def SIsampleb : SDSample<"AMDGPUISD::SAMPLEB">;
 def SIsampled : SDSample<"AMDGPUISD::SAMPLED">;
 def SIsamplel : SDSample<"AMDGPUISD::SAMPLEL">;
 
-def SIconstdata_ptr : SDNode<
-  "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 1, [SDTCisVT<0, i64>,
-                                                     SDTCisVT<0, i64>]>
+def SIpc_add_rel_offset : SDNode<"AMDGPUISD::PC_ADD_REL_OFFSET",
+  SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>]>
 >;
 
 //===----------------------------------------------------------------------===//
@@ -140,12 +152,14 @@ def SIconstdata_ptr : SDNode<
 
 class flat_ld <SDPatternOperator ld> : PatFrag<(ops node:$ptr),
                                                (ld node:$ptr), [{
-  return isFlatLoad(dyn_cast<LoadSDNode>(N)) ||
-         isGlobalLoad(dyn_cast<LoadSDNode>(N)) ||
-         isConstantLoad(cast<LoadSDNode>(N), -1);
+  const MemSDNode *LD = cast<MemSDNode>(N);
+  return LD->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
+         LD->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
+         LD->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS;
 }]>;
 
 def flat_load : flat_ld <load>;
+def atomic_flat_load : flat_ld<atomic_load>;
 def flat_az_extloadi8 : flat_ld <az_extloadi8>;
 def flat_sextloadi8 : flat_ld <sextloadi8>;
 def flat_az_extloadi16 : flat_ld <az_extloadi16>;
@@ -153,26 +167,50 @@ def flat_sextloadi16 : flat_ld <sextloadi16>;
 
 class flat_st <SDPatternOperator st> : PatFrag<(ops node:$val, node:$ptr),
                                                (st node:$val, node:$ptr), [{
-  return isFlatStore(dyn_cast<StoreSDNode>(N)) ||
-         isGlobalStore(dyn_cast<StoreSDNode>(N));
+  const MemSDNode *ST = cast<MemSDNode>(N);
+  return ST->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
+         ST->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
 }]>;
 
 def flat_store: flat_st <store>;
+def atomic_flat_store: flat_st <atomic_store>;
 def flat_truncstorei8 : flat_st <truncstorei8>;
 def flat_truncstorei16 : flat_st <truncstorei16>;
 
+class MubufLoad <SDPatternOperator op> : PatFrag <
+  (ops node:$ptr), (op node:$ptr), [{
 
-def mubuf_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{
-	return isGlobalLoad(cast<LoadSDNode>(N)) ||
-         isConstantLoad(cast<LoadSDNode>(N), -1);
+  const MemSDNode *LD = cast<MemSDNode>(N);
+  return LD->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
+         LD->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS;
 }]>;
 
+def mubuf_load : MubufLoad <load>;
+def mubuf_az_extloadi8 : MubufLoad <az_extloadi8>;
+def mubuf_sextloadi8 : MubufLoad <sextloadi8>;
+def mubuf_az_extloadi16 : MubufLoad <az_extloadi16>;
+def mubuf_sextloadi16 : MubufLoad <sextloadi16>;
+
+def mubuf_load_atomic : MubufLoad <atomic_load>;
+
 def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{
-  return isConstantLoad(cast<LoadSDNode>(N), -1) &&
-  static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N);
+  auto Ld = cast<LoadSDNode>(N);
+  return Ld->getAlignment() >= 4  &&
+    Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
+    static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N);
 }]>;
 
 //===----------------------------------------------------------------------===//
+// PatFrags for global memory operations
+//===----------------------------------------------------------------------===//
+
+def atomic_inc_global : global_binary_atomic_op<SIatomic_inc>;
+def atomic_dec_global : global_binary_atomic_op<SIatomic_dec>;
+
+def atomic_inc_flat : flat_binary_atomic_op<SIatomic_inc>;
+def atomic_dec_flat : flat_binary_atomic_op<SIatomic_dec>;
+
+//===----------------------------------------------------------------------===//
 // SDNodes and PatFrag for local loads and stores to enable s_mov_b32 m0, -1
 // to be glued to the memory instructions.
 //===----------------------------------------------------------------------===//
@@ -182,7 +220,7 @@ def SIld_local : SDNode <"ISD::LOAD", SDTLoad,
 >;
 
 def si_ld_local : PatFrag <(ops node:$ptr), (SIld_local node:$ptr), [{
-  return isLocalLoad(cast<LoadSDNode>(N));
+  return cast<LoadSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
 }]>;
 
 def si_load_local : PatFrag <(ops node:$ptr), (si_ld_local node:$ptr), [{
@@ -219,7 +257,7 @@ def SIst_local : SDNode <"ISD::STORE", SDTStore,
 
 def si_st_local : PatFrag <
   (ops node:$val, node:$ptr), (SIst_local node:$val, node:$ptr), [{
-  return isLocalStore(cast<StoreSDNode>(N));
+  return cast<StoreSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
 }]>;
 
 def si_store_local : PatFrag <
@@ -247,9 +285,34 @@ def si_truncstore_local_i16 : PatFrag <
   return cast<StoreSDNode>(N)->getMemoryVT() == MVT::i16;
 }]>;
 
-multiclass SIAtomicM0Glue2 <string op_name> {
+def si_setcc_uniform : PatFrag <
+  (ops node:$lhs, node:$rhs, node:$cond),
+  (setcc node:$lhs, node:$rhs, node:$cond), [{
+  for (SDNode *Use : N->uses()) {
+    if (Use->isMachineOpcode() || Use->getOpcode() != ISD::CopyToReg)
+      return false;
+
+    unsigned Reg = cast<RegisterSDNode>(Use->getOperand(1))->getReg();
+    if (Reg != AMDGPU::SCC)
+      return false;
+  }
+  return true;
+}]>;
+
+def si_uniform_br : PatFrag <
+  (ops node:$cond, node:$bb), (brcond node:$cond, node:$bb), [{
+  return isUniformBr(N);
+}]>;
+
+def si_uniform_br_scc : PatFrag <
+  (ops node:$cond, node:$bb), (si_uniform_br node:$cond, node:$bb), [{
+  return isCBranchSCC(N);
+}]>;
+
+multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0> {
 
-  def _glue : SDNode <"ISD::ATOMIC_"#op_name, SDTAtomic2,
+  def _glue : SDNode <
+    !if(is_amdgpu, "AMDGPUISD", "ISD")#"::ATOMIC_"#op_name, SDTAtomic2,
     [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
   >;
 
@@ -257,11 +320,13 @@ multiclass SIAtomicM0Glue2 <string op_name> {
 }
 
 defm si_atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">;
+defm si_atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">;
+defm si_atomic_inc : SIAtomicM0Glue2 <"INC", 1>;
+defm si_atomic_dec : SIAtomicM0Glue2 <"DEC", 1>;
 defm si_atomic_load_and : SIAtomicM0Glue2 <"LOAD_AND">;
 defm si_atomic_load_min : SIAtomicM0Glue2 <"LOAD_MIN">;
 defm si_atomic_load_max : SIAtomicM0Glue2 <"LOAD_MAX">;
 defm si_atomic_load_or : SIAtomicM0Glue2 <"LOAD_OR">;
-defm si_atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">;
 defm si_atomic_load_xor : SIAtomicM0Glue2 <"LOAD_XOR">;
 defm si_atomic_load_umin : SIAtomicM0Glue2 <"LOAD_UMIN">;
 defm si_atomic_load_umax : SIAtomicM0Glue2 <"LOAD_UMAX">;
@@ -347,6 +412,10 @@ def IMM16bit : PatLeaf <(imm),
   [{return isUInt<16>(N->getZExtValue());}]
 >;
 
+def SIMM16bit : PatLeaf <(imm),
+  [{return isInt<16>(N->getSExtValue());}]
+>;
+
 def IMM20bit : PatLeaf <(imm),
   [{return isUInt<20>(N->getZExtValue());}]
 >;
@@ -369,7 +438,7 @@ class InlineFPImm <ValueType vt> : PatLeaf <(vt fpimm), [{
 }]>;
 
 class SGPRImm <dag frag> : PatLeaf<frag, [{
-  if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+  if (Subtarget->getGeneration() < SISubtarget::SOUTHERN_ISLANDS) {
     return false;
   }
   const SIRegisterInfo *SIRI =
@@ -402,188 +471,133 @@ def sopp_brtarget : Operand<OtherVT> {
   let ParserMatchClass = SoppBrTarget;
 }
 
-def const_ga : Operand<iPTR>;
-
-include "SIInstrFormats.td"
-include "VIInstrFormats.td"
+def si_ga : Operand<iPTR>;
 
-def MubufOffsetMatchClass : AsmOperandClass {
-  let Name = "MubufOffset";
-  let ParserMethod = "parseMubufOptionalOps";
-  let RenderMethod = "addImmOperands";
+def InterpSlot : Operand<i32> {
+  let PrintMethod = "printInterpSlot";
 }
 
-class DSOffsetBaseMatchClass <string parser> : AsmOperandClass {
-  let Name = "DSOffset"#parser;
-  let ParserMethod = parser;
+def SendMsgMatchClass : AsmOperandClass {
+  let Name = "SendMsg";
+  let PredicateMethod = "isSendMsg";
+  let ParserMethod = "parseSendMsgOp";
   let RenderMethod = "addImmOperands";
-  let PredicateMethod = "isDSOffset";
 }
 
-def DSOffsetMatchClass : DSOffsetBaseMatchClass <"parseDSOptionalOps">;
-def DSOffsetGDSMatchClass : DSOffsetBaseMatchClass <"parseDSOffsetOptional">;
-
-def DSOffset01MatchClass : AsmOperandClass {
-  let Name = "DSOffset1";
-  let ParserMethod = "parseDSOff01OptionalOps";
-  let RenderMethod = "addImmOperands";
-  let PredicateMethod = "isDSOffset01";
+def SendMsgImm : Operand<i32> {
+  let PrintMethod = "printSendMsg";
+  let ParserMatchClass = SendMsgMatchClass;
 }
 
-class GDSBaseMatchClass <string parser> : AsmOperandClass {
-  let Name = "GDS"#parser;
-  let PredicateMethod = "isImm";
-  let ParserMethod = parser;
+def SWaitMatchClass : AsmOperandClass {
+  let Name = "SWaitCnt";
   let RenderMethod = "addImmOperands";
+  let ParserMethod = "parseSWaitCntOps";
 }
 
-def GDSMatchClass : GDSBaseMatchClass <"parseDSOptionalOps">;
-def GDS01MatchClass : GDSBaseMatchClass <"parseDSOff01OptionalOps">;
-
-class GLCBaseMatchClass <string parser> : AsmOperandClass {
-  let Name = "GLC"#parser;
-  let PredicateMethod = "isImm";
-  let ParserMethod = parser;
-  let RenderMethod = "addImmOperands";
+def WAIT_FLAG : Operand <i32> {
+  let ParserMatchClass = SWaitMatchClass;
+  let PrintMethod = "printWaitFlag";
 }
 
-def GLCMubufMatchClass : GLCBaseMatchClass <"parseMubufOptionalOps">;
-def GLCFlatMatchClass : GLCBaseMatchClass <"parseFlatOptionalOps">;
+include "SIInstrFormats.td"
+include "VIInstrFormats.td"
 
-class SLCBaseMatchClass <string parser> : AsmOperandClass {
-  let Name = "SLC"#parser;
-  let PredicateMethod = "isImm";
-  let ParserMethod = parser;
+class NamedMatchClass<string CName, bit Optional = 1> : AsmOperandClass {
+  let Name = "Imm"#CName;
+  let PredicateMethod = "is"#CName;
+  let ParserMethod = !if(Optional, "parseOptionalOperand", "parse"#CName);
   let RenderMethod = "addImmOperands";
+  let IsOptional = Optional;
+  let DefaultMethod = !if(Optional, "default"#CName, ?);
 }
 
-def SLCMubufMatchClass : SLCBaseMatchClass <"parseMubufOptionalOps">;
-def SLCFlatMatchClass : SLCBaseMatchClass <"parseFlatOptionalOps">;
-def SLCFlatAtomicMatchClass : SLCBaseMatchClass <"parseFlatAtomicOptionalOps">;
-
-class TFEBaseMatchClass <string parser> : AsmOperandClass {
-  let Name = "TFE"#parser;
-  let PredicateMethod = "isImm";
-  let ParserMethod = parser;
-  let RenderMethod = "addImmOperands";
+class NamedOperandBit<string Name, AsmOperandClass MatchClass> : Operand<i1> {
+  let PrintMethod = "print"#Name;
+  let ParserMatchClass = MatchClass;
 }
 
-def TFEMubufMatchClass : TFEBaseMatchClass <"parseMubufOptionalOps">;
-def TFEFlatMatchClass : TFEBaseMatchClass <"parseFlatOptionalOps">;
-def TFEFlatAtomicMatchClass : TFEBaseMatchClass <"parseFlatAtomicOptionalOps">;
-
-def OModMatchClass : AsmOperandClass {
-  let Name = "OMod";
-  let PredicateMethod = "isImm";
-  let ParserMethod = "parseVOP3OptionalOps";
-  let RenderMethod = "addImmOperands";
+class NamedOperandU8<string Name, AsmOperandClass MatchClass> : Operand<i8> {
+  let PrintMethod = "print"#Name;
+  let ParserMatchClass = MatchClass;
 }
 
-def ClampMatchClass : AsmOperandClass {
-  let Name = "Clamp";
-  let PredicateMethod = "isImm";
-  let ParserMethod = "parseVOP3OptionalOps";
-  let RenderMethod = "addImmOperands";
+class NamedOperandU16<string Name, AsmOperandClass MatchClass> : Operand<i16> {
+  let PrintMethod = "print"#Name;
+  let ParserMatchClass = MatchClass;
 }
 
-class SMRDOffsetBaseMatchClass <string predicate> : AsmOperandClass {
-  let Name = "SMRDOffset"#predicate;
-  let PredicateMethod = predicate;
-  let RenderMethod = "addImmOperands";
+class NamedOperandU32<string Name, AsmOperandClass MatchClass> : Operand<i32> {
+  let PrintMethod = "print"#Name;
+  let ParserMatchClass = MatchClass;
 }
 
-def SMRDOffsetMatchClass : SMRDOffsetBaseMatchClass <"isSMRDOffset">;
-def SMRDLiteralOffsetMatchClass : SMRDOffsetBaseMatchClass <
-  "isSMRDLiteralOffset"
->;
-
 let OperandType = "OPERAND_IMMEDIATE" in {
 
-def offen : Operand<i1> {
-  let PrintMethod = "printOffen";
-}
-def idxen : Operand<i1> {
-  let PrintMethod = "printIdxen";
-}
-def addr64 : Operand<i1> {
-  let PrintMethod = "printAddr64";
-}
-def mbuf_offset : Operand<i16> {
-  let PrintMethod = "printMBUFOffset";
-  let ParserMatchClass = MubufOffsetMatchClass;
-}
-class ds_offset_base <AsmOperandClass mc> : Operand<i16> {
-  let PrintMethod = "printDSOffset";
-  let ParserMatchClass = mc;
-}
-def ds_offset : ds_offset_base <DSOffsetMatchClass>;
-def ds_offset_gds : ds_offset_base <DSOffsetGDSMatchClass>;
+def offen : NamedOperandBit<"Offen", NamedMatchClass<"Offen">>;
+def idxen : NamedOperandBit<"Idxen", NamedMatchClass<"Idxen">>;
+def addr64 : NamedOperandBit<"Addr64", NamedMatchClass<"Addr64">>;
 
-def ds_offset0 : Operand<i8> {
-  let PrintMethod = "printDSOffset0";
-  let ParserMatchClass = DSOffset01MatchClass;
-}
-def ds_offset1 : Operand<i8> {
-  let PrintMethod = "printDSOffset1";
-  let ParserMatchClass = DSOffset01MatchClass;
-}
-class gds_base <AsmOperandClass mc> : Operand <i1> {
-  let PrintMethod = "printGDS";
-  let ParserMatchClass = mc;
-}
-def gds : gds_base <GDSMatchClass>;
+def offset : NamedOperandU16<"Offset", NamedMatchClass<"Offset">>;
+def offset0 : NamedOperandU8<"Offset0", NamedMatchClass<"Offset0">>;
+def offset1 : NamedOperandU8<"Offset1", NamedMatchClass<"Offset1">>;
 
-def gds01 : gds_base <GDS01MatchClass>;
+def gds : NamedOperandBit<"GDS", NamedMatchClass<"GDS">>;
 
-class glc_base <AsmOperandClass mc> : Operand <i1> {
-  let PrintMethod = "printGLC";
-  let ParserMatchClass = mc;
-}
+def omod : NamedOperandU32<"OModSI", NamedMatchClass<"OModSI">>;
+def clampmod : NamedOperandBit<"ClampSI", NamedMatchClass<"ClampSI">>;
 
-def glc : glc_base <GLCMubufMatchClass>;
-def glc_flat : glc_base <GLCFlatMatchClass>;
+def smrd_offset : NamedOperandU32<"SMRDOffset", NamedMatchClass<"SMRDOffset">>;
+def smrd_literal_offset : NamedOperandU32<"SMRDLiteralOffset", NamedMatchClass<"SMRDLiteralOffset">>;
 
-class slc_base <AsmOperandClass mc> : Operand <i1> {
-  let PrintMethod = "printSLC";
-  let ParserMatchClass = mc;
-}
+def glc : NamedOperandBit<"GLC", NamedMatchClass<"GLC">>;
+def slc : NamedOperandBit<"SLC", NamedMatchClass<"SLC">>;
+def tfe : NamedOperandBit<"TFE", NamedMatchClass<"TFE">>;
+def unorm : NamedOperandBit<"UNorm", NamedMatchClass<"UNorm">>;
+def da : NamedOperandBit<"DA", NamedMatchClass<"DA">>;
+def r128 : NamedOperandBit<"R128", NamedMatchClass<"R128">>;
+def lwe : NamedOperandBit<"LWE", NamedMatchClass<"LWE">>;
 
-def slc : slc_base <SLCMubufMatchClass>;
-def slc_flat : slc_base <SLCFlatMatchClass>;
-def slc_flat_atomic : slc_base <SLCFlatAtomicMatchClass>;
+def dmask : NamedOperandU16<"DMask", NamedMatchClass<"DMask">>;
 
-class tfe_base <AsmOperandClass mc> : Operand <i1> {
-  let PrintMethod = "printTFE";
-  let ParserMatchClass = mc;
-}
+def dpp_ctrl : NamedOperandU32<"DPPCtrl", NamedMatchClass<"DPPCtrl", 0>>;
+def row_mask : NamedOperandU32<"RowMask", NamedMatchClass<"RowMask">>;
+def bank_mask : NamedOperandU32<"BankMask", NamedMatchClass<"BankMask">>;
+def bound_ctrl : NamedOperandBit<"BoundCtrl", NamedMatchClass<"BoundCtrl">>;
 
-def tfe : tfe_base <TFEMubufMatchClass>;
-def tfe_flat : tfe_base <TFEFlatMatchClass>;
-def tfe_flat_atomic : tfe_base <TFEFlatAtomicMatchClass>;
+def dst_sel : NamedOperandU32<"SDWADstSel", NamedMatchClass<"SDWADstSel">>;
+def src0_sel : NamedOperandU32<"SDWASrc0Sel", NamedMatchClass<"SDWASrc0Sel">>;
+def src1_sel : NamedOperandU32<"SDWASrc1Sel", NamedMatchClass<"SDWASrc1Sel">>;
+def dst_unused : NamedOperandU32<"SDWADstUnused", NamedMatchClass<"SDWADstUnused">>;
 
-def omod : Operand <i32> {
-  let PrintMethod = "printOModSI";
-  let ParserMatchClass = OModMatchClass;
-}
+def hwreg : NamedOperandU16<"Hwreg", NamedMatchClass<"Hwreg", 0>>;
+
+} // End OperandType = "OPERAND_IMMEDIATE"
 
-def ClampMod : Operand <i1> {
-  let PrintMethod = "printClampSI";
-  let ParserMatchClass = ClampMatchClass;
-}
 
-def smrd_offset : Operand <i32> {
-  let PrintMethod = "printU32ImmOperand";
-  let ParserMatchClass = SMRDOffsetMatchClass;
+def VOPDstS64 : VOPDstOperand <SReg_64>;
+
+def FPInputModsMatchClass : AsmOperandClass {
+  let Name = "RegOrImmWithFPInputMods";
+  let ParserMethod = "parseRegOrImmWithFPInputMods";
+  let PredicateMethod = "isRegOrImmWithInputMods";
 }
 
-def smrd_literal_offset : Operand <i32> {
-  let PrintMethod = "printU32ImmOperand";
-  let ParserMatchClass = SMRDLiteralOffsetMatchClass;
+def FPInputMods : Operand <i32> {
+  let PrintMethod = "printOperandAndFPInputMods";
+  let ParserMatchClass = FPInputModsMatchClass;
 }
 
-} // End OperandType = "OPERAND_IMMEDIATE"
+def IntInputModsMatchClass : AsmOperandClass {
+  let Name = "RegOrImmWithIntInputMods";
+  let ParserMethod = "parseRegOrImmWithIntInputMods";
+  let PredicateMethod = "isRegOrImmWithInputMods";
+}
 
-def VOPDstS64 : VOPDstOperand <SReg_64>;
+def IntInputMods: Operand <i32> {
+  let PrintMethod = "printOperandAndIntInputMods";
+  let ParserMatchClass = IntInputModsMatchClass;
+}
 
 //===----------------------------------------------------------------------===//
 // Complex patterns
@@ -595,9 +609,13 @@ def DS64Bit4ByteAligned : ComplexPattern<i32, 3, "SelectDS64Bit4ByteAligned">;
 def MUBUFAddr32 : ComplexPattern<i64, 9, "SelectMUBUFAddr32">;
 def MUBUFAddr64 : ComplexPattern<i64, 7, "SelectMUBUFAddr64">;
 def MUBUFAddr64Atomic : ComplexPattern<i64, 5, "SelectMUBUFAddr64">;
+def FLATAtomic : ComplexPattern<i64, 3, "SelectFlat">;
 def MUBUFScratch : ComplexPattern<i64, 4, "SelectMUBUFScratch">;
 def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">;
+def MUBUFOffsetNoGLC : ComplexPattern<i64, 3, "SelectMUBUFOffset">;
 def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">;
+def MUBUFIntrinsicOffset : ComplexPattern<i32, 2, "SelectMUBUFIntrinsicOffset">;
+def MUBUFIntrinsicVOffset : ComplexPattern<i32, 3, "SelectMUBUFIntrinsicVOffset">;
 
 def SMRDImm   : ComplexPattern<i64, 2, "SelectSMRDImm">;
 def SMRDImm32 : ComplexPattern<i64, 2, "SelectSMRDImm32">;
@@ -606,6 +624,8 @@ def SMRDBufferImm   : ComplexPattern<i32, 1, "SelectSMRDBufferImm">;
 def SMRDBufferImm32 : ComplexPattern<i32, 1, "SelectSMRDBufferImm32">;
 def SMRDBufferSgpr  : ComplexPattern<i32, 1, "SelectSMRDBufferSgpr">;
 
+def MOVRELOffset : ComplexPattern<i32, 2, "SelectMOVRELOffset">;
+
 def VOP3Mods0 : ComplexPattern<untyped, 4, "SelectVOP3Mods0">;
 def VOP3NoMods0 : ComplexPattern<untyped, 4, "SelectVOP3NoMods0">;
 def VOP3Mods0Clamp : ComplexPattern<untyped, 3, "SelectVOP3Mods0Clamp">;
@@ -670,17 +690,24 @@ class EXPCommon : InstSI<
 
   let EXP_CNT = 1;
   let Uses = [EXEC];
+  let SchedRW = [WriteExport];
 }
 
 multiclass EXP_m {
 
   let isPseudo = 1, isCodeGenOnly = 1 in {
-    def "" : EXPCommon, SIMCInstr <"exp", SISubtarget.NONE> ;
+    def "" : EXPCommon, SIMCInstr <"exp", SIEncodingFamily.NONE> ;
   }
 
-  def _si : EXPCommon, SIMCInstr <"exp", SISubtarget.SI>, EXPe;
+  def _si : EXPCommon, SIMCInstr <"exp", SIEncodingFamily.SI>, EXPe {
+    let DecoderNamespace="SICI";
+    let DisableDecoder = DisableSIDecoder;
+  }
 
-  def _vi : EXPCommon, SIMCInstr <"exp", SISubtarget.VI>, EXPe_vi;
+  def _vi : EXPCommon, SIMCInstr <"exp", SIEncodingFamily.VI>, EXPe_vi {
+    let DecoderNamespace="VI";
+    let DisableDecoder = DisableVIDecoder;
+  }
 }
 
 //===----------------------------------------------------------------------===//
@@ -689,7 +716,7 @@ multiclass EXP_m {
 
 class SOP1_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
   SOP1 <outs, ins, "", pattern>,
-  SIMCInstr<opName, SISubtarget.NONE> {
+  SIMCInstr<opName, SIEncodingFamily.NONE> {
   let isPseudo = 1;
   let isCodeGenOnly = 1;
 }
@@ -697,17 +724,21 @@ class SOP1_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
 class SOP1_Real_si <sop1 op, string opName, dag outs, dag ins, string asm> :
   SOP1 <outs, ins, asm, []>,
   SOP1e <op.SI>,
-  SIMCInstr<opName, SISubtarget.SI> {
+  SIMCInstr<opName, SIEncodingFamily.SI> {
   let isCodeGenOnly = 0;
   let AssemblerPredicates = [isSICI];
+  let DecoderNamespace = "SICI";
+  let DisableDecoder = DisableSIDecoder;
 }
 
 class SOP1_Real_vi <sop1 op, string opName, dag outs, dag ins, string asm> :
   SOP1 <outs, ins, asm, []>,
   SOP1e <op.VI>,
-  SIMCInstr<opName, SISubtarget.VI> {
+  SIMCInstr<opName, SIEncodingFamily.VI> {
   let isCodeGenOnly = 0;
   let AssemblerPredicates = [isVI];
+  let DecoderNamespace = "VI";
+  let DisableDecoder = DisableVIDecoder;
 }
 
 multiclass SOP1_m <sop1 op, string opName, dag outs, dag ins, string asm,
@@ -722,27 +753,27 @@ multiclass SOP1_m <sop1 op, string opName, dag outs, dag ins, string asm,
 }
 
 multiclass SOP1_32 <sop1 op, string opName, list<dag> pattern> : SOP1_m <
-    op, opName, (outs SReg_32:$dst), (ins SSrc_32:$src0),
-    opName#" $dst, $src0", pattern
+    op, opName, (outs SReg_32:$sdst), (ins SSrc_32:$src0),
+    opName#" $sdst, $src0", pattern
 >;
 
 multiclass SOP1_64 <sop1 op, string opName, list<dag> pattern> : SOP1_m <
-    op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0),
-    opName#" $dst, $src0", pattern
+    op, opName, (outs SReg_64:$sdst), (ins SSrc_64:$src0),
+    opName#" $sdst, $src0", pattern
 >;
 
 // no input, 64-bit output.
 multiclass SOP1_64_0 <sop1 op, string opName, list<dag> pattern> {
-  def "" : SOP1_Pseudo <opName, (outs SReg_64:$dst), (ins), pattern>;
+  def "" : SOP1_Pseudo <opName, (outs SReg_64:$sdst), (ins), pattern>;
 
-  def _si : SOP1_Real_si <op, opName, (outs SReg_64:$dst), (ins),
-    opName#" $dst"> {
-    let ssrc0 = 0;
+  def _si : SOP1_Real_si <op, opName, (outs SReg_64:$sdst), (ins),
+    opName#" $sdst"> {
+    let src0 = 0;
   }
 
-  def _vi : SOP1_Real_vi <op, opName, (outs SReg_64:$dst), (ins),
-    opName#" $dst"> {
-    let ssrc0 = 0;
+  def _vi : SOP1_Real_vi <op, opName, (outs SReg_64:$sdst), (ins),
+    opName#" $sdst"> {
+    let src0 = 0;
   }
 }
 
@@ -763,13 +794,19 @@ multiclass SOP1_1 <sop1 op, string opName, list<dag> pattern> {
 
 // 64-bit input, 32-bit output.
 multiclass SOP1_32_64 <sop1 op, string opName, list<dag> pattern> : SOP1_m <
-    op, opName, (outs SReg_32:$dst), (ins SSrc_64:$src0),
-    opName#" $dst, $src0", pattern
+    op, opName, (outs SReg_32:$sdst), (ins SSrc_64:$src0),
+    opName#" $sdst, $src0", pattern
+>;
+
+// 32-bit input, 64-bit output.
+multiclass SOP1_64_32 <sop1 op, string opName, list<dag> pattern> : SOP1_m <
+    op, opName, (outs SReg_64:$sdst), (ins SSrc_32:$src0),
+    opName#" $sdst, $src0", pattern
 >;
 
 class SOP2_Pseudo<string opName, dag outs, dag ins, list<dag> pattern> :
   SOP2<outs, ins, "", pattern>,
-  SIMCInstr<opName, SISubtarget.NONE> {
+  SIMCInstr<opName, SIEncodingFamily.NONE> {
   let isPseudo = 1;
   let isCodeGenOnly = 1;
   let Size = 4;
@@ -784,15 +821,19 @@ class SOP2_Pseudo<string opName, dag outs, dag ins, list<dag> pattern> :
 class SOP2_Real_si<sop2 op, string opName, dag outs, dag ins, string asm> :
   SOP2<outs, ins, asm, []>,
   SOP2e<op.SI>,
-  SIMCInstr<opName, SISubtarget.SI> {
+  SIMCInstr<opName, SIEncodingFamily.SI> {
   let AssemblerPredicates = [isSICI];
+  let DecoderNamespace = "SICI";
+  let DisableDecoder = DisableSIDecoder;
 }
 
 class SOP2_Real_vi<sop2 op, string opName, dag outs, dag ins, string asm> :
   SOP2<outs, ins, asm, []>,
   SOP2e<op.VI>,
-  SIMCInstr<opName, SISubtarget.VI> {
+  SIMCInstr<opName, SIEncodingFamily.VI> {
   let AssemblerPredicates = [isVI];
+  let DecoderNamespace = "VI";
+  let DisableDecoder = DisableVIDecoder;
 }
 
 multiclass SOP2_m <sop2 op, string opName, dag outs, dag ins, string asm,
@@ -807,36 +848,49 @@ multiclass SOP2_m <sop2 op, string opName, dag outs, dag ins, string asm,
 }
 
 multiclass SOP2_32 <sop2 op, string opName, list<dag> pattern> : SOP2_m <
-    op, opName, (outs SReg_32:$dst), (ins SSrc_32:$src0, SSrc_32:$src1),
-    opName#" $dst, $src0, $src1", pattern
+    op, opName, (outs SReg_32:$sdst), (ins SSrc_32:$src0, SSrc_32:$src1),
+    opName#" $sdst, $src0, $src1", pattern
 >;
 
 multiclass SOP2_64 <sop2 op, string opName, list<dag> pattern> : SOP2_m <
-    op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_64:$src1),
-    opName#" $dst, $src0, $src1", pattern
+    op, opName, (outs SReg_64:$sdst), (ins SSrc_64:$src0, SSrc_64:$src1),
+    opName#" $sdst, $src0, $src1", pattern
 >;
 
 multiclass SOP2_64_32 <sop2 op, string opName, list<dag> pattern> : SOP2_m <
-    op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_32:$src1),
-    opName#" $dst, $src0, $src1", pattern
+    op, opName, (outs SReg_64:$sdst), (ins SSrc_64:$src0, SSrc_32:$src1),
+    opName#" $sdst, $src0, $src1", pattern
 >;
 
-class SOPC_Helper <bits<7> op, RegisterOperand rc, ValueType vt,
-                    string opName, PatLeaf cond> : SOPC <
-  op, (outs), (ins rc:$src0, rc:$src1),
-  opName#" $src0, $src1", []> {
+multiclass SOP2_64_32_32 <sop2 op, string opName, list<dag> pattern> : SOP2_m <
+    op, opName, (outs SReg_64:$sdst), (ins SSrc_32:$src0, SSrc_32:$src1),
+    opName#" $sdst, $src0, $src1", pattern
+>;
+
+class SOPC_Base <bits<7> op, RegisterOperand rc0, RegisterOperand rc1,
+                 string opName, list<dag> pattern = []> : SOPC <
+  op, (outs), (ins rc0:$src0, rc1:$src1),
+  opName#" $src0, $src1", pattern > {
   let Defs = [SCC];
 }
+class SOPC_Helper <bits<7> op, RegisterOperand rc, ValueType vt,
+                    string opName, PatLeaf cond> : SOPC_Base <
+  op, rc, rc, opName,
+  [(set SCC, (si_setcc_uniform vt:$src0, vt:$src1, cond))] > {
+}
 
-class SOPC_32<bits<7> op, string opName, PatLeaf cond = COND_NULL>
+class SOPC_CMP_32<bits<7> op, string opName, PatLeaf cond = COND_NULL>
   : SOPC_Helper<op, SSrc_32, i32, opName, cond>;
 
-class SOPC_64<bits<7> op, string opName, PatLeaf cond = COND_NULL>
-  : SOPC_Helper<op, SSrc_64, i64, opName, cond>;
+class SOPC_32<bits<7> op, string opName, list<dag> pattern = []>
+  : SOPC_Base<op, SSrc_32, SSrc_32, opName, pattern>;
+
+class SOPC_64_32<bits<7> op, string opName, list<dag> pattern = []>
+  : SOPC_Base<op, SSrc_64, SSrc_32, opName, pattern>;
 
 class SOPK_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
   SOPK <outs, ins, "", pattern>,
-  SIMCInstr<opName, SISubtarget.NONE> {
+  SIMCInstr<opName, SIEncodingFamily.NONE> {
   let isPseudo = 1;
   let isCodeGenOnly = 1;
 }
@@ -844,16 +898,20 @@ class SOPK_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
 class SOPK_Real_si <sopk op, string opName, dag outs, dag ins, string asm> :
   SOPK <outs, ins, asm, []>,
   SOPKe <op.SI>,
-  SIMCInstr<opName, SISubtarget.SI> {
+  SIMCInstr<opName, SIEncodingFamily.SI> {
   let AssemblerPredicates = [isSICI];
+  let DecoderNamespace = "SICI";
+  let DisableDecoder = DisableSIDecoder;
   let isCodeGenOnly = 0;
 }
 
 class SOPK_Real_vi <sopk op, string opName, dag outs, dag ins, string asm> :
   SOPK <outs, ins, asm, []>,
   SOPKe <op.VI>,
-  SIMCInstr<opName, SISubtarget.VI> {
+  SIMCInstr<opName, SIEncodingFamily.VI> {
   let AssemblerPredicates = [isVI];
+  let DecoderNamespace = "VI";
+  let DisableDecoder = DisableVIDecoder;
   let isCodeGenOnly = 0;
 }
 
@@ -868,14 +926,14 @@ multiclass SOPK_m <sopk op, string opName, dag outs, dag ins, string opAsm,
 }
 
 multiclass SOPK_32 <sopk op, string opName, list<dag> pattern> {
-  def "" : SOPK_Pseudo <opName, (outs SReg_32:$dst), (ins u16imm:$src0),
+  def "" : SOPK_Pseudo <opName, (outs SReg_32:$sdst), (ins u16imm:$simm16),
     pattern>;
 
-  def _si : SOPK_Real_si <op, opName, (outs SReg_32:$dst), (ins u16imm:$src0),
-    opName#" $dst, $src0">;
+  def _si : SOPK_Real_si <op, opName, (outs SReg_32:$sdst), (ins u16imm:$simm16),
+    opName#" $sdst, $simm16">;
 
-  def _vi : SOPK_Real_vi <op, opName, (outs SReg_32:$dst), (ins u16imm:$src0),
-    opName#" $dst, $src0">;
+  def _vi : SOPK_Real_vi <op, opName, (outs SReg_32:$sdst), (ins u16imm:$simm16),
+    opName#" $sdst, $simm16">;
 }
 
 multiclass SOPK_SCC <sopk op, string opName, list<dag> pattern> {
@@ -908,15 +966,19 @@ multiclass SOPK_IMM32 <sopk op, string opName, dag outs, dag ins,
 
   def _si : SOPK <outs, ins, asm, []>,
             SOPK64e <op.SI>,
-            SIMCInstr<opName, SISubtarget.SI> {
+            SIMCInstr<opName, SIEncodingFamily.SI> {
               let AssemblerPredicates = [isSICI];
+              let DecoderNamespace = "SICI";
+              let DisableDecoder = DisableSIDecoder;
               let isCodeGenOnly = 0;
             }
 
   def _vi : SOPK <outs, ins, asm, []>,
             SOPK64e <op.VI>,
-            SIMCInstr<opName, SISubtarget.VI> {
+            SIMCInstr<opName, SIEncodingFamily.VI> {
               let AssemblerPredicates = [isVI];
+              let DecoderNamespace = "VI";
+              let DisableDecoder = DisableVIDecoder;
               let isCodeGenOnly = 0;
             }
 }
@@ -926,86 +988,145 @@ multiclass SOPK_IMM32 <sopk op, string opName, dag outs, dag ins,
 
 class SMRD_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
   SMRD <outs, ins, "", pattern>,
-  SIMCInstr<opName, SISubtarget.NONE> {
+  SIMCInstr<opName, SIEncodingFamily.NONE> {
   let isPseudo = 1;
   let isCodeGenOnly = 1;
 }
 
-class SMRD_Real_si <bits<5> op, string opName, bit imm, dag outs, dag ins,
-                    string asm> :
+class SMRD_IMM_Real_si <bits<5> op, string opName, dag outs, dag ins,
+                        string asm> :
+  SMRD <outs, ins, asm, []>,
+  SMRD_IMMe <op>,
+  SIMCInstr<opName, SIEncodingFamily.SI> {
+  let AssemblerPredicates = [isSICI];
+  let DecoderNamespace = "SICI";
+  let DisableDecoder = DisableSIDecoder;
+}
+
+class SMRD_SOFF_Real_si <bits<5> op, string opName, dag outs, dag ins,
+                         string asm> :
   SMRD <outs, ins, asm, []>,
-  SMRDe <op, imm>,
-  SIMCInstr<opName, SISubtarget.SI> {
+  SMRD_SOFFe <op>,
+  SIMCInstr<opName, SIEncodingFamily.SI> {
   let AssemblerPredicates = [isSICI];
+  let DecoderNamespace = "SICI";
+  let DisableDecoder = DisableSIDecoder;
+}
+
+
+class SMRD_IMM_Real_vi <bits<8> op, string opName, dag outs, dag ins,
+                        string asm, list<dag> pattern = []> :
+  SMRD <outs, ins, asm, pattern>,
+  SMEM_IMMe_vi <op>,
+  SIMCInstr<opName, SIEncodingFamily.VI> {
+  let AssemblerPredicates = [isVI];
+  let DecoderNamespace = "VI";
+  let DisableDecoder = DisableVIDecoder;
 }
 
-class SMRD_Real_vi <bits<8> op, string opName, bit imm, dag outs, dag ins,
-                    string asm, list<dag> pattern = []> :
+class SMRD_SOFF_Real_vi <bits<8> op, string opName, dag outs, dag ins,
+                         string asm, list<dag> pattern = []> :
   SMRD <outs, ins, asm, pattern>,
-  SMEMe_vi <op, imm>,
-  SIMCInstr<opName, SISubtarget.VI> {
+  SMEM_SOFFe_vi <op>,
+  SIMCInstr<opName, SIEncodingFamily.VI> {
   let AssemblerPredicates = [isVI];
+  let DecoderNamespace = "VI";
+  let DisableDecoder = DisableVIDecoder;
 }
 
-multiclass SMRD_m <smrd op, string opName, bit imm, dag outs, dag ins,
+
+multiclass SMRD_IMM_m <smrd op, string opName, dag outs, dag ins,
                    string asm, list<dag> pattern> {
 
   def "" : SMRD_Pseudo <opName, outs, ins, pattern>;
 
-  def _si : SMRD_Real_si <op.SI, opName, imm, outs, ins, asm>;
+  def _si : SMRD_IMM_Real_si <op.SI, opName, outs, ins, asm>;
 
   // glc is only applicable to scalar stores, which are not yet
   // implemented.
   let glc = 0 in {
-    def _vi : SMRD_Real_vi <op.VI, opName, imm, outs, ins, asm>;
+    def _vi : SMRD_IMM_Real_vi <op.VI, opName, outs, ins, asm>;
   }
 }
 
-multiclass SMRD_Inval <smrd op, string opName,
-                       SDPatternOperator node> {
-  let hasSideEffects = 1, mayStore = 1 in {
-    def "" : SMRD_Pseudo <opName, (outs), (ins), [(node)]>;
+multiclass SMRD_SOFF_m <smrd op, string opName, dag outs, dag ins,
+                        string asm, list<dag> pattern> {
 
-    let sbase = 0, offset = 0 in {
-      let sdst = 0 in {
-        def _si : SMRD_Real_si <op.SI, opName, 0, (outs), (ins), opName>;
-      }
+  def "" : SMRD_Pseudo <opName, outs, ins, pattern>;
+
+  def _si : SMRD_SOFF_Real_si <op.SI, opName, outs, ins, asm>;
+
+  // glc is only applicable to scalar stores, which are not yet
+  // implemented.
+  let glc = 0 in {
+    def _vi : SMRD_SOFF_Real_vi <op.VI, opName, outs, ins, asm>;
+  }
+}
+
+multiclass SMRD_Special <smrd op, string opName, dag outs,
+                       int sdst_ = ?,
+                       string opStr = "",
+                       list<dag> pattern = []> {
+  let hasSideEffects = 1 in {
+    def "" : SMRD_Pseudo <opName, outs, (ins), pattern>;
+
+    let sbase = 0, soff = 0, sdst = sdst_ in {
+      def _si : SMRD_SOFF_Real_si <op.SI, opName, outs, (ins), opName#opStr>;
 
-      let glc = 0, sdata = 0 in {
-        def _vi : SMRD_Real_vi <op.VI, opName, 0, (outs), (ins), opName>;
+      let glc = 0 in {
+        def _vi : SMRD_SOFF_Real_vi <op.VI, opName, outs, (ins), opName#opStr>;
       }
     }
   }
 }
 
+multiclass SMRD_Inval <smrd op, string opName,
+                     SDPatternOperator node> {
+  let mayStore = 1 in {
+    defm : SMRD_Special<op, opName, (outs), 0, "", [(node)]>;
+  }
+}
+
 class SMEM_Inval <bits<8> op, string opName, SDPatternOperator node> :
-  SMRD_Real_vi<op, opName, 0, (outs), (ins), opName, [(node)]> {
+  SMRD_SOFF_Real_vi<op, opName, (outs), (ins), opName, [(node)]> {
   let hasSideEffects = 1;
   let mayStore = 1;
   let sbase = 0;
-  let sdata = 0;
+  let sdst = 0;
+  let glc = 0;
+  let soff = 0;
+}
+
+class SMEM_Ret <bits<8> op, string opName, SDPatternOperator node> :
+  SMRD_SOFF_Real_vi<op, opName, (outs SReg_64:$sdst), (ins),
+  opName#" $sdst", [(set i64:$sdst, (node))]> {
+  let hasSideEffects = 1;
+  let mayStore = ?;
+  let mayLoad = ?;
+  let sbase = 0;
   let glc = 0;
-  let offset = 0;
+  let soff = 0;
 }
 
 multiclass SMRD_Helper <smrd op, string opName, RegisterClass baseClass,
                         RegisterClass dstClass> {
-  defm _IMM : SMRD_m <
-    op, opName#"_IMM", 1, (outs dstClass:$dst),
+  defm _IMM : SMRD_IMM_m <
+    op, opName#"_IMM", (outs dstClass:$sdst),
     (ins baseClass:$sbase, smrd_offset:$offset),
-    opName#" $dst, $sbase, $offset", []
+    opName#" $sdst, $sbase, $offset", []
   >;
 
   def _IMM_ci : SMRD <
-    (outs dstClass:$dst), (ins baseClass:$sbase, smrd_literal_offset:$offset),
-    opName#" $dst, $sbase, $offset", []>, SMRD_IMMe_ci <op.SI> {
+    (outs dstClass:$sdst), (ins baseClass:$sbase, smrd_literal_offset:$offset),
+    opName#" $sdst, $sbase, $offset", []>, SMRD_IMMe_ci <op.SI> {
     let AssemblerPredicates = [isCIOnly];
+    let DecoderNamespace = "CI";
   }
 
-  defm _SGPR : SMRD_m <
-    op, opName#"_SGPR", 0, (outs dstClass:$dst),
+  defm _SGPR : SMRD_SOFF_m <
+    op, opName#"_SGPR", (outs dstClass:$sdst),
     (ins baseClass:$sbase, SReg_32:$soff),
-    opName#" $dst, $sbase, $soff", []
+    opName#" $sdst, $sbase, $soff", []
   >;
 }
 
@@ -1013,20 +1134,6 @@ multiclass SMRD_Helper <smrd op, string opName, RegisterClass baseClass,
 // Vector ALU classes
 //===----------------------------------------------------------------------===//
 
-// This must always be right before the operand being input modified.
-def InputMods : OperandWithDefaultOps <i32, (ops (i32 0))> {
-  let PrintMethod = "printOperandAndMods";
-}
-
-def InputModsMatchClass : AsmOperandClass {
-  let Name = "RegWithInputMods";
-}
-
-def InputModsNoDefault : Operand <i32> {
-  let PrintMethod = "printOperandAndMods";
-  let ParserMatchClass = InputModsMatchClass;
-}
-
 class getNumSrcArgs<ValueType Src0, ValueType Src1, ValueType Src2> {
   int ret =
     !if (!eq(Src0.Value, untyped.Value),      0,
@@ -1050,12 +1157,12 @@ class getVOPSrc0ForVT<ValueType VT> {
   RegisterOperand ret = !if(!eq(VT.Size, 64), VSrc_64, VSrc_32);
 }
 
-// Returns the register class to use for source 1 of VOP[12C] for the
-// given VT.
-class getVOPSrc1ForVT<ValueType VT> {
+// Returns the vreg register class to use for source operand given VT
+class getVregSrcForVT<ValueType VT> {
   RegisterClass ret = !if(!eq(VT.Size, 64), VReg_64, VGPR_32);
 }
 
+
 // Returns the register class to use for sources of VOP3 instructions for the
 // given VT.
 class getVOP3SrcForVT<ValueType VT> {
@@ -1072,8 +1179,10 @@ class getVOP3SrcForVT<ValueType VT> {
 // Returns 1 if the source arguments have modifiers, 0 if they do not.
 // XXX - do f16 instructions?
 class hasModifiers<ValueType SrcVT> {
-  bit ret = !if(!eq(SrcVT.Value, f32.Value), 1,
-            !if(!eq(SrcVT.Value, f64.Value), 1, 0));
+  bit ret =
+    !if(!eq(SrcVT.Value, f32.Value), 1,
+    !if(!eq(SrcVT.Value, f64.Value), 1,
+    0));
 }
 
 // Returns the input arguments for VOP[12C] instructions for the given SrcVT.
@@ -1089,11 +1198,15 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
                 bit HasModifiers> {
 
   dag ret =
+    !if (!eq(NumSrcArgs, 0),
+      // VOP1 without input operands (V_NOP, V_CLREXCP)
+      (ins),
+      /* else */
     !if (!eq(NumSrcArgs, 1),
       !if (!eq(HasModifiers, 1),
         // VOP1 with modifiers
-        (ins InputModsNoDefault:$src0_modifiers, Src0RC:$src0,
-             ClampMod:$clamp, omod:$omod)
+        (ins FPInputMods:$src0_modifiers, Src0RC:$src0,
+             clampmod:$clamp, omod:$omod)
       /* else */,
         // VOP1 without modifiers
         (ins Src0RC:$src0)
@@ -1101,9 +1214,9 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
     !if (!eq(NumSrcArgs, 2),
       !if (!eq(HasModifiers, 1),
         // VOP 2 with modifiers
-        (ins InputModsNoDefault:$src0_modifiers, Src0RC:$src0,
-             InputModsNoDefault:$src1_modifiers, Src1RC:$src1,
-             ClampMod:$clamp, omod:$omod)
+        (ins FPInputMods:$src0_modifiers, Src0RC:$src0,
+             FPInputMods:$src1_modifiers, Src1RC:$src1,
+             clampmod:$clamp, omod:$omod)
       /* else */,
         // VOP2 without modifiers
         (ins Src0RC:$src0, Src1RC:$src1)
@@ -1111,21 +1224,109 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
     /* NumSrcArgs == 3 */,
       !if (!eq(HasModifiers, 1),
         // VOP3 with modifiers
-        (ins InputModsNoDefault:$src0_modifiers, Src0RC:$src0,
-             InputModsNoDefault:$src1_modifiers, Src1RC:$src1,
-             InputModsNoDefault:$src2_modifiers, Src2RC:$src2,
-             ClampMod:$clamp, omod:$omod)
+        (ins FPInputMods:$src0_modifiers, Src0RC:$src0,
+             FPInputMods:$src1_modifiers, Src1RC:$src1,
+             FPInputMods:$src2_modifiers, Src2RC:$src2,
+             clampmod:$clamp, omod:$omod)
       /* else */,
         // VOP3 without modifiers
         (ins Src0RC:$src0, Src1RC:$src1, Src2RC:$src2)
-      /* endif */ )));
+      /* endif */ ))));
+}
+
+class getInsDPP <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs,
+                                                             bit HasModifiers> {
+
+  dag ret = !if (!eq(NumSrcArgs, 0),
+                // VOP1 without input operands (V_NOP)
+                (ins dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
+                     bank_mask:$bank_mask, bound_ctrl:$bound_ctrl),
+            !if (!eq(NumSrcArgs, 1),
+              !if (!eq(HasModifiers, 1),
+                // VOP1_DPP with modifiers
+                (ins FPInputMods:$src0_modifiers, Src0RC:$src0,
+                     dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
+                     bank_mask:$bank_mask, bound_ctrl:$bound_ctrl)
+              /* else */,
+                // VOP1_DPP without modifiers
+                (ins Src0RC:$src0, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
+                bank_mask:$bank_mask, bound_ctrl:$bound_ctrl)
+              /* endif */)
+              /* NumSrcArgs == 2 */,
+              !if (!eq(HasModifiers, 1),
+                // VOP2_DPP with modifiers
+                (ins FPInputMods:$src0_modifiers, Src0RC:$src0,
+                     FPInputMods:$src1_modifiers, Src1RC:$src1,
+                     dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
+                     bank_mask:$bank_mask, bound_ctrl:$bound_ctrl)
+              /* else */,
+                // VOP2_DPP without modifiers
+                (ins Src0RC:$src0, Src1RC:$src1, dpp_ctrl:$dpp_ctrl,
+                row_mask:$row_mask, bank_mask:$bank_mask,
+                bound_ctrl:$bound_ctrl)
+             /* endif */)));
+}
+
+class getInsSDWA <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs,
+                  bit HasFloatModifiers, ValueType DstVT> {
+
+  dag ret = !if(!eq(NumSrcArgs, 0),
+               // VOP1 without input operands (V_NOP)
+               (ins),
+            !if(!eq(NumSrcArgs, 1),
+                !if(HasFloatModifiers,
+                    // VOP1_SDWA with float modifiers
+                    (ins FPInputMods:$src0_fmodifiers, Src0RC:$src0,
+                         clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
+                         src0_sel:$src0_sel)
+                /* else */,
+                    // VOP1_SDWA with sext modifier
+                    (ins IntInputMods:$src0_imodifiers, Src0RC:$src0,
+                         clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
+                         src0_sel:$src0_sel)
+                /* endif */)
+              /* NumSrcArgs == 2 */,
+              !if(HasFloatModifiers,
+                  !if(!eq(DstVT.Size, 1),
+                      // VOPC_SDWA with float modifiers
+                      (ins FPInputMods:$src0_fmodifiers, Src0RC:$src0,
+                           FPInputMods:$src1_fmodifiers, Src1RC:$src1,
+                           clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel),
+                      // VOP2_SDWA or VOPC_SDWA with float modifiers
+                      (ins FPInputMods:$src0_fmodifiers, Src0RC:$src0,
+                           FPInputMods:$src1_fmodifiers, Src1RC:$src1,
+                           clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
+                           src0_sel:$src0_sel, src1_sel:$src1_sel)
+                  ),
+              /* else */
+                !if(!eq(DstVT.Size, 1),
+                    // VOPC_SDWA with sext modifiers
+                    (ins IntInputMods:$src0_imodifiers, Src0RC:$src0,
+                         IntInputMods:$src1_imodifiers, Src1RC:$src1,
+                         clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel),
+                    // VOP2_SDWA or VOPC_SDWA with sext modifier
+                    (ins IntInputMods:$src0_imodifiers, Src0RC:$src0,
+                         IntInputMods:$src1_imodifiers, Src1RC:$src1,
+                         clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
+                         src0_sel:$src0_sel, src1_sel:$src1_sel)
+                )
+             /* endif */)));
+}
+
+// Outs for DPP and SDWA
+class getOutsExt <bit HasDst, ValueType DstVT, RegisterOperand DstRCDPP> {
+  dag ret = !if(HasDst,
+                !if(!eq(DstVT.Size, 1),
+                    (outs), // no dst for VOPC, we use "vcc"-token as dst in SDWA VOPC instructions
+                    (outs DstRCDPP:$vdst)),
+                (outs)); // V_NOP
 }
 
 // Returns the assembly string for the inputs and outputs of a VOP[12C]
 // instruction.  This does not add the _e32 suffix, so it can be reused
 // by getAsm64.
-class getAsm32 <bit HasDst, int NumSrcArgs> {
-  string dst = "$dst";
+class getAsm32 <bit HasDst, int NumSrcArgs, ValueType DstVT = i32> {
+  string dst = !if(!eq(DstVT.Size, 1), "$sdst", "$vdst"); // use $sdst for VOPC
   string src0 = ", $src0";
   string src1 = ", $src1";
   string src2 = ", $src2";
@@ -1137,7 +1338,8 @@ class getAsm32 <bit HasDst, int NumSrcArgs> {
 
 // Returns the assembly string for the inputs and outputs of a VOP3
 // instruction.
-class getAsm64 <bit HasDst, int NumSrcArgs, bit HasModifiers> {
+class getAsm64 <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT = i32> {
+  string dst = !if(!eq(DstVT.Size, 1), "$sdst", "$vdst"); // use $sdst for VOPC
   string src0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,");
   string src1 = !if(!eq(NumSrcArgs, 1), "",
                    !if(!eq(NumSrcArgs, 2), " $src1_modifiers",
@@ -1145,8 +1347,71 @@ class getAsm64 <bit HasDst, int NumSrcArgs, bit HasModifiers> {
   string src2 = !if(!eq(NumSrcArgs, 3), " $src2_modifiers", "");
   string ret =
   !if(!eq(HasModifiers, 0),
-      getAsm32<HasDst, NumSrcArgs>.ret,
-      "$dst, "#src0#src1#src2#"$clamp"#"$omod");
+      getAsm32<HasDst, NumSrcArgs, DstVT>.ret,
+      dst#", "#src0#src1#src2#"$clamp"#"$omod");
+}
+
+class getAsmDPP <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT = i32> {
+  string dst = !if(HasDst,
+                   !if(!eq(DstVT.Size, 1),
+                       "$sdst",
+                       "$vdst"),
+                    ""); // use $sdst for VOPC
+  string src0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,");
+  string src1 = !if(!eq(NumSrcArgs, 1), "",
+                   !if(!eq(NumSrcArgs, 2), " $src1_modifiers",
+                                           " $src1_modifiers,"));
+  string args = !if(!eq(HasModifiers, 0),
+                     getAsm32<0, NumSrcArgs, DstVT>.ret,
+                     ", "#src0#src1);
+  string ret = dst#args#" $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
+}
+
+class getAsmSDWA <bit HasDst, int NumSrcArgs, bit HasFloatModifiers,
+                  ValueType DstVT = i32> {
+  string dst = !if(HasDst,
+                   !if(!eq(DstVT.Size, 1),
+                       " vcc", // use vcc token as dst for VOPC instructioins
+                       "$vdst"),
+                    "");
+  string src0 = !if(HasFloatModifiers, "$src0_fmodifiers", "$src0_imodifiers");
+  string src1 = !if(HasFloatModifiers, "$src1_fmodifiers", "$src1_imodifiers");
+  string args = !if(!eq(NumSrcArgs, 0),
+                    "",
+                    !if(!eq(NumSrcArgs, 1),
+                        ", "#src0#"$clamp",
+                        ", "#src0#", "#src1#"$clamp"
+                     )
+                );
+  string sdwa = !if(!eq(NumSrcArgs, 0),
+                    "",
+                    !if(!eq(NumSrcArgs, 1),
+                        " $dst_sel $dst_unused $src0_sel",
+                        !if(!eq(DstVT.Size, 1),
+                            " $src0_sel $src1_sel", // No dst_sel and dst_unused for VOPC
+                            " $dst_sel $dst_unused $src0_sel $src1_sel"
+                        )
+                    )
+                );
+  string ret = dst#args#sdwa;
+}
+
+// Function that checks if instruction supports DPP and SDWA
+class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
+                 ValueType Src1VT = i32> {
+  bit ret = !if(!eq(NumSrcArgs, 3),
+                0, // NumSrcArgs == 3 - No DPP or SDWA for VOP3
+                !if(!eq(DstVT.Size, 64),
+                    0, // 64-bit dst - No DPP or SDWA for 64-bit operands
+                    !if(!eq(Src0VT.Size, 64),
+                        0, // 64-bit src0
+                        !if(!eq(Src0VT.Size, 64),
+                            0, // 64-bit src2
+                            1
+                        )
+                    )
+                )
+            );
 }
 
 class VOPProfile <list<ValueType> _ArgVT> {
@@ -1158,30 +1423,48 @@ class VOPProfile <list<ValueType> _ArgVT> {
   field ValueType Src1VT = ArgVT[2];
   field ValueType Src2VT = ArgVT[3];
   field RegisterOperand DstRC = getVALUDstForVT<DstVT>.ret;
+  field RegisterOperand DstRCDPP = getVALUDstForVT<DstVT>.ret;
+  field RegisterOperand DstRCSDWA = getVALUDstForVT<DstVT>.ret;
   field RegisterOperand Src0RC32 = getVOPSrc0ForVT<Src0VT>.ret;
-  field RegisterClass Src1RC32 = getVOPSrc1ForVT<Src1VT>.ret;
+  field RegisterClass Src1RC32 = getVregSrcForVT<Src1VT>.ret;
   field RegisterOperand Src0RC64 = getVOP3SrcForVT<Src0VT>.ret;
   field RegisterOperand Src1RC64 = getVOP3SrcForVT<Src1VT>.ret;
   field RegisterOperand Src2RC64 = getVOP3SrcForVT<Src2VT>.ret;
+  field RegisterClass Src0DPP = getVregSrcForVT<Src0VT>.ret;
+  field RegisterClass Src1DPP = getVregSrcForVT<Src1VT>.ret;
+  field RegisterClass Src0SDWA = getVregSrcForVT<Src0VT>.ret;
+  field RegisterClass Src1SDWA = getVregSrcForVT<Src1VT>.ret;
 
   field bit HasDst = !if(!eq(DstVT.Value, untyped.Value), 0, 1);
   field bit HasDst32 = HasDst;
   field int NumSrcArgs = getNumSrcArgs<Src0VT, Src1VT, Src2VT>.ret;
   field bit HasModifiers = hasModifiers<Src0VT>.ret;
 
-  field dag Outs = !if(HasDst,(outs DstRC:$dst),(outs));
+  field bit HasExt = getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
+
+  field dag Outs = !if(HasDst,(outs DstRC:$vdst),(outs));
 
   // VOP3b instructions are a special case with a second explicit
   // output. This is manually overridden for them.
   field dag Outs32 = Outs;
   field dag Outs64 = Outs;
+  field dag OutsDPP = getOutsExt<HasDst, DstVT, DstRCDPP>.ret;
+  field dag OutsSDWA = getOutsExt<HasDst, DstVT, DstRCDPP>.ret;
 
   field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret;
   field dag Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
                              HasModifiers>.ret;
+  field dag InsDPP = getInsDPP<Src0DPP, Src1DPP, NumSrcArgs, HasModifiers>.ret;
+  field dag InsSDWA = getInsSDWA<Src0SDWA, Src1SDWA, NumSrcArgs, HasModifiers, DstVT>.ret;
+
+  field string Asm32 = getAsm32<HasDst, NumSrcArgs, DstVT>.ret;
+  field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret;
+  field string AsmDPP = getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret;
+  field string AsmSDWA = getAsmSDWA<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret;
+}
 
-  field string Asm32 = getAsm32<HasDst, NumSrcArgs>.ret;
-  field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasModifiers>.ret;
+class VOP_NO_EXT <VOPProfile p> : VOPProfile <p.ArgVT> {
+  let HasExt = 0;
 }
 
 // FIXME: I think these F16/I16 profiles will need to use f16/i16 types in order
@@ -1194,6 +1477,9 @@ def VOP_F16_F16_F16 : VOPProfile <[f16, f16, f16, untyped]>;
 def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i32, untyped]>;
 def VOP_I16_I16_I16 : VOPProfile <[i32, i32, i32, untyped]>;
 
+def VOP_I16_I16_I16_I16 : VOPProfile <[i32, i32, i32, i32, untyped]>;
+def VOP_F16_F16_F16_F16 : VOPProfile <[f16, f16, f16, f16, untyped]>;
+
 def VOP_NONE : VOPProfile <[untyped, untyped, untyped, untyped]>;
 
 def VOP_F32_F32 : VOPProfile <[f32, f32, untyped, untyped]>;
@@ -1216,10 +1502,10 @@ def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>;
 
 // Write out to vcc or arbitrary SGPR.
 def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped]> {
-  let Asm32 = "$dst, vcc, $src0, $src1";
-  let Asm64 = "$dst, $sdst, $src0, $src1";
-  let Outs32 = (outs DstRC:$dst);
-  let Outs64 = (outs DstRC:$dst, SReg_64:$sdst);
+  let Asm32 = "$vdst, vcc, $src0, $src1";
+  let Asm64 = "$vdst, $sdst, $src0, $src1";
+  let Outs32 = (outs DstRC:$vdst);
+  let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
 }
 
 // Write out to vcc or arbitrary SGPR and read in from vcc or
@@ -1231,10 +1517,23 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {
   // restriction. SGPRs are still allowed because it should
   // technically be possible to use VCC again as src0.
   let Src0RC32 = VCSrc_32;
-  let Asm32 = "$dst, vcc, $src0, $src1, vcc";
-  let Asm64 = "$dst, $sdst, $src0, $src1, $src2";
-  let Outs32 = (outs DstRC:$dst);
-  let Outs64 = (outs DstRC:$dst, SReg_64:$sdst);
+  let Asm32 = "$vdst, vcc, $src0, $src1, vcc";
+  let Asm64 = "$vdst, $sdst, $src0, $src1, $src2";
+  let Outs32 = (outs DstRC:$vdst);
+  let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
+
+  // Suppress src2 implied by type since the 32-bit encoding uses an
+  // implicit VCC use.
+  let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1);
+}
+
+// Read in from vcc or arbitrary SGPR
+def VOP2e_I32_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {
+  let Src0RC32 = VCSrc_32; // See comment in def VOP2b_I32_I1_I32_I32_I1 above.
+  let Asm32 = "$vdst, $src0, $src1, vcc";
+  let Asm64 = "$vdst, $src0, $src1, $src2";
+  let Outs32 = (outs DstRC:$vdst);
+  let Outs64 = (outs DstRC:$vdst);
 
   // Suppress src2 implied by type since the 32-bit encoding uses an
   // implicit VCC use.
@@ -1263,11 +1562,17 @@ class VOPC_Profile<ValueType vt0, ValueType vt1 = vt0> : VOPProfile <[i1, vt0, v
   let Asm32 = "vcc, $src0, $src1";
   // The destination for 32-bit encoding is implicit.
   let HasDst32 = 0;
+  let Outs64 = (outs DstRC:$sdst);
 }
 
 class VOPC_Class_Profile<ValueType vt> : VOPC_Profile<vt, i32> {
-  let Ins64 = (ins InputModsNoDefault:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1);
-  let Asm64 = "$dst, $src0_modifiers, $src1";
+  let Ins64 = (ins FPInputMods:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1);
+  let Asm64 = "$sdst, $src0_modifiers, $src1";
+  let InsSDWA = (ins FPInputMods:$src0_fmodifiers, Src0RC64:$src0,
+                     IntInputMods:$src1_imodifiers, Src1RC64:$src1,
+                     clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel);
+  let AsmSDWA = " vcc, $src0_fmodifiers, $src1_imodifiers$clamp $src0_sel $src1_sel";
+
 }
 
 def VOPC_I1_F32_F32 : VOPC_Profile<f32>;
@@ -1281,28 +1586,42 @@ def VOPC_I1_F64_I32 : VOPC_Class_Profile<f64>;
 def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>;
 def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>;
 def VOP_I64_I64_I64 : VOPProfile <[i64, i64, i64, untyped]>;
-def VOP_CNDMASK : VOPProfile <[i32, i32, i32, untyped]> {
-  let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1);
-  let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, SSrc_64:$src2);
-  let Asm64 = "$dst, $src0, $src1, $src2";
-}
 
 def VOP_F32_F32_F32_F32 : VOPProfile <[f32, f32, f32, f32]>;
-def VOP_MADK : VOPProfile <[f32, f32, f32, f32]> {
-  field dag Ins = (ins VCSrc_32:$src0, VGPR_32:$vsrc1, u32imm:$src2);
-  field string Asm = "$dst, $src0, $vsrc1, $src2";
+def VOP_MADAK : VOPProfile <[f32, f32, f32, f32]> {
+  field dag Ins32 = (ins VCSrc_32:$src0, VGPR_32:$src1, u32kimm:$imm);
+  field string Asm32 = "$vdst, $src0, $src1, $imm";
+  field bit HasExt = 0;
+}
+def VOP_MADMK : VOPProfile <[f32, f32, f32, f32]> {
+  field dag Ins32 = (ins VCSrc_32:$src0, u32kimm:$imm, VGPR_32:$src1);
+  field string Asm32 = "$vdst, $src0, $imm, $src1";
+  field bit HasExt = 0;
 }
 def VOP_MAC : VOPProfile <[f32, f32, f32, f32]> {
   let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2);
   let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3,
                              HasModifiers>.ret;
-  let Asm32 = getAsm32<1, 2>.ret;
-  let Asm64 = getAsm64<1, 2, HasModifiers>.ret;
+  let InsDPP = (ins FPInputMods:$src0_modifiers, Src0RC32:$src0,
+                    FPInputMods:$src1_modifiers, Src1RC32:$src1,
+                    VGPR_32:$src2, // stub argument
+                    dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
+                    bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
+  let InsSDWA = (ins FPInputMods:$src0_fmodifiers, Src0RC32:$src0,
+                     FPInputMods:$src1_fmodifiers, Src1RC32:$src1,
+                     VGPR_32:$src2, // stub argument
+                     clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
+                     src0_sel:$src0_sel, src1_sel:$src1_sel);
+  let Asm32 = getAsm32<1, 2, f32>.ret;
+  let Asm64 = getAsm64<1, 2, HasModifiers, f32>.ret;
+  let AsmDPP = getAsmDPP<1, 2, HasModifiers, f32>.ret;
+  let AsmSDWA = getAsmSDWA<1, 2, HasModifiers, f32>.ret;
 }
 def VOP_F64_F64_F64_F64 : VOPProfile <[f64, f64, f64, f64]>;
 def VOP_I32_I32_I32_I32 : VOPProfile <[i32, i32, i32, i32]>;
 def VOP_I64_I32_I32_I64 : VOPProfile <[i64, i32, i32, i64]>;
 
+// This class is used only with VOPC instructions. Use $sdst for out operand
 class SIInstAlias <string asm, Instruction inst, VOPProfile p> :
     InstAlias <asm, (inst)>, PredicateControl {
 
@@ -1313,13 +1632,13 @@ class SIInstAlias <string asm, Instruction inst, VOPProfile p> :
     !if (p.HasDst32,
       !if (!eq(p.NumSrcArgs, 0),
         // 1 dst, 0 src
-        (inst p.DstRC:$dst),
+        (inst p.DstRC:$sdst),
       !if (!eq(p.NumSrcArgs, 1),
         // 1 dst, 1 src
-        (inst p.DstRC:$dst, p.Src0RC32:$src0),
+        (inst p.DstRC:$sdst, p.Src0RC32:$src0),
       !if (!eq(p.NumSrcArgs, 2),
         // 1 dst, 2 src
-        (inst p.DstRC:$dst, p.Src0RC32:$src0, p.Src1RC32:$src1),
+        (inst p.DstRC:$sdst, p.Src0RC32:$src0, p.Src1RC32:$src1),
       // else - unreachable
         (inst)))),
     // else
@@ -1368,7 +1687,7 @@ class AtomicNoRet <string noRetOp, bit isRet> {
 class VOP1_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> :
   VOP1Common <outs, ins, "", pattern>,
   VOP <opName>,
-  SIMCInstr <opName#"_e32", SISubtarget.NONE>,
+  SIMCInstr <opName#"_e32", SIEncodingFamily.NONE>,
   MnemonicAlias<opName#"_e32", opName> {
   let isPseudo = 1;
   let isCodeGenOnly = 1;
@@ -1379,14 +1698,18 @@ class VOP1_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> :
 
 class VOP1_Real_si <string opName, vop1 op, dag outs, dag ins, string asm> :
   VOP1<op.SI, outs, ins, asm, []>,
-  SIMCInstr <opName#"_e32", SISubtarget.SI> {
+  SIMCInstr <opName#"_e32", SIEncodingFamily.SI> {
   let AssemblerPredicate = SIAssemblerPredicate;
+  let DecoderNamespace = "SICI";
+  let DisableDecoder = DisableSIDecoder;
 }
 
 class VOP1_Real_vi <string opName, vop1 op, dag outs, dag ins, string asm> :
   VOP1<op.VI, outs, ins, asm, []>,
-  SIMCInstr <opName#"_e32", SISubtarget.VI> {
+  SIMCInstr <opName#"_e32", SIEncodingFamily.VI> {
   let AssemblerPredicates = [isVI];
+  let DecoderNamespace = "VI";
+  let DisableDecoder = DisableVIDecoder;
 }
 
 multiclass VOP1_m <vop1 op, string opName, VOPProfile p, list<dag> pattern,
@@ -1399,6 +1722,49 @@ multiclass VOP1_m <vop1 op, string opName, VOPProfile p, list<dag> pattern,
 
 }
 
+class VOP1_DPP <vop1 op, string opName, VOPProfile p> :
+  VOP1_DPPe <op.VI>,
+  VOP_DPP <p.OutsDPP, p.InsDPP, opName#p.AsmDPP, [], p.HasModifiers> {
+  let AssemblerPredicates = !if(p.HasExt, [isVI], [DisableInst]);
+  let DecoderNamespace = "DPP";
+  let DisableDecoder = DisableVIDecoder;
+  let src0_modifiers = !if(p.HasModifiers, ?, 0);
+  let src1_modifiers = 0;
+}
+
+class SDWADisableFields <VOPProfile p> {
+  bits<8> src0 = !if(!eq(p.NumSrcArgs, 0), 0, ?);
+  bits<3> src0_sel = !if(!eq(p.NumSrcArgs, 0), 6, ?);
+  bits<2> src0_fmodifiers = !if(!eq(p.NumSrcArgs, 0),
+                                0,
+                                !if(p.HasModifiers, ?, 0));
+  bits<1> src0_imodifiers = !if(!eq(p.NumSrcArgs, 0),
+                                0,
+                                !if(p.HasModifiers, 0, ?));
+  bits<3> src1_sel = !if(!eq(p.NumSrcArgs, 0), 6,
+                         !if(!eq(p.NumSrcArgs, 1), 6,
+                             ?));
+  bits<2> src1_fmodifiers = !if(!eq(p.NumSrcArgs, 0), 0,
+                                !if(!eq(p.NumSrcArgs, 1), 0,
+                                    !if(p.HasModifiers, ?, 0)));
+  bits<1> src1_imodifiers = !if(!eq(p.NumSrcArgs, 0), 0,
+                                !if(!eq(p.NumSrcArgs, 1), 0,
+                                    !if(p.HasModifiers, 0, ?)));
+  bits<3> dst_sel = !if(p.HasDst, ?, 6);
+  bits<2> dst_unused = !if(p.HasDst, ?, 2);
+  bits<1> clamp = !if(!eq(p.NumSrcArgs, 0), 0, ?);
+}
+
+class VOP1_SDWA <vop1 op, string opName, VOPProfile p> :
+  VOP1_SDWAe <op.VI>,
+  VOP_SDWA <p.OutsSDWA, p.InsSDWA, opName#p.AsmSDWA, [], p.HasModifiers>,
+  SDWADisableFields <p> {
+  let AsmMatchConverter = "cvtSdwaVOP1";
+  let AssemblerPredicates = !if(p.HasExt, [isVI], [DisableInst]);
+  let DecoderNamespace = "SDWA";
+  let DisableDecoder = DisableVIDecoder;
+}
+
 multiclass VOP1SI_m <vop1 op, string opName, VOPProfile p, list<dag> pattern,
                      string asm = opName#p.Asm32> {
 
@@ -1410,7 +1776,7 @@ multiclass VOP1SI_m <vop1 op, string opName, VOPProfile p, list<dag> pattern,
 class VOP2_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> :
   VOP2Common <outs, ins, "", pattern>,
   VOP <opName>,
-  SIMCInstr<opName#"_e32", SISubtarget.NONE>,
+  SIMCInstr<opName#"_e32", SIEncodingFamily.NONE>,
   MnemonicAlias<opName#"_e32", opName> {
   let isPseudo = 1;
   let isCodeGenOnly = 1;
@@ -1418,14 +1784,18 @@ class VOP2_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> :
 
 class VOP2_Real_si <string opName, vop2 op, dag outs, dag ins, string asm> :
   VOP2 <op.SI, outs, ins, opName#asm, []>,
-  SIMCInstr <opName#"_e32", SISubtarget.SI> {
+  SIMCInstr <opName#"_e32", SIEncodingFamily.SI> {
   let AssemblerPredicates = [isSICI];
+  let DecoderNamespace = "SICI";
+  let DisableDecoder = DisableSIDecoder;
 }
 
 class VOP2_Real_vi <string opName, vop2 op, dag outs, dag ins, string asm> :
   VOP2 <op.VI, outs, ins, opName#asm, []>,
-  SIMCInstr <opName#"_e32", SISubtarget.VI> {
+  SIMCInstr <opName#"_e32", SIEncodingFamily.VI> {
   let AssemblerPredicates = [isVI];
+  let DecoderNamespace = "VI";
+  let DisableDecoder = DisableVIDecoder;
 }
 
 multiclass VOP2SI_m <vop2 op, string opName, VOPProfile p, list<dag> pattern,
@@ -1449,6 +1819,26 @@ multiclass VOP2_m <vop2 op, string opName, VOPProfile p, list <dag> pattern,
 
 }
 
+class VOP2_DPP <vop2 op, string opName, VOPProfile p> :
+  VOP2_DPPe <op.VI>,
+  VOP_DPP <p.OutsDPP, p.InsDPP, opName#p.AsmDPP, [], p.HasModifiers> {
+  let AssemblerPredicates = !if(p.HasExt, [isVI], [DisableInst]);
+  let DecoderNamespace = "DPP";
+  let DisableDecoder = DisableVIDecoder;
+  let src0_modifiers = !if(p.HasModifiers, ?, 0);
+  let src1_modifiers = !if(p.HasModifiers, ?, 0);
+}
+
+class VOP2_SDWA <vop2 op, string opName, VOPProfile p> :
+  VOP2_SDWAe <op.VI>,
+  VOP_SDWA <p.OutsSDWA, p.InsSDWA, opName#p.AsmSDWA, [], p.HasModifiers>,
+  SDWADisableFields <p> {
+  let AsmMatchConverter = "cvtSdwaVOP2";
+  let AssemblerPredicates = !if(p.HasExt, [isVI], [DisableInst]);
+  let DecoderNamespace = "SDWA";
+  let DisableDecoder = DisableVIDecoder;
+}
+
 class VOP3DisableFields <bit HasSrc1, bit HasSrc2, bit HasModifiers> {
 
   bits<2> src0_modifiers = !if(HasModifiers, ?, 0);
@@ -1471,10 +1861,11 @@ class VOP3DisableModFields <bit HasSrc0Mods,
   bits<1> clamp = !if(HasOutputMods, ?, 0);
 }
 
-class VOP3_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> :
-  VOP3Common <outs, ins, "", pattern>,
+class VOP3_Pseudo <dag outs, dag ins, list<dag> pattern, string opName,
+                   bit HasMods = 0, bit VOP3Only = 0> :
+  VOP3Common <outs, ins, "", pattern, HasMods, VOP3Only>,
   VOP <opName>,
-  SIMCInstr<opName#"_e64", SISubtarget.NONE>,
+  SIMCInstr<opName#"_e64", SIEncodingFamily.NONE>,
   MnemonicAlias<opName#"_e64", opName> {
   let isPseudo = 1;
   let isCodeGenOnly = 1;
@@ -1483,44 +1874,96 @@ class VOP3_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> :
   field bit src0;
 }
 
-class VOP3_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName> :
-  VOP3Common <outs, ins, asm, []>,
+class VOP3_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName,
+                    bit HasMods = 0, bit VOP3Only = 0> :
+  VOP3Common <outs, ins, asm, [], HasMods, VOP3Only>,
   VOP3e <op>,
-  SIMCInstr<opName#"_e64", SISubtarget.SI> {
+  SIMCInstr<opName#"_e64", SIEncodingFamily.SI> {
   let AssemblerPredicates = [isSICI];
+  let DecoderNamespace = "SICI";
+  let DisableDecoder = DisableSIDecoder;
 }
 
-class VOP3_Real_vi <bits<10> op, dag outs, dag ins, string asm, string opName> :
-  VOP3Common <outs, ins, asm, []>,
+class VOP3_Real_vi <bits<10> op, dag outs, dag ins, string asm, string opName,
+                    bit HasMods = 0, bit VOP3Only = 0> :
+  VOP3Common <outs, ins, asm, [], HasMods, VOP3Only>,
   VOP3e_vi <op>,
-  SIMCInstr <opName#"_e64", SISubtarget.VI> {
+  SIMCInstr <opName#"_e64", SIEncodingFamily.VI> {
+  let AssemblerPredicates = [isVI];
+  let DecoderNamespace = "VI";
+  let DisableDecoder = DisableVIDecoder;
+}
+
+class VOP3_C_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName,
+                     bit HasMods = 0, bit VOP3Only = 0> :
+  VOP3Common <outs, ins, asm, [], HasMods, VOP3Only>,
+  VOP3ce <op>,
+  SIMCInstr<opName#"_e64", SIEncodingFamily.SI> {
+  let AssemblerPredicates = [isSICI];
+  let DecoderNamespace = "SICI";
+  let DisableDecoder = DisableSIDecoder;
+}
+
+class VOP3_C_Real_vi <bits<10> op, dag outs, dag ins, string asm, string opName,
+                      bit HasMods = 0, bit VOP3Only = 0> :
+  VOP3Common <outs, ins, asm, [], HasMods, VOP3Only>,
+  VOP3ce_vi <op>,
+  SIMCInstr <opName#"_e64", SIEncodingFamily.VI> {
   let AssemblerPredicates = [isVI];
+  let DecoderNamespace = "VI";
+  let DisableDecoder = DisableVIDecoder;
 }
 
-class VOP3b_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName> :
-  VOP3Common <outs, ins, asm, []>,
+class VOP3b_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName,
+                     bit HasMods = 0, bit VOP3Only = 0> :
+  VOP3Common <outs, ins, asm, [], HasMods, VOP3Only>,
   VOP3be <op>,
-  SIMCInstr<opName#"_e64", SISubtarget.SI> {
+  SIMCInstr<opName#"_e64", SIEncodingFamily.SI> {
   let AssemblerPredicates = [isSICI];
+  let DecoderNamespace = "SICI";
+  let DisableDecoder = DisableSIDecoder;
 }
 
-class VOP3b_Real_vi <bits<10> op, dag outs, dag ins, string asm, string opName> :
-  VOP3Common <outs, ins, asm, []>,
+class VOP3b_Real_vi <bits<10> op, dag outs, dag ins, string asm, string opName,
+                     bit HasMods = 0, bit VOP3Only = 0> :
+  VOP3Common <outs, ins, asm, [], HasMods, VOP3Only>,
   VOP3be_vi <op>,
-  SIMCInstr <opName#"_e64", SISubtarget.VI> {
+  SIMCInstr <opName#"_e64", SIEncodingFamily.VI> {
+  let AssemblerPredicates = [isVI];
+  let DecoderNamespace = "VI";
+  let DisableDecoder = DisableVIDecoder;
+}
+
+class VOP3e_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName,
+                     bit HasMods = 0, bit VOP3Only = 0> :
+  VOP3Common <outs, ins, asm, [], HasMods, VOP3Only>,
+  VOP3e <op>,
+  SIMCInstr<opName#"_e64", SIEncodingFamily.SI> {
+  let AssemblerPredicates = [isSICI];
+  let DecoderNamespace = "SICI";
+  let DisableDecoder = DisableSIDecoder;
+}
+
+class VOP3e_Real_vi <bits<10> op, dag outs, dag ins, string asm, string opName,
+                     bit HasMods = 0, bit VOP3Only = 0> :
+  VOP3Common <outs, ins, asm, [], HasMods, VOP3Only>,
+  VOP3e_vi <op>,
+  SIMCInstr <opName#"_e64", SIEncodingFamily.VI> {
   let AssemblerPredicates = [isVI];
+  let DecoderNamespace = "VI";
+  let DisableDecoder = DisableVIDecoder;
 }
 
 multiclass VOP3_m <vop op, dag outs, dag ins, string asm, list<dag> pattern,
-                   string opName, int NumSrcArgs, bit HasMods = 1> {
+                   string opName, int NumSrcArgs, bit HasMods = 1, bit VOP3Only = 0> {
 
   def "" : VOP3_Pseudo <outs, ins, pattern, opName>;
 
-  def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>,
+  def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName, HasMods, VOP3Only>,
             VOP3DisableFields<!if(!eq(NumSrcArgs, 1), 0, 1),
                               !if(!eq(NumSrcArgs, 2), 0, 1),
                               HasMods>;
-  def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>,
+  def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName, HasMods, VOP3Only>,
             VOP3DisableFields<!if(!eq(NumSrcArgs, 1), 0, 1),
                               !if(!eq(NumSrcArgs, 2), 0, 1),
                               HasMods>;
@@ -1529,21 +1972,21 @@ multiclass VOP3_m <vop op, dag outs, dag ins, string asm, list<dag> pattern,
 multiclass VOP3_1_m <vop op, dag outs, dag ins, string asm,
                      list<dag> pattern, string opName, bit HasMods = 1> {
 
-  def "" : VOP3_Pseudo <outs, ins, pattern, opName>;
+  def "" : VOP3_Pseudo <outs, ins, pattern, opName, HasMods>;
 
-  def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>,
+  def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName, HasMods>,
             VOP3DisableFields<0, 0, HasMods>;
 
-  def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>,
+  def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName, HasMods>,
             VOP3DisableFields<0, 0, HasMods>;
 }
 
 multiclass VOP3SI_1_m <vop op, dag outs, dag ins, string asm,
                      list<dag> pattern, string opName, bit HasMods = 1> {
 
-  def "" : VOP3_Pseudo <outs, ins, pattern, opName>;
+  def "" : VOP3_Pseudo <outs, ins, pattern, opName, HasMods>;
 
-  def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>,
+  def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName, HasMods>,
             VOP3DisableFields<0, 0, HasMods>;
   // No VI instruction. This class is for SI only.
 }
@@ -1552,13 +1995,13 @@ multiclass VOP3_2_m <vop op, dag outs, dag ins, string asm,
                      list<dag> pattern, string opName, string revOp,
                      bit HasMods = 1> {
 
-  def "" : VOP3_Pseudo <outs, ins, pattern, opName>,
+  def "" : VOP3_Pseudo <outs, ins, pattern, opName, HasMods>,
            VOP2_REV<revOp#"_e64", !eq(revOp, opName)>;
 
-  def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>,
+  def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName, HasMods>,
             VOP3DisableFields<1, 0, HasMods>;
 
-  def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>,
+  def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName, HasMods>,
             VOP3DisableFields<1, 0, HasMods>;
 }
 
@@ -1566,10 +2009,10 @@ multiclass VOP3SI_2_m <vop op, dag outs, dag ins, string asm,
                      list<dag> pattern, string opName, string revOp,
                      bit HasMods = 1> {
 
-  def "" : VOP3_Pseudo <outs, ins, pattern, opName>,
+  def "" : VOP3_Pseudo <outs, ins, pattern, opName, HasMods>,
            VOP2_REV<revOp#"_e64", !eq(revOp, opName)>;
 
-  def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>,
+  def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName, HasMods>,
             VOP3DisableFields<1, 0, HasMods>;
 
   // No VI instruction. This class is for SI only.
@@ -1579,13 +2022,26 @@ multiclass VOP3SI_2_m <vop op, dag outs, dag ins, string asm,
 // instead of an implicit VCC as in the VOP2b format.
 multiclass VOP3b_2_3_m <vop op, dag outs, dag ins, string asm,
                         list<dag> pattern, string opName, string revOp,
-                        bit HasMods = 1, bit useSrc2Input = 0> {
-  def "" : VOP3_Pseudo <outs, ins, pattern, opName>;
+                        bit HasMods = 1, bit useSrc2Input = 0, bit VOP3Only = 0> {
+  def "" : VOP3_Pseudo <outs, ins, pattern, opName, HasMods, VOP3Only>;
+
+  def _si : VOP3b_Real_si <op.SI3, outs, ins, asm, opName, HasMods, VOP3Only>,
+            VOP3DisableFields<1, useSrc2Input, HasMods>;
+
+  def _vi : VOP3b_Real_vi <op.VI3, outs, ins, asm, opName, HasMods, VOP3Only>,
+            VOP3DisableFields<1, useSrc2Input, HasMods>;
+}
 
-  def _si : VOP3b_Real_si <op.SI3, outs, ins, asm, opName>,
+// Same as VOP3b_2_3_m but no 2nd destination (sdst), e.g. v_cndmask_b32.
+multiclass VOP3e_2_3_m <vop op, dag outs, dag ins, string asm,
+                        list<dag> pattern, string opName, string revOp,
+                        bit HasMods = 1, bit useSrc2Input = 0, bit VOP3Only = 0> {
+  def "" : VOP3_Pseudo <outs, ins, pattern, opName, HasMods, VOP3Only>;
+
+  def _si : VOP3e_Real_si <op.SI3, outs, ins, asm, opName, HasMods, VOP3Only>,
             VOP3DisableFields<1, useSrc2Input, HasMods>;
 
-  def _vi : VOP3b_Real_vi <op.VI3, outs, ins, asm, opName>,
+  def _vi : VOP3e_Real_vi <op.VI3, outs, ins, asm, opName, HasMods, VOP3Only>,
             VOP3DisableFields<1, useSrc2Input, HasMods>;
 }
 
@@ -1594,19 +2050,19 @@ multiclass VOP3_C_m <vop op, dag outs, dag ins, string asm,
                      bit HasMods, bit defExec,
                      string revOp, list<SchedReadWrite> sched> {
 
-  def "" : VOP3_Pseudo <outs, ins, pattern, opName>,
+  def "" : VOP3_Pseudo <outs, ins, pattern, opName, HasMods>,
            VOP2_REV<revOp#"_e64", !eq(revOp, opName)> {
     let Defs = !if(defExec, [EXEC], []);
     let SchedRW = sched;
   }
 
-  def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>,
+  def _si : VOP3_C_Real_si <op.SI3, outs, ins, asm, opName, HasMods>,
             VOP3DisableFields<1, 0, HasMods> {
     let Defs = !if(defExec, [EXEC], []);
     let SchedRW = sched;
   }
 
-  def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>,
+  def _vi : VOP3_C_Real_vi <op.VI3, outs, ins, asm, opName, HasMods>,
             VOP3DisableFields<1, 0, HasMods> {
     let Defs = !if(defExec, [EXEC], []);
     let SchedRW = sched;
@@ -1618,19 +2074,23 @@ multiclass VOP2SI_3VI_m <vop3 op, string opName, dag outs, dag ins,
                          string asm, list<dag> pattern = []> {
   let isPseudo = 1, isCodeGenOnly = 1 in {
     def "" : VOPAnyCommon <outs, ins, "", pattern>,
-             SIMCInstr<opName, SISubtarget.NONE>;
+             SIMCInstr<opName, SIEncodingFamily.NONE>;
   }
 
   def _si : VOP2 <op.SI3{5-0}, outs, ins, asm, []>,
-            SIMCInstr <opName, SISubtarget.SI> {
+            SIMCInstr <opName, SIEncodingFamily.SI> {
             let AssemblerPredicates = [isSICI];
+            let DecoderNamespace = "SICI";
+            let DisableDecoder = DisableSIDecoder;
   }
 
   def _vi : VOP3Common <outs, ins, asm, []>,
             VOP3e_vi <op.VI3>,
             VOP3DisableFields <1, 0, 0>,
-            SIMCInstr <opName, SISubtarget.VI> {
+            SIMCInstr <opName, SIEncodingFamily.VI> {
             let AssemblerPredicates = [isVI];
+            let DecoderNamespace = "VI";
+            let DisableDecoder = DisableVIDecoder;
   }
 }
 
@@ -1641,15 +2101,19 @@ multiclass VOP1_Helper <vop1 op, string opName, VOPProfile p, list<dag> pat32,
 
   defm _e64 : VOP3_1_m <op, p.Outs, p.Ins64, opName#p.Asm64, pat64, opName,
                         p.HasModifiers>;
+
+  def _dpp : VOP1_DPP <op, opName, p>;
+
+  def _sdwa : VOP1_SDWA <op, opName, p>;
 }
 
 multiclass VOP1Inst <vop1 op, string opName, VOPProfile P,
                      SDPatternOperator node = null_frag> : VOP1_Helper <
   op, opName, P, [],
   !if(P.HasModifiers,
-      [(set P.DstVT:$dst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0,
+      [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0,
                                 i32:$src0_modifiers, i1:$clamp, i32:$omod))))],
-      [(set P.DstVT:$dst, (node P.Src0VT:$src0))])
+      [(set P.DstVT:$vdst, (node P.Src0VT:$src0))])
 >;
 
 multiclass VOP1InstSI <vop1 op, string opName, VOPProfile P,
@@ -1659,9 +2123,9 @@ multiclass VOP1InstSI <vop1 op, string opName, VOPProfile P,
 
   defm _e64 : VOP3SI_1_m <op, P.Outs, P.Ins64, opName#P.Asm64,
     !if(P.HasModifiers,
-      [(set P.DstVT:$dst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0,
+      [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0,
                                 i32:$src0_modifiers, i1:$clamp, i32:$omod))))],
-      [(set P.DstVT:$dst, (node P.Src0VT:$src0))]),
+      [(set P.DstVT:$vdst, (node P.Src0VT:$src0))]),
     opName, P.HasModifiers>;
 }
 
@@ -1672,6 +2136,10 @@ multiclass VOP2_Helper <vop2 op, string opName, VOPProfile p, list<dag> pat32,
 
   defm _e64 : VOP3_2_m <op, p.Outs, p.Ins64, opName#p.Asm64, pat64, opName,
                         revOp, p.HasModifiers>;
+
+  def _dpp : VOP2_DPP <op, opName, p>;
+
+  def _sdwa : VOP2_SDWA <op, opName, p>;
 }
 
 multiclass VOP2Inst <vop2 op, string opName, VOPProfile P,
@@ -1679,11 +2147,11 @@ multiclass VOP2Inst <vop2 op, string opName, VOPProfile P,
                      string revOp = opName> : VOP2_Helper <
   op, opName, P, [],
   !if(P.HasModifiers,
-      [(set P.DstVT:$dst,
+      [(set P.DstVT:$vdst,
            (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
                                       i1:$clamp, i32:$omod)),
                  (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
-      [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]),
+      [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]),
   revOp
 >;
 
@@ -1695,14 +2163,41 @@ multiclass VOP2InstSI <vop2 op, string opName, VOPProfile P,
 
   defm _e64 : VOP3SI_2_m <op, P.Outs, P.Ins64, opName#P.Asm64,
     !if(P.HasModifiers,
-        [(set P.DstVT:$dst,
+        [(set P.DstVT:$vdst,
              (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
                                         i1:$clamp, i32:$omod)),
                    (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
-        [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]),
+        [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]),
     opName, revOp, P.HasModifiers>;
 }
 
+multiclass VOP2e_Helper <vop2 op, string opName, VOPProfile p,
+                         list<dag> pat32, list<dag> pat64,
+                         string revOp, bit useSGPRInput> {
+
+  let SchedRW = [Write32Bit] in {
+    let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]) in {
+      defm _e32 : VOP2_m <op, opName, p, pat32, revOp>;
+    }
+
+    defm _e64 : VOP3e_2_3_m <op, p.Outs64, p.Ins64, opName#p.Asm64, pat64,
+                             opName, revOp, p.HasModifiers, useSGPRInput>;
+  }
+}
+
+multiclass VOP2eInst <vop2 op, string opName, VOPProfile P,
+                      SDPatternOperator node = null_frag,
+                      string revOp = opName> : VOP2e_Helper <
+  op, opName, P, [],
+  !if(P.HasModifiers,
+      [(set P.DstVT:$vdst,
+           (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
+                                      i1:$clamp, i32:$omod)),
+                 (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
+      [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]),
+  revOp, !eq(P.NumSrcArgs, 3)
+>;
+
 multiclass VOP2b_Helper <vop2 op, string opName, VOPProfile p,
                          list<dag> pat32, list<dag> pat64,
                          string revOp, bit useSGPRInput> {
@@ -1722,11 +2217,11 @@ multiclass VOP2bInst <vop2 op, string opName, VOPProfile P,
                       string revOp = opName> : VOP2b_Helper <
   op, opName, P, [],
   !if(P.HasModifiers,
-      [(set P.DstVT:$dst,
+      [(set P.DstVT:$vdst,
            (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
                                       i1:$clamp, i32:$omod)),
                  (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
-      [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]),
+      [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]),
   revOp, !eq(P.NumSrcArgs, 3)
 >;
 
@@ -1746,31 +2241,35 @@ multiclass VOP2_VI3_Inst <vop23 op, string opName, VOPProfile P,
                           : VOP2_VI3_Helper <
   op, opName, P, [],
   !if(P.HasModifiers,
-      [(set P.DstVT:$dst,
+      [(set P.DstVT:$vdst,
            (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
                                       i1:$clamp, i32:$omod)),
                  (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
-      [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]),
+      [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]),
   revOp
 >;
 
-multiclass VOP2MADK <vop2 op, string opName, list<dag> pattern = []> {
+multiclass VOP2MADK <vop2 op, string opName, VOPProfile P, list<dag> pattern = []> {
 
-  def "" : VOP2_Pseudo <VOP_MADK.Outs, VOP_MADK.Ins, pattern, opName>;
+  def "" : VOP2_Pseudo <P.Outs, P.Ins32, pattern, opName>;
 
 let isCodeGenOnly = 0 in {
-  def _si : VOP2Common <VOP_MADK.Outs, VOP_MADK.Ins,
-                        !strconcat(opName, VOP_MADK.Asm), []>,
-            SIMCInstr <opName#"_e32", SISubtarget.SI>,
+  def _si : VOP2Common <P.Outs, P.Ins32,
+                        !strconcat(opName, P.Asm32), []>,
+            SIMCInstr <opName#"_e32", SIEncodingFamily.SI>,
             VOP2_MADKe <op.SI> {
             let AssemblerPredicates = [isSICI];
+            let DecoderNamespace = "SICI";
+            let DisableDecoder = DisableSIDecoder;
             }
 
-  def _vi : VOP2Common <VOP_MADK.Outs, VOP_MADK.Ins,
-                        !strconcat(opName, VOP_MADK.Asm), []>,
-            SIMCInstr <opName#"_e32", SISubtarget.VI>,
+  def _vi : VOP2Common <P.Outs, P.Ins32,
+                        !strconcat(opName, P.Asm32), []>,
+            SIMCInstr <opName#"_e32", SIEncodingFamily.VI>,
             VOP2_MADKe <op.VI> {
             let AssemblerPredicates = [isVI];
+            let DecoderNamespace = "VI";
+            let DisableDecoder = DisableVIDecoder;
             }
 } // End isCodeGenOnly = 0
 }
@@ -1778,37 +2277,55 @@ let isCodeGenOnly = 0 in {
 class VOPC_Pseudo <dag ins, list<dag> pattern, string opName> :
   VOPCCommon <ins, "", pattern>,
   VOP <opName>,
-  SIMCInstr<opName#"_e32", SISubtarget.NONE> {
+  SIMCInstr<opName#"_e32", SIEncodingFamily.NONE> {
   let isPseudo = 1;
   let isCodeGenOnly = 1;
 }
 
+class VOPC_SDWA <vopc op, string opName, bit DefExec, VOPProfile p> :
+    VOPC_SDWAe <op.VI>,
+    VOP_SDWA <p.OutsSDWA, p.InsSDWA, opName#p.AsmSDWA, [], p.HasModifiers>,
+    SDWADisableFields <p> {
+  let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
+  let hasSideEffects = DefExec;
+  let AsmMatchConverter = "cvtSdwaVOPC";
+  let AssemblerPredicates = !if(p.HasExt, [isVI], [DisableInst]);
+  let DecoderNamespace = "SDWA";
+  let DisableDecoder = DisableVIDecoder;
+}
+
 multiclass VOPC_m <vopc op, dag ins, string op_asm, list<dag> pattern,
                    string opName, bit DefExec, VOPProfile p,
                    list<SchedReadWrite> sched,
                    string revOpName = "", string asm = opName#"_e32 "#op_asm,
                    string alias_asm = opName#" "#op_asm> {
-  def "" : VOPC_Pseudo <ins, pattern, opName> {
+  def "" : VOPC_Pseudo <ins, pattern, opName>,
+           VOP2_REV<revOpName#"_e32", !eq(revOpName, opName)> {
     let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
     let SchedRW = sched;
+    let isConvergent = DefExec;
   }
 
   let AssemblerPredicates = [isSICI] in {
     def _si : VOPC<op.SI, ins, asm, []>,
-              SIMCInstr <opName#"_e32", SISubtarget.SI> {
+              SIMCInstr <opName#"_e32", SIEncodingFamily.SI> {
       let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
-      let hasSideEffects = DefExec;
+      let isConvergent = DefExec;
       let SchedRW = sched;
+      let DecoderNamespace = "SICI";
+      let DisableDecoder = DisableSIDecoder;
     }
 
   } // End AssemblerPredicates = [isSICI]
 
   let AssemblerPredicates = [isVI] in {
     def _vi : VOPC<op.VI, ins, asm, []>,
-              SIMCInstr <opName#"_e32", SISubtarget.VI> {
+              SIMCInstr <opName#"_e32", SIEncodingFamily.VI> {
       let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
-      let hasSideEffects = DefExec;
+      let isConvergent = DefExec;
       let SchedRW = sched;
+      let DecoderNamespace = "VI";
+      let DisableDecoder = DisableVIDecoder;
     }
 
   } // End AssemblerPredicates = [isVI]
@@ -1819,10 +2336,13 @@ multiclass VOPC_m <vopc op, dag ins, string op_asm, list<dag> pattern,
 multiclass VOPC_Helper <vopc op, string opName, list<dag> pat32,
                         list<dag> pat64, bit DefExec, string revOp,
                         VOPProfile p, list<SchedReadWrite> sched> {
-  defm _e32 : VOPC_m <op, p.Ins32, p.Asm32, pat32, opName, DefExec, p, sched>;
+  defm _e32 : VOPC_m <op, p.Ins32, p.Asm32, pat32, opName, DefExec, p, sched,
+                      revOp>;
 
-  defm _e64 : VOP3_C_m <op, (outs VOPDstS64:$dst), p.Ins64, opName#p.Asm64, pat64,
+  defm _e64 : VOP3_C_m <op, (outs VOPDstS64:$sdst), p.Ins64, opName#p.Asm64, pat64,
                         opName, p.HasModifiers, DefExec, revOp, sched>;
+
+  def _sdwa : VOPC_SDWA <op, opName, DefExec, p>;
 }
 
 // Special case for class instructions which only have modifiers on
@@ -1832,9 +2352,14 @@ multiclass VOPC_Class_Helper <vopc op, string opName, list<dag> pat32,
                               VOPProfile p, list<SchedReadWrite> sched> {
   defm _e32 : VOPC_m <op, p.Ins32, p.Asm32, pat32, opName, DefExec, p, sched>;
 
-  defm _e64 : VOP3_C_m <op, (outs VOPDstS64:$dst), p.Ins64, opName#p.Asm64, pat64,
+  defm _e64 : VOP3_C_m <op, (outs VOPDstS64:$sdst), p.Ins64, opName#p.Asm64, pat64,
                         opName, p.HasModifiers, DefExec, revOp, sched>,
                         VOP3DisableModFields<1, 0, 0>;
+
+  def _sdwa : VOPC_SDWA <op, opName, DefExec, p> {
+    let src1_fmodifiers = 0;
+    let src1_imodifiers = ?;
+  }
 }
 
 multiclass VOPCInst <vopc op, string opName,
@@ -1845,12 +2370,12 @@ multiclass VOPCInst <vopc op, string opName,
                      VOPC_Helper <
   op, opName, [],
   !if(P.HasModifiers,
-      [(set i1:$dst,
+      [(set i1:$sdst,
           (setcc (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
                                       i1:$clamp, i32:$omod)),
                  (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
                  cond))],
-      [(set i1:$dst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))]),
+      [(set i1:$sdst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))]),
   DefExec, revOp, P, sched
 >;
 
@@ -1859,9 +2384,9 @@ multiclass VOPCClassInst <vopc op, string opName, VOPProfile P,
                      list<SchedReadWrite> sched> : VOPC_Class_Helper <
   op, opName, [],
   !if(P.HasModifiers,
-      [(set i1:$dst,
+      [(set i1:$sdst,
           (AMDGPUfp_class (P.Src0VT (VOP3Mods0Clamp0OMod P.Src0VT:$src0, i32:$src0_modifiers)), P.Src1VT:$src1))],
-      [(set i1:$dst, (AMDGPUfp_class P.Src0VT:$src0, P.Src1VT:$src1))]),
+      [(set i1:$sdst, (AMDGPUfp_class P.Src0VT:$src0, P.Src1VT:$src1))]),
   DefExec, opName, P, sched
 >;
 
@@ -1897,10 +2422,6 @@ multiclass VOPCX_I32 <vopc op, string opName, string revOp = opName> :
 multiclass VOPCX_I64 <vopc op, string opName, string revOp = opName> :
   VOPCX <op, opName, VOPC_I1_I64_I64, COND_NULL, [Write64Bit], revOp>;
 
-multiclass VOP3_Helper <vop3 op, string opName, dag outs, dag ins, string asm,
-                        list<dag> pat, int NumSrcArgs, bit HasMods> : VOP3_m <
-    op, outs, ins, opName#" "#asm, pat, opName, NumSrcArgs, HasMods
->;
 
 multiclass VOPC_CLASS_F32 <vopc op, string opName> :
   VOPCClassInst <op, opName, VOPC_I1_F32_I32, 0, [Write32Bit]>;
@@ -1914,32 +2435,40 @@ multiclass VOPC_CLASS_F64 <vopc op, string opName> :
 multiclass VOPCX_CLASS_F64 <vopc op, string opName> :
   VOPCClassInst <op, opName, VOPC_I1_F64_I32, 1, [WriteDoubleAdd]>;
 
+
+multiclass VOP3_Helper <vop3 op, string opName, dag outs, dag ins, string asm,
+                        list<dag> pat, int NumSrcArgs, bit HasMods,
+                        bit VOP3Only = 0> : VOP3_m <
+    op, outs, ins, opName#" "#asm, pat, opName, NumSrcArgs, HasMods, VOP3Only
+>;
+
 multiclass VOP3Inst <vop3 op, string opName, VOPProfile P,
-                     SDPatternOperator node = null_frag> : VOP3_Helper <
-  op, opName, (outs P.DstRC.RegClass:$dst), P.Ins64, P.Asm64,
+                     SDPatternOperator node = null_frag, bit VOP3Only = 0> :
+  VOP3_Helper <
+  op, opName, (outs P.DstRC.RegClass:$vdst), P.Ins64, P.Asm64,
   !if(!eq(P.NumSrcArgs, 3),
     !if(P.HasModifiers,
-        [(set P.DstVT:$dst,
+        [(set P.DstVT:$vdst,
             (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
                                        i1:$clamp, i32:$omod)),
                   (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
                   (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))))],
-        [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1,
+        [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1,
                                   P.Src2VT:$src2))]),
   !if(!eq(P.NumSrcArgs, 2),
     !if(P.HasModifiers,
-        [(set P.DstVT:$dst,
+        [(set P.DstVT:$vdst,
             (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
                                        i1:$clamp, i32:$omod)),
                   (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
-        [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))])
+        [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))])
   /* P.NumSrcArgs == 1 */,
     !if(P.HasModifiers,
-        [(set P.DstVT:$dst,
+        [(set P.DstVT:$vdst,
             (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
                                        i1:$clamp, i32:$omod))))],
-        [(set P.DstVT:$dst, (node P.Src0VT:$src0))]))),
-  P.NumSrcArgs, P.HasModifiers
+        [(set P.DstVT:$vdst, (node P.Src0VT:$src0))]))),
+  P.NumSrcArgs, P.HasModifiers, VOP3Only
 >;
 
 // Special case for v_div_fmas_{f32|f64}, since it seems to be the
@@ -1948,14 +2477,14 @@ multiclass VOP3_VCC_Inst <vop3 op, string opName,
                           VOPProfile P,
                           SDPatternOperator node = null_frag> : VOP3_Helper <
   op, opName,
-  (outs P.DstRC.RegClass:$dst),
-  (ins InputModsNoDefault:$src0_modifiers, P.Src0RC64:$src0,
-       InputModsNoDefault:$src1_modifiers, P.Src1RC64:$src1,
-       InputModsNoDefault:$src2_modifiers, P.Src2RC64:$src2,
-       ClampMod:$clamp,
+  (outs P.DstRC.RegClass:$vdst),
+  (ins FPInputMods:$src0_modifiers, P.Src0RC64:$src0,
+       FPInputMods:$src1_modifiers, P.Src1RC64:$src1,
+       FPInputMods:$src2_modifiers, P.Src2RC64:$src2,
+       clampmod:$clamp,
        omod:$omod),
-  "$dst, $src0_modifiers, $src1_modifiers, $src2_modifiers"#"$clamp"#"$omod",
-  [(set P.DstVT:$dst,
+  "$vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers"#"$clamp"#"$omod",
+  [(set P.DstVT:$vdst,
             (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
                                        i1:$clamp, i32:$omod)),
                   (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
@@ -1964,11 +2493,11 @@ multiclass VOP3_VCC_Inst <vop3 op, string opName,
   3, 1
 >;
 
-multiclass VOP3bInst <vop op, string opName, VOPProfile P, list<dag> pattern = []> :
+multiclass VOP3bInst <vop op, string opName, VOPProfile P, list<dag> pattern = [], bit VOP3Only = 0> :
   VOP3b_2_3_m <
   op, P.Outs64, P.Ins64,
   opName#" "#P.Asm64, pattern,
-  opName, "", 1, 1
+  opName, "", 1, 1, VOP3Only
 >;
 
 class Vop3ModPat<Instruction Inst, VOPProfile P, SDPatternOperator node> : Pat<
@@ -1987,7 +2516,7 @@ class Vop3ModPat<Instruction Inst, VOPProfile P, SDPatternOperator node> : Pat<
 
 class VINTRP_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
   VINTRPCommon <outs, ins, "", pattern>,
-  SIMCInstr<opName, SISubtarget.NONE> {
+  SIMCInstr<opName, SIEncodingFamily.NONE> {
   let isPseudo = 1;
   let isCodeGenOnly = 1;
 }
@@ -1996,13 +2525,21 @@ class VINTRP_Real_si <bits <2> op, string opName, dag outs, dag ins,
                       string asm> :
   VINTRPCommon <outs, ins, asm, []>,
   VINTRPe <op>,
-  SIMCInstr<opName, SISubtarget.SI>;
+  SIMCInstr<opName, SIEncodingFamily.SI> {
+  let AssemblerPredicate = SIAssemblerPredicate;
+  let DecoderNamespace = "SICI";
+  let DisableDecoder = DisableSIDecoder;
+}
 
 class VINTRP_Real_vi <bits <2> op, string opName, dag outs, dag ins,
                       string asm> :
   VINTRPCommon <outs, ins, asm, []>,
   VINTRPe_vi <op>,
-  SIMCInstr<opName, SISubtarget.VI>;
+  SIMCInstr<opName, SIEncodingFamily.VI> {
+  let AssemblerPredicate = VIAssemblerPredicate;
+  let DecoderNamespace = "VI";
+  let DisableDecoder = DisableVIDecoder;
+}
 
 multiclass VINTRP_m <bits <2> op, dag outs, dag ins, string asm,
                      list<dag> pattern = []> {
@@ -2019,7 +2556,7 @@ multiclass VINTRP_m <bits <2> op, dag outs, dag ins, string asm,
 
 class DS_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
   DS <outs, ins, "", pattern>,
-  SIMCInstr <opName, SISubtarget.NONE> {
+  SIMCInstr <opName, SIEncodingFamily.NONE> {
   let isPseudo = 1;
   let isCodeGenOnly = 1;
 }
@@ -2027,14 +2564,22 @@ class DS_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
 class DS_Real_si <bits<8> op, string opName, dag outs, dag ins, string asm> :
   DS <outs, ins, asm, []>,
   DSe <op>,
-  SIMCInstr <opName, SISubtarget.SI> {
+  SIMCInstr <opName, SIEncodingFamily.SI> {
   let isCodeGenOnly = 0;
+  let AssemblerPredicates = [isSICI];
+  let DecoderNamespace="SICI";
+  let DisableDecoder = DisableSIDecoder;
 }
 
 class DS_Real_vi <bits<8> op, string opName, dag outs, dag ins, string asm> :
   DS <outs, ins, asm, []>,
   DSe_vi <op>,
-  SIMCInstr <opName, SISubtarget.VI>;
+  SIMCInstr <opName, SIEncodingFamily.VI> {
+  let isCodeGenOnly = 0;
+  let AssemblerPredicates = [isVI];
+  let DecoderNamespace="VI";
+  let DisableDecoder = DisableVIDecoder;
+}
 
 class DS_Off16_Real_si <bits<8> op, string opName, dag outs, dag ins, string asm> :
   DS_Real_si <op,opName, outs, ins, asm> {
@@ -2043,7 +2588,6 @@ class DS_Off16_Real_si <bits<8> op, string opName, dag outs, dag ins, string asm
   bits<16> offset;
   let offset0 = offset{7-0};
   let offset1 = offset{15-8};
-  let isCodeGenOnly = 0;
 }
 
 class DS_Off16_Real_vi <bits<8> op, string opName, dag outs, dag ins, string asm> :
@@ -2055,9 +2599,24 @@ class DS_Off16_Real_vi <bits<8> op, string opName, dag outs, dag ins, string asm
   let offset1 = offset{15-8};
 }
 
+multiclass DS_1A_RET_ <dsop op, string opName, RegisterClass rc,
+  dag outs = (outs rc:$vdst),
+  dag ins = (ins VGPR_32:$addr, offset:$offset, gds:$gds),
+  string asm = opName#" $vdst, $addr"#"$offset$gds"> {
+
+  def "" : DS_Pseudo <opName, outs, ins, []>;
+
+  let data0 = 0, data1 = 0 in {
+    def _si : DS_Off16_Real_si <op.SI, opName, outs, ins, asm>;
+    def _vi : DS_Off16_Real_vi <op.VI, opName, outs, ins, asm>;
+  }
+}
+
+// TODO: DS_1A_RET can be inherited from DS_1A_RET_ but its not working
+// for some reason. In fact we can remove this class if use dsop everywhere
 multiclass DS_1A_RET <bits<8> op, string opName, RegisterClass rc,
   dag outs = (outs rc:$vdst),
-  dag ins = (ins VGPR_32:$addr, ds_offset:$offset, gds:$gds),
+  dag ins = (ins VGPR_32:$addr, offset:$offset, gds:$gds),
   string asm = opName#" $vdst, $addr"#"$offset$gds"> {
 
   def "" : DS_Pseudo <opName, outs, ins, []>;
@@ -2070,8 +2629,8 @@ multiclass DS_1A_RET <bits<8> op, string opName, RegisterClass rc,
 
 multiclass DS_1A_Off8_RET <bits<8> op, string opName, RegisterClass rc,
   dag outs = (outs rc:$vdst),
-  dag ins = (ins VGPR_32:$addr, ds_offset0:$offset0, ds_offset1:$offset1,
-                 gds01:$gds),
+  dag ins = (ins VGPR_32:$addr, offset0:$offset0, offset1:$offset1,
+                 gds:$gds),
   string asm = opName#" $vdst, $addr"#"$offset0"#"$offset1$gds"> {
 
   def "" : DS_Pseudo <opName, outs, ins, []>;
@@ -2084,7 +2643,7 @@ multiclass DS_1A_Off8_RET <bits<8> op, string opName, RegisterClass rc,
 
 multiclass DS_1A1D_NORET <bits<8> op, string opName, RegisterClass rc,
   dag outs = (outs),
-  dag ins = (ins VGPR_32:$addr, rc:$data0, ds_offset:$offset, gds:$gds),
+  dag ins = (ins VGPR_32:$addr, rc:$data0, offset:$offset, gds:$gds),
   string asm = opName#" $addr, $data0"#"$offset$gds"> {
 
   def "" : DS_Pseudo <opName, outs, ins, []>,
@@ -2096,11 +2655,25 @@ multiclass DS_1A1D_NORET <bits<8> op, string opName, RegisterClass rc,
   }
 }
 
-multiclass DS_1A1D_Off8_NORET <bits<8> op, string opName, RegisterClass rc,
+multiclass DS_1A_Off8_NORET <bits<8> op, string opName,
+  dag outs = (outs),
+  dag ins = (ins VGPR_32:$addr,
+              offset0:$offset0, offset1:$offset1, gds:$gds),
+  string asm = opName#" $addr $offset0"#"$offset1$gds"> {
+
+  def "" : DS_Pseudo <opName, outs, ins, []>;
+
+  let data0 = 0, data1 = 0, vdst = 0, AsmMatchConverter = "cvtDSOffset01" in {
+    def _si : DS_Real_si <op, opName, outs, ins, asm>;
+    def _vi : DS_Real_vi <op, opName, outs, ins, asm>;
+  }
+}
+
+multiclass DS_1A2D_Off8_NORET <bits<8> op, string opName, RegisterClass rc,
   dag outs = (outs),
   dag ins = (ins VGPR_32:$addr, rc:$data0, rc:$data1,
-              ds_offset0:$offset0, ds_offset1:$offset1, gds01:$gds),
-  string asm = opName#" $addr, $data0, $data1"#"$offset0"#"$offset1"#"$gds"> {
+              offset0:$offset0, offset1:$offset1, gds:$gds),
+  string asm = opName#" $addr, $data0, $data1$offset0$offset1$gds"> {
 
   def "" : DS_Pseudo <opName, outs, ins, []>;
 
@@ -2113,7 +2686,7 @@ multiclass DS_1A1D_Off8_NORET <bits<8> op, string opName, RegisterClass rc,
 multiclass DS_1A1D_RET <bits<8> op, string opName, RegisterClass rc,
                         string noRetOp = "",
   dag outs = (outs rc:$vdst),
-  dag ins = (ins VGPR_32:$addr, rc:$data0, ds_offset:$offset, gds:$gds),
+  dag ins = (ins VGPR_32:$addr, rc:$data0, offset:$offset, gds:$gds),
   string asm = opName#" $vdst, $addr, $data0"#"$offset$gds"> {
 
   let hasPostISelHook = 1 in {
@@ -2127,6 +2700,23 @@ multiclass DS_1A1D_RET <bits<8> op, string opName, RegisterClass rc,
   }
 }
 
+multiclass DS_1A1D_PERMUTE <bits<8> op, string opName, RegisterClass rc,
+                            SDPatternOperator node = null_frag,
+  dag outs = (outs rc:$vdst),
+  dag ins = (ins VGPR_32:$addr, rc:$data0, offset:$offset),
+  string asm = opName#" $vdst, $addr, $data0"#"$offset"> {
+
+  let mayLoad = 0, mayStore = 0, isConvergent = 1 in {
+    def "" : DS_Pseudo <opName, outs, ins,
+     [(set i32:$vdst,
+         (node (DS1Addr1Offset i32:$addr, i16:$offset), i32:$data0))]>;
+
+    let data1 = 0, gds = 0  in {
+      def "_vi" : DS_Off16_Real_vi <op, opName, outs, ins, asm>;
+    }
+  }
+}
+
 multiclass DS_1A2D_RET_m <bits<8> op, string opName, RegisterClass rc,
                           string noRetOp = "", dag ins,
   dag outs = (outs rc:$vdst),
@@ -2145,14 +2735,14 @@ multiclass DS_1A2D_RET <bits<8> op, string asm, RegisterClass rc,
                         string noRetOp = "", RegisterClass src = rc> :
   DS_1A2D_RET_m <op, asm, rc, noRetOp,
                  (ins VGPR_32:$addr, src:$data0, src:$data1,
-                      ds_offset:$offset, gds:$gds)
+                      offset:$offset, gds:$gds)
 >;
 
 multiclass DS_1A2D_NORET <bits<8> op, string opName, RegisterClass rc,
                           string noRetOp = opName,
   dag outs = (outs),
   dag ins = (ins VGPR_32:$addr, rc:$data0, rc:$data1,
-                 ds_offset:$offset, gds:$gds),
+                 offset:$offset, gds:$gds),
   string asm = opName#" $addr, $data0, $data1"#"$offset"#"$gds"> {
 
   def "" : DS_Pseudo <opName, outs, ins, []>,
@@ -2166,7 +2756,7 @@ multiclass DS_1A2D_NORET <bits<8> op, string opName, RegisterClass rc,
 
 multiclass DS_0A_RET <bits<8> op, string opName,
   dag outs = (outs VGPR_32:$vdst),
-  dag ins = (ins ds_offset:$offset, gds:$gds),
+  dag ins = (ins offset:$offset, gds:$gds),
   string asm = opName#" $vdst"#"$offset"#"$gds"> {
 
   let mayLoad = 1, mayStore = 1 in {
@@ -2181,7 +2771,7 @@ multiclass DS_0A_RET <bits<8> op, string opName,
 
 multiclass DS_1A_RET_GDS <bits<8> op, string opName,
   dag outs = (outs VGPR_32:$vdst),
-  dag ins = (ins VGPR_32:$addr, ds_offset_gds:$offset),
+  dag ins = (ins VGPR_32:$addr, offset:$offset),
   string asm = opName#" $vdst, $addr"#"$offset gds"> {
 
   def "" : DS_Pseudo <opName, outs, ins, []>;
@@ -2207,7 +2797,7 @@ multiclass DS_1A_GDS <bits<8> op, string opName,
 
 multiclass DS_1A <bits<8> op, string opName,
   dag outs = (outs),
-  dag ins = (ins VGPR_32:$addr, ds_offset:$offset, gds:$gds),
+  dag ins = (ins VGPR_32:$addr, offset:$offset, gds:$gds),
   string asm = opName#" $addr"#"$offset"#"$gds"> {
 
   let mayLoad = 1, mayStore = 1 in {
@@ -2226,7 +2816,7 @@ multiclass DS_1A <bits<8> op, string opName,
 
 class MTBUF_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
   MTBUF <outs, ins, "", pattern>,
-  SIMCInstr<opName, SISubtarget.NONE> {
+  SIMCInstr<opName, SIEncodingFamily.NONE> {
   let isPseudo = 1;
   let isCodeGenOnly = 1;
 }
@@ -2235,12 +2825,18 @@ class MTBUF_Real_si <bits<3> op, string opName, dag outs, dag ins,
                     string asm> :
   MTBUF <outs, ins, asm, []>,
   MTBUFe <op>,
-  SIMCInstr<opName, SISubtarget.SI>;
+  SIMCInstr<opName, SIEncodingFamily.SI> {
+  let DecoderNamespace="SICI";
+  let DisableDecoder = DisableSIDecoder;
+}
 
 class MTBUF_Real_vi <bits<4> op, string opName, dag outs, dag ins, string asm> :
   MTBUF <outs, ins, asm, []>,
   MTBUFe_vi <op>,
-  SIMCInstr <opName, SISubtarget.VI>;
+  SIMCInstr <opName, SIEncodingFamily.VI> {
+  let DecoderNamespace="VI";
+  let DisableDecoder = DisableVIDecoder;
+}
 
 multiclass MTBUF_m <bits<3> op, string opName, dag outs, dag ins, string asm,
                     list<dag> pattern> {
@@ -2311,7 +2907,7 @@ class MUBUFAddr64Table <bit is_addr64, string suffix = ""> {
 
 class MUBUF_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
   MUBUF <outs, ins, "", pattern>,
-  SIMCInstr<opName, SISubtarget.NONE> {
+  SIMCInstr<opName, SIEncodingFamily.NONE> {
   let isPseudo = 1;
   let isCodeGenOnly = 1;
 
@@ -2329,16 +2925,22 @@ class MUBUF_Real_si <mubuf op, string opName, dag outs, dag ins,
                      string asm> :
   MUBUF <outs, ins, asm, []>,
   MUBUFe <op.SI>,
-  SIMCInstr<opName, SISubtarget.SI> {
+  SIMCInstr<opName, SIEncodingFamily.SI> {
   let lds = 0;
+  let AssemblerPredicate = SIAssemblerPredicate;
+  let DecoderNamespace="SICI";
+  let DisableDecoder = DisableSIDecoder;
 }
 
 class MUBUF_Real_vi <mubuf op, string opName, dag outs, dag ins,
                      string asm> :
   MUBUF <outs, ins, asm, []>,
   MUBUFe_vi <op.VI>,
-  SIMCInstr<opName, SISubtarget.VI> {
+  SIMCInstr<opName, SIEncodingFamily.VI> {
   let lds = 0;
+  let AssemblerPredicate = VIAssemblerPredicate;
+  let DecoderNamespace="VI";
+  let DisableDecoder = DisableVIDecoder;
 }
 
 multiclass MUBUF_m <mubuf op, string opName, dag outs, dag ins, string asm,
@@ -2399,38 +3001,82 @@ multiclass MUBUFAtomicAddr64_m <mubuf op, string opName, dag outs, dag ins,
   // for VI appropriately.
 }
 
+multiclass MUBUFAtomicOther_m <mubuf op, string opName, dag outs, dag ins,
+                               string asm, list<dag> pattern, bit is_return> {
+
+  def "" : MUBUF_Pseudo <opName, outs, ins, pattern>,
+           AtomicNoRet<opName, is_return>;
+
+  let tfe = 0 in {
+    let addr64 = 0 in {
+      def _si : MUBUF_Real_si <op, opName, outs, ins, asm>;
+    }
+
+    def _vi : MUBUF_Real_vi <op, opName, outs, ins, asm>;
+  }
+}
+
 multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc,
                          ValueType vt, SDPatternOperator atomic> {
 
-  let mayStore = 1, mayLoad = 1, hasPostISelHook = 1 in {
+  let mayStore = 1, mayLoad = 1, hasPostISelHook = 1, hasSideEffects = 1 in {
 
     // No return variants
-    let glc = 0 in {
+    let glc = 0, AsmMatchConverter = "cvtMubufAtomic" in {
 
       defm _ADDR64 : MUBUFAtomicAddr64_m <
         op, name#"_addr64", (outs),
         (ins rc:$vdata, VReg_64:$vaddr, SReg_128:$srsrc,
-             SCSrc_32:$soffset, mbuf_offset:$offset, slc:$slc),
-        name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"#"$slc", [], 0
+             SCSrc_32:$soffset, offset:$offset, slc:$slc),
+        name#" $vdata, $vaddr, $srsrc, $soffset addr64$offset$slc", [], 0
       >;
 
       defm _OFFSET : MUBUFAtomicOffset_m <
         op, name#"_offset", (outs),
-        (ins rc:$vdata, SReg_128:$srsrc, SCSrc_32:$soffset, mbuf_offset:$offset,
+        (ins rc:$vdata, SReg_128:$srsrc, SCSrc_32:$soffset, offset:$offset,
              slc:$slc),
-        name#" $vdata, $srsrc, $soffset"#"$offset"#"$slc", [], 0
+        name#" $vdata, off, $srsrc, $soffset$offset$slc", [], 0
       >;
+
+      let offen = 1, idxen = 0 in {
+        defm _OFFEN : MUBUFAtomicOther_m <
+          op, name#"_offen", (outs),
+          (ins rc:$vdata, VGPR_32:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset,
+                offset:$offset, slc:$slc),
+          name#" $vdata, $vaddr, $srsrc, $soffset offen$offset$slc", [], 0
+        >;
+      }
+
+      let offen = 0, idxen = 1 in {
+        defm _IDXEN : MUBUFAtomicOther_m <
+          op, name#"_idxen", (outs),
+          (ins rc:$vdata, VGPR_32:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset,
+                offset:$offset, slc:$slc),
+          name#" $vdata, $vaddr, $srsrc, $soffset idxen$offset$slc", [], 0
+        >;
+      }
+
+      let offen = 1, idxen = 1 in {
+        defm _BOTHEN : MUBUFAtomicOther_m <
+          op, name#"_bothen", (outs),
+          (ins rc:$vdata, VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset,
+                offset:$offset, slc:$slc),
+          name#" $vdata, $vaddr, $srsrc, $soffset idxen offen$offset$slc",
+          [], 0
+        >;
+      }
     } // glc = 0
 
     // Variant that return values
     let glc = 1, Constraints = "$vdata = $vdata_in",
+        AsmMatchConverter = "cvtMubufAtomicReturn",
         DisableEncoding = "$vdata_in"  in {
 
       defm _RTN_ADDR64 : MUBUFAtomicAddr64_m <
         op, name#"_rtn_addr64", (outs rc:$vdata),
         (ins rc:$vdata_in, VReg_64:$vaddr, SReg_128:$srsrc,
-             SCSrc_32:$soffset, mbuf_offset:$offset, slc:$slc),
-        name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"#" glc"#"$slc",
+             SCSrc_32:$soffset, offset:$offset, slc:$slc),
+        name#" $vdata, $vaddr, $srsrc, $soffset addr64$offset glc$slc",
         [(set vt:$vdata,
          (atomic (MUBUFAddr64Atomic v4i32:$srsrc, i64:$vaddr, i32:$soffset,
 	                            i16:$offset, i1:$slc), vt:$vdata_in))], 1
@@ -2439,13 +3085,42 @@ multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc,
       defm _RTN_OFFSET : MUBUFAtomicOffset_m <
         op, name#"_rtn_offset", (outs rc:$vdata),
         (ins rc:$vdata_in, SReg_128:$srsrc, SCSrc_32:$soffset,
-             mbuf_offset:$offset, slc:$slc),
-        name#" $vdata, $srsrc, $soffset"#"$offset"#" glc$slc",
+             offset:$offset, slc:$slc),
+        name#" $vdata, off, $srsrc, $soffset$offset glc$slc",
         [(set vt:$vdata,
          (atomic (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset,
                                     i1:$slc), vt:$vdata_in))], 1
       >;
 
+      let offen = 1, idxen = 0 in {
+        defm _RTN_OFFEN : MUBUFAtomicOther_m <
+          op, name#"_rtn_offen", (outs rc:$vdata),
+          (ins rc:$vdata_in, VGPR_32:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset,
+                offset:$offset, slc:$slc),
+          name#" $vdata, $vaddr, $srsrc, $soffset offen$offset glc$slc",
+          [], 1
+        >;
+      }
+
+      let offen = 0, idxen = 1 in {
+        defm _RTN_IDXEN : MUBUFAtomicOther_m <
+          op, name#"_rtn_idxen", (outs rc:$vdata),
+          (ins rc:$vdata_in, VGPR_32:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset,
+                offset:$offset, slc:$slc),
+          name#" $vdata, $vaddr, $srsrc, $soffset idxen$offset glc$slc",
+          [], 1
+        >;
+      }
+
+      let offen = 1, idxen = 1 in {
+        defm _RTN_BOTHEN : MUBUFAtomicOther_m <
+          op, name#"_rtn_bothen", (outs rc:$vdata),
+          (ins rc:$vdata_in, VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset,
+                offset:$offset, slc:$slc),
+          name#" $vdata, $vaddr, $srsrc, $soffset idxen offen$offset glc$slc",
+          [], 1
+        >;
+      }
     } // glc = 1
 
   } // mayStore = 1, mayLoad = 1, hasPostISelHook = 1
@@ -2461,8 +3136,8 @@ multiclass MUBUF_Load_Helper <mubuf op, string name, RegisterClass regClass,
     let offen = 0, idxen = 0, vaddr = 0 in {
       defm _OFFSET : MUBUF_m <op, name#"_offset", (outs regClass:$vdata),
                            (ins SReg_128:$srsrc, SCSrc_32:$soffset,
-                           mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe),
-                           name#" $vdata, $srsrc, $soffset"#"$offset"#"$glc"#"$slc"#"$tfe",
+                           offset:$offset, glc:$glc, slc:$slc, tfe:$tfe),
+                           name#" $vdata, off, $srsrc, $soffset$offset$glc$slc$tfe",
                            [(set load_vt:$vdata, (ld (MUBUFOffset v4i32:$srsrc,
                                                      i32:$soffset, i16:$offset,
                                                      i1:$glc, i1:$slc, i1:$tfe)))]>;
@@ -2471,33 +3146,32 @@ multiclass MUBUF_Load_Helper <mubuf op, string name, RegisterClass regClass,
     let offen = 1, idxen = 0  in {
       defm _OFFEN  : MUBUF_m <op, name#"_offen", (outs regClass:$vdata),
                            (ins VGPR_32:$vaddr, SReg_128:$srsrc,
-                           SCSrc_32:$soffset, mbuf_offset:$offset, glc:$glc, slc:$slc,
+                           SCSrc_32:$soffset, offset:$offset, glc:$glc, slc:$slc,
                            tfe:$tfe),
-                           name#" $vdata, $vaddr, $srsrc, $soffset offen"#"$offset"#"$glc"#"$slc"#"$tfe", []>;
+                           name#" $vdata, $vaddr, $srsrc, $soffset offen$offset$glc$slc$tfe", []>;
     }
 
     let offen = 0, idxen = 1 in {
       defm _IDXEN  : MUBUF_m <op, name#"_idxen", (outs regClass:$vdata),
                            (ins VGPR_32:$vaddr, SReg_128:$srsrc,
-                           SCSrc_32:$soffset, mbuf_offset:$offset, glc:$glc,
+                           SCSrc_32:$soffset, offset:$offset, glc:$glc,
                            slc:$slc, tfe:$tfe),
-                           name#" $vdata, $vaddr, $srsrc, $soffset idxen"#"$offset"#"$glc"#"$slc"#"$tfe", []>;
+                           name#" $vdata, $vaddr, $srsrc, $soffset idxen$offset$glc$slc$tfe", []>;
     }
 
     let offen = 1, idxen = 1 in {
       defm _BOTHEN : MUBUF_m <op, name#"_bothen", (outs regClass:$vdata),
                            (ins VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset,
-                           mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe),
-                           name#" $vdata, $vaddr, $srsrc, $soffset idxen offen"#"$offset"#"$glc"#"$slc"#"$tfe", []>;
+                           offset:$offset, glc:$glc, slc:$slc, tfe:$tfe),
+                           name#" $vdata, $vaddr, $srsrc, $soffset idxen offen$offset$glc$slc$tfe", []>;
     }
 
     let offen = 0, idxen = 0 in {
       defm _ADDR64 : MUBUFAddr64_m <op, name#"_addr64", (outs regClass:$vdata),
                            (ins VReg_64:$vaddr, SReg_128:$srsrc,
-                                SCSrc_32:$soffset, mbuf_offset:$offset,
+                                SCSrc_32:$soffset, offset:$offset,
 				glc:$glc, slc:$slc, tfe:$tfe),
-                           name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"#
-                                "$glc"#"$slc"#"$tfe",
+                           name#" $vdata, $vaddr, $srsrc, $soffset addr64$offset$glc$slc$tfe",
                            [(set load_vt:$vdata, (ld (MUBUFAddr64 v4i32:$srsrc,
                                                   i64:$vaddr, i32:$soffset,
                                                   i16:$offset, i1:$glc, i1:$slc,
@@ -2509,18 +3183,11 @@ multiclass MUBUF_Load_Helper <mubuf op, string name, RegisterClass regClass,
 multiclass MUBUF_Store_Helper <mubuf op, string name, RegisterClass vdataClass,
                           ValueType store_vt = i32, SDPatternOperator st = null_frag> {
   let mayLoad = 0, mayStore = 1 in {
-    defm : MUBUF_m <op, name, (outs),
-                    (ins vdataClass:$vdata, VGPR_32:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset,
-                    mbuf_offset:$offset, offen:$offen, idxen:$idxen, glc:$glc, slc:$slc,
-                    tfe:$tfe),
-                    name#" $vdata, $vaddr, $srsrc, $soffset"#"$offen"#"$idxen"#"$offset"#
-                         "$glc"#"$slc"#"$tfe", []>;
-
     let offen = 0, idxen = 0, vaddr = 0 in {
       defm _OFFSET : MUBUF_m <op, name#"_offset",(outs),
                               (ins vdataClass:$vdata, SReg_128:$srsrc, SCSrc_32:$soffset,
-                              mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe),
-                              name#" $vdata, $srsrc, $soffset"#"$offset"#"$glc"#"$slc"#"$tfe",
+                              offset:$offset, glc:$glc, slc:$slc, tfe:$tfe),
+                              name#" $vdata, off, $srsrc, $soffset$offset$glc$slc$tfe",
                               [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
                                    i16:$offset, i1:$glc, i1:$slc, i1:$tfe))]>;
     } // offen = 0, idxen = 0, vaddr = 0
@@ -2528,35 +3195,35 @@ multiclass MUBUF_Store_Helper <mubuf op, string name, RegisterClass vdataClass,
     let offen = 1, idxen = 0  in {
       defm _OFFEN : MUBUF_m <op, name#"_offen", (outs),
                              (ins vdataClass:$vdata, VGPR_32:$vaddr, SReg_128:$srsrc,
-                              SCSrc_32:$soffset, mbuf_offset:$offset, glc:$glc,
+                              SCSrc_32:$soffset, offset:$offset, glc:$glc,
                               slc:$slc, tfe:$tfe),
-                             name#" $vdata, $vaddr, $srsrc, $soffset offen"#"$offset"#
-                             "$glc"#"$slc"#"$tfe", []>;
+                             name#" $vdata, $vaddr, $srsrc, $soffset offen"#
+                             "$offset$glc$slc$tfe", []>;
     } // end offen = 1, idxen = 0
 
     let offen = 0, idxen = 1 in {
       defm _IDXEN  : MUBUF_m <op, name#"_idxen", (outs),
                            (ins vdataClass:$vdata, VGPR_32:$vaddr, SReg_128:$srsrc,
-                           SCSrc_32:$soffset, mbuf_offset:$offset, glc:$glc,
+                           SCSrc_32:$soffset, offset:$offset, glc:$glc,
                            slc:$slc, tfe:$tfe),
-                           name#" $vdata, $vaddr, $srsrc, $soffset idxen"#"$offset"#"$glc"#"$slc"#"$tfe", []>;
+                           name#" $vdata, $vaddr, $srsrc, $soffset idxen$offset$glc$slc$tfe", []>;
     }
 
     let offen = 1, idxen = 1 in {
       defm _BOTHEN : MUBUF_m <op, name#"_bothen", (outs),
                            (ins vdataClass:$vdata, VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset,
-                           mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe),
-                           name#" $vdata, $vaddr, $srsrc, $soffset idxen offen"#"$offset"#"$glc"#"$slc"#"$tfe", []>;
+                           offset:$offset, glc:$glc, slc:$slc, tfe:$tfe),
+                           name#" $vdata, $vaddr, $srsrc, $soffset idxen offen$offset$glc$slc$tfe", []>;
     }
 
     let offen = 0, idxen = 0 in {
       defm _ADDR64 : MUBUFAddr64_m <op, name#"_addr64", (outs),
                                     (ins vdataClass:$vdata, VReg_64:$vaddr, SReg_128:$srsrc,
                                          SCSrc_32:$soffset,
-                                         mbuf_offset:$offset, glc:$glc, slc:$slc,
+                                         offset:$offset, glc:$glc, slc:$slc,
                                          tfe:$tfe),
                                     name#" $vdata, $vaddr, $srsrc, $soffset addr64"#
-                                         "$offset"#"$glc"#"$slc"#"$tfe",
+                                         "$offset$glc$slc$tfe",
                                     [(st store_vt:$vdata,
                                       (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr,
                                                    i32:$soffset, i16:$offset,
@@ -2593,21 +3260,24 @@ class flat <bits<7> ci, bits<7> vi = ci> {
 
 class FLAT_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
      FLAT <0, outs, ins, "", pattern>,
-      SIMCInstr<opName, SISubtarget.NONE> {
+      SIMCInstr<opName, SIEncodingFamily.NONE> {
   let isPseudo = 1;
   let isCodeGenOnly = 1;
 }
 
 class FLAT_Real_ci <bits<7> op, string opName, dag outs, dag ins, string asm> :
     FLAT <op, outs, ins, asm, []>,
-    SIMCInstr<opName, SISubtarget.SI> {
+    SIMCInstr<opName, SIEncodingFamily.SI> {
   let AssemblerPredicate = isCIOnly;
+  let DecoderNamespace="CI";
 }
 
 class FLAT_Real_vi <bits<7> op, string opName, dag outs, dag ins, string asm> :
     FLAT <op, outs, ins, asm, []>,
-    SIMCInstr<opName, SISubtarget.VI> {
+    SIMCInstr<opName, SIEncodingFamily.VI> {
   let AssemblerPredicate = VIAssemblerPredicate;
+  let DecoderNamespace="VI";
+  let DisableDecoder = DisableVIDecoder;
 }
 
 multiclass FLAT_AtomicRet_m <flat op, dag outs, dag ins, string asm,
@@ -2623,8 +3293,8 @@ multiclass FLAT_AtomicRet_m <flat op, dag outs, dag ins, string asm,
 multiclass FLAT_Load_Helper <flat op, string asm_name,
     RegisterClass regClass,
     dag outs = (outs regClass:$vdst),
-    dag ins = (ins VReg_64:$addr, glc_flat:$glc, slc_flat:$slc, tfe_flat:$tfe),
-    string asm = asm_name#" $vdst, $addr"#"$glc"#"$slc"#"$tfe"> {
+    dag ins = (ins VReg_64:$addr, glc:$glc, slc:$slc, tfe:$tfe),
+    string asm = asm_name#" $vdst, $addr$glc$slc$tfe"> {
 
   let data = 0, mayLoad = 1 in {
 
@@ -2639,9 +3309,9 @@ multiclass FLAT_Load_Helper <flat op, string asm_name,
 multiclass FLAT_Store_Helper <flat op, string asm_name,
     RegisterClass vdataClass,
     dag outs = (outs),
-    dag ins = (ins vdataClass:$data, VReg_64:$addr, glc_flat:$glc,
-                   slc_flat:$slc, tfe_flat:$tfe),
-    string asm = asm_name#" $data, $addr"#"$glc"#"$slc"#"$tfe"> {
+    dag ins = (ins VReg_64:$addr, vdataClass:$data, glc:$glc,
+                   slc:$slc, tfe:$tfe),
+    string asm = asm_name#" $addr, $data$glc$slc$tfe"> {
 
   let mayLoad = 0, mayStore = 1, vdst = 0 in {
 
@@ -2654,32 +3324,36 @@ multiclass FLAT_Store_Helper <flat op, string asm_name,
 }
 
 multiclass FLAT_ATOMIC <flat op, string asm_name, RegisterClass vdst_rc,
+    ValueType vt, SDPatternOperator atomic = null_frag,
+    ValueType data_vt = vt,
     RegisterClass data_rc = vdst_rc,
-    dag outs_noret = (outs),
     string asm_noret = asm_name#" $addr, $data"#"$slc"#"$tfe"> {
 
   let mayLoad = 1, mayStore = 1, glc = 0, vdst = 0 in {
-    def "" : FLAT_Pseudo <NAME, outs_noret,
+    def "" : FLAT_Pseudo <NAME, (outs),
                           (ins VReg_64:$addr, data_rc:$data,
-                               slc_flat_atomic:$slc, tfe_flat_atomic:$tfe), []>,
+                               slc:$slc, tfe:$tfe), []>,
              AtomicNoRet <NAME, 0>;
 
-    def _ci : FLAT_Real_ci <op.CI, NAME, outs_noret,
+    def _ci : FLAT_Real_ci <op.CI, NAME, (outs),
                             (ins VReg_64:$addr, data_rc:$data,
-                                 slc_flat_atomic:$slc, tfe_flat_atomic:$tfe),
+                                 slc:$slc, tfe:$tfe),
                             asm_noret>;
 
-    def _vi : FLAT_Real_vi <op.VI, NAME, outs_noret,
+    def _vi : FLAT_Real_vi <op.VI, NAME, (outs),
                             (ins VReg_64:$addr, data_rc:$data,
-                                 slc_flat_atomic:$slc, tfe_flat_atomic:$tfe),
+                                 slc:$slc, tfe:$tfe),
                             asm_noret>;
   }
 
   let glc = 1, hasPostISelHook = 1 in {
-    defm _RTN : FLAT_AtomicRet_m <op, (outs vdst_rc:$vdst),
-                        (ins VReg_64:$addr, data_rc:$data, slc_flat_atomic:$slc,
-                             tfe_flat_atomic:$tfe),
-                        asm_name#" $vdst, $addr, $data glc"#"$slc"#"$tfe", []>;
+    defm _RTN : FLAT_AtomicRet_m <
+      op, (outs vdst_rc:$vdst),
+      (ins VReg_64:$addr, data_rc:$data, slc:$slc, tfe:$tfe),
+      asm_name#" $vdst, $addr, $data glc$slc$tfe",
+      [(set vt:$vdst,
+         (atomic (FLATAtomic i64:$addr, i1:$slc, i1:$tfe), data_vt:$data))]
+    >;
   }
 }
 
@@ -2688,27 +3362,39 @@ class MIMG_Mask <string op, int channels> {
   int Channels = channels;
 }
 
+class mimg <bits<7> si, bits<7> vi = si> {
+  field bits<7> SI = si;
+  field bits<7> VI = vi;
+}
+
+class MIMG_Helper <dag outs, dag ins, string asm,
+                   string dns=""> : MIMG<outs, ins, asm,[]> {
+  let mayLoad = 1;
+  let mayStore = 0;
+  let hasPostISelHook = 1;
+  let DecoderNamespace = dns;
+  let isAsmParserOnly = !if(!eq(dns,""), 1, 0);
+  let AsmMatchConverter = "cvtMIMG";
+}
+
 class MIMG_NoSampler_Helper <bits<7> op, string asm,
                              RegisterClass dst_rc,
-                             RegisterClass src_rc> : MIMG <
-  op,
+                             RegisterClass addr_rc,
+                             string dns=""> : MIMG_Helper <
   (outs dst_rc:$vdata),
-  (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128,
-       i1imm:$tfe, i1imm:$lwe, i1imm:$slc, src_rc:$vaddr,
-       SReg_256:$srsrc),
-  asm#" $vdata, $dmask, $unorm, $glc, $da, $r128,"
-     #" $tfe, $lwe, $slc, $vaddr, $srsrc",
-  []> {
+  (ins addr_rc:$vaddr, SReg_256:$srsrc,
+       dmask:$dmask, unorm:$unorm, glc:$glc, slc:$slc,
+       r128:$r128, tfe:$tfe, lwe:$lwe, da:$da),
+  asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da",
+  dns>, MIMGe<op> {
   let ssamp = 0;
-  let mayLoad = 1;
-  let mayStore = 0;
-  let hasPostISelHook = 1;
 }
 
 multiclass MIMG_NoSampler_Src_Helper <bits<7> op, string asm,
                                       RegisterClass dst_rc,
                                       int channels> {
-  def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32>,
+  def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32,
+                                   !if(!eq(channels, 1), "AMDGPU", "")>,
             MIMG_Mask<asm#"_V1", channels>;
   def _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>,
             MIMG_Mask<asm#"_V2", channels>;
@@ -2723,27 +3409,116 @@ multiclass MIMG_NoSampler <bits<7> op, string asm> {
   defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 4>;
 }
 
+class MIMG_Store_Helper <bits<7> op, string asm,
+                         RegisterClass data_rc,
+                         RegisterClass addr_rc> : MIMG_Helper <
+  (outs),
+  (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc,
+       dmask:$dmask, unorm:$unorm, glc:$glc, slc:$slc,
+       r128:$r128, tfe:$tfe, lwe:$lwe, da:$da),
+  asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da"
+     >, MIMGe<op> {
+  let ssamp = 0;
+  let mayLoad = 1; // TableGen requires this for matching with the intrinsics
+  let mayStore = 1;
+  let hasSideEffects = 1;
+  let hasPostISelHook = 0;
+}
+
+multiclass MIMG_Store_Addr_Helper <bits<7> op, string asm,
+                                  RegisterClass data_rc,
+                                  int channels> {
+  def _V1 : MIMG_Store_Helper <op, asm, data_rc, VGPR_32>,
+            MIMG_Mask<asm#"_V1", channels>;
+  def _V2 : MIMG_Store_Helper <op, asm, data_rc, VReg_64>,
+            MIMG_Mask<asm#"_V2", channels>;
+  def _V4 : MIMG_Store_Helper <op, asm, data_rc, VReg_128>,
+            MIMG_Mask<asm#"_V4", channels>;
+}
+
+multiclass MIMG_Store <bits<7> op, string asm> {
+  defm _V1 : MIMG_Store_Addr_Helper <op, asm, VGPR_32, 1>;
+  defm _V2 : MIMG_Store_Addr_Helper <op, asm, VReg_64, 2>;
+  defm _V3 : MIMG_Store_Addr_Helper <op, asm, VReg_96, 3>;
+  defm _V4 : MIMG_Store_Addr_Helper <op, asm, VReg_128, 4>;
+}
+
+class MIMG_Atomic_Helper <string asm, RegisterClass data_rc,
+                          RegisterClass addr_rc> : MIMG_Helper <
+    (outs data_rc:$vdst),
+    (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc,
+         dmask:$dmask, unorm:$unorm, glc:$glc, slc:$slc,
+         r128:$r128, tfe:$tfe, lwe:$lwe, da:$da),
+    asm#" $vdst, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da"
+  > {
+  let mayStore = 1;
+  let hasSideEffects = 1;
+  let hasPostISelHook = 0;
+  let Constraints = "$vdst = $vdata";
+  let AsmMatchConverter = "cvtMIMGAtomic";
+}
+
+class MIMG_Atomic_Real_si<mimg op, string name, string asm,
+  RegisterClass data_rc, RegisterClass addr_rc> :
+  MIMG_Atomic_Helper<asm, data_rc, addr_rc>,
+  SIMCInstr<name, SIEncodingFamily.SI>,
+  MIMGe<op.SI> {
+  let isCodeGenOnly = 0;
+  let AssemblerPredicates = [isSICI];
+  let DecoderNamespace = "SICI";
+  let DisableDecoder = DisableSIDecoder;
+}
+
+class MIMG_Atomic_Real_vi<mimg op, string name, string asm,
+  RegisterClass data_rc, RegisterClass addr_rc> :
+  MIMG_Atomic_Helper<asm, data_rc, addr_rc>,
+  SIMCInstr<name, SIEncodingFamily.VI>,
+  MIMGe<op.VI> {
+  let isCodeGenOnly = 0;
+  let AssemblerPredicates = [isVI];
+  let DecoderNamespace = "VI";
+  let DisableDecoder = DisableVIDecoder;
+}
+
+multiclass MIMG_Atomic_Helper_m <mimg op, string name, string asm,
+                                 RegisterClass data_rc, RegisterClass addr_rc> {
+  let isPseudo = 1, isCodeGenOnly = 1 in {
+    def "" : MIMG_Atomic_Helper<asm, data_rc, addr_rc>,
+             SIMCInstr<name, SIEncodingFamily.NONE>;
+  }
+
+  let ssamp = 0 in {
+    def _si : MIMG_Atomic_Real_si<op, name, asm, data_rc, addr_rc>;
+
+    def _vi : MIMG_Atomic_Real_vi<op, name, asm, data_rc, addr_rc>;
+  }
+}
+
+multiclass MIMG_Atomic <mimg op, string asm, RegisterClass data_rc = VGPR_32> {
+  defm _V1 : MIMG_Atomic_Helper_m <op, asm # "_V1", asm, data_rc, VGPR_32>;
+  defm _V2 : MIMG_Atomic_Helper_m <op, asm # "_V2", asm, data_rc, VReg_64>;
+  defm _V4 : MIMG_Atomic_Helper_m <op, asm # "_V3", asm, data_rc, VReg_128>;
+}
+
 class MIMG_Sampler_Helper <bits<7> op, string asm,
                            RegisterClass dst_rc,
-                           RegisterClass src_rc, int wqm> : MIMG <
-  op,
+                           RegisterClass src_rc,
+                           int wqm,
+                           string dns=""> : MIMG_Helper <
   (outs dst_rc:$vdata),
-  (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128,
-       i1imm:$tfe, i1imm:$lwe, i1imm:$slc, src_rc:$vaddr,
-       SReg_256:$srsrc, SReg_128:$ssamp),
-  asm#" $vdata, $dmask, $unorm, $glc, $da, $r128,"
-     #" $tfe, $lwe, $slc, $vaddr, $srsrc, $ssamp",
-  []> {
-  let mayLoad = 1;
-  let mayStore = 0;
-  let hasPostISelHook = 1;
+  (ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp,
+       dmask:$dmask, unorm:$unorm, glc:$glc, slc:$slc,
+       r128:$r128, tfe:$tfe, lwe:$lwe, da:$da),
+  asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da",
+  dns>, MIMGe<op> {
   let WQM = wqm;
 }
 
 multiclass MIMG_Sampler_Src_Helper <bits<7> op, string asm,
                                     RegisterClass dst_rc,
                                     int channels, int wqm> {
-  def _V1 : MIMG_Sampler_Helper <op, asm, dst_rc, VGPR_32, wqm>,
+  def _V1 : MIMG_Sampler_Helper <op, asm, dst_rc, VGPR_32, wqm,
+                                 !if(!eq(channels, 1), "AMDGPU", "")>,
             MIMG_Mask<asm#"_V1", channels>;
   def _V2 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_64, wqm>,
             MIMG_Mask<asm#"_V2", channels>;
@@ -2755,31 +3530,24 @@ multiclass MIMG_Sampler_Src_Helper <bits<7> op, string asm,
             MIMG_Mask<asm#"_V16", channels>;
 }
 
-multiclass MIMG_Sampler <bits<7> op, string asm> {
-  defm _V1 : MIMG_Sampler_Src_Helper<op, asm, VGPR_32, 1, 0>;
-  defm _V2 : MIMG_Sampler_Src_Helper<op, asm, VReg_64, 2, 0>;
-  defm _V3 : MIMG_Sampler_Src_Helper<op, asm, VReg_96, 3, 0>;
-  defm _V4 : MIMG_Sampler_Src_Helper<op, asm, VReg_128, 4, 0>;
+multiclass MIMG_Sampler <bits<7> op, string asm, int wqm=0> {
+  defm _V1 : MIMG_Sampler_Src_Helper<op, asm, VGPR_32, 1, wqm>;
+  defm _V2 : MIMG_Sampler_Src_Helper<op, asm, VReg_64, 2, wqm>;
+  defm _V3 : MIMG_Sampler_Src_Helper<op, asm, VReg_96, 3, wqm>;
+  defm _V4 : MIMG_Sampler_Src_Helper<op, asm, VReg_128, 4, wqm>;
 }
 
-multiclass MIMG_Sampler_WQM <bits<7> op, string asm> {
-  defm _V1 : MIMG_Sampler_Src_Helper<op, asm, VGPR_32, 1, 1>;
-  defm _V2 : MIMG_Sampler_Src_Helper<op, asm, VReg_64, 2, 1>;
-  defm _V3 : MIMG_Sampler_Src_Helper<op, asm, VReg_96, 3, 1>;
-  defm _V4 : MIMG_Sampler_Src_Helper<op, asm, VReg_128, 4, 1>;
-}
+multiclass MIMG_Sampler_WQM <bits<7> op, string asm> : MIMG_Sampler<op, asm, 1>;
 
 class MIMG_Gather_Helper <bits<7> op, string asm,
                           RegisterClass dst_rc,
                           RegisterClass src_rc, int wqm> : MIMG <
-  op,
   (outs dst_rc:$vdata),
-  (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128,
-       i1imm:$tfe, i1imm:$lwe, i1imm:$slc, src_rc:$vaddr,
-       SReg_256:$srsrc, SReg_128:$ssamp),
-  asm#" $vdata, $dmask, $unorm, $glc, $da, $r128,"
-     #" $tfe, $lwe, $slc, $vaddr, $srsrc, $ssamp",
-  []> {
+  (ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp,
+       dmask:$dmask, unorm:$unorm, glc:$glc, slc:$slc,
+       r128:$r128, tfe:$tfe, lwe:$lwe, da:$da),
+  asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da",
+  []>, MIMGe<op> {
   let mayLoad = 1;
   let mayStore = 0;
 
@@ -2789,10 +3557,12 @@ class MIMG_Gather_Helper <bits<7> op, string asm,
   // 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
   // (red,red,red,red) etc.) The ISA document doesn't mention
   // this.
-  // Therefore, disable all code which updates DMASK by setting these two:
-  let MIMG = 0;
+  // Therefore, disable all code which updates DMASK by setting this:
+  let Gather4 = 1;
   let hasPostISelHook = 0;
   let WQM = wqm;
+
+  let isAsmParserOnly = 1; // TBD: fix it later
 }
 
 multiclass MIMG_Gather_Src_Helper <bits<7> op, string asm,
@@ -2810,19 +3580,14 @@ multiclass MIMG_Gather_Src_Helper <bits<7> op, string asm,
             MIMG_Mask<asm#"_V16", channels>;
 }
 
-multiclass MIMG_Gather <bits<7> op, string asm> {
-  defm _V1 : MIMG_Gather_Src_Helper<op, asm, VGPR_32, 1, 0>;
-  defm _V2 : MIMG_Gather_Src_Helper<op, asm, VReg_64, 2, 0>;
-  defm _V3 : MIMG_Gather_Src_Helper<op, asm, VReg_96, 3, 0>;
-  defm _V4 : MIMG_Gather_Src_Helper<op, asm, VReg_128, 4, 0>;
+multiclass MIMG_Gather <bits<7> op, string asm, int wqm=0> {
+  defm _V1 : MIMG_Gather_Src_Helper<op, asm, VGPR_32, 1, wqm>;
+  defm _V2 : MIMG_Gather_Src_Helper<op, asm, VReg_64, 2, wqm>;
+  defm _V3 : MIMG_Gather_Src_Helper<op, asm, VReg_96, 3, wqm>;
+  defm _V4 : MIMG_Gather_Src_Helper<op, asm, VReg_128, 4, wqm>;
 }
 
-multiclass MIMG_Gather_WQM <bits<7> op, string asm> {
-  defm _V1 : MIMG_Gather_Src_Helper<op, asm, VGPR_32, 1, 1>;
-  defm _V2 : MIMG_Gather_Src_Helper<op, asm, VReg_64, 2, 1>;
-  defm _V3 : MIMG_Gather_Src_Helper<op, asm, VReg_96, 3, 1>;
-  defm _V4 : MIMG_Gather_Src_Helper<op, asm, VReg_128, 4, 1>;
-}
+multiclass MIMG_Gather_WQM <bits<7> op, string asm> : MIMG_Gather<op, asm, 1>;
 
 //===----------------------------------------------------------------------===//
 // Vector instruction mappings
@@ -2894,8 +3659,9 @@ def getMCOpcodeGen : InstrMapping {
   let FilterClass = "SIMCInstr";
   let RowFields = ["PseudoInstr"];
   let ColFields = ["Subtarget"];
-  let KeyCol = [!cast<string>(SISubtarget.NONE)];
-  let ValueCols = [[!cast<string>(SISubtarget.SI)],[!cast<string>(SISubtarget.VI)]];
+  let KeyCol = [!cast<string>(SIEncodingFamily.NONE)];
+  let ValueCols = [[!cast<string>(SIEncodingFamily.SI)],
+                   [!cast<string>(SIEncodingFamily.VI)]];
 }
 
 def getAddr64Inst : InstrMapping {
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index 89692ab71f4d..6427db87cd6f 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -18,35 +18,17 @@ int P20 = 1;
 }
 def INTERP : InterpSlots;
 
-def InterpSlot : Operand<i32> {
-  let PrintMethod = "printInterpSlot";
-}
-
-def SendMsgImm : Operand<i32> {
-  let PrintMethod = "printSendMsg";
-}
-
 def isGCN : Predicate<"Subtarget->getGeneration() "
-                      ">= AMDGPUSubtarget::SOUTHERN_ISLANDS">,
+                      ">= SISubtarget::SOUTHERN_ISLANDS">,
             AssemblerPredicate<"FeatureGCN">;
 def isSI : Predicate<"Subtarget->getGeneration() "
-                      "== AMDGPUSubtarget::SOUTHERN_ISLANDS">,
+                      "== SISubtarget::SOUTHERN_ISLANDS">,
            AssemblerPredicate<"FeatureSouthernIslands">;
 
 
 def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">;
 def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">;
 
-def SWaitMatchClass : AsmOperandClass {
-  let Name = "SWaitCnt";
-  let RenderMethod = "addImmOperands";
-  let ParserMethod = "parseSWaitCntOps";
-}
-
-def WAIT_FLAG : InstFlag<"printWaitFlag"> {
-  let ParserMatchClass = SWaitMatchClass;
-}
-
 let SubtargetPredicate = isGCN in {
 
 //===----------------------------------------------------------------------===//
@@ -59,17 +41,17 @@ defm EXP : EXP_m;
 // SMRD Instructions
 //===----------------------------------------------------------------------===//
 
-// We are using the SGPR_32 and not the SReg_32 register class for 32-bit
-// SMRD instructions, because the SGPR_32 register class does not include M0
+// We are using the SReg_32_XM0 and not the SReg_32 register class for 32-bit
+// SMRD instructions, because the SReg_32_XM0 register class does not include M0
 // and writing to M0 from an SMRD instruction will hang the GPU.
-defm S_LOAD_DWORD : SMRD_Helper <smrd<0x00>, "s_load_dword", SReg_64, SGPR_32>;
+defm S_LOAD_DWORD : SMRD_Helper <smrd<0x00>, "s_load_dword", SReg_64, SReg_32_XM0>;
 defm S_LOAD_DWORDX2 : SMRD_Helper <smrd<0x01>, "s_load_dwordx2", SReg_64, SReg_64>;
 defm S_LOAD_DWORDX4 : SMRD_Helper <smrd<0x02>, "s_load_dwordx4", SReg_64, SReg_128>;
 defm S_LOAD_DWORDX8 : SMRD_Helper <smrd<0x03>, "s_load_dwordx8", SReg_64, SReg_256>;
 defm S_LOAD_DWORDX16 : SMRD_Helper <smrd<0x04>, "s_load_dwordx16", SReg_64, SReg_512>;
 
 defm S_BUFFER_LOAD_DWORD : SMRD_Helper <
-  smrd<0x08>, "s_buffer_load_dword", SReg_128, SGPR_32
+  smrd<0x08>, "s_buffer_load_dword", SReg_128, SReg_32_XM0
 >;
 
 defm S_BUFFER_LOAD_DWORDX2 : SMRD_Helper <
@@ -88,7 +70,15 @@ defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper <
   smrd<0x0c>, "s_buffer_load_dwordx16", SReg_128, SReg_512
 >;
 
-//def S_MEMTIME : SMRD_ <0x0000001e, "s_memtime", []>;
+let mayStore = ? in {
+// FIXME: mayStore = ? is a workaround for tablegen bug for different
+// inferred mayStore flags for the instruction pattern vs. standalone
+// Pat. Each considers the other contradictory.
+
+defm S_MEMTIME : SMRD_Special <smrd<0x1e, 0x24>, "s_memtime",
+  (outs SReg_64:$sdst), ?, " $sdst", [(set i64:$sdst, (int_amdgcn_s_memtime))]
+>;
+}
 
 defm S_DCACHE_INV : SMRD_Inval <smrd<0x1f, 0x20>, "s_dcache_inv",
   int_amdgcn_s_dcache_inv>;
@@ -101,7 +91,7 @@ let isMoveImm = 1 in {
   let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
     defm S_MOV_B32 : SOP1_32 <sop1<0x03, 0x00>, "s_mov_b32", []>;
     defm S_MOV_B64 : SOP1_64 <sop1<0x04, 0x01>, "s_mov_b64", []>;
-  } // let isRematerializeable = 1
+  } // End isRematerializeable = 1
 
   let Uses = [SCC] in {
     defm S_CMOV_B32 : SOP1_32 <sop1<0x05, 0x02>, "s_cmov_b32", []>;
@@ -111,11 +101,11 @@ let isMoveImm = 1 in {
 
 let Defs = [SCC] in {
   defm S_NOT_B32 : SOP1_32 <sop1<0x07, 0x04>, "s_not_b32",
-    [(set i32:$dst, (not i32:$src0))]
+    [(set i32:$sdst, (not i32:$src0))]
   >;
 
   defm S_NOT_B64 : SOP1_64 <sop1<0x08, 0x05>, "s_not_b64",
-    [(set i64:$dst, (not i64:$src0))]
+    [(set i64:$sdst, (not i64:$src0))]
   >;
   defm S_WQM_B32 : SOP1_32 <sop1<0x09, 0x06>, "s_wqm_b32", []>;
   defm S_WQM_B64 : SOP1_64 <sop1<0x0a, 0x07>, "s_wqm_b64", []>;
@@ -123,7 +113,7 @@ let Defs = [SCC] in {
 
 
 defm S_BREV_B32 : SOP1_32 <sop1<0x0b, 0x08>, "s_brev_b32",
-  [(set i32:$dst, (bitreverse i32:$src0))]
+  [(set i32:$sdst, (bitreverse i32:$src0))]
 >;
 defm S_BREV_B64 : SOP1_64 <sop1<0x0c, 0x09>, "s_brev_b64", []>;
 
@@ -131,7 +121,7 @@ let Defs = [SCC] in {
   defm S_BCNT0_I32_B32 : SOP1_32 <sop1<0x0d, 0x0a>, "s_bcnt0_i32_b32", []>;
   defm S_BCNT0_I32_B64 : SOP1_32_64 <sop1<0x0e, 0x0b>, "s_bcnt0_i32_b64", []>;
   defm S_BCNT1_I32_B32 : SOP1_32 <sop1<0x0f, 0x0c>, "s_bcnt1_i32_b32",
-    [(set i32:$dst, (ctpop i32:$src0))]
+    [(set i32:$sdst, (ctpop i32:$src0))]
   >;
   defm S_BCNT1_I32_B64 : SOP1_32_64 <sop1<0x10, 0x0d>, "s_bcnt1_i32_b64", []>;
 } // End Defs = [SCC]
@@ -139,34 +129,34 @@ let Defs = [SCC] in {
 defm S_FF0_I32_B32 : SOP1_32 <sop1<0x11, 0x0e>, "s_ff0_i32_b32", []>;
 defm S_FF0_I32_B64 : SOP1_32_64 <sop1<0x12, 0x0f>, "s_ff0_i32_b64", []>;
 defm S_FF1_I32_B32 : SOP1_32 <sop1<0x13, 0x10>, "s_ff1_i32_b32",
-  [(set i32:$dst, (cttz_zero_undef i32:$src0))]
+  [(set i32:$sdst, (cttz_zero_undef i32:$src0))]
 >;
 defm S_FF1_I32_B64 : SOP1_32_64 <sop1<0x14, 0x11>, "s_ff1_i32_b64", []>;
 
 defm S_FLBIT_I32_B32 : SOP1_32 <sop1<0x15, 0x12>, "s_flbit_i32_b32",
-  [(set i32:$dst, (AMDGPUffbh_u32 i32:$src0))]
+  [(set i32:$sdst, (AMDGPUffbh_u32 i32:$src0))]
 >;
 
 defm S_FLBIT_I32_B64 : SOP1_32_64 <sop1<0x16, 0x13>, "s_flbit_i32_b64", []>;
 defm S_FLBIT_I32 : SOP1_32 <sop1<0x17, 0x14>, "s_flbit_i32",
-  [(set i32:$dst, (int_AMDGPU_flbit_i32 i32:$src0))]
+  [(set i32:$sdst, (int_AMDGPU_flbit_i32 i32:$src0))]
 >;
 defm S_FLBIT_I32_I64 : SOP1_32_64 <sop1<0x18, 0x15>, "s_flbit_i32_i64", []>;
 defm S_SEXT_I32_I8 : SOP1_32 <sop1<0x19, 0x16>, "s_sext_i32_i8",
-  [(set i32:$dst, (sext_inreg i32:$src0, i8))]
+  [(set i32:$sdst, (sext_inreg i32:$src0, i8))]
 >;
 defm S_SEXT_I32_I16 : SOP1_32 <sop1<0x1a, 0x17>, "s_sext_i32_i16",
-  [(set i32:$dst, (sext_inreg i32:$src0, i16))]
+  [(set i32:$sdst, (sext_inreg i32:$src0, i16))]
 >;
 
 defm S_BITSET0_B32 : SOP1_32 <sop1<0x1b, 0x18>, "s_bitset0_b32", []>;
-defm S_BITSET0_B64 : SOP1_64 <sop1<0x1c, 0x19>, "s_bitset0_b64", []>;
+defm S_BITSET0_B64 : SOP1_64_32 <sop1<0x1c, 0x19>, "s_bitset0_b64", []>;
 defm S_BITSET1_B32 : SOP1_32 <sop1<0x1d, 0x1a>, "s_bitset1_b32", []>;
-defm S_BITSET1_B64 : SOP1_64 <sop1<0x1e, 0x1b>, "s_bitset1_b64", []>;
+defm S_BITSET1_B64 : SOP1_64_32 <sop1<0x1e, 0x1b>, "s_bitset1_b64", []>;
 defm S_GETPC_B64 : SOP1_64_0 <sop1<0x1f, 0x1c>, "s_getpc_b64", []>;
-defm S_SETPC_B64 : SOP1_64 <sop1<0x20, 0x1d>, "s_setpc_b64", []>;
+defm S_SETPC_B64 : SOP1_1 <sop1<0x20, 0x1d>, "s_setpc_b64", []>;
 defm S_SWAPPC_B64 : SOP1_64 <sop1<0x21, 0x1e>, "s_swappc_b64", []>;
-defm S_RFE_B64 : SOP1_64 <sop1<0x22, 0x1f>, "s_rfe_b64", []>;
+defm S_RFE_B64 : SOP1_1 <sop1<0x22, 0x1f>, "s_rfe_b64", []>;
 
 let hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC] in {
 
@@ -206,36 +196,36 @@ let Defs = [SCC] in { // Carry out goes to SCC
 let isCommutable = 1 in {
 defm S_ADD_U32 : SOP2_32 <sop2<0x00>, "s_add_u32", []>;
 defm S_ADD_I32 : SOP2_32 <sop2<0x02>, "s_add_i32",
-  [(set i32:$dst, (add SSrc_32:$src0, SSrc_32:$src1))]
+  [(set i32:$sdst, (add SSrc_32:$src0, SSrc_32:$src1))]
 >;
 } // End isCommutable = 1
 
 defm S_SUB_U32 : SOP2_32 <sop2<0x01>, "s_sub_u32", []>;
 defm S_SUB_I32 : SOP2_32 <sop2<0x03>, "s_sub_i32",
-  [(set i32:$dst, (sub SSrc_32:$src0, SSrc_32:$src1))]
+  [(set i32:$sdst, (sub SSrc_32:$src0, SSrc_32:$src1))]
 >;
 
 let Uses = [SCC] in { // Carry in comes from SCC
 let isCommutable = 1 in {
 defm S_ADDC_U32 : SOP2_32 <sop2<0x04>, "s_addc_u32",
-  [(set i32:$dst, (adde (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>;
+  [(set i32:$sdst, (adde (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>;
 } // End isCommutable = 1
 
 defm S_SUBB_U32 : SOP2_32 <sop2<0x05>, "s_subb_u32",
-  [(set i32:$dst, (sube (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>;
+  [(set i32:$sdst, (sube (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>;
 } // End Uses = [SCC]
 
 defm S_MIN_I32 : SOP2_32 <sop2<0x06>, "s_min_i32",
-  [(set i32:$dst, (smin i32:$src0, i32:$src1))]
+  [(set i32:$sdst, (smin i32:$src0, i32:$src1))]
 >;
 defm S_MIN_U32 : SOP2_32 <sop2<0x07>, "s_min_u32",
-  [(set i32:$dst, (umin i32:$src0, i32:$src1))]
+  [(set i32:$sdst, (umin i32:$src0, i32:$src1))]
 >;
 defm S_MAX_I32 : SOP2_32 <sop2<0x08>, "s_max_i32",
-  [(set i32:$dst, (smax i32:$src0, i32:$src1))]
+  [(set i32:$sdst, (smax i32:$src0, i32:$src1))]
 >;
 defm S_MAX_U32 : SOP2_32 <sop2<0x09>, "s_max_u32",
-  [(set i32:$dst, (umax i32:$src0, i32:$src1))]
+  [(set i32:$sdst, (umax i32:$src0, i32:$src1))]
 >;
 } // End Defs = [SCC]
 
@@ -247,27 +237,27 @@ let Uses = [SCC] in {
 
 let Defs = [SCC] in {
 defm S_AND_B32 : SOP2_32 <sop2<0x0e, 0x0c>, "s_and_b32",
-  [(set i32:$dst, (and i32:$src0, i32:$src1))]
+  [(set i32:$sdst, (and i32:$src0, i32:$src1))]
 >;
 
 defm S_AND_B64 : SOP2_64 <sop2<0x0f, 0x0d>, "s_and_b64",
-  [(set i64:$dst, (and i64:$src0, i64:$src1))]
+  [(set i64:$sdst, (and i64:$src0, i64:$src1))]
 >;
 
 defm S_OR_B32 : SOP2_32 <sop2<0x10, 0x0e>, "s_or_b32",
-  [(set i32:$dst, (or i32:$src0, i32:$src1))]
+  [(set i32:$sdst, (or i32:$src0, i32:$src1))]
 >;
 
 defm S_OR_B64 : SOP2_64 <sop2<0x11, 0x0f>, "s_or_b64",
-  [(set i64:$dst, (or i64:$src0, i64:$src1))]
+  [(set i64:$sdst, (or i64:$src0, i64:$src1))]
 >;
 
 defm S_XOR_B32 : SOP2_32 <sop2<0x12, 0x10>, "s_xor_b32",
-  [(set i32:$dst, (xor i32:$src0, i32:$src1))]
+  [(set i32:$sdst, (xor i32:$src0, i32:$src1))]
 >;
 
 defm S_XOR_B64 : SOP2_64 <sop2<0x13, 0x11>, "s_xor_b64",
-  [(set i64:$dst, (xor i64:$src0, i64:$src1))]
+  [(set i64:$sdst, (xor i64:$src0, i64:$src1))]
 >;
 defm S_ANDN2_B32 : SOP2_32 <sop2<0x14, 0x12>, "s_andn2_b32", []>;
 defm S_ANDN2_B64 : SOP2_64 <sop2<0x15, 0x13>, "s_andn2_b64", []>;
@@ -286,30 +276,30 @@ let AddedComplexity = 1 in {
 let Defs = [SCC] in {
 
 defm S_LSHL_B32 : SOP2_32 <sop2<0x1e, 0x1c>, "s_lshl_b32",
-  [(set i32:$dst, (shl i32:$src0, i32:$src1))]
+  [(set i32:$sdst, (shl i32:$src0, i32:$src1))]
 >;
 defm S_LSHL_B64 : SOP2_64_32 <sop2<0x1f, 0x1d>, "s_lshl_b64",
-  [(set i64:$dst, (shl i64:$src0, i32:$src1))]
+  [(set i64:$sdst, (shl i64:$src0, i32:$src1))]
 >;
 defm S_LSHR_B32 : SOP2_32 <sop2<0x20, 0x1e>, "s_lshr_b32",
-  [(set i32:$dst, (srl i32:$src0, i32:$src1))]
+  [(set i32:$sdst, (srl i32:$src0, i32:$src1))]
 >;
 defm S_LSHR_B64 : SOP2_64_32 <sop2<0x21, 0x1f>, "s_lshr_b64",
-  [(set i64:$dst, (srl i64:$src0, i32:$src1))]
+  [(set i64:$sdst, (srl i64:$src0, i32:$src1))]
 >;
 defm S_ASHR_I32 : SOP2_32 <sop2<0x22, 0x20>, "s_ashr_i32",
-  [(set i32:$dst, (sra i32:$src0, i32:$src1))]
+  [(set i32:$sdst, (sra i32:$src0, i32:$src1))]
 >;
 defm S_ASHR_I64 : SOP2_64_32 <sop2<0x23, 0x21>, "s_ashr_i64",
-  [(set i64:$dst, (sra i64:$src0, i32:$src1))]
+  [(set i64:$sdst, (sra i64:$src0, i32:$src1))]
 >;
 } // End Defs = [SCC]
 
 defm S_BFM_B32 : SOP2_32 <sop2<0x24, 0x22>, "s_bfm_b32",
-  [(set i32:$dst, (AMDGPUbfm i32:$src0, i32:$src1))]>;
-defm S_BFM_B64 : SOP2_64 <sop2<0x25, 0x23>, "s_bfm_b64", []>;
+  [(set i32:$sdst, (AMDGPUbfm i32:$src0, i32:$src1))]>;
+defm S_BFM_B64 : SOP2_64_32_32 <sop2<0x25, 0x23>, "s_bfm_b64", []>;
 defm S_MUL_I32 : SOP2_32 <sop2<0x26, 0x24>, "s_mul_i32",
-  [(set i32:$dst, (mul i32:$src0, i32:$src1))]
+  [(set i32:$sdst, (mul i32:$src0, i32:$src1))]
 >;
 
 } // End AddedComplexity = 1
@@ -317,7 +307,7 @@ defm S_MUL_I32 : SOP2_32 <sop2<0x26, 0x24>, "s_mul_i32",
 let Defs = [SCC] in {
 defm S_BFE_U32 : SOP2_32 <sop2<0x27, 0x25>, "s_bfe_u32", []>;
 defm S_BFE_I32 : SOP2_32 <sop2<0x28, 0x26>, "s_bfe_i32", []>;
-defm S_BFE_U64 : SOP2_64 <sop2<0x29, 0x27>, "s_bfe_u64", []>;
+defm S_BFE_U64 : SOP2_64_32 <sop2<0x29, 0x27>, "s_bfe_u64", []>;
 defm S_BFE_I64 : SOP2_64_32 <sop2<0x2a, 0x28>, "s_bfe_i64", []>;
 } // End Defs = [SCC]
 
@@ -336,23 +326,23 @@ defm S_ABSDIFF_I32 : SOP2_32 <sop2<0x2c, 0x2a>, "s_absdiff_i32", []>;
 // SOPC Instructions
 //===----------------------------------------------------------------------===//
 
-def S_CMP_EQ_I32 : SOPC_32 <0x00000000, "s_cmp_eq_i32">;
-def S_CMP_LG_I32 : SOPC_32 <0x00000001, "s_cmp_lg_i32">;
-def S_CMP_GT_I32 : SOPC_32 <0x00000002, "s_cmp_gt_i32">;
-def S_CMP_GE_I32 : SOPC_32 <0x00000003, "s_cmp_ge_i32">;
-def S_CMP_LT_I32 : SOPC_32 <0x00000004, "s_cmp_lt_i32">;
-def S_CMP_LE_I32 : SOPC_32 <0x00000005, "s_cmp_le_i32">;
-def S_CMP_EQ_U32 : SOPC_32 <0x00000006, "s_cmp_eq_u32">;
-def S_CMP_LG_U32 : SOPC_32 <0x00000007, "s_cmp_lg_u32">;
-def S_CMP_GT_U32 : SOPC_32 <0x00000008, "s_cmp_gt_u32">;
-def S_CMP_GE_U32 : SOPC_32 <0x00000009, "s_cmp_ge_u32">;
-def S_CMP_LT_U32 : SOPC_32 <0x0000000a, "s_cmp_lt_u32">;
-def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "s_cmp_le_u32">;
-////def S_BITCMP0_B32 : SOPC_BITCMP0 <0x0000000c, "s_bitcmp0_b32", []>;
-////def S_BITCMP1_B32 : SOPC_BITCMP1 <0x0000000d, "s_bitcmp1_b32", []>;
-////def S_BITCMP0_B64 : SOPC_BITCMP0 <0x0000000e, "s_bitcmp0_b64", []>;
-////def S_BITCMP1_B64 : SOPC_BITCMP1 <0x0000000f, "s_bitcmp1_b64", []>;
-//def S_SETVSKIP : SOPC_ <0x00000010, "s_setvskip", []>;
+def S_CMP_EQ_I32 : SOPC_CMP_32 <0x00000000, "s_cmp_eq_i32", COND_EQ>;
+def S_CMP_LG_I32 : SOPC_CMP_32 <0x00000001, "s_cmp_lg_i32", COND_NE>;
+def S_CMP_GT_I32 : SOPC_CMP_32 <0x00000002, "s_cmp_gt_i32", COND_SGT>;
+def S_CMP_GE_I32 : SOPC_CMP_32 <0x00000003, "s_cmp_ge_i32", COND_SGE>;
+def S_CMP_LT_I32 : SOPC_CMP_32 <0x00000004, "s_cmp_lt_i32", COND_SLT>;
+def S_CMP_LE_I32 : SOPC_CMP_32 <0x00000005, "s_cmp_le_i32", COND_SLE>;
+def S_CMP_EQ_U32 : SOPC_CMP_32 <0x00000006, "s_cmp_eq_u32", COND_EQ>;
+def S_CMP_LG_U32 : SOPC_CMP_32 <0x00000007, "s_cmp_lg_u32", COND_NE >;
+def S_CMP_GT_U32 : SOPC_CMP_32 <0x00000008, "s_cmp_gt_u32", COND_UGT>;
+def S_CMP_GE_U32 : SOPC_CMP_32 <0x00000009, "s_cmp_ge_u32", COND_UGE>;
+def S_CMP_LT_U32 : SOPC_CMP_32 <0x0000000a, "s_cmp_lt_u32", COND_ULT>;
+def S_CMP_LE_U32 : SOPC_CMP_32 <0x0000000b, "s_cmp_le_u32", COND_ULE>;
+def S_BITCMP0_B32 : SOPC_32 <0x0000000c, "s_bitcmp0_b32">;
+def S_BITCMP1_B32 : SOPC_32 <0x0000000d, "s_bitcmp1_b32">;
+def S_BITCMP0_B64 : SOPC_64_32 <0x0000000e, "s_bitcmp0_b64">;
+def S_BITCMP1_B64 : SOPC_64_32 <0x0000000f, "s_bitcmp1_b64">;
+def S_SETVSKIP : SOPC_32 <0x00000010, "s_setvskip">;
 
 //===----------------------------------------------------------------------===//
 // SOPK Instructions
@@ -408,16 +398,23 @@ defm S_CBRANCH_I_FORK : SOPK_m <
   sopk<0x11, 0x10>, "s_cbranch_i_fork", (outs),
   (ins SReg_64:$sdst, u16imm:$simm16), " $sdst, $simm16"
 >;
-defm S_GETREG_B32 : SOPK_32 <sopk<0x12, 0x11>, "s_getreg_b32", []>;
+
+let mayLoad = 1 in {
+defm S_GETREG_B32 : SOPK_m <
+  sopk<0x12, 0x11>, "s_getreg_b32", (outs SReg_32:$sdst),
+  (ins hwreg:$simm16), " $sdst, $simm16"
+>;
+}
+
 defm S_SETREG_B32 : SOPK_m <
   sopk<0x13, 0x12>, "s_setreg_b32", (outs),
-  (ins SReg_32:$sdst, u16imm:$simm16), " $sdst, $simm16"
+  (ins SReg_32:$sdst, hwreg:$simm16), " $simm16, $sdst"
 >;
 // FIXME: Not on SI?
 //defm S_GETREG_REGRD_B32 : SOPK_32 <sopk<0x14, 0x13>, "s_getreg_regrd_b32", []>;
 defm S_SETREG_IMM32_B32 : SOPK_IMM32 <
   sopk<0x15, 0x14>, "s_setreg_imm32_b32", (outs),
-  (ins i32imm:$imm, u16imm:$simm16), " $imm, $simm16"
+  (ins i32imm:$imm, hwreg:$simm16), " $simm16, $imm"
 >;
 
 //===----------------------------------------------------------------------===//
@@ -429,10 +426,11 @@ def S_NOP : SOPP <0x00000000, (ins i16imm:$simm16), "s_nop $simm16">;
 let isTerminator = 1 in {
 
 def S_ENDPGM : SOPP <0x00000001, (ins), "s_endpgm",
-  [(IL_retflag)]> {
+  [(AMDGPUendpgm)]> {
   let simm16 = 0;
   let isBarrier = 1;
   let hasCtrlDep = 1;
+  let hasSideEffects = 1;
 }
 
 let isBranch = 1 in {
@@ -449,7 +447,8 @@ def S_CBRANCH_SCC0 : SOPP <
 >;
 def S_CBRANCH_SCC1 : SOPP <
   0x00000005, (ins sopp_brtarget:$simm16),
-  "s_cbranch_scc1 $simm16"
+  "s_cbranch_scc1 $simm16",
+  [(si_uniform_br_scc SCC, bb:$simm16)]
 >;
 } // End Uses = [SCC]
 
@@ -481,7 +480,7 @@ def S_CBRANCH_EXECNZ : SOPP <
 
 let hasSideEffects = 1 in {
 def S_BARRIER : SOPP <0x0000000a, (ins), "s_barrier",
-  [(int_AMDGPU_barrier_local)]
+  [(int_amdgcn_s_barrier)]
 > {
   let SchedRW = [WriteBarrier];
   let simm16 = 0;
@@ -490,18 +489,31 @@ def S_BARRIER : SOPP <0x0000000a, (ins), "s_barrier",
   let isConvergent = 1;
 }
 
+let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in
 def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16">;
 def S_SETHALT : SOPP <0x0000000d, (ins i16imm:$simm16), "s_sethalt $simm16">;
-def S_SLEEP : SOPP <0x0000000e, (ins i16imm:$simm16), "s_sleep $simm16">;
-def S_SETPRIO : SOPP <0x0000000f, (ins i16imm:$sim16), "s_setprio $sim16">;
+
+// On SI the documentation says sleep for approximately 64 * low 2
+// bits, consistent with the reported maximum of 448. On VI the
+// maximum reported is 960 cycles, so 960 / 64 = 15 max, so is the
+// maximum really 15 on VI?
+def S_SLEEP : SOPP <0x0000000e, (ins i32imm:$simm16),
+  "s_sleep $simm16", [(int_amdgcn_s_sleep SIMM16bit:$simm16)]> {
+  let hasSideEffects = 1;
+  let mayLoad = 1;
+  let mayStore = 1;
+}
+
+def S_SETPRIO : SOPP <0x0000000f, (ins i16imm:$simm16), "s_setprio $simm16">;
 
 let Uses = [EXEC, M0] in {
+  // FIXME: Should this be mayLoad+mayStore?
   def S_SENDMSG : SOPP <0x00000010, (ins SendMsgImm:$simm16), "s_sendmsg $simm16",
       [(AMDGPUsendmsg (i32 imm:$simm16))]
   >;
 } // End Uses = [EXEC, M0]
 
-def S_SENDMSGHALT : SOPP <0x00000011, (ins i16imm:$simm16), "s_sendmsghalt $simm16">;
+def S_SENDMSGHALT : SOPP <0x00000011, (ins SendMsgImm:$simm16), "s_sendmsghalt $simm16">;
 def S_TRAP : SOPP <0x00000012, (ins i16imm:$simm16), "s_trap $simm16">;
 def S_ICACHE_INV : SOPP <0x00000013, (ins), "s_icache_inv"> {
 	let simm16 = 0;
@@ -770,8 +782,8 @@ defm DS_XOR_B32 : DS_1A1D_NORET <0xb, "ds_xor_b32", VGPR_32>;
 defm DS_MSKOR_B32 : DS_1A2D_NORET <0xc, "ds_mskor_b32", VGPR_32>;
 let mayLoad = 0 in {
 defm DS_WRITE_B32 : DS_1A1D_NORET <0xd, "ds_write_b32", VGPR_32>;
-defm DS_WRITE2_B32 : DS_1A1D_Off8_NORET <0xe, "ds_write2_b32", VGPR_32>;
-defm DS_WRITE2ST64_B32 : DS_1A1D_Off8_NORET <0xf, "ds_write2st64_b32", VGPR_32>;
+defm DS_WRITE2_B32 : DS_1A2D_Off8_NORET <0xe, "ds_write2_b32", VGPR_32>;
+defm DS_WRITE2ST64_B32 : DS_1A2D_Off8_NORET <0xf, "ds_write2st64_b32", VGPR_32>;
 }
 defm DS_CMPST_B32 : DS_1A2D_NORET <0x10, "ds_cmpst_b32", VGPR_32>;
 defm DS_CMPST_F32 : DS_1A2D_NORET <0x11, "ds_cmpst_f32", VGPR_32>;
@@ -811,7 +823,11 @@ defm DS_CMPST_RTN_B32 : DS_1A2D_RET <0x30, "ds_cmpst_rtn_b32", VGPR_32, "ds_cmps
 defm DS_CMPST_RTN_F32 : DS_1A2D_RET <0x31, "ds_cmpst_rtn_f32", VGPR_32, "ds_cmpst_f32">;
 defm DS_MIN_RTN_F32 : DS_1A2D_RET <0x32, "ds_min_rtn_f32", VGPR_32, "ds_min_f32">;
 defm DS_MAX_RTN_F32 : DS_1A2D_RET <0x33, "ds_max_rtn_f32", VGPR_32, "ds_max_f32">;
-defm DS_SWIZZLE_B32 : DS_1A_RET <0x35, "ds_swizzle_b32", VGPR_32>;
+
+let Uses = [EXEC], mayLoad =0, mayStore = 0, isConvergent = 1 in {
+defm DS_SWIZZLE_B32 : DS_1A_RET_ <dsop<0x35, 0x3d>, "ds_swizzle_b32", VGPR_32>;
+}
+
 let mayStore = 0 in {
 defm DS_READ_B32 : DS_1A_RET <0x36, "ds_read_b32", VGPR_32>;
 defm DS_READ2_B32 : DS_1A_Off8_RET <0x37, "ds_read2_b32", VReg_64>;
@@ -839,8 +855,8 @@ defm DS_XOR_B64 : DS_1A1D_NORET <0x4b, "ds_xor_b64", VReg_64>;
 defm DS_MSKOR_B64 : DS_1A2D_NORET <0x4c, "ds_mskor_b64", VReg_64>;
 let mayLoad = 0 in {
 defm DS_WRITE_B64 : DS_1A1D_NORET <0x4d, "ds_write_b64", VReg_64>;
-defm DS_WRITE2_B64 : DS_1A1D_Off8_NORET <0x4E, "ds_write2_b64", VReg_64>;
-defm DS_WRITE2ST64_B64 : DS_1A1D_Off8_NORET <0x4f, "ds_write2st64_b64", VReg_64>;
+defm DS_WRITE2_B64 : DS_1A2D_Off8_NORET <0x4E, "ds_write2_b64", VReg_64>;
+defm DS_WRITE2ST64_B64 : DS_1A2D_Off8_NORET <0x4f, "ds_write2st64_b64", VReg_64>;
 }
 defm DS_CMPST_B64 : DS_1A2D_NORET <0x50, "ds_cmpst_b64", VReg_64>;
 defm DS_CMPST_F64 : DS_1A2D_NORET <0x51, "ds_cmpst_f64", VReg_64>;
@@ -886,7 +902,7 @@ defm DS_MAX_SRC2_U32 : DS_1A <0x88, "ds_max_src2_u32">;
 defm DS_AND_SRC2_B32 : DS_1A <0x89, "ds_and_src_b32">;
 defm DS_OR_SRC2_B32 : DS_1A <0x8a, "ds_or_src2_b32">;
 defm DS_XOR_SRC2_B32 : DS_1A <0x8b, "ds_xor_src2_b32">;
-defm DS_WRITE_SRC2_B32 : DS_1A <0x8c, "ds_write_src2_b32">;
+defm DS_WRITE_SRC2_B32 : DS_1A_Off8_NORET <0x8d, "ds_write_src2_b32">;
 
 defm DS_MIN_SRC2_F32 : DS_1A <0x92, "ds_min_src2_f32">;
 defm DS_MAX_SRC2_F32 : DS_1A <0x93, "ds_max_src2_f32">;
@@ -903,7 +919,7 @@ defm DS_MAX_SRC2_U64 : DS_1A <0xc8, "ds_max_src2_u64">;
 defm DS_AND_SRC2_B64 : DS_1A <0xc9, "ds_and_src2_b64">;
 defm DS_OR_SRC2_B64 : DS_1A <0xca, "ds_or_src2_b64">;
 defm DS_XOR_SRC2_B64 : DS_1A <0xcb, "ds_xor_src2_b64">;
-defm DS_WRITE_SRC2_B64 : DS_1A <0xcc, "ds_write_src2_b64">;
+defm DS_WRITE_SRC2_B64 : DS_1A_Off8_NORET <0xcd, "ds_write_src2_b64">;
 
 defm DS_MIN_SRC2_F64 : DS_1A <0xd2, "ds_min_src2_f64">;
 defm DS_MAX_SRC2_F64 : DS_1A <0xd3, "ds_max_src2_f64">;
@@ -937,16 +953,16 @@ defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Store_Helper <
   mubuf<0x07>, "buffer_store_format_xyzw", VReg_128
 >;
 defm BUFFER_LOAD_UBYTE : MUBUF_Load_Helper <
-  mubuf<0x08, 0x10>, "buffer_load_ubyte", VGPR_32, i32, az_extloadi8_global
+  mubuf<0x08, 0x10>, "buffer_load_ubyte", VGPR_32, i32, mubuf_az_extloadi8
 >;
 defm BUFFER_LOAD_SBYTE : MUBUF_Load_Helper <
-  mubuf<0x09, 0x11>, "buffer_load_sbyte", VGPR_32, i32, sextloadi8_global
+  mubuf<0x09, 0x11>, "buffer_load_sbyte", VGPR_32, i32, mubuf_sextloadi8
 >;
 defm BUFFER_LOAD_USHORT : MUBUF_Load_Helper <
-  mubuf<0x0a, 0x12>, "buffer_load_ushort", VGPR_32, i32, az_extloadi16_global
+  mubuf<0x0a, 0x12>, "buffer_load_ushort", VGPR_32, i32, mubuf_az_extloadi16
 >;
 defm BUFFER_LOAD_SSHORT : MUBUF_Load_Helper <
-  mubuf<0x0b, 0x13>, "buffer_load_sshort", VGPR_32, i32, sextloadi16_global
+  mubuf<0x0b, 0x13>, "buffer_load_sshort", VGPR_32, i32, mubuf_sextloadi16
 >;
 defm BUFFER_LOAD_DWORD : MUBUF_Load_Helper <
   mubuf<0x0c, 0x14>, "buffer_load_dword", VGPR_32, i32, mubuf_load
@@ -981,7 +997,9 @@ defm BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper <
 defm BUFFER_ATOMIC_SWAP : MUBUF_Atomic <
   mubuf<0x30, 0x40>, "buffer_atomic_swap", VGPR_32, i32, atomic_swap_global
 >;
-//def BUFFER_ATOMIC_CMPSWAP : MUBUF_ <mubuf<0x31, 0x41>, "buffer_atomic_cmpswap", []>;
+defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Atomic <
+  mubuf<0x31, 0x41>, "buffer_atomic_cmpswap", VReg_64, v2i32, null_frag
+>;
 defm BUFFER_ATOMIC_ADD : MUBUF_Atomic <
   mubuf<0x32, 0x42>, "buffer_atomic_add", VGPR_32, i32, atomic_add_global
 >;
@@ -1010,30 +1028,61 @@ defm BUFFER_ATOMIC_OR : MUBUF_Atomic <
 defm BUFFER_ATOMIC_XOR : MUBUF_Atomic <
   mubuf<0x3b, 0x4a>, "buffer_atomic_xor", VGPR_32, i32, atomic_xor_global
 >;
-//def BUFFER_ATOMIC_INC : MUBUF_ <mubuf<0x3c, 0x4b>, "buffer_atomic_inc", []>;
-//def BUFFER_ATOMIC_DEC : MUBUF_ <mubuf<0x3d, 0x4c>, "buffer_atomic_dec", []>;
-//def BUFFER_ATOMIC_FCMPSWAP : MUBUF_ <mubuf<0x3e>, "buffer_atomic_fcmpswap", []>; // isn't on VI
-//def BUFFER_ATOMIC_FMIN : MUBUF_ <mubuf<0x3f>, "buffer_atomic_fmin", []>; // isn't on VI
-//def BUFFER_ATOMIC_FMAX : MUBUF_ <mubuf<0x40>, "buffer_atomic_fmax", []>; // isn't on VI
-//def BUFFER_ATOMIC_SWAP_X2 : MUBUF_X2 <mubuf<0x50, 0x60>, "buffer_atomic_swap_x2", []>;
-//def BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_X2 <mubuf<0x51, 0x61>, "buffer_atomic_cmpswap_x2", []>;
-//def BUFFER_ATOMIC_ADD_X2 : MUBUF_X2 <mubuf<0x52, 0x62>, "buffer_atomic_add_x2", []>;
-//def BUFFER_ATOMIC_SUB_X2 : MUBUF_X2 <mubuf<0x53, 0x63>, "buffer_atomic_sub_x2", []>;
-//def BUFFER_ATOMIC_RSUB_X2 : MUBUF_X2 <mubuf<0x54>, "buffer_atomic_rsub_x2", []>; // isn't on CI & VI
-//def BUFFER_ATOMIC_SMIN_X2 : MUBUF_X2 <mubuf<0x55, 0x64>, "buffer_atomic_smin_x2", []>;
-//def BUFFER_ATOMIC_UMIN_X2 : MUBUF_X2 <mubuf<0x56, 0x65>, "buffer_atomic_umin_x2", []>;
-//def BUFFER_ATOMIC_SMAX_X2 : MUBUF_X2 <mubuf<0x57, 0x66>, "buffer_atomic_smax_x2", []>;
-//def BUFFER_ATOMIC_UMAX_X2 : MUBUF_X2 <mubuf<0x58, 0x67>, "buffer_atomic_umax_x2", []>;
-//def BUFFER_ATOMIC_AND_X2 : MUBUF_X2 <mubuf<0x59, 0x68>, "buffer_atomic_and_x2", []>;
-//def BUFFER_ATOMIC_OR_X2 : MUBUF_X2 <mubuf<0x5a, 0x69>, "buffer_atomic_or_x2", []>;
-//def BUFFER_ATOMIC_XOR_X2 : MUBUF_X2 <mubuf<0x5b, 0x6a>, "buffer_atomic_xor_x2", []>;
-//def BUFFER_ATOMIC_INC_X2 : MUBUF_X2 <mubuf<0x5c, 0x6b>, "buffer_atomic_inc_x2", []>;
-//def BUFFER_ATOMIC_DEC_X2 : MUBUF_X2 <mubuf<0x5d, 0x6c>, "buffer_atomic_dec_x2", []>;
+defm BUFFER_ATOMIC_INC : MUBUF_Atomic <
+  mubuf<0x3c, 0x4b>, "buffer_atomic_inc", VGPR_32, i32, atomic_inc_global
+>;
+defm BUFFER_ATOMIC_DEC : MUBUF_Atomic <
+  mubuf<0x3d, 0x4c>, "buffer_atomic_dec", VGPR_32, i32, atomic_dec_global
+>;
+
+//def BUFFER_ATOMIC_FCMPSWAP : MUBUF_Atomic <mubuf<0x3e>, "buffer_atomic_fcmpswap", []>; // isn't on VI
+//def BUFFER_ATOMIC_FMIN : MUBUF_Atomic <mubuf<0x3f>, "buffer_atomic_fmin", []>; // isn't on VI
+//def BUFFER_ATOMIC_FMAX : MUBUF_Atomic <mubuf<0x40>, "buffer_atomic_fmax", []>; // isn't on VI
+defm BUFFER_ATOMIC_SWAP_X2 : MUBUF_Atomic <
+  mubuf<0x50, 0x60>, "buffer_atomic_swap_x2", VReg_64, i64, atomic_swap_global
+>;
+defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Atomic <
+  mubuf<0x51, 0x61>, "buffer_atomic_cmpswap_x2", VReg_128, v2i64, null_frag
+>;
+defm BUFFER_ATOMIC_ADD_X2 : MUBUF_Atomic <
+  mubuf<0x52, 0x62>, "buffer_atomic_add_x2", VReg_64, i64, atomic_add_global
+>;
+defm BUFFER_ATOMIC_SUB_X2 : MUBUF_Atomic <
+  mubuf<0x53, 0x63>, "buffer_atomic_sub_x2", VReg_64, i64, atomic_sub_global
+>;
+//defm BUFFER_ATOMIC_RSUB_X2 : MUBUF_Atomic <mubuf<0x54>, "buffer_atomic_rsub_x2", []>; // isn't on CI & VI
+defm BUFFER_ATOMIC_SMIN_X2 : MUBUF_Atomic <
+  mubuf<0x55, 0x64>, "buffer_atomic_smin_x2", VReg_64, i64, atomic_min_global
+>;
+defm BUFFER_ATOMIC_UMIN_X2 : MUBUF_Atomic <
+  mubuf<0x56, 0x65>, "buffer_atomic_umin_x2", VReg_64, i64, atomic_umin_global
+>;
+defm BUFFER_ATOMIC_SMAX_X2 : MUBUF_Atomic <
+  mubuf<0x57, 0x66>, "buffer_atomic_smax_x2", VReg_64, i64, atomic_max_global
+>;
+defm BUFFER_ATOMIC_UMAX_X2 : MUBUF_Atomic <
+  mubuf<0x58, 0x67>, "buffer_atomic_umax_x2", VReg_64, i64, atomic_umax_global
+>;
+defm BUFFER_ATOMIC_AND_X2 : MUBUF_Atomic <
+  mubuf<0x59, 0x68>, "buffer_atomic_and_x2", VReg_64, i64, atomic_and_global
+>;
+defm BUFFER_ATOMIC_OR_X2 : MUBUF_Atomic <
+  mubuf<0x5a, 0x69>, "buffer_atomic_or_x2", VReg_64, i64, atomic_or_global
+>;
+defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Atomic <
+  mubuf<0x5b, 0x6a>, "buffer_atomic_xor_x2", VReg_64, i64, atomic_xor_global
+>;
+defm BUFFER_ATOMIC_INC_X2 : MUBUF_Atomic <
+  mubuf<0x5c, 0x6b>, "buffer_atomic_inc_x2", VReg_64, i64, atomic_inc_global
+>;
+defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Atomic <
+  mubuf<0x5d, 0x6c>, "buffer_atomic_dec_x2", VReg_64, i64, atomic_dec_global
+>;
 //def BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_X2 <mubuf<0x5e>, "buffer_atomic_fcmpswap_x2", []>; // isn't on VI
 //def BUFFER_ATOMIC_FMIN_X2 : MUBUF_X2 <mubuf<0x5f>, "buffer_atomic_fmin_x2", []>; // isn't on VI
 //def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 <mubuf<0x60>, "buffer_atomic_fmax_x2", []>; // isn't on VI
 
-let SubtargetPredicate = isSI in {
+let SubtargetPredicate = isSI, DisableVIDecoder = 1 in {
 defm BUFFER_WBINVL1_SC : MUBUF_Invalidate <mubuf<0x70>, "buffer_wbinvl1_sc", int_amdgcn_buffer_wbinvl1_sc>; // isn't on CI & VI
 }
 
@@ -1062,28 +1111,28 @@ defm IMAGE_LOAD_MIP : MIMG_NoSampler <0x00000001, "image_load_mip">;
 //def IMAGE_LOAD_PCK_SGN : MIMG_NoPattern_ <"image_load_pck_sgn", 0x00000003>;
 //def IMAGE_LOAD_MIP_PCK : MIMG_NoPattern_ <"image_load_mip_pck", 0x00000004>;
 //def IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoPattern_ <"image_load_mip_pck_sgn", 0x00000005>;
-//def IMAGE_STORE : MIMG_NoPattern_ <"image_store", 0x00000008>;
-//def IMAGE_STORE_MIP : MIMG_NoPattern_ <"image_store_mip", 0x00000009>;
+defm IMAGE_STORE : MIMG_Store <0x00000008, "image_store">;
+defm IMAGE_STORE_MIP : MIMG_Store <0x00000009, "image_store_mip">;
 //def IMAGE_STORE_PCK : MIMG_NoPattern_ <"image_store_pck", 0x0000000a>;
 //def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"image_store_mip_pck", 0x0000000b>;
 defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "image_get_resinfo">;
-//def IMAGE_ATOMIC_SWAP : MIMG_NoPattern_ <"image_atomic_swap", 0x0000000f>;
-//def IMAGE_ATOMIC_CMPSWAP : MIMG_NoPattern_ <"image_atomic_cmpswap", 0x00000010>;
-//def IMAGE_ATOMIC_ADD : MIMG_NoPattern_ <"image_atomic_add", 0x00000011>;
-//def IMAGE_ATOMIC_SUB : MIMG_NoPattern_ <"image_atomic_sub", 0x00000012>;
-//def IMAGE_ATOMIC_RSUB : MIMG_NoPattern_ <"image_atomic_rsub", 0x00000013>;
-//def IMAGE_ATOMIC_SMIN : MIMG_NoPattern_ <"image_atomic_smin", 0x00000014>;
-//def IMAGE_ATOMIC_UMIN : MIMG_NoPattern_ <"image_atomic_umin", 0x00000015>;
-//def IMAGE_ATOMIC_SMAX : MIMG_NoPattern_ <"image_atomic_smax", 0x00000016>;
-//def IMAGE_ATOMIC_UMAX : MIMG_NoPattern_ <"image_atomic_umax", 0x00000017>;
-//def IMAGE_ATOMIC_AND : MIMG_NoPattern_ <"image_atomic_and", 0x00000018>;
-//def IMAGE_ATOMIC_OR : MIMG_NoPattern_ <"image_atomic_or", 0x00000019>;
-//def IMAGE_ATOMIC_XOR : MIMG_NoPattern_ <"image_atomic_xor", 0x0000001a>;
-//def IMAGE_ATOMIC_INC : MIMG_NoPattern_ <"image_atomic_inc", 0x0000001b>;
-//def IMAGE_ATOMIC_DEC : MIMG_NoPattern_ <"image_atomic_dec", 0x0000001c>;
-//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"image_atomic_fcmpswap", 0x0000001d>;
-//def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"image_atomic_fmin", 0x0000001e>;
-//def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"image_atomic_fmax", 0x0000001f>;
+defm IMAGE_ATOMIC_SWAP : MIMG_Atomic <mimg<0x0f, 0x10>, "image_atomic_swap">;
+defm IMAGE_ATOMIC_CMPSWAP : MIMG_Atomic <mimg<0x10, 0x11>, "image_atomic_cmpswap", VReg_64>;
+defm IMAGE_ATOMIC_ADD : MIMG_Atomic <mimg<0x11, 0x12>, "image_atomic_add">;
+defm IMAGE_ATOMIC_SUB : MIMG_Atomic <mimg<0x12, 0x13>, "image_atomic_sub">;
+//def IMAGE_ATOMIC_RSUB : MIMG_NoPattern_ <"image_atomic_rsub", 0x00000013>; -- not on VI
+defm IMAGE_ATOMIC_SMIN : MIMG_Atomic <mimg<0x14>, "image_atomic_smin">;
+defm IMAGE_ATOMIC_UMIN : MIMG_Atomic <mimg<0x15>, "image_atomic_umin">;
+defm IMAGE_ATOMIC_SMAX : MIMG_Atomic <mimg<0x16>, "image_atomic_smax">;
+defm IMAGE_ATOMIC_UMAX : MIMG_Atomic <mimg<0x17>, "image_atomic_umax">;
+defm IMAGE_ATOMIC_AND : MIMG_Atomic <mimg<0x18>, "image_atomic_and">;
+defm IMAGE_ATOMIC_OR : MIMG_Atomic <mimg<0x19>, "image_atomic_or">;
+defm IMAGE_ATOMIC_XOR : MIMG_Atomic <mimg<0x1a>, "image_atomic_xor">;
+defm IMAGE_ATOMIC_INC : MIMG_Atomic <mimg<0x1b>, "image_atomic_inc">;
+defm IMAGE_ATOMIC_DEC : MIMG_Atomic <mimg<0x1c>, "image_atomic_dec">;
+//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"image_atomic_fcmpswap", 0x0000001d>; -- not on VI
+//def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"image_atomic_fmin", 0x0000001e>; -- not on VI
+//def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"image_atomic_fmax", 0x0000001f>; -- not on VI
 defm IMAGE_SAMPLE           : MIMG_Sampler_WQM <0x00000020, "image_sample">;
 defm IMAGE_SAMPLE_CL        : MIMG_Sampler_WQM <0x00000021, "image_sample_cl">;
 defm IMAGE_SAMPLE_D         : MIMG_Sampler <0x00000022, "image_sample_d">;
@@ -1171,10 +1220,12 @@ let Uses = [EXEC] in {
 def V_READFIRSTLANE_B32 : VOP1 <
   0x00000002,
   (outs SReg_32:$vdst),
-  (ins VGPR_32:$src0),
+  (ins VS_32:$src0),
   "v_readfirstlane_b32 $vdst, $src0",
   []
->;
+> {
+  let isConvergent = 1;
+}
 
 }
 
@@ -1234,7 +1285,7 @@ defm V_CVT_F64_U32 : VOP1Inst <vop1<0x16>, "v_cvt_f64_u32",
   VOP_F64_I32, uint_to_fp
 >;
 
-} // let SchedRW = [WriteQuarterRate32]
+} // End SchedRW = [WriteQuarterRate32]
 
 defm V_FRACT_F32 : VOP1Inst <vop1<0x20, 0x1b>, "v_fract_f32",
   VOP_F32_F32, AMDGPUfract
@@ -1270,7 +1321,7 @@ defm V_RSQ_F32 : VOP1Inst <vop1<0x2e, 0x24>, "v_rsq_f32",
   VOP_F32_F32, AMDGPUrsq
 >;
 
-} //let SchedRW = [WriteQuarterRate32]
+} // End SchedRW = [WriteQuarterRate32]
 
 let SchedRW = [WriteDouble] in {
 
@@ -1281,7 +1332,7 @@ defm V_RSQ_F64 : VOP1Inst <vop1<0x31, 0x26>, "v_rsq_f64",
   VOP_F64_F64, AMDGPUrsq
 >;
 
-} // let SchedRW = [WriteDouble];
+} // End SchedRW = [WriteDouble];
 
 defm V_SQRT_F32 : VOP1Inst <vop1<0x33, 0x27>, "v_sqrt_f32",
   VOP_F32_F32, fsqrt
@@ -1312,34 +1363,34 @@ defm V_FFBH_U32 : VOP1Inst <vop1<0x39, 0x2d>, "v_ffbh_u32", VOP_I32_I32>;
 defm V_FFBL_B32 : VOP1Inst <vop1<0x3a, 0x2e>, "v_ffbl_b32", VOP_I32_I32>;
 defm V_FFBH_I32 : VOP1Inst <vop1<0x3b, 0x2f>, "v_ffbh_i32", VOP_I32_I32>;
 defm V_FREXP_EXP_I32_F64 : VOP1Inst <vop1<0x3c,0x30>, "v_frexp_exp_i32_f64",
-  VOP_I32_F64
+  VOP_I32_F64, int_amdgcn_frexp_exp
 >;
 
 let SchedRW = [WriteDoubleAdd] in {
 defm V_FREXP_MANT_F64 : VOP1Inst <vop1<0x3d, 0x31>, "v_frexp_mant_f64",
-  VOP_F64_F64
+  VOP_F64_F64, int_amdgcn_frexp_mant
 >;
 
 defm V_FRACT_F64 : VOP1Inst <vop1<0x3e, 0x32>, "v_fract_f64",
-  VOP_F64_F64
+  VOP_F64_F64, AMDGPUfract
 >;
 } // End SchedRW = [WriteDoubleAdd]
 
 
 defm V_FREXP_EXP_I32_F32 : VOP1Inst <vop1<0x3f, 0x33>, "v_frexp_exp_i32_f32",
-  VOP_I32_F32
+  VOP_I32_F32, int_amdgcn_frexp_exp
 >;
 defm V_FREXP_MANT_F32 : VOP1Inst <vop1<0x40, 0x34>, "v_frexp_mant_f32",
-  VOP_F32_F32
+  VOP_F32_F32, int_amdgcn_frexp_mant
 >;
 let vdst = 0, src0 = 0, VOPAsmPrefer32Bit = 1 in {
-defm V_CLREXCP : VOP1Inst <vop1<0x41,0x35>, "v_clrexcp", VOP_NONE>;
+defm V_CLREXCP : VOP1Inst <vop1<0x41,0x35>, "v_clrexcp", VOP_NO_EXT<VOP_NONE>>;
 }
 
 let Uses = [M0, EXEC] in {
-defm V_MOVRELD_B32 : VOP1Inst <vop1<0x42, 0x36>, "v_movreld_b32", VOP_I32_I32>;
-defm V_MOVRELS_B32 : VOP1Inst <vop1<0x43, 0x37>, "v_movrels_b32", VOP_I32_I32>;
-defm V_MOVRELSD_B32 : VOP1Inst <vop1<0x44, 0x38>, "v_movrelsd_b32", VOP_I32_I32>;
+defm V_MOVRELD_B32 : VOP1Inst <vop1<0x42, 0x36>, "v_movreld_b32", VOP_NO_EXT<VOP_I32_I32>>;
+defm V_MOVRELS_B32 : VOP1Inst <vop1<0x43, 0x37>, "v_movrels_b32", VOP_NO_EXT<VOP_I32_I32>>;
+defm V_MOVRELSD_B32 : VOP1Inst <vop1<0x44, 0x38>, "v_movrelsd_b32", VOP_NO_EXT<VOP_I32_I32>>;
 } // End Uses = [M0, EXEC]
 
 // These instruction only exist on SI and CI
@@ -1348,11 +1399,12 @@ let SubtargetPredicate = isSICI in {
 let SchedRW = [WriteQuarterRate32] in {
 
 defm V_MOV_FED_B32 : VOP1InstSI <vop1<0x9>, "v_mov_fed_b32", VOP_I32_I32>;
-defm V_LOG_CLAMP_F32 : VOP1InstSI <vop1<0x26>, "v_log_clamp_f32", VOP_F32_F32>;
+defm V_LOG_CLAMP_F32 : VOP1InstSI <vop1<0x26>, "v_log_clamp_f32",
+  VOP_F32_F32, int_amdgcn_log_clamp>;
 defm V_RCP_CLAMP_F32 : VOP1InstSI <vop1<0x28>, "v_rcp_clamp_f32", VOP_F32_F32>;
 defm V_RCP_LEGACY_F32 : VOP1InstSI <vop1<0x29>, "v_rcp_legacy_f32", VOP_F32_F32>;
 defm V_RSQ_CLAMP_F32 : VOP1InstSI <vop1<0x2c>, "v_rsq_clamp_f32",
-  VOP_F32_F32, AMDGPUrsq_clamped
+  VOP_F32_F32, AMDGPUrsq_clamp
 >;
 defm V_RSQ_LEGACY_F32 : VOP1InstSI <vop1<0x2d>, "v_rsq_legacy_f32",
   VOP_F32_F32, AMDGPUrsq_legacy
@@ -1364,7 +1416,7 @@ let SchedRW = [WriteDouble] in {
 
 defm V_RCP_CLAMP_F64 : VOP1InstSI <vop1<0x30>, "v_rcp_clamp_f64", VOP_F64_F64>;
 defm V_RSQ_CLAMP_F64 : VOP1InstSI <vop1<0x32>, "v_rsq_clamp_f64",
-  VOP_F64_F64, AMDGPUrsq_clamped
+  VOP_F64_F64, AMDGPUrsq_clamp
 >;
 
 } // End SchedRW = [WriteDouble]
@@ -1394,11 +1446,11 @@ defm V_INTERP_P1_F32 : V_INTERP_P1_F32_m;
 
 } // End OtherPredicates = [has32BankLDS]
 
-let OtherPredicates = [has16BankLDS], Constraints = "@earlyclobber $dst" in {
+let OtherPredicates = [has16BankLDS], Constraints = "@earlyclobber $dst", isAsmParserOnly=1 in {
 
 defm V_INTERP_P1_F32_16bank : V_INTERP_P1_F32_m;
 
-} // End OtherPredicates = [has32BankLDS], Constraints = "@earlyclobber $dst"
+} // End OtherPredicates = [has32BankLDS], Constraints = "@earlyclobber $dst", isAsmParserOnly=1
 
 let DisableEncoding = "$src0", Constraints = "$src0 = $dst" in {
 
@@ -1426,15 +1478,9 @@ defm V_INTERP_MOV_F32 : VINTRP_m <
 // VOP2 Instructions
 //===----------------------------------------------------------------------===//
 
-multiclass V_CNDMASK <vop2 op, string name> {
-  defm _e32 : VOP2_m <op, name, VOP_CNDMASK, [], name>;
-
-  defm _e64  : VOP3_m <
-      op, VOP_CNDMASK.Outs, VOP_CNDMASK.Ins64,
-      name#!cast<string>(VOP_CNDMASK.Asm64), [], name, 3>;
-}
-
-defm V_CNDMASK_B32 : V_CNDMASK<vop2<0x0>, "v_cndmask_b32">;
+defm V_CNDMASK_B32 : VOP2eInst <vop2<0x0, 0x0>, "v_cndmask_b32",
+  VOP2e_I32_I32_I32_I1
+>;
 
 let isCommutable = 1 in {
 defm V_ADD_F32 : VOP2Inst <vop2<0x3, 0x1>, "v_add_f32",
@@ -1450,7 +1496,7 @@ defm V_SUBREV_F32 : VOP2Inst <vop2<0x5, 0x3>, "v_subrev_f32",
 let isCommutable = 1 in {
 
 defm V_MUL_LEGACY_F32 : VOP2Inst <vop2<0x7, 0x4>, "v_mul_legacy_f32",
-  VOP_F32_F32_F32, int_AMDGPU_mul
+  VOP_F32_F32_F32
 >;
 
 defm V_MUL_F32 : VOP2Inst <vop2<0x8, 0x5>, "v_mul_f32",
@@ -1501,16 +1547,16 @@ defm V_AND_B32 : VOP2Inst <vop2<0x1b, 0x13>, "v_and_b32", VOP_I32_I32_I32>;
 defm V_OR_B32 : VOP2Inst <vop2<0x1c, 0x14>, "v_or_b32", VOP_I32_I32_I32>;
 defm V_XOR_B32 : VOP2Inst <vop2<0x1d, 0x15>, "v_xor_b32", VOP_I32_I32_I32>;
 
-let Constraints = "$dst = $src2", DisableEncoding="$src2",
+let Constraints = "$vdst = $src2", DisableEncoding="$src2",
     isConvertibleToThreeAddress = 1 in {
 defm V_MAC_F32 : VOP2Inst <vop2<0x1f, 0x16>, "v_mac_f32", VOP_MAC>;
 }
 } // End isCommutable = 1
 
-defm V_MADMK_F32 : VOP2MADK <vop2<0x20, 0x17>, "v_madmk_f32">;
+defm V_MADMK_F32 : VOP2MADK <vop2<0x20, 0x17>, "v_madmk_f32", VOP_MADMK>;
 
 let isCommutable = 1 in {
-defm V_MADAK_F32 : VOP2MADK <vop2<0x21, 0x18>, "v_madak_f32">;
+defm V_MADAK_F32 : VOP2MADK <vop2<0x21, 0x18>, "v_madak_f32", VOP_MADAK>;
 } // End isCommutable = 1
 
 let isCommutable = 1 in {
@@ -1540,11 +1586,14 @@ defm V_SUBBREV_U32 : VOP2bInst <vop2<0x2a, 0x1e>, "v_subbrev_u32",
 
 } // End isCommutable = 1
 
+// These are special and do not read the exec mask.
+let isConvergent = 1, Uses = []<Register> in {
+
 defm V_READLANE_B32 : VOP2SI_3VI_m <
   vop3 <0x001, 0x289>,
   "v_readlane_b32",
   (outs SReg_32:$vdst),
-  (ins VGPR_32:$src0, SCSrc_32:$src1),
+  (ins VS_32:$src0, SCSrc_32:$src1),
   "v_readlane_b32 $vdst, $src0, $src1"
 >;
 
@@ -1556,6 +1605,8 @@ defm V_WRITELANE_B32 : VOP2SI_3VI_m <
   "v_writelane_b32 $vdst, $src0, $src1"
 >;
 
+} // End isConvergent = 1
+
 // These instructions only exist on SI and CI
 let SubtargetPredicate = isSICI in {
 
@@ -1636,16 +1687,16 @@ defm V_MAD_U32_U24 : VOP3Inst <vop3<0x143, 0x1c3>, "v_mad_u32_u24",
 } // End isCommutable = 1
 
 defm V_CUBEID_F32 : VOP3Inst <vop3<0x144, 0x1c4>, "v_cubeid_f32",
-  VOP_F32_F32_F32_F32
+  VOP_F32_F32_F32_F32, int_amdgcn_cubeid
 >;
 defm V_CUBESC_F32 : VOP3Inst <vop3<0x145, 0x1c5>, "v_cubesc_f32",
-  VOP_F32_F32_F32_F32
+  VOP_F32_F32_F32_F32, int_amdgcn_cubesc
 >;
 defm V_CUBETC_F32 : VOP3Inst <vop3<0x146, 0x1c6>, "v_cubetc_f32",
-  VOP_F32_F32_F32_F32
+  VOP_F32_F32_F32_F32, int_amdgcn_cubetc
 >;
 defm V_CUBEMA_F32 : VOP3Inst <vop3<0x147, 0x1c7>, "v_cubema_f32",
-  VOP_F32_F32_F32_F32
+  VOP_F32_F32_F32_F32, int_amdgcn_cubema
 >;
 
 defm V_BFE_U32 : VOP3Inst <vop3<0x148, 0x1c8>, "v_bfe_u32",
@@ -1666,6 +1717,10 @@ defm V_FMA_F32 : VOP3Inst <vop3<0x14b, 0x1cb>, "v_fma_f32",
 defm V_FMA_F64 : VOP3Inst <vop3<0x14c, 0x1cc>, "v_fma_f64",
   VOP_F64_F64_F64_F64, fma
 >;
+
+defm V_LERP_U8 : VOP3Inst <vop3<0x14d, 0x1cd>, "v_lerp_u8",
+  VOP_I32_I32_I32_I32, int_amdgcn_lerp
+>;
 } // End isCommutable = 1
 
 //def V_LERP_U8 : VOP3_U8 <0x0000014d, "v_lerp_u8", []>;
@@ -1695,13 +1750,13 @@ defm V_MAX3_U32 : VOP3Inst <vop3<0x156, 0x1d5>, "v_max3_u32",
   VOP_I32_I32_I32_I32, AMDGPUumax3
 >;
 defm V_MED3_F32 : VOP3Inst <vop3<0x157, 0x1d6>, "v_med3_f32",
-  VOP_F32_F32_F32_F32
+  VOP_F32_F32_F32_F32, AMDGPUfmed3
 >;
 defm V_MED3_I32 : VOP3Inst <vop3<0x158, 0x1d7>, "v_med3_i32",
-  VOP_I32_I32_I32_I32
+  VOP_I32_I32_I32_I32, AMDGPUsmed3
 >;
 defm V_MED3_U32 : VOP3Inst <vop3<0x159, 0x1d8>, "v_med3_u32",
-  VOP_I32_I32_I32_I32
+  VOP_I32_I32_I32_I32, AMDGPUumed3
 >;
 
 //def V_SAD_U8 : VOP3_U8 <0x0000015a, "v_sad_u8", []>;
@@ -1710,7 +1765,7 @@ defm V_MED3_U32 : VOP3Inst <vop3<0x159, 0x1d8>, "v_med3_u32",
 defm V_SAD_U32 : VOP3Inst <vop3<0x15d, 0x1dc>, "v_sad_u32",
   VOP_I32_I32_I32_I32
 >;
-////def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "v_cvt_pk_u8_f32", []>;
+//def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "v_cvt_pk_u8_f32", []>;
 defm V_DIV_FIXUP_F32 : VOP3Inst <
   vop3<0x15f, 0x1de>, "v_div_fixup_f32", VOP_F32_F32_F32_F32, AMDGPUdiv_fixup
 >;
@@ -1727,26 +1782,26 @@ let SchedRW = [WriteDoubleAdd] in {
 let isCommutable = 1 in {
 
 defm V_ADD_F64 : VOP3Inst <vop3<0x164, 0x280>, "v_add_f64",
-  VOP_F64_F64_F64, fadd
+  VOP_F64_F64_F64, fadd, 1
 >;
 defm V_MUL_F64 : VOP3Inst <vop3<0x165, 0x281>, "v_mul_f64",
-  VOP_F64_F64_F64, fmul
+  VOP_F64_F64_F64, fmul, 1
 >;
 
 defm V_MIN_F64 : VOP3Inst <vop3<0x166, 0x282>, "v_min_f64",
-  VOP_F64_F64_F64, fminnum
+  VOP_F64_F64_F64, fminnum, 1
 >;
 defm V_MAX_F64 : VOP3Inst <vop3<0x167, 0x283>, "v_max_f64",
-  VOP_F64_F64_F64, fmaxnum
+  VOP_F64_F64_F64, fmaxnum, 1
 >;
 
-} // isCommutable = 1
+} // End isCommutable = 1
 
 defm V_LDEXP_F64 : VOP3Inst <vop3<0x168, 0x284>, "v_ldexp_f64",
-  VOP_F64_F64_I32, AMDGPUldexp
+  VOP_F64_F64_I32, AMDGPUldexp, 1
 >;
 
-} // let SchedRW = [WriteDoubleAdd]
+} // End let SchedRW = [WriteDoubleAdd]
 
 let isCommutable = 1, SchedRW = [WriteQuarterRate32] in {
 
@@ -1754,30 +1809,33 @@ defm V_MUL_LO_U32 : VOP3Inst <vop3<0x169, 0x285>, "v_mul_lo_u32",
   VOP_I32_I32_I32
 >;
 defm V_MUL_HI_U32 : VOP3Inst <vop3<0x16a, 0x286>, "v_mul_hi_u32",
-  VOP_I32_I32_I32
+  VOP_I32_I32_I32, mulhu
 >;
 
+let DisableVIDecoder=1 in { // removed from VI as identical to V_MUL_LO_U32
 defm V_MUL_LO_I32 : VOP3Inst <vop3<0x16b, 0x285>, "v_mul_lo_i32",
   VOP_I32_I32_I32
 >;
+}
+
 defm V_MUL_HI_I32 : VOP3Inst <vop3<0x16c, 0x287>, "v_mul_hi_i32",
-  VOP_I32_I32_I32
+  VOP_I32_I32_I32, mulhs
 >;
 
-} // isCommutable = 1, SchedRW = [WriteQuarterRate32]
+} // End isCommutable = 1, SchedRW = [WriteQuarterRate32]
 
 let SchedRW = [WriteFloatFMA, WriteSALU] in {
 defm V_DIV_SCALE_F32 : VOP3bInst <vop3<0x16d, 0x1e0>, "v_div_scale_f32",
-  VOP3b_F32_I1_F32_F32_F32
+  VOP3b_F32_I1_F32_F32_F32, [], 1
 >;
 }
 
 let SchedRW = [WriteDouble, WriteSALU] in {
 // Double precision division pre-scale.
 defm V_DIV_SCALE_F64 : VOP3bInst <vop3<0x16e, 0x1e1>, "v_div_scale_f64",
-  VOP3b_F64_I1_F64_F64_F64
+  VOP3b_F64_I1_F64_F64_F64, [], 1
 >;
-} // let SchedRW = [WriteDouble]
+} // End SchedRW = [WriteDouble]
 
 let isCommutable = 1, Uses = [VCC, EXEC] in {
 
@@ -1814,7 +1872,7 @@ defm V_TRIG_PREOP_F64 : VOP3Inst <
   vop3<0x174, 0x292>, "v_trig_preop_f64", VOP_F64_F64_I32, AMDGPUtrig_preop
 >;
 
-} // let SchedRW = [WriteDouble]
+} // End SchedRW = [WriteDouble]
 
 // These instructions only exist on SI and CI
 let SubtargetPredicate = isSICI in {
@@ -1828,7 +1886,7 @@ defm V_MULLIT_F32 : VOP3Inst <vop3<0x150>, "v_mullit_f32",
 
 } // End SubtargetPredicate = isSICI
 
-let SubtargetPredicate = isVI in {
+let SubtargetPredicate = isVI, DisableSIDecoder = 1 in {
 
 defm V_LSHLREV_B64 : VOP3Inst <vop3<0, 0x28f>, "v_lshlrev_b64",
   VOP_I64_I32_I64
@@ -1845,113 +1903,145 @@ defm V_ASHRREV_I64 : VOP3Inst <vop3<0, 0x291>, "v_ashrrev_i64",
 //===----------------------------------------------------------------------===//
 // Pseudo Instructions
 //===----------------------------------------------------------------------===//
-let isCodeGenOnly = 1, isPseudo = 1 in {
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
 
 // For use in patterns
-def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$dst),
-  (ins VSrc_64:$src0, VSrc_64:$src1, SSrc_64:$src2), "", []
->;
+def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst),
+  (ins VSrc_64:$src0, VSrc_64:$src1, SSrc_64:$src2), "", []> {
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+}
 
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
 // 64-bit vector move instruction.  This is mainly used by the SIFoldOperands
 // pass to enable folding of inline immediates.
-def V_MOV_B64_PSEUDO : InstSI <(outs VReg_64:$dst), (ins VSrc_64:$src0), "", []>;
-} // end let hasSideEffects = 0, mayLoad = 0, mayStore = 0
-
-let hasSideEffects = 1, SALU = 1 in {
-def SGPR_USE : InstSI <(outs),(ins), "", []>;
+def V_MOV_B64_PSEUDO : PseudoInstSI <(outs VReg_64:$vdst), (ins VSrc_64:$src0)> {
+  let VALU = 1;
 }
+} // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]
+
+let usesCustomInserter = 1, SALU = 1 in {
+def GET_GROUPSTATICSIZE : PseudoInstSI <(outs SReg_32:$sdst), (ins),
+  [(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>;
+} // End let usesCustomInserter = 1, SALU = 1
 
 // SI pseudo instructions. These are used by the CFG structurizer pass
 // and should be lowered to ISA instructions prior to codegen.
 
-let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in {
-let Uses = [EXEC], Defs = [EXEC] in {
+let hasSideEffects = 1 in {
+
+// Dummy terminator instruction to use after control flow instructions
+// replaced with exec mask operations.
+def SI_MASK_BRANCH : PseudoInstSI <
+  (outs), (ins brtarget:$target, SReg_64:$dst)> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let isBarrier = 1;
+  let SALU = 1;
+}
+
+let Uses = [EXEC], Defs = [EXEC, SCC] in {
 
 let isBranch = 1, isTerminator = 1 in {
 
-def SI_IF: InstSI <
-  (outs SReg_64:$dst),
-  (ins SReg_64:$vcc, brtarget:$target),
-  "",
-  [(set i64:$dst, (int_SI_if i1:$vcc, bb:$target))]
->;
+def SI_IF: PseudoInstSI <
+  (outs SReg_64:$dst), (ins SReg_64:$vcc, brtarget:$target),
+  [(set i64:$dst, (int_amdgcn_if i1:$vcc, bb:$target))]> {
+  let Constraints = "";
+}
 
-def SI_ELSE : InstSI <
-  (outs SReg_64:$dst),
-  (ins SReg_64:$src, brtarget:$target),
-  "",
-  [(set i64:$dst, (int_SI_else i64:$src, bb:$target))]
-> {
+def SI_ELSE : PseudoInstSI <
+  (outs SReg_64:$dst), (ins SReg_64:$src, brtarget:$target),
+  [(set i64:$dst, (int_amdgcn_else i64:$src, bb:$target))]> {
   let Constraints = "$src = $dst";
 }
 
-def SI_LOOP : InstSI <
-  (outs),
-  (ins SReg_64:$saved, brtarget:$target),
-  "si_loop $saved, $target",
-  [(int_SI_loop i64:$saved, bb:$target)]
+def SI_LOOP : PseudoInstSI <
+  (outs), (ins SReg_64:$saved, brtarget:$target),
+  [(int_amdgcn_loop i64:$saved, bb:$target)]
 >;
 
-} // end isBranch = 1, isTerminator = 1
+} // End isBranch = 1, isTerminator = 1
 
-def SI_BREAK : InstSI <
-  (outs SReg_64:$dst),
-  (ins SReg_64:$src),
-  "si_else $dst, $src",
-  [(set i64:$dst, (int_SI_break i64:$src))]
+
+def SI_BREAK : PseudoInstSI <
+  (outs SReg_64:$dst), (ins SReg_64:$src),
+  [(set i64:$dst, (int_amdgcn_break i64:$src))]
 >;
 
-def SI_IF_BREAK : InstSI <
-  (outs SReg_64:$dst),
-  (ins SReg_64:$vcc, SReg_64:$src),
-  "si_if_break $dst, $vcc, $src",
-  [(set i64:$dst, (int_SI_if_break i1:$vcc, i64:$src))]
+def SI_IF_BREAK : PseudoInstSI <
+  (outs SReg_64:$dst), (ins SReg_64:$vcc, SReg_64:$src),
+  [(set i64:$dst, (int_amdgcn_if_break i1:$vcc, i64:$src))]
 >;
 
-def SI_ELSE_BREAK : InstSI <
-  (outs SReg_64:$dst),
-  (ins SReg_64:$src0, SReg_64:$src1),
-  "si_else_break $dst, $src0, $src1",
-  [(set i64:$dst, (int_SI_else_break i64:$src0, i64:$src1))]
+def SI_ELSE_BREAK : PseudoInstSI <
+  (outs SReg_64:$dst), (ins SReg_64:$src0, SReg_64:$src1),
+  [(set i64:$dst, (int_amdgcn_else_break i64:$src0, i64:$src1))]
 >;
 
-def SI_END_CF : InstSI <
-  (outs),
-  (ins SReg_64:$saved),
-  "si_end_cf $saved",
-  [(int_SI_end_cf i64:$saved)]
+def SI_END_CF : PseudoInstSI <
+  (outs), (ins SReg_64:$saved),
+  [(int_amdgcn_end_cf i64:$saved)]
 >;
 
-} // End Uses = [EXEC], Defs = [EXEC]
+} // End Uses = [EXEC], Defs = [EXEC, SCC]
 
 let Uses = [EXEC], Defs = [EXEC,VCC] in {
-def SI_KILL : InstSI <
-  (outs),
-  (ins VSrc_32:$src),
-  "si_kill $src",
-  [(int_AMDGPU_kill f32:$src)]
->;
+def SI_KILL : PseudoInstSI <
+  (outs), (ins VSrc_32:$src),
+  [(int_AMDGPU_kill f32:$src)]> {
+  let isConvergent = 1;
+  let usesCustomInserter = 1;
+}
+
+def SI_KILL_TERMINATOR : PseudoInstSI <
+  (outs), (ins VSrc_32:$src)> {
+  let isTerminator = 1;
+}
+
 } // End Uses = [EXEC], Defs = [EXEC,VCC]
 
-} // end mayLoad = 1, mayStore = 1, hasSideEffects = 1
+} // End mayLoad = 1, mayStore = 1, hasSideEffects = 1
 
-let Uses = [EXEC], Defs = [EXEC,VCC,M0] in {
+def SI_PS_LIVE : PseudoInstSI <
+  (outs SReg_64:$dst), (ins),
+  [(set i1:$dst, (int_amdgcn_ps_live))]> {
+  let SALU = 1;
+}
 
-class SI_INDIRECT_SRC<RegisterClass rc> : InstSI <
-  (outs VGPR_32:$dst, SReg_64:$temp),
-  (ins rc:$src, VSrc_32:$idx, i32imm:$off),
-  "si_indirect_src $dst, $temp, $src, $idx, $off",
-  []
->;
+// Used as an isel pseudo to directly emit initialization with an
+// s_mov_b32 rather than a copy of another initialized
+// register. MachineCSE skips copies, and we don't want to have to
+// fold operands before it runs.
+def SI_INIT_M0 : PseudoInstSI <(outs), (ins SSrc_32:$src)> {
+  let Defs = [M0];
+  let usesCustomInserter = 1;
+  let isAsCheapAsAMove = 1;
+  let SALU = 1;
+  let isReMaterializable = 1;
+}
 
-class SI_INDIRECT_DST<RegisterClass rc> : InstSI <
-  (outs rc:$dst, SReg_64:$temp),
-  (ins unknown:$src, VSrc_32:$idx, i32imm:$off, VGPR_32:$val),
-  "si_indirect_dst $dst, $temp, $src, $idx, $off, $val",
-  []
-> {
-  let Constraints = "$src = $dst";
+def SI_RETURN : PseudoInstSI <
+  (outs), (ins variable_ops), [(AMDGPUreturn)]> {
+  let isTerminator = 1;
+  let isBarrier = 1;
+  let isReturn = 1;
+  let hasSideEffects = 1;
+  let SALU = 1;
+  let hasNoSchedulingInfo = 1;
+}
+
+let Uses = [EXEC], Defs = [EXEC, VCC, M0],
+  UseNamedOperandTable = 1 in {
+
+class SI_INDIRECT_SRC<RegisterClass rc> : PseudoInstSI <
+  (outs VGPR_32:$vdst, SReg_64:$sdst),
+  (ins rc:$src, VS_32:$idx, i32imm:$offset)>;
+
+class SI_INDIRECT_DST<RegisterClass rc> : PseudoInstSI <
+  (outs rc:$vdst, SReg_64:$sdst),
+  (ins unknown:$src, VS_32:$idx, i32imm:$offset, VGPR_32:$val)> {
+  let Constraints = "$src = $vdst";
 }
 
 // TODO: We can support indirect SGPR access.
@@ -1967,25 +2057,20 @@ def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>;
 def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST<VReg_256>;
 def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>;
 
-} // Uses = [EXEC,VCC,M0], Defs = [EXEC,VCC,M0]
+} // End Uses = [EXEC], Defs = [EXEC,VCC,M0]
 
 multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> {
-
   let UseNamedOperandTable = 1, Uses = [EXEC] in {
-    def _SAVE : InstSI <
+    def _SAVE : PseudoInstSI <
       (outs),
-      (ins sgpr_class:$src, i32imm:$frame_idx),
-      "", []
-    > {
+      (ins sgpr_class:$src, i32imm:$frame_idx)> {
       let mayStore = 1;
       let mayLoad = 0;
     }
 
-    def _RESTORE : InstSI <
+    def _RESTORE : PseudoInstSI <
       (outs sgpr_class:$dst),
-      (ins i32imm:$frame_idx),
-      "", []
-    > {
+      (ins i32imm:$frame_idx)> {
       let mayStore = 0;
       let mayLoad = 1;
     }
@@ -1993,9 +2078,9 @@ multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> {
 }
 
 // It's unclear whether you can use M0 as the output of v_readlane_b32
-// instructions, so use SGPR_32 register class for spills to prevent
+// instructions, so use SReg_32_XM0 register class for spills to prevent
 // this from happening.
-defm SI_SPILL_S32  : SI_SPILL_SGPR <SGPR_32>;
+defm SI_SPILL_S32  : SI_SPILL_SGPR <SReg_32_XM0>;
 defm SI_SPILL_S64  : SI_SPILL_SGPR <SReg_64>;
 defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>;
 defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
@@ -2003,21 +2088,18 @@ defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
 
 multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> {
   let UseNamedOperandTable = 1, VGPRSpill = 1, Uses = [EXEC] in {
-    def _SAVE : InstSI <
+    def _SAVE : PseudoInstSI <
       (outs),
       (ins vgpr_class:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc,
-           SReg_32:$scratch_offset),
-      "", []
-    > {
+           SReg_32:$scratch_offset, i32imm:$offset)> {
       let mayStore = 1;
       let mayLoad = 0;
     }
 
-    def _RESTORE : InstSI <
+    def _RESTORE : PseudoInstSI <
       (outs vgpr_class:$dst),
-      (ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset),
-      "", []
-    > {
+      (ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset,
+           i32imm:$offset)> {
       let mayStore = 0;
       let mayLoad = 1;
     }
@@ -2033,29 +2115,19 @@ defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>;
 
 let Defs = [SCC] in {
 
-def SI_CONSTDATA_PTR : InstSI <
+def SI_PC_ADD_REL_OFFSET : PseudoInstSI <
   (outs SReg_64:$dst),
-  (ins const_ga:$ptr),
-  "", [(set SReg_64:$dst, (i64 (SIconstdata_ptr (tglobaladdr:$ptr))))]
-> {
+  (ins si_ga:$ptr),
+  [(set SReg_64:$dst, (i64 (SIpc_add_rel_offset (tglobaladdr:$ptr))))]> {
   let SALU = 1;
 }
 
 } // End Defs = [SCC]
 
-} // end IsCodeGenOnly, isPseudo
-
-} // end SubtargetPredicate = isGCN
+} // End SubtargetPredicate = isGCN
 
 let Predicates = [isGCN] in {
 
-def : Pat<
-  (int_AMDGPU_cndlt f32:$src0, f32:$src1, f32:$src2),
-  (V_CNDMASK_B32_e64 $src2, $src1,
-                     (V_CMP_GT_F32_e64 SRCMODS.NONE, 0, SRCMODS.NONE, $src0,
-                                       DSTCLAMP.NONE, DSTOMOD.NONE))
->;
-
 def : Pat <
   (int_AMDGPU_kilp),
   (SI_KILL 0xbf800000)
@@ -2067,7 +2139,6 @@ def : Pat<
   (BUFFER_LOAD_FORMAT_XYZW_IDXEN $buf_idx_vgpr, $tlst, 0, imm:$attr_offset, 0, 0, 0)
 >;
 
-/* int_SI_export */
 def : Pat <
   (int_SI_export imm:$en, imm:$vm, imm:$done, imm:$tgt, imm:$compr,
                  f32:$src0, f32:$src1, f32:$src2, f32:$src3),
@@ -2076,6 +2147,217 @@ def : Pat <
 >;
 
 //===----------------------------------------------------------------------===//
+// buffer_load/store_format patterns
+//===----------------------------------------------------------------------===//
+
+multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
+                                  string opcode> {
+  def : Pat<
+    (vt (name v4i32:$rsrc, 0,
+              (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
+              imm:$glc, imm:$slc)),
+    (!cast<MUBUF>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset),
+      (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
+
+  def : Pat<
+    (vt (name v4i32:$rsrc, i32:$vindex,
+              (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
+              imm:$glc, imm:$slc)),
+    (!cast<MUBUF>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset),
+      (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
+
+  def : Pat<
+    (vt (name v4i32:$rsrc, 0,
+              (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
+              imm:$glc, imm:$slc)),
+    (!cast<MUBUF>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset),
+      (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
+
+  def : Pat<
+    (vt (name v4i32:$rsrc, i32:$vindex,
+              (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
+              imm:$glc, imm:$slc)),
+    (!cast<MUBUF>(opcode # _BOTHEN)
+      (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
+      $rsrc, $soffset, (as_i16imm $offset),
+      (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
+}
+
+defm : MUBUF_LoadIntrinsicPat<int_amdgcn_buffer_load_format, f32, "BUFFER_LOAD_FORMAT_X">;
+defm : MUBUF_LoadIntrinsicPat<int_amdgcn_buffer_load_format, v2f32, "BUFFER_LOAD_FORMAT_XY">;
+defm : MUBUF_LoadIntrinsicPat<int_amdgcn_buffer_load_format, v4f32, "BUFFER_LOAD_FORMAT_XYZW">;
+defm : MUBUF_LoadIntrinsicPat<int_amdgcn_buffer_load, f32, "BUFFER_LOAD_DWORD">;
+defm : MUBUF_LoadIntrinsicPat<int_amdgcn_buffer_load, v2f32, "BUFFER_LOAD_DWORDX2">;
+defm : MUBUF_LoadIntrinsicPat<int_amdgcn_buffer_load, v4f32, "BUFFER_LOAD_DWORDX4">;
+
+multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
+                                   string opcode> {
+  def : Pat<
+    (name vt:$vdata, v4i32:$rsrc, 0,
+          (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
+          imm:$glc, imm:$slc),
+    (!cast<MUBUF>(opcode # _OFFSET) $vdata, $rsrc, $soffset, (as_i16imm $offset),
+                                    (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
+
+  def : Pat<
+    (name vt:$vdata, v4i32:$rsrc, i32:$vindex,
+          (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
+          imm:$glc, imm:$slc),
+    (!cast<MUBUF>(opcode # _IDXEN) $vdata, $vindex, $rsrc, $soffset,
+                                   (as_i16imm $offset), (as_i1imm $glc),
+                                   (as_i1imm $slc), 0)
+  >;
+
+  def : Pat<
+    (name vt:$vdata, v4i32:$rsrc, 0,
+          (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
+          imm:$glc, imm:$slc),
+    (!cast<MUBUF>(opcode # _OFFEN) $vdata, $voffset, $rsrc, $soffset,
+                                   (as_i16imm $offset), (as_i1imm $glc),
+                                   (as_i1imm $slc), 0)
+  >;
+
+  def : Pat<
+    (name vt:$vdata, v4i32:$rsrc, i32:$vindex,
+          (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
+          imm:$glc, imm:$slc),
+    (!cast<MUBUF>(opcode # _BOTHEN)
+      $vdata,
+      (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
+      $rsrc, $soffset, (as_i16imm $offset),
+      (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
+}
+
+defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store_format, f32, "BUFFER_STORE_FORMAT_X">;
+defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store_format, v2f32, "BUFFER_STORE_FORMAT_XY">;
+defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store_format, v4f32, "BUFFER_STORE_FORMAT_XYZW">;
+defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store, f32, "BUFFER_STORE_DWORD">;
+defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store, v2f32, "BUFFER_STORE_DWORDX2">;
+defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store, v4f32, "BUFFER_STORE_DWORDX4">;
+
+//===----------------------------------------------------------------------===//
+// buffer_atomic patterns
+//===----------------------------------------------------------------------===//
+multiclass BufferAtomicPatterns<SDPatternOperator name, string opcode> {
+  def : Pat<
+    (name i32:$vdata_in, v4i32:$rsrc, 0,
+          (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
+          imm:$slc),
+    (!cast<MUBUF>(opcode # _RTN_OFFSET) $vdata_in, $rsrc, $soffset,
+                                        (as_i16imm $offset), (as_i1imm $slc))
+  >;
+
+  def : Pat<
+    (name i32:$vdata_in, v4i32:$rsrc, i32:$vindex,
+          (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
+          imm:$slc),
+    (!cast<MUBUF>(opcode # _RTN_IDXEN) $vdata_in, $vindex, $rsrc, $soffset,
+                                       (as_i16imm $offset), (as_i1imm $slc))
+  >;
+
+  def : Pat<
+    (name i32:$vdata_in, v4i32:$rsrc, 0,
+          (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
+          imm:$slc),
+    (!cast<MUBUF>(opcode # _RTN_OFFEN) $vdata_in, $voffset, $rsrc, $soffset,
+                                       (as_i16imm $offset), (as_i1imm $slc))
+  >;
+
+  def : Pat<
+    (name i32:$vdata_in, v4i32:$rsrc, i32:$vindex,
+          (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
+          imm:$slc),
+    (!cast<MUBUF>(opcode # _RTN_BOTHEN)
+      $vdata_in,
+      (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
+      $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc))
+  >;
+}
+
+defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_swap, "BUFFER_ATOMIC_SWAP">;
+defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_add, "BUFFER_ATOMIC_ADD">;
+defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_sub, "BUFFER_ATOMIC_SUB">;
+defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_smin, "BUFFER_ATOMIC_SMIN">;
+defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_umin, "BUFFER_ATOMIC_UMIN">;
+defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_smax, "BUFFER_ATOMIC_SMAX">;
+defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_umax, "BUFFER_ATOMIC_UMAX">;
+defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_and, "BUFFER_ATOMIC_AND">;
+defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_or, "BUFFER_ATOMIC_OR">;
+defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_xor, "BUFFER_ATOMIC_XOR">;
+
+def : Pat<
+  (int_amdgcn_buffer_atomic_cmpswap
+      i32:$data, i32:$cmp, v4i32:$rsrc, 0,
+      (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
+      imm:$slc),
+  (EXTRACT_SUBREG
+    (BUFFER_ATOMIC_CMPSWAP_RTN_OFFSET
+      (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
+      $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)),
+    sub0)
+>;
+
+def : Pat<
+  (int_amdgcn_buffer_atomic_cmpswap
+      i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex,
+      (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
+      imm:$slc),
+  (EXTRACT_SUBREG
+    (BUFFER_ATOMIC_CMPSWAP_RTN_IDXEN
+      (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
+      $vindex, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)),
+    sub0)
+>;
+
+def : Pat<
+  (int_amdgcn_buffer_atomic_cmpswap
+      i32:$data, i32:$cmp, v4i32:$rsrc, 0,
+      (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
+      imm:$slc),
+  (EXTRACT_SUBREG
+    (BUFFER_ATOMIC_CMPSWAP_RTN_OFFEN
+      (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
+      $voffset, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)),
+    sub0)
+>;
+
+def : Pat<
+  (int_amdgcn_buffer_atomic_cmpswap
+      i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex,
+      (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
+      imm:$slc),
+  (EXTRACT_SUBREG
+    (BUFFER_ATOMIC_CMPSWAP_RTN_BOTHEN
+      (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
+      (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
+      $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)),
+    sub0)
+>;
+
+
+//===----------------------------------------------------------------------===//
+// S_GETREG_B32 Intrinsic Pattern.
+//===----------------------------------------------------------------------===//
+def : Pat <
+  (int_amdgcn_s_getreg imm:$simm16),
+  (S_GETREG_B32 (as_i16imm $simm16))
+>;
+
+//===----------------------------------------------------------------------===//
+// DS_SWIZZLE Intrinsic Pattern.
+//===----------------------------------------------------------------------===//
+def : Pat <
+  (int_amdgcn_ds_swizzle i32:$src, imm:$offset16),
+  (DS_SWIZZLE_B32 $src, (as_i16imm $offset16), (i1 0))
+>;
+
+//===----------------------------------------------------------------------===//
 // SMRD Patterns
 //===----------------------------------------------------------------------===//
 
@@ -2109,7 +2391,6 @@ let AddedComplexity = 100 in {
 defm : SMRD_Pattern <"S_LOAD_DWORD", i32>;
 defm : SMRD_Pattern <"S_LOAD_DWORDX2", v2i32>;
 defm : SMRD_Pattern <"S_LOAD_DWORDX4", v4i32>;
-defm : SMRD_Pattern <"S_LOAD_DWORDX8", v32i8>;
 defm : SMRD_Pattern <"S_LOAD_DWORDX8", v8i32>;
 defm : SMRD_Pattern <"S_LOAD_DWORDX16", v16i32>;
 
@@ -2143,7 +2424,7 @@ def : Pat <
 def : Pat <
   (i64 (ctpop i64:$src)),
     (i64 (REG_SEQUENCE SReg_64,
-     (S_BCNT1_I32_B64 $src), sub0,
+     (i32 (COPY_TO_REGCLASS (S_BCNT1_I32_B64 $src), SReg_32)), sub0,
      (S_MOV_B32 0), sub1))
 >;
 
@@ -2168,8 +2449,8 @@ def : Pat <
 //===----------------------------------------------------------------------===//
 
 def : Pat <
-  (int_AMDGPU_barrier_global),
-  (S_BARRIER)
+  (int_amdgcn_s_waitcnt i32:$simm16),
+  (S_WAITCNT (as_i16imm $simm16))
 >;
 
 //===----------------------------------------------------------------------===//
@@ -2184,7 +2465,22 @@ let Predicates = [UnsafeFPMath] in {
 
 def : RsqPat<V_RSQ_F32_e32, f32>;
 def : RsqPat<V_RSQ_F64_e32, f64>;
-}
+
+// Convert (x - floor(x)) to fract(x)
+def : Pat <
+  (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)),
+             (f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))),
+  (V_FRACT_F32_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+// Convert (x + (-floor(x))) to fract(x)
+def : Pat <
+  (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)),
+             (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))),
+  (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+} // End Predicates = [UnsafeFPMath]
 
 //===----------------------------------------------------------------------===//
 // VOP2 Patterns
@@ -2217,9 +2513,9 @@ def : Pat <
 class SampleRawPattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat <
   (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, i32:$dmask, i32:$unorm,
         i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe),
-  (opcode (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $da),
-          (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $slc),
-          $addr, $rsrc, $sampler)
+  (opcode $addr, $rsrc, $sampler,
+          (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $slc),
+          (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $da))
 >;
 
 multiclass SampleRawPatterns<SDPatternOperator name, string opcode> {
@@ -2232,11 +2528,11 @@ multiclass SampleRawPatterns<SDPatternOperator name, string opcode> {
 
 // Image only
 class ImagePattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat <
-  (name vt:$addr, v8i32:$rsrc, i32:$dmask, i32:$unorm,
-        i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe),
-  (opcode (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $da),
-          (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $slc),
-          $addr, $rsrc)
+  (name vt:$addr, v8i32:$rsrc, imm:$dmask, imm:$unorm,
+        imm:$r128, imm:$da, imm:$glc, imm:$slc, imm:$tfe, imm:$lwe),
+  (opcode $addr, $rsrc,
+          (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $slc),
+          (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $da))
 >;
 
 multiclass ImagePatterns<SDPatternOperator name, string opcode> {
@@ -2245,6 +2541,54 @@ multiclass ImagePatterns<SDPatternOperator name, string opcode> {
   def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V4), v4i32>;
 }
 
+class ImageLoadPattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat <
+  (name vt:$addr, v8i32:$rsrc, imm:$dmask, imm:$r128, imm:$da, imm:$glc,
+        imm:$slc),
+  (opcode $addr, $rsrc,
+          (as_i32imm $dmask), 1, (as_i1imm $glc), (as_i1imm $slc),
+          (as_i1imm $r128), 0, 0, (as_i1imm $da))
+>;
+
+multiclass ImageLoadPatterns<SDPatternOperator name, string opcode> {
+  def : ImageLoadPattern<name, !cast<MIMG>(opcode # _V4_V1), i32>;
+  def : ImageLoadPattern<name, !cast<MIMG>(opcode # _V4_V2), v2i32>;
+  def : ImageLoadPattern<name, !cast<MIMG>(opcode # _V4_V4), v4i32>;
+}
+
+class ImageStorePattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat <
+  (name v4f32:$data, vt:$addr, v8i32:$rsrc, i32:$dmask, imm:$r128, imm:$da,
+        imm:$glc, imm:$slc),
+  (opcode $data, $addr, $rsrc,
+          (as_i32imm $dmask), 1, (as_i1imm $glc), (as_i1imm $slc),
+          (as_i1imm $r128), 0, 0, (as_i1imm $da))
+>;
+
+multiclass ImageStorePatterns<SDPatternOperator name, string opcode> {
+  def : ImageStorePattern<name, !cast<MIMG>(opcode # _V4_V1), i32>;
+  def : ImageStorePattern<name, !cast<MIMG>(opcode # _V4_V2), v2i32>;
+  def : ImageStorePattern<name, !cast<MIMG>(opcode # _V4_V4), v4i32>;
+}
+
+class ImageAtomicPattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat <
+  (name i32:$vdata, vt:$addr, v8i32:$rsrc, imm:$r128, imm:$da, imm:$slc),
+  (opcode $vdata, $addr, $rsrc, 1, 1, 1, (as_i1imm $slc), (as_i1imm $r128), 0, 0, (as_i1imm $da))
+>;
+
+multiclass ImageAtomicPatterns<SDPatternOperator name, string opcode> {
+  def : ImageAtomicPattern<name, !cast<MIMG>(opcode # _V1), i32>;
+  def : ImageAtomicPattern<name, !cast<MIMG>(opcode # _V2), v2i32>;
+  def : ImageAtomicPattern<name, !cast<MIMG>(opcode # _V4), v4i32>;
+}
+
+class ImageAtomicCmpSwapPattern<MIMG opcode, ValueType vt> : Pat <
+  (int_amdgcn_image_atomic_cmpswap i32:$vsrc, i32:$vcmp, vt:$addr, v8i32:$rsrc,
+                                   imm:$r128, imm:$da, imm:$slc),
+  (EXTRACT_SUBREG
+    (opcode (REG_SEQUENCE VReg_64, $vsrc, sub0, $vcmp, sub1),
+            $addr, $rsrc, 3, 1, 1, (as_i1imm $slc), (as_i1imm $r128), 0, 0, (as_i1imm $da)),
+    sub0)
+>;
+
 // Basic sample
 defm : SampleRawPatterns<int_SI_image_sample,           "IMAGE_SAMPLE">;
 defm : SampleRawPatterns<int_SI_image_sample_cl,        "IMAGE_SAMPLE_CL">;
@@ -2341,38 +2685,57 @@ def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V4, v4i32>;
 def : ImagePattern<int_SI_getresinfo, IMAGE_GET_RESINFO_V4_V1, i32>;
 defm : ImagePatterns<int_SI_image_load, "IMAGE_LOAD">;
 defm : ImagePatterns<int_SI_image_load_mip, "IMAGE_LOAD_MIP">;
+defm : ImageLoadPatterns<int_amdgcn_image_load, "IMAGE_LOAD">;
+defm : ImageLoadPatterns<int_amdgcn_image_load_mip, "IMAGE_LOAD_MIP">;
+defm : ImageStorePatterns<int_amdgcn_image_store, "IMAGE_STORE">;
+defm : ImageStorePatterns<int_amdgcn_image_store_mip, "IMAGE_STORE_MIP">;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_swap, "IMAGE_ATOMIC_SWAP">;
+def : ImageAtomicCmpSwapPattern<IMAGE_ATOMIC_CMPSWAP_V1, i32>;
+def : ImageAtomicCmpSwapPattern<IMAGE_ATOMIC_CMPSWAP_V2, v2i32>;
+def : ImageAtomicCmpSwapPattern<IMAGE_ATOMIC_CMPSWAP_V4, v4i32>;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_add, "IMAGE_ATOMIC_ADD">;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_sub, "IMAGE_ATOMIC_SUB">;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_smin, "IMAGE_ATOMIC_SMIN">;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_umin, "IMAGE_ATOMIC_UMIN">;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_smax, "IMAGE_ATOMIC_SMAX">;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_umax, "IMAGE_ATOMIC_UMAX">;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_and, "IMAGE_ATOMIC_AND">;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_or, "IMAGE_ATOMIC_OR">;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_xor, "IMAGE_ATOMIC_XOR">;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_inc, "IMAGE_ATOMIC_INC">;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_dec, "IMAGE_ATOMIC_DEC">;
 
 /* SIsample for simple 1D texture lookup */
 def : Pat <
-  (SIsample i32:$addr, v32i8:$rsrc, v4i32:$sampler, imm),
-  (IMAGE_SAMPLE_V4_V1 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
+  (SIsample i32:$addr, v8i32:$rsrc, v4i32:$sampler, imm),
+  (IMAGE_SAMPLE_V4_V1 $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0)
 >;
 
 class SamplePattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
-    (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, imm),
-    (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
+    (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, imm),
+    (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0)
 >;
 
 class SampleRectPattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
-    (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_RECT),
-    (opcode 0xf, 1, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
+    (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_RECT),
+    (opcode $addr, $rsrc, $sampler, 0xf, 1, 0, 0, 0, 0, 0, 0)
 >;
 
 class SampleArrayPattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
-    (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_ARRAY),
-    (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc, $sampler)
+    (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_ARRAY),
+    (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 1)
 >;
 
 class SampleShadowPattern<SDNode name, MIMG opcode,
                           ValueType vt> : Pat <
-    (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_SHADOW),
-    (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
+    (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_SHADOW),
+    (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0)
 >;
 
 class SampleShadowArrayPattern<SDNode name, MIMG opcode,
                                ValueType vt> : Pat <
-    (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_SHADOW_ARRAY),
-    (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc, $sampler)
+    (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_SHADOW_ARRAY),
+    (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 1)
 >;
 
 /* SIsample* for texture lookups consuming more address parameters */
@@ -2422,68 +2785,10 @@ defm : SamplePatterns<IMAGE_SAMPLE_V4_V16, IMAGE_SAMPLE_C_V4_V16,
                       IMAGE_SAMPLE_D_V4_V16, IMAGE_SAMPLE_C_D_V4_V16,
                       v16i32>;
 
-/* int_SI_imageload for texture fetches consuming varying address parameters */
-class ImageLoadPattern<Intrinsic name, MIMG opcode, ValueType addr_type> : Pat <
-    (name addr_type:$addr, v32i8:$rsrc, imm),
-    (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc)
->;
-
-class ImageLoadArrayPattern<Intrinsic name, MIMG opcode, ValueType addr_type> : Pat <
-    (name addr_type:$addr, v32i8:$rsrc, TEX_ARRAY),
-    (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc)
->;
-
-class ImageLoadMSAAPattern<Intrinsic name, MIMG opcode, ValueType addr_type> : Pat <
-    (name addr_type:$addr, v32i8:$rsrc, TEX_MSAA),
-    (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc)
->;
-
-class ImageLoadArrayMSAAPattern<Intrinsic name, MIMG opcode, ValueType addr_type> : Pat <
-    (name addr_type:$addr, v32i8:$rsrc, TEX_ARRAY_MSAA),
-    (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc)
->;
-
-multiclass ImageLoadPatterns<MIMG opcode, ValueType addr_type> {
-  def : ImageLoadPattern <int_SI_imageload, opcode, addr_type>;
-  def : ImageLoadArrayPattern <int_SI_imageload, opcode, addr_type>;
-}
-
-multiclass ImageLoadMSAAPatterns<MIMG opcode, ValueType addr_type> {
-  def : ImageLoadMSAAPattern <int_SI_imageload, opcode, addr_type>;
-  def : ImageLoadArrayMSAAPattern <int_SI_imageload, opcode, addr_type>;
-}
-
-defm : ImageLoadPatterns<IMAGE_LOAD_MIP_V4_V2, v2i32>;
-defm : ImageLoadPatterns<IMAGE_LOAD_MIP_V4_V4, v4i32>;
-
-defm : ImageLoadMSAAPatterns<IMAGE_LOAD_V4_V2, v2i32>;
-defm : ImageLoadMSAAPatterns<IMAGE_LOAD_V4_V4, v4i32>;
-
-/* Image resource information */
-def : Pat <
-  (int_SI_resinfo i32:$mipid, v32i8:$rsrc, imm),
-  (IMAGE_GET_RESINFO_V4_V1 0xf, 0, 0, 0, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc)
->;
-
-def : Pat <
-  (int_SI_resinfo i32:$mipid, v32i8:$rsrc, TEX_ARRAY),
-  (IMAGE_GET_RESINFO_V4_V1 0xf, 0, 0, 1, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc)
->;
-
-def : Pat <
-  (int_SI_resinfo i32:$mipid, v32i8:$rsrc, TEX_ARRAY_MSAA),
-  (IMAGE_GET_RESINFO_V4_V1 0xf, 0, 0, 1, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc)
->;
-
 /********** ============================================ **********/
 /********** Extraction, Insertion, Building and Casting  **********/
 /********** ============================================ **********/
 
-//def : Extract_Element<i64, v2i64, 0, sub0_sub1>;
-//def : Extract_Element<i64, v2i64, 1, sub2_sub3>;
-//def : Extract_Element<f64, v2f64, 0, sub0_sub1>;
-//def : Extract_Element<f64, v2f64, 1, sub2_sub3>;
-
 foreach Index = 0-2 in {
   def Extract_Element_v2i32_#Index : Extract_Element <
     i32, v2i32, Index, !cast<SubRegIndex>(sub#Index)
@@ -2548,50 +2853,47 @@ foreach Index = 0-15 in {
   >;
 }
 
-def : BitConvert <i32, f32, SReg_32>;
+// FIXME: Why do only some of these type combinations for SReg and
+// VReg?
+// 32-bit bitcast
 def : BitConvert <i32, f32, VGPR_32>;
-
-def : BitConvert <f32, i32, SReg_32>;
 def : BitConvert <f32, i32, VGPR_32>;
+def : BitConvert <i32, f32, SReg_32>;
+def : BitConvert <f32, i32, SReg_32>;
 
+// 64-bit bitcast
 def : BitConvert <i64, f64, VReg_64>;
-
 def : BitConvert <f64, i64, VReg_64>;
-
-def : BitConvert <v2f32, v2i32, VReg_64>;
 def : BitConvert <v2i32, v2f32, VReg_64>;
-def : BitConvert <v2i32, i64, VReg_64>;
+def : BitConvert <v2f32, v2i32, VReg_64>;
 def : BitConvert <i64, v2i32, VReg_64>;
-def : BitConvert <v2f32, i64, VReg_64>;
+def : BitConvert <v2i32, i64, VReg_64>;
 def : BitConvert <i64, v2f32, VReg_64>;
-def : BitConvert <v2f32, f64, VReg_64>;
-def : BitConvert <v2i32, f64, VReg_64>;
+def : BitConvert <v2f32, i64, VReg_64>;
 def : BitConvert <f64, v2f32, VReg_64>;
+def : BitConvert <v2f32, f64, VReg_64>;
 def : BitConvert <f64, v2i32, VReg_64>;
-def : BitConvert <v4f32, v4i32, VReg_128>;
+def : BitConvert <v2i32, f64, VReg_64>;
 def : BitConvert <v4i32, v4f32, VReg_128>;
+def : BitConvert <v4f32, v4i32, VReg_128>;
 
-
+// 128-bit bitcast
 def : BitConvert <v2i64, v4i32, SReg_128>;
 def : BitConvert <v4i32, v2i64, SReg_128>;
-
 def : BitConvert <v2f64, v4f32, VReg_128>;
 def : BitConvert <v2f64, v4i32, VReg_128>;
 def : BitConvert <v4f32, v2f64, VReg_128>;
 def : BitConvert <v4i32, v2f64, VReg_128>;
+def : BitConvert <v2i64, v2f64, VReg_128>;
+def : BitConvert <v2f64, v2i64, VReg_128>;
 
-
-
-
-def : BitConvert <v8f32, v8i32, SReg_256>;
+// 256-bit bitcast
 def : BitConvert <v8i32, v8f32, SReg_256>;
-def : BitConvert <v8i32, v32i8, SReg_256>;
-def : BitConvert <v32i8, v8i32, SReg_256>;
-def : BitConvert <v8i32, v32i8, VReg_256>;
+def : BitConvert <v8f32, v8i32, SReg_256>;
 def : BitConvert <v8i32, v8f32, VReg_256>;
 def : BitConvert <v8f32, v8i32, VReg_256>;
-def : BitConvert <v32i8, v8i32, VReg_256>;
 
+// 512-bit bitcast
 def : BitConvert <v16i32, v16f32, VReg_512>;
 def : BitConvert <v16f32, v16i32, VReg_512>;
 
@@ -2613,7 +2915,7 @@ def : Pat <
 
 def : Pat <
   (fneg (fabs f32:$src)),
-  (S_OR_B32 $src, 0x80000000) /* Set sign bit */
+  (S_OR_B32 $src, 0x80000000) // Set sign bit
 >;
 
 // FIXME: Should use S_OR_B32
@@ -2703,15 +3005,9 @@ def : Pat <
 /********** Intrinsic Patterns **********/
 /********** ================== **********/
 
-/* llvm.AMDGPU.pow */
 def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_LEGACY_F32_e32>;
 
 def : Pat <
-  (int_AMDGPU_div f32:$src0, f32:$src1),
-  (V_MUL_LEGACY_F32_e32 $src0, (V_RCP_LEGACY_F32_e32 $src1))
->;
-
-def : Pat <
   (int_AMDGPU_cube v4f32:$src),
   (REG_SEQUENCE VReg_128,
     (V_CUBETC_F32 0 /* src0_modifiers */, (EXTRACT_SUBREG $src, sub0),
@@ -2745,7 +3041,7 @@ class Ext32Pat <SDNode ext> : Pat <
 def : Ext32Pat <zext>;
 def : Ext32Pat <anyext>;
 
-// Offset in an 32Bit VGPR
+// Offset in an 32-bit VGPR
 def : Pat <
   (SIload_constant v4i32:$sbase, i32:$voff),
   (BUFFER_LOAD_DWORD_OFFEN $voff, $sbase, 0, 0, 0, 0, 0)
@@ -2759,12 +3055,6 @@ def : Pat <
                    (V_RCP_IFLAG_F32_e32 (V_CVT_F32_U32_e32 $src0))))
 >;
 
-def : Pat <
-  (int_SI_tid),
-  (V_MBCNT_HI_U32_B32_e64 0xffffffff,
-                          (V_MBCNT_LO_U32_B32_e64 0xffffffff, 0))
->;
-
 //===----------------------------------------------------------------------===//
 // VOP3 Patterns
 //===----------------------------------------------------------------------===//
@@ -2772,16 +3062,6 @@ def : Pat <
 def : IMad24Pat<V_MAD_I32_I24>;
 def : UMad24Pat<V_MAD_U32_U24>;
 
-def : Pat <
-  (mulhu i32:$src0, i32:$src1),
-  (V_MUL_HI_U32 $src0, $src1)
->;
-
-def : Pat <
-  (mulhs i32:$src0, i32:$src1),
-  (V_MUL_HI_I32 $src0, $src1)
->;
-
 defm : BFIPatterns <V_BFI_B32, S_MOV_B32, SReg_64>;
 def : ROTRPattern <V_ALIGNBIT_B32>;
 
@@ -2839,19 +3119,6 @@ class DSAtomicRetPat<DS inst, ValueType vt, PatFrag frag> : Pat <
   (inst $ptr, $value, (as_i16imm $offset), (i1 0))
 >;
 
-// Special case of DSAtomicRetPat for add / sub 1 -> inc / dec
-//
-// We need to use something for the data0, so we set a register to
-// -1. For the non-rtn variants, the manual says it does
-// DS[A] = (DS[A] >= D0) ? 0 : DS[A] + 1, and setting D0 to uint_max
-// will always do the increment so I'm assuming it's the same.
-class DSAtomicIncRetPat<DS inst, ValueType vt,
-                        Instruction LoadImm, PatFrag frag> : Pat <
-  (frag (DS1Addr1Offset i32:$ptr, i32:$offset), (vt 1)),
-  (inst $ptr, (LoadImm (vt -1)), (as_i16imm $offset), (i1 0))
->;
-
-
 class DSAtomicCmpXChg <DS inst, ValueType vt, PatFrag frag> : Pat <
   (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap),
   (inst $ptr, $cmp, $swap, (as_i16imm $offset), (i1 0))
@@ -2859,14 +3126,11 @@ class DSAtomicCmpXChg <DS inst, ValueType vt, PatFrag frag> : Pat <
 
 
 // 32-bit atomics.
-def : DSAtomicIncRetPat<DS_INC_RTN_U32, i32,
-                        V_MOV_B32_e32, si_atomic_load_add_local>;
-def : DSAtomicIncRetPat<DS_DEC_RTN_U32, i32,
-                        V_MOV_B32_e32, si_atomic_load_sub_local>;
-
 def : DSAtomicRetPat<DS_WRXCHG_RTN_B32, i32, si_atomic_swap_local>;
 def : DSAtomicRetPat<DS_ADD_RTN_U32, i32, si_atomic_load_add_local>;
 def : DSAtomicRetPat<DS_SUB_RTN_U32, i32, si_atomic_load_sub_local>;
+def : DSAtomicRetPat<DS_INC_RTN_U32, i32, si_atomic_inc_local>;
+def : DSAtomicRetPat<DS_DEC_RTN_U32, i32, si_atomic_dec_local>;
 def : DSAtomicRetPat<DS_AND_RTN_B32, i32, si_atomic_load_and_local>;
 def : DSAtomicRetPat<DS_OR_RTN_B32, i32, si_atomic_load_or_local>;
 def : DSAtomicRetPat<DS_XOR_RTN_B32, i32, si_atomic_load_xor_local>;
@@ -2874,18 +3138,14 @@ def : DSAtomicRetPat<DS_MIN_RTN_I32, i32, si_atomic_load_min_local>;
 def : DSAtomicRetPat<DS_MAX_RTN_I32, i32, si_atomic_load_max_local>;
 def : DSAtomicRetPat<DS_MIN_RTN_U32, i32, si_atomic_load_umin_local>;
 def : DSAtomicRetPat<DS_MAX_RTN_U32, i32, si_atomic_load_umax_local>;
-
 def : DSAtomicCmpXChg<DS_CMPST_RTN_B32, i32, si_atomic_cmp_swap_32_local>;
 
 // 64-bit atomics.
-def : DSAtomicIncRetPat<DS_INC_RTN_U64, i64,
-                        V_MOV_B64_PSEUDO, si_atomic_load_add_local>;
-def : DSAtomicIncRetPat<DS_DEC_RTN_U64, i64,
-                        V_MOV_B64_PSEUDO, si_atomic_load_sub_local>;
-
 def : DSAtomicRetPat<DS_WRXCHG_RTN_B64, i64, si_atomic_swap_local>;
 def : DSAtomicRetPat<DS_ADD_RTN_U64, i64, si_atomic_load_add_local>;
 def : DSAtomicRetPat<DS_SUB_RTN_U64, i64, si_atomic_load_sub_local>;
+def : DSAtomicRetPat<DS_INC_RTN_U64, i64, si_atomic_inc_local>;
+def : DSAtomicRetPat<DS_DEC_RTN_U64, i64, si_atomic_dec_local>;
 def : DSAtomicRetPat<DS_AND_RTN_B64, i64, si_atomic_load_and_local>;
 def : DSAtomicRetPat<DS_OR_RTN_B64, i64, si_atomic_load_or_local>;
 def : DSAtomicRetPat<DS_XOR_RTN_B64, i64, si_atomic_load_xor_local>;
@@ -2901,20 +3161,35 @@ def : DSAtomicCmpXChg<DS_CMPST_RTN_B64, i64, si_atomic_cmp_swap_64_local>;
 // MUBUF Patterns
 //===----------------------------------------------------------------------===//
 
-multiclass MUBUFLoad_Pattern <MUBUF Instr_ADDR64, ValueType vt,
-                              PatFrag constant_ld> {
-  def : Pat <
+class MUBUFLoad_Pattern <MUBUF Instr_ADDR64, ValueType vt,
+                              PatFrag constant_ld> : Pat <
      (vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
                                    i16:$offset, i1:$glc, i1:$slc, i1:$tfe))),
      (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, $glc, $slc, $tfe)
   >;
+
+multiclass MUBUFLoad_Atomic_Pattern <MUBUF Instr_ADDR64, MUBUF Instr_OFFSET,
+                                     ValueType vt, PatFrag atomic_ld> {
+  def : Pat <
+     (vt (atomic_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
+                                   i16:$offset, i1:$slc))),
+     (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, 1, $slc, 0)
+  >;
+
+  def : Pat <
+    (vt (atomic_ld (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset))),
+    (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset), 1, 0, 0)
+  >;
 }
 
 let Predicates = [isSICI] in {
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_ADDR64, i32, sextloadi8_constant>;
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_ADDR64, i32, az_extloadi8_constant>;
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_SSHORT_ADDR64, i32, sextloadi16_constant>;
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_USHORT_ADDR64, i32, az_extloadi16_constant>;
+def : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_ADDR64, i32, sextloadi8_constant>;
+def : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_ADDR64, i32, az_extloadi8_constant>;
+def : MUBUFLoad_Pattern <BUFFER_LOAD_SSHORT_ADDR64, i32, sextloadi16_constant>;
+def : MUBUFLoad_Pattern <BUFFER_LOAD_USHORT_ADDR64, i32, az_extloadi16_constant>;
+
+defm : MUBUFLoad_Atomic_Pattern <BUFFER_LOAD_DWORD_ADDR64, BUFFER_LOAD_DWORD_OFFSET, i32, mubuf_load_atomic>;
+defm : MUBUFLoad_Atomic_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, BUFFER_LOAD_DWORDX2_OFFSET, i64, mubuf_load_atomic>;
 } // End Predicates = [isSICI]
 
 class MUBUFScratchLoadPat <MUBUF Instr, ValueType vt, PatFrag ld> : Pat <
@@ -2975,6 +3250,25 @@ defm : MUBUF_Load_Dword <v2i32, BUFFER_LOAD_DWORDX2_OFFSET, BUFFER_LOAD_DWORDX2_
 defm : MUBUF_Load_Dword <v4i32, BUFFER_LOAD_DWORDX4_OFFSET, BUFFER_LOAD_DWORDX4_OFFEN,
                          BUFFER_LOAD_DWORDX4_IDXEN, BUFFER_LOAD_DWORDX4_BOTHEN>;
 
+multiclass MUBUFStore_Atomic_Pattern <MUBUF Instr_ADDR64, MUBUF Instr_OFFSET,
+                                      ValueType vt, PatFrag atomic_st> {
+  // Store follows atomic op convention so address is forst
+  def : Pat <
+     (atomic_st (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
+                                   i16:$offset, i1:$slc), vt:$val),
+     (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset, 1, $slc, 0)
+  >;
+
+  def : Pat <
+    (atomic_st (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset), vt:$val),
+    (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset), 1, 0, 0)
+  >;
+}
+let Predicates = [isSICI] in {
+defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORD_ADDR64, BUFFER_STORE_DWORD_OFFSET, i32, global_store_atomic>;
+defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORDX2_ADDR64, BUFFER_STORE_DWORDX2_OFFSET, i64, global_store_atomic>;
+} // End Predicates = [isSICI]
+
 class MUBUFScratchStorePat <MUBUF Instr, ValueType vt, PatFrag st> : Pat <
   (st vt:$value, (MUBUFScratch v4i32:$srsrc, i32:$vaddr, i32:$soffset,
                                u16imm:$offset)),
@@ -2987,22 +3281,6 @@ def : MUBUFScratchStorePat <BUFFER_STORE_DWORD_OFFEN, i32, store_private>;
 def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, v2i32, store_private>;
 def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, v4i32, store_private>;
 
-/*
-class MUBUFStore_Pattern <MUBUF Instr, ValueType vt, PatFrag st> : Pat <
-  (st vt:$value, (MUBUFScratch v4i32:$srsrc, i64:$vaddr, u16imm:$offset)),
-  (Instr $value, $srsrc, $vaddr, $offset)
->;
-
-let Predicates = [isSICI] in {
-def : MUBUFStore_Pattern <BUFFER_STORE_BYTE_ADDR64, i32, truncstorei8_private>;
-def : MUBUFStore_Pattern <BUFFER_STORE_SHORT_ADDR64, i32, truncstorei16_private>;
-def : MUBUFStore_Pattern <BUFFER_STORE_DWORD_ADDR64, i32, store_private>;
-def : MUBUFStore_Pattern <BUFFER_STORE_DWORDX2_ADDR64, v2i32, store_private>;
-def : MUBUFStore_Pattern <BUFFER_STORE_DWORDX4_ADDR64, v4i32, store_private>;
-} // End Predicates = [isSICI]
-
-*/
-
 //===----------------------------------------------------------------------===//
 // MTBUF Patterns
 //===----------------------------------------------------------------------===//
@@ -3029,29 +3307,16 @@ def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_XYZW>;
 /********** ====================== **********/
 
 multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, string VecSize> {
-
-  // 1. Extract with offset
+  // Extract with offset
   def : Pat<
-    (eltvt (extractelt vt:$vec, (add i32:$idx, imm:$off))),
-    (!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $vec, $idx, imm:$off)
+    (eltvt (extractelt vt:$src, (MOVRELOffset i32:$idx, (i32 imm:$offset)))),
+    (!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $src, $idx, imm:$offset)
   >;
 
-  // 2. Extract without offset
+  // Insert with offset
   def : Pat<
-    (eltvt (extractelt vt:$vec, i32:$idx)),
-    (!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $vec, $idx, 0)
-  >;
-
-  // 3. Insert with offset
-  def : Pat<
-    (insertelt vt:$vec, eltvt:$val, (add i32:$idx, imm:$off)),
-    (!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $vec, $idx, imm:$off, $val)
-  >;
-
-  // 4. Insert without offset
-  def : Pat<
-    (insertelt vt:$vec, eltvt:$val, i32:$idx),
-    (!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $vec, $idx, 0, $val)
+    (insertelt vt:$src, eltvt:$val, (MOVRELOffset i32:$idx, (i32 imm:$offset))),
+    (!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $src, $idx, imm:$offset, $val)
   >;
 }
 
@@ -3111,10 +3376,12 @@ def : ZExt_i64_i32_Pat<anyext>;
 def : ZExt_i64_i1_Pat<zext>;
 def : ZExt_i64_i1_Pat<anyext>;
 
+// FIXME: We need to use COPY_TO_REGCLASS to work-around the fact that
+// REG_SEQUENCE patterns don't support instructions with multiple outputs.
 def : Pat <
   (i64 (sext i32:$src)),
     (REG_SEQUENCE SReg_64, $src, sub0,
-    (S_ASHR_I32 $src, 31), sub1)
+    (i32 (COPY_TO_REGCLASS (S_ASHR_I32 $src, 31), SReg_32_XM0)), sub1)
 >;
 
 def : Pat <
@@ -3214,6 +3481,23 @@ defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>;
 
 def : BFEPattern <V_BFE_U32, S_MOV_B32>;
 
+let Predicates = [isSICI] in {
+def : Pat <
+  (i64 (readcyclecounter)),
+  (S_MEMTIME)
+>;
+}
+
+def : Pat<
+  (fcanonicalize f32:$src),
+  (V_MUL_F32_e64 0, CONST.FP32_ONE, 0, $src, 0, 0)
+>;
+
+def : Pat<
+  (fcanonicalize f64:$src),
+  (V_MUL_F64 0, CONST.FP64_ONE, 0, $src, 0, 0)
+>;
+
 //===----------------------------------------------------------------------===//
 // Fract Patterns
 //===----------------------------------------------------------------------===//
@@ -3226,21 +3510,6 @@ let Predicates = [isSI] in {
 // The workaround for the V_FRACT bug is:
 //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
 
-// Convert (x + (-floor(x)) to fract(x)
-def : Pat <
-  (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)),
-             (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))),
-  (V_CNDMASK_B64_PSEUDO
-      (V_MIN_F64
-          SRCMODS.NONE,
-          (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE),
-          SRCMODS.NONE,
-          (V_MOV_B64_PSEUDO 0x3fefffffffffffff),
-          DSTCLAMP.NONE, DSTOMOD.NONE),
-      $x,
-      (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, 3/*NaN*/))
->;
-
 // Convert floor(x) to (x - fract(x))
 def : Pat <
   (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))),
@@ -3268,6 +3537,9 @@ def : Pat <
 
 def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64>;
 
+def : IntMed3Pat<V_MED3_I32, smax, smax_oneuse, smin_oneuse>;
+def : IntMed3Pat<V_MED3_U32, umax, umax_oneuse, umin_oneuse>;
+
 //============================================================================//
 // Assembler aliases
 //============================================================================//
diff --git a/lib/Target/AMDGPU/SIIntrinsics.td b/lib/Target/AMDGPU/SIIntrinsics.td
index 027a0a2f5167..a9b7c39096e7 100644
--- a/lib/Target/AMDGPU/SIIntrinsics.td
+++ b/lib/Target/AMDGPU/SIIntrinsics.td
@@ -13,8 +13,6 @@
 
 
 let TargetPrefix = "SI", isTarget = 1 in {
-
-  def int_SI_tid : Intrinsic <[llvm_i32_ty], [], [IntrNoMem]>;
   def int_SI_packf16 : Intrinsic <[llvm_i32_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
   def int_SI_export : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>;
   def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>;
@@ -50,9 +48,9 @@ let TargetPrefix = "SI", isTarget = 1 in {
      llvm_i32_ty,     // glc(imm)
      llvm_i32_ty,     // slc(imm)
      llvm_i32_ty],    // tfe(imm)
-    [IntrReadArgMem]>;
+    [IntrReadMem, IntrArgMemOnly]>;
 
-  def int_SI_sendmsg : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_SI_sendmsg : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], []>;
 
   // Fully-flexible SAMPLE instruction.
   class SampleRaw : Intrinsic <
@@ -172,28 +170,20 @@ let TargetPrefix = "SI", isTarget = 1 in {
   def int_SI_image_load_mip : Image;
   def int_SI_getresinfo : Image;
 
-  // Deprecated image and sample intrinsics.
-  class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>;
-
-  def int_SI_sample : Sample;
-  def int_SI_sampleb : Sample;
-  def int_SI_sampled : Sample;
-  def int_SI_samplel : Sample;
-  def int_SI_imageload : Intrinsic <[llvm_v4i32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_SI_resinfo : Intrinsic <[llvm_v4i32_ty], [llvm_i32_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>;
-
   /* Interpolation Intrinsics */
 
   def int_SI_fs_constant : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_SI_fs_interp : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_v2i32_ty], [IntrNoMem]>;
+} // End TargetPrefix = "SI", isTarget = 1
 
+let TargetPrefix = "amdgcn", isTarget = 1 in {
   /* Control flow Intrinsics */
 
-  def int_SI_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], []>;
-  def int_SI_else : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_empty_ty], []>;
-  def int_SI_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty], []>;
-  def int_SI_if_break : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_i64_ty], []>;
-  def int_SI_else_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], []>;
-  def int_SI_loop : Intrinsic<[], [llvm_i64_ty, llvm_empty_ty], []>;
-  def int_SI_end_cf : Intrinsic<[], [llvm_i64_ty], []>;
+  def int_amdgcn_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], []>;
+  def int_amdgcn_else : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_empty_ty], []>;
+  def int_amdgcn_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty], []>;
+  def int_amdgcn_if_break : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_i64_ty], []>;
+  def int_amdgcn_else_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], []>;
+  def int_amdgcn_loop : Intrinsic<[], [llvm_i64_ty, llvm_empty_ty], []>;
+  def int_amdgcn_end_cf : Intrinsic<[], [llvm_i64_ty], []>;
 }
diff --git a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 1bdb1f0ee9f9..9e972a569a0f 100644
--- a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -26,7 +26,7 @@
 //
 // - Live interval recomputing seems inefficient. This currently only matches
 //   one pair, and recomputes live intervals and moves on to the next pair. It
-//   would be better to compute a list of all merges that need to occur
+//   would be better to compute a list of all merges that need to occur.
 //
 // - With a list of instructions to process, we can also merge more. If a
 //   cluster of loads have offsets that are too large to fit in the 8-bit
@@ -36,6 +36,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
 #include "SIRegisterInfo.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
@@ -61,7 +62,6 @@ private:
   MachineRegisterInfo *MRI;
   LiveIntervals *LIS;
 
-
   static bool offsetsCanBeCombined(unsigned Offset0,
                                    unsigned Offset1,
                                    unsigned EltSize);
@@ -69,10 +69,6 @@ private:
   MachineBasicBlock::iterator findMatchingDSInst(MachineBasicBlock::iterator I,
                                                  unsigned EltSize);
 
-  void updateRegDefsUses(unsigned SrcReg,
-                         unsigned DstReg,
-                         unsigned SubIdx);
-
   MachineBasicBlock::iterator mergeRead2Pair(
     MachineBasicBlock::iterator I,
     MachineBasicBlock::iterator Paired,
@@ -193,17 +189,6 @@ SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I,
   return E;
 }
 
-void SILoadStoreOptimizer::updateRegDefsUses(unsigned SrcReg,
-                                             unsigned DstReg,
-                                             unsigned SubIdx) {
-  for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(SrcReg),
-         E = MRI->reg_end(); I != E; ) {
-    MachineOperand &O = *I;
-    ++I;
-    O.substVirtReg(DstReg, SubIdx, *TRI);
-  }
-}
-
 MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(
   MachineBasicBlock::iterator I,
   MachineBasicBlock::iterator Paired,
@@ -268,19 +253,19 @@ MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(
     .addOperand(*Dest1)
     .addReg(DestReg, RegState::Kill, SubRegIdx1);
 
-  LIS->InsertMachineInstrInMaps(Read2);
+  LIS->InsertMachineInstrInMaps(*Read2);
 
   // repairLiveintervalsInRange() doesn't handle physical register, so we have
   // to update the M0 range manually.
-  SlotIndex PairedIndex = LIS->getInstructionIndex(Paired);
+  SlotIndex PairedIndex = LIS->getInstructionIndex(*Paired);
   LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI));
   LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex);
   bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot();
 
   // The new write to the original destination register is now the copy. Steal
   // the old SlotIndex.
-  LIS->ReplaceMachineInstrInMaps(I, Copy0);
-  LIS->ReplaceMachineInstrInMaps(Paired, Copy1);
+  LIS->ReplaceMachineInstrInMaps(*I, *Copy0);
+  LIS->ReplaceMachineInstrInMaps(*Paired, *Copy1);
 
   I->eraseFromParent();
   Paired->eraseFromParent();
@@ -291,7 +276,7 @@ MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(
   LIS->createAndComputeVirtRegInterval(DestReg);
 
   if (UpdateM0Range) {
-    SlotIndex Read2Index = LIS->getInstructionIndex(Read2);
+    SlotIndex Read2Index = LIS->getInstructionIndex(*Read2);
     M0Segment->end = Read2Index.getRegSlot();
   }
 
@@ -340,7 +325,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
 
   // repairLiveintervalsInRange() doesn't handle physical register, so we have
   // to update the M0 range manually.
-  SlotIndex PairedIndex = LIS->getInstructionIndex(Paired);
+  SlotIndex PairedIndex = LIS->getInstructionIndex(*Paired);
   LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI));
   LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex);
   bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot();
@@ -359,8 +344,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
   // XXX - How do we express subregisters here?
   unsigned OrigRegs[] = { Data0->getReg(), Data1->getReg(), Addr->getReg() };
 
-  LIS->RemoveMachineInstrFromMaps(I);
-  LIS->RemoveMachineInstrFromMaps(Paired);
+  LIS->RemoveMachineInstrFromMaps(*I);
+  LIS->RemoveMachineInstrFromMaps(*Paired);
   I->eraseFromParent();
   Paired->eraseFromParent();
 
@@ -368,7 +353,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
   LIS->repairIntervalsInRange(MBB, Write2, Write2, OrigRegs);
 
   if (UpdateM0Range) {
-    SlotIndex Write2Index = LIS->getInstructionIndex(Write2);
+    SlotIndex Write2Index = LIS->getInstructionIndex(*Write2);
     M0Segment->end = Write2Index.getRegSlot();
   }
 
@@ -423,9 +408,16 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
 }
 
 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
-  const TargetSubtargetInfo &STM = MF.getSubtarget();
-  TRI = static_cast<const SIRegisterInfo *>(STM.getRegisterInfo());
-  TII = static_cast<const SIInstrInfo *>(STM.getInstrInfo());
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
+  const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
+  if (!STM.loadStoreOptEnabled())
+    return false;
+
+  TII = STM.getInstrInfo();
+  TRI = &TII->getRegisterInfo();
+
   MRI = &MF.getRegInfo();
 
   LIS = &getAnalysis<LiveIntervals>();
diff --git a/lib/Target/AMDGPU/SILowerControlFlow.cpp b/lib/Target/AMDGPU/SILowerControlFlow.cpp
index 126f6245dfc0..ee1d5dae70b7 100644
--- a/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -52,6 +52,7 @@
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -61,24 +62,24 @@
 
 using namespace llvm;
 
-namespace {
+#define DEBUG_TYPE "si-lower-control-flow"
 
-class SILowerControlFlowPass : public MachineFunctionPass {
+namespace {
 
+class SILowerControlFlow : public MachineFunctionPass {
 private:
   static const unsigned SkipThreshold = 12;
 
-  static char ID;
   const SIRegisterInfo *TRI;
   const SIInstrInfo *TII;
 
   bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To);
 
   void Skip(MachineInstr &From, MachineOperand &To);
-  void SkipIfDead(MachineInstr &MI);
+  bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB);
 
   void If(MachineInstr &MI);
-  void Else(MachineInstr &MI);
+  void Else(MachineInstr &MI, bool ExecModified);
   void Break(MachineInstr &MI);
   void IfBreak(MachineInstr &MI);
   void ElseBreak(MachineInstr &MI);
@@ -88,56 +89,118 @@ private:
   void Kill(MachineInstr &MI);
   void Branch(MachineInstr &MI);
 
-  void LoadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset = 0);
-  void computeIndirectRegAndOffset(unsigned VecReg, unsigned &Reg, int &Offset);
-  void IndirectSrc(MachineInstr &MI);
-  void IndirectDst(MachineInstr &MI);
+  MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+
+  std::pair<MachineBasicBlock *, MachineBasicBlock *>
+  splitBlock(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
+
+  void splitLoadM0BlockLiveIns(LivePhysRegs &RemainderLiveRegs,
+                               const MachineRegisterInfo &MRI,
+                               const MachineInstr &MI,
+                               MachineBasicBlock &LoopBB,
+                               MachineBasicBlock &RemainderBB,
+                               unsigned SaveReg,
+                               const MachineOperand &IdxReg);
+
+  void emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB, DebugLoc DL,
+                              MachineInstr *MovRel,
+                              const MachineOperand &IdxReg,
+                              int Offset);
+
+  bool loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset = 0);
+  std::pair<unsigned, int> computeIndirectRegAndOffset(unsigned VecReg,
+                                                       int Offset) const;
+  bool indirectSrc(MachineInstr &MI);
+  bool indirectDst(MachineInstr &MI);
 
 public:
-  SILowerControlFlowPass(TargetMachine &tm) :
+  static char ID;
+
+  SILowerControlFlow() :
     MachineFunctionPass(ID), TRI(nullptr), TII(nullptr) { }
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
   const char *getPassName() const override {
-    return "SI Lower control flow instructions";
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
-    MachineFunctionPass::getAnalysisUsage(AU);
+    return "SI Lower control flow pseudo instructions";
   }
 };
 
 } // End anonymous namespace
 
-char SILowerControlFlowPass::ID = 0;
+char SILowerControlFlow::ID = 0;
+
+INITIALIZE_PASS(SILowerControlFlow, DEBUG_TYPE,
+                "SI lower control flow", false, false)
 
-FunctionPass *llvm::createSILowerControlFlowPass(TargetMachine &tm) {
-  return new SILowerControlFlowPass(tm);
+char &llvm::SILowerControlFlowPassID = SILowerControlFlow::ID;
+
+
+FunctionPass *llvm::createSILowerControlFlowPass() {
+  return new SILowerControlFlow();
 }
 
-bool SILowerControlFlowPass::shouldSkip(MachineBasicBlock *From,
-                                        MachineBasicBlock *To) {
+static bool opcodeEmitsNoInsts(unsigned Opc) {
+  switch (Opc) {
+  case TargetOpcode::IMPLICIT_DEF:
+  case TargetOpcode::KILL:
+  case TargetOpcode::BUNDLE:
+  case TargetOpcode::CFI_INSTRUCTION:
+  case TargetOpcode::EH_LABEL:
+  case TargetOpcode::GC_LABEL:
+  case TargetOpcode::DBG_VALUE:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool SILowerControlFlow::shouldSkip(MachineBasicBlock *From,
+                                    MachineBasicBlock *To) {
+  if (From->succ_empty())
+    return false;
 
   unsigned NumInstr = 0;
+  MachineFunction *MF = From->getParent();
 
-  for (MachineBasicBlock *MBB = From; MBB != To && !MBB->succ_empty();
-       MBB = *MBB->succ_begin()) {
+  for (MachineFunction::iterator MBBI(From), ToI(To), End = MF->end();
+       MBBI != End && MBBI != ToI; ++MBBI) {
+    MachineBasicBlock &MBB = *MBBI;
 
-    for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
          NumInstr < SkipThreshold && I != E; ++I) {
+      if (opcodeEmitsNoInsts(I->getOpcode()))
+        continue;
+
+      // When a uniform loop is inside non-uniform control flow, the branch
+      // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
+      // when EXEC = 0. We should skip the loop lest it becomes infinite.
+      if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
+          I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
+        return true;
+
+      if (I->isInlineAsm()) {
+        const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
+        const char *AsmStr = I->getOperand(0).getSymbolName();
+
+        // inlineasm length estimate is number of bytes assuming the longest
+        // instruction.
+        uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI);
+        NumInstr += MaxAsmSize / MAI->getMaxInstLength();
+      } else {
+        ++NumInstr;
+      }
 
-      if (I->isBundle() || !I->isBundled())
-        if (++NumInstr >= SkipThreshold)
-          return true;
+      if (NumInstr >= SkipThreshold)
+        return true;
     }
   }
 
   return false;
 }
 
-void SILowerControlFlowPass::Skip(MachineInstr &From, MachineOperand &To) {
+void SILowerControlFlow::Skip(MachineInstr &From, MachineOperand &To) {
 
   if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB()))
     return;
@@ -147,40 +210,44 @@ void SILowerControlFlowPass::Skip(MachineInstr &From, MachineOperand &To) {
     .addOperand(To);
 }
 
-void SILowerControlFlowPass::SkipIfDead(MachineInstr &MI) {
-
+bool SILowerControlFlow::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) {
   MachineBasicBlock &MBB = *MI.getParent();
-  DebugLoc DL = MI.getDebugLoc();
+  MachineFunction *MF = MBB.getParent();
 
-  if (MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getShaderType() !=
-      ShaderType::PIXEL ||
+  if (MF->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS ||
       !shouldSkip(&MBB, &MBB.getParent()->back()))
-    return;
+    return false;
+
+  MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator());
+  MBB.addSuccessor(SkipBB);
 
-  MachineBasicBlock::iterator Insert = &MI;
-  ++Insert;
+  const DebugLoc &DL = MI.getDebugLoc();
 
   // If the exec mask is non-zero, skip the next two instructions
-  BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
-    .addImm(3);
+  BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
+    .addMBB(&NextBB);
+
+  MachineBasicBlock::iterator Insert = SkipBB->begin();
 
   // Exec mask is zero: Export to NULL target...
-  BuildMI(MBB, Insert, DL, TII->get(AMDGPU::EXP))
-          .addImm(0)
-          .addImm(0x09) // V_008DFC_SQ_EXP_NULL
-          .addImm(0)
-          .addImm(1)
-          .addImm(1)
-          .addReg(AMDGPU::VGPR0)
-          .addReg(AMDGPU::VGPR0)
-          .addReg(AMDGPU::VGPR0)
-          .addReg(AMDGPU::VGPR0);
-
-  // ... and terminate wavefront
-  BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
+  BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP))
+    .addImm(0)
+    .addImm(0x09) // V_008DFC_SQ_EXP_NULL
+    .addImm(0)
+    .addImm(1)
+    .addImm(1)
+    .addReg(AMDGPU::VGPR0, RegState::Undef)
+    .addReg(AMDGPU::VGPR0, RegState::Undef)
+    .addReg(AMDGPU::VGPR0, RegState::Undef)
+    .addReg(AMDGPU::VGPR0, RegState::Undef);
+
+  // ... and terminate wavefront.
+  BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
+
+  return true;
 }
 
-void SILowerControlFlowPass::If(MachineInstr &MI) {
+void SILowerControlFlow::If(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc DL = MI.getDebugLoc();
   unsigned Reg = MI.getOperand(0).getReg();
@@ -195,10 +262,15 @@ void SILowerControlFlowPass::If(MachineInstr &MI) {
 
   Skip(MI, MI.getOperand(2));
 
+  // Insert a pseudo terminator to help keep the verifier happy.
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
+    .addOperand(MI.getOperand(2))
+    .addReg(Reg);
+
   MI.eraseFromParent();
 }
 
-void SILowerControlFlowPass::Else(MachineInstr &MI) {
+void SILowerControlFlow::Else(MachineInstr &MI, bool ExecModified) {
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc DL = MI.getDebugLoc();
   unsigned Dst = MI.getOperand(0).getReg();
@@ -208,22 +280,36 @@ void SILowerControlFlowPass::Else(MachineInstr &MI) {
           TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst)
           .addReg(Src); // Saved EXEC
 
+  if (ExecModified) {
+    // Adjust the saved exec to account for the modifications during the flow
+    // block that contains the ELSE. This can happen when WQM mode is switched
+    // off.
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), Dst)
+            .addReg(AMDGPU::EXEC)
+            .addReg(Dst);
+  }
+
   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
           .addReg(AMDGPU::EXEC)
           .addReg(Dst);
 
   Skip(MI, MI.getOperand(2));
 
+  // Insert a pseudo terminator to help keep the verifier happy.
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
+    .addOperand(MI.getOperand(2))
+    .addReg(Dst);
+
   MI.eraseFromParent();
 }
 
-void SILowerControlFlowPass::Break(MachineInstr &MI) {
+void SILowerControlFlow::Break(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc DL = MI.getDebugLoc();
 
   unsigned Dst = MI.getOperand(0).getReg();
   unsigned Src = MI.getOperand(1).getReg();
- 
+
   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
           .addReg(AMDGPU::EXEC)
           .addReg(Src);
@@ -231,14 +317,14 @@ void SILowerControlFlowPass::Break(MachineInstr &MI) {
   MI.eraseFromParent();
 }
 
-void SILowerControlFlowPass::IfBreak(MachineInstr &MI) {
+void SILowerControlFlow::IfBreak(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc DL = MI.getDebugLoc();
 
   unsigned Dst = MI.getOperand(0).getReg();
   unsigned Vcc = MI.getOperand(1).getReg();
   unsigned Src = MI.getOperand(2).getReg();
- 
+
   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
           .addReg(Vcc)
           .addReg(Src);
@@ -246,14 +332,14 @@ void SILowerControlFlowPass::IfBreak(MachineInstr &MI) {
   MI.eraseFromParent();
 }
 
-void SILowerControlFlowPass::ElseBreak(MachineInstr &MI) {
+void SILowerControlFlow::ElseBreak(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc DL = MI.getDebugLoc();
 
   unsigned Dst = MI.getOperand(0).getReg();
   unsigned Saved = MI.getOperand(1).getReg();
   unsigned Src = MI.getOperand(2).getReg();
- 
+
   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
           .addReg(Saved)
           .addReg(Src);
@@ -261,7 +347,7 @@ void SILowerControlFlowPass::ElseBreak(MachineInstr &MI) {
   MI.eraseFromParent();
 }
 
-void SILowerControlFlowPass::Loop(MachineInstr &MI) {
+void SILowerControlFlow::Loop(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc DL = MI.getDebugLoc();
   unsigned Src = MI.getOperand(0).getReg();
@@ -276,7 +362,7 @@ void SILowerControlFlowPass::Loop(MachineInstr &MI) {
   MI.eraseFromParent();
 }
 
-void SILowerControlFlowPass::EndCf(MachineInstr &MI) {
+void SILowerControlFlow::EndCf(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc DL = MI.getDebugLoc();
   unsigned Reg = MI.getOperand(0).getReg();
@@ -289,24 +375,24 @@ void SILowerControlFlowPass::EndCf(MachineInstr &MI) {
   MI.eraseFromParent();
 }
 
-void SILowerControlFlowPass::Branch(MachineInstr &MI) {
-  if (MI.getOperand(0).getMBB() == MI.getParent()->getNextNode())
+void SILowerControlFlow::Branch(MachineInstr &MI) {
+  MachineBasicBlock *MBB = MI.getOperand(0).getMBB();
+  if (MBB == MI.getParent()->getNextNode())
     MI.eraseFromParent();
 
   // If these aren't equal, this is probably an infinite loop.
 }
 
-void SILowerControlFlowPass::Kill(MachineInstr &MI) {
+void SILowerControlFlow::Kill(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc DL = MI.getDebugLoc();
   const MachineOperand &Op = MI.getOperand(0);
 
 #ifndef NDEBUG
-  const SIMachineFunctionInfo *MFI
-    = MBB.getParent()->getInfo<SIMachineFunctionInfo>();
+  CallingConv::ID CallConv = MBB.getParent()->getFunction()->getCallingConv();
   // Kill is only allowed in pixel / geometry shaders.
-  assert(MFI->getShaderType() == ShaderType::PIXEL ||
-         MFI->getShaderType() == ShaderType::GEOMETRY);
+  assert(CallConv == CallingConv::AMDGPU_PS ||
+         CallConv == CallingConv::AMDGPU_GS);
 #endif
 
   // Clear this thread from the exec mask if the operand is negative
@@ -325,94 +411,209 @@ void SILowerControlFlowPass::Kill(MachineInstr &MI) {
   MI.eraseFromParent();
 }
 
-void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) {
+// All currently live registers must remain so in the remainder block.
+void SILowerControlFlow::splitLoadM0BlockLiveIns(LivePhysRegs &RemainderLiveRegs,
+                                                 const MachineRegisterInfo &MRI,
+                                                 const MachineInstr &MI,
+                                                 MachineBasicBlock &LoopBB,
+                                                 MachineBasicBlock &RemainderBB,
+                                                 unsigned SaveReg,
+                                                 const MachineOperand &IdxReg) {
+  // Add reg defined in loop body.
+  RemainderLiveRegs.addReg(SaveReg);
+
+  if (const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val)) {
+    if (!Val->isUndef()) {
+      RemainderLiveRegs.addReg(Val->getReg());
+      LoopBB.addLiveIn(Val->getReg());
+    }
+  }
+
+  for (unsigned Reg : RemainderLiveRegs) {
+    if (MRI.isAllocatable(Reg))
+      RemainderBB.addLiveIn(Reg);
+  }
+
+  const MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src);
+  if (!Src->isUndef())
+    LoopBB.addLiveIn(Src->getReg());
+
+  if (!IdxReg.isUndef())
+    LoopBB.addLiveIn(IdxReg.getReg());
+  LoopBB.sortUniqueLiveIns();
+}
+
+void SILowerControlFlow::emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB,
+                                                DebugLoc DL,
+                                                MachineInstr *MovRel,
+                                                const MachineOperand &IdxReg,
+                                                int Offset) {
+  MachineBasicBlock::iterator I = LoopBB.begin();
+
+  // Read the next variant into VCC (lower 32 bits) <- also loop target
+  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), AMDGPU::VCC_LO)
+    .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
+
+  // Move index from VCC into M0
+  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+    .addReg(AMDGPU::VCC_LO);
+
+  // Compare the just read M0 value to all possible Idx values
+  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32))
+    .addReg(AMDGPU::M0)
+    .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
+
+  // Update EXEC, save the original EXEC value to VCC
+  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
+    .addReg(AMDGPU::VCC);
+
+  if (Offset != 0) {
+    BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
+      .addReg(AMDGPU::M0)
+      .addImm(Offset);
+  }
+
+  // Do the actual move
+  LoopBB.insert(I, MovRel);
+
+  // Update EXEC, switch all done bits to 0 and all todo bits to 1
+  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
+    .addReg(AMDGPU::EXEC)
+    .addReg(AMDGPU::VCC);
+
+  // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
+  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
+    .addMBB(&LoopBB);
+}
+
+MachineBasicBlock *SILowerControlFlow::insertSkipBlock(
+  MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const {
+  MachineFunction *MF = MBB.getParent();
+
+  MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock();
+  MachineFunction::iterator MBBI(MBB);
+  ++MBBI;
+
+  MF->insert(MBBI, SkipBB);
+
+  return SkipBB;
+}
+
+std::pair<MachineBasicBlock *, MachineBasicBlock *>
+SILowerControlFlow::splitBlock(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator I) {
+  MachineFunction *MF = MBB.getParent();
 
+  // To insert the loop we need to split the block. Move everything after this
+  // point to a new block, and insert a new empty block between the two.
+  MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
+  MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
+  MachineFunction::iterator MBBI(MBB);
+  ++MBBI;
+
+  MF->insert(MBBI, LoopBB);
+  MF->insert(MBBI, RemainderBB);
+
+  // Move the rest of the block into a new block.
+  RemainderBB->transferSuccessors(&MBB);
+  RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
+
+  MBB.addSuccessor(LoopBB);
+
+  return std::make_pair(LoopBB, RemainderBB);
+}
+
+// Returns true if a new block was inserted.
+bool SILowerControlFlow::loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) {
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc DL = MI.getDebugLoc();
-  MachineBasicBlock::iterator I = MI;
+  MachineBasicBlock::iterator I(&MI);
 
-  unsigned Save = MI.getOperand(1).getReg();
-  unsigned Idx = MI.getOperand(3).getReg();
+  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
 
-  if (AMDGPU::SReg_32RegClass.contains(Idx)) {
-    if (Offset) {
-      BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
-              .addReg(Idx)
-              .addImm(Offset);
+  if (AMDGPU::SReg_32RegClass.contains(Idx->getReg())) {
+    if (Offset != 0) {
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
+        .addReg(Idx->getReg(), getUndefRegState(Idx->isUndef()))
+        .addImm(Offset);
     } else {
-      BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
-              .addReg(Idx);
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+        .addReg(Idx->getReg(), getUndefRegState(Idx->isUndef()));
     }
+
     MBB.insert(I, MovRel);
-  } else {
+    MI.eraseFromParent();
+    return false;
+  }
 
-    assert(AMDGPU::SReg_64RegClass.contains(Save));
-    assert(AMDGPU::VGPR_32RegClass.contains(Idx));
+  MachineOperand *SaveOp = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
+  SaveOp->setIsDead(false);
+  unsigned Save = SaveOp->getReg();
 
-    // Save the EXEC mask
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save)
-            .addReg(AMDGPU::EXEC);
+  // Reading from a VGPR requires looping over all workitems in the wavefront.
+  assert(AMDGPU::SReg_64RegClass.contains(Save) &&
+         AMDGPU::VGPR_32RegClass.contains(Idx->getReg()));
 
-    // Read the next variant into VCC (lower 32 bits) <- also loop target
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
-            AMDGPU::VCC_LO)
-            .addReg(Idx);
+  // Save the EXEC mask
+  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), Save)
+    .addReg(AMDGPU::EXEC);
 
-    // Move index from VCC into M0
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
-            .addReg(AMDGPU::VCC_LO);
+  LivePhysRegs RemainderLiveRegs(TRI);
 
-    // Compare the just read M0 value to all possible Idx values
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32))
-      .addReg(AMDGPU::M0)
-      .addReg(Idx);
+  RemainderLiveRegs.addLiveOuts(MBB);
 
-    // Update EXEC, save the original EXEC value to VCC
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
-            .addReg(AMDGPU::VCC);
+  MachineBasicBlock *LoopBB;
+  MachineBasicBlock *RemainderBB;
 
-    if (Offset) {
-      BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
-              .addReg(AMDGPU::M0)
-              .addImm(Offset);
-    }
-    // Do the actual move
-    MBB.insert(I, MovRel);
+  std::tie(LoopBB, RemainderBB) = splitBlock(MBB, I);
 
-    // Update EXEC, switch all done bits to 0 and all todo bits to 1
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
-            .addReg(AMDGPU::EXEC)
-            .addReg(AMDGPU::VCC);
+  for (const MachineInstr &Inst : reverse(*RemainderBB))
+    RemainderLiveRegs.stepBackward(Inst);
 
-    // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
-      .addImm(-7);
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  LoopBB->addSuccessor(RemainderBB);
+  LoopBB->addSuccessor(LoopBB);
 
-    // Restore EXEC
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
-            .addReg(Save);
+  splitLoadM0BlockLiveIns(RemainderLiveRegs, MRI, MI, *LoopBB,
+                          *RemainderBB, Save, *Idx);
+
+  emitLoadM0FromVGPRLoop(*LoopBB, DL, MovRel, *Idx, Offset);
+
+  MachineBasicBlock::iterator First = RemainderBB->begin();
+  BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
+    .addReg(Save);
 
-  }
   MI.eraseFromParent();
+  return true;
 }
 
-/// \param @VecReg The register which holds element zero of the vector
-///                 being addressed into.
-/// \param[out] @Reg The base register to use in the indirect addressing instruction.
-/// \param[in,out] @Offset As an input, this is the constant offset part of the
-//                         indirect Index. e.g. v0 = v[VecReg + Offset]
-//                         As an output, this is a constant value that needs
-//                         to be added to the value stored in M0.
-void SILowerControlFlowPass::computeIndirectRegAndOffset(unsigned VecReg,
-                                                         unsigned &Reg,
-                                                         int &Offset) {
+/// \param @VecReg The register which holds element zero of the vector being
+///                 addressed into.
+//
+/// \param[in] @Idx The index operand from the movrel instruction. This must be
+// a register, but may be NoRegister.
+///
+/// \param[in] @Offset As an input, this is the constant offset part of the
+// indirect Index. e.g. v0 = v[VecReg + Offset] As an output, this is a constant
+// value that needs to be added to the value stored in M0.
+std::pair<unsigned, int>
+SILowerControlFlow::computeIndirectRegAndOffset(unsigned VecReg, int Offset) const {
   unsigned SubReg = TRI->getSubReg(VecReg, AMDGPU::sub0);
   if (!SubReg)
     SubReg = VecReg;
 
+  const TargetRegisterClass *SuperRC = TRI->getPhysRegClass(VecReg);
   const TargetRegisterClass *RC = TRI->getPhysRegClass(SubReg);
-  int RegIdx = TRI->getHWRegIndex(SubReg) + Offset;
+  int NumElts = SuperRC->getSize() / RC->getSize();
+
+  int BaseRegIdx = TRI->getHWRegIndex(SubReg);
+
+  // Skip out of bounds offsets, or else we would end up using an undefined
+  // register.
+  if (Offset >= NumElts)
+    return std::make_pair(RC->getRegister(BaseRegIdx), Offset);
 
+  int RegIdx = BaseRegIdx + Offset;
   if (RegIdx < 0) {
     Offset = RegIdx;
     RegIdx = 0;
@@ -420,77 +621,102 @@ void SILowerControlFlowPass::computeIndirectRegAndOffset(unsigned VecReg,
     Offset = 0;
   }
 
-  Reg = RC->getRegister(RegIdx);
+  unsigned Reg = RC->getRegister(RegIdx);
+  return std::make_pair(Reg, Offset);
 }
 
-void SILowerControlFlowPass::IndirectSrc(MachineInstr &MI) {
-
+// Return true if a new block was inserted.
+bool SILowerControlFlow::indirectSrc(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
-  DebugLoc DL = MI.getDebugLoc();
+  const DebugLoc &DL = MI.getDebugLoc();
 
   unsigned Dst = MI.getOperand(0).getReg();
-  unsigned Vec = MI.getOperand(2).getReg();
-  int Off = MI.getOperand(4).getImm();
+  const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
+  int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
   unsigned Reg;
 
-  computeIndirectRegAndOffset(Vec, Reg, Off);
+  std::tie(Reg, Offset) = computeIndirectRegAndOffset(SrcVec->getReg(), Offset);
+
+  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
+  if (Idx->getReg() == AMDGPU::NoRegister) {
+    // Only had a constant offset, copy the register directly.
+    BuildMI(MBB, MI.getIterator(), DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
+      .addReg(Reg, getUndefRegState(SrcVec->isUndef()));
+    MI.eraseFromParent();
+    return false;
+  }
 
   MachineInstr *MovRel =
     BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
-            .addReg(Reg)
-            .addReg(Vec, RegState::Implicit);
+    .addReg(Reg, getUndefRegState(SrcVec->isUndef()))
+    .addReg(SrcVec->getReg(), RegState::Implicit);
 
-  LoadM0(MI, MovRel, Off);
+  return loadM0(MI, MovRel, Offset);
 }
 
-void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) {
-
+// Return true if a new block was inserted.
+bool SILowerControlFlow::indirectDst(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
-  DebugLoc DL = MI.getDebugLoc();
+  const DebugLoc &DL = MI.getDebugLoc();
 
   unsigned Dst = MI.getOperand(0).getReg();
-  int Off = MI.getOperand(4).getImm();
-  unsigned Val = MI.getOperand(5).getReg();
+  int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
   unsigned Reg;
 
-  computeIndirectRegAndOffset(Dst, Reg, Off);
+  const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
+  std::tie(Reg, Offset) = computeIndirectRegAndOffset(Dst, Offset);
 
-  MachineInstr *MovRel = 
-    BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32))
-            .addReg(Reg, RegState::Define)
-            .addReg(Val)
-            .addReg(Dst, RegState::Implicit);
+  MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
+  if (Idx->getReg() == AMDGPU::NoRegister) {
+    // Only had a constant offset, copy the register directly.
+    BuildMI(MBB, MI.getIterator(), DL, TII->get(AMDGPU::V_MOV_B32_e32), Reg)
+      .addOperand(*Val);
+    MI.eraseFromParent();
+    return false;
+  }
+
+  MachineInstr *MovRel =
+    BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32), Reg)
+    .addReg(Val->getReg(), getUndefRegState(Val->isUndef()))
+    .addReg(Dst, RegState::Implicit);
 
-  LoadM0(MI, MovRel, Off);
+  return loadM0(MI, MovRel, Offset);
 }
 
-bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
-  TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
-  TRI =
-      static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
+bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  TII = ST.getInstrInfo();
+  TRI = &TII->getRegisterInfo();
+
   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
   bool HaveKill = false;
-  bool NeedWQM = false;
   bool NeedFlat = false;
   unsigned Depth = 0;
 
-  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
-       BI != BE; ++BI) {
+  MachineFunction::iterator NextBB;
 
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+       BI != BE; BI = NextBB) {
+    NextBB = std::next(BI);
     MachineBasicBlock &MBB = *BI;
+
+    MachineBasicBlock *EmptyMBBAtEnd = nullptr;
     MachineBasicBlock::iterator I, Next;
+    bool ExecModified = false;
+
     for (I = MBB.begin(); I != MBB.end(); I = Next) {
       Next = std::next(I);
 
       MachineInstr &MI = *I;
-      if (TII->isWQM(MI) || TII->isDS(MI))
-        NeedWQM = true;
 
       // Flat uses m0 in case it needs to access LDS.
       if (TII->isFLAT(MI))
         NeedFlat = true;
 
+      if (I->modifiesRegister(AMDGPU::EXEC, TRI))
+        ExecModified = true;
+
       switch (MI.getOpcode()) {
         default: break;
         case AMDGPU::SI_IF:
@@ -499,7 +725,7 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
           break;
 
         case AMDGPU::SI_ELSE:
-          Else(MI);
+          Else(MI, ExecModified);
           break;
 
         case AMDGPU::SI_BREAK:
@@ -521,16 +747,20 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
 
         case AMDGPU::SI_END_CF:
           if (--Depth == 0 && HaveKill) {
-            SkipIfDead(MI);
             HaveKill = false;
+            // TODO: Insert skip if exec is 0?
           }
+
           EndCf(MI);
           break;
 
-        case AMDGPU::SI_KILL:
-          if (Depth == 0)
-            SkipIfDead(MI);
-          else
+        case AMDGPU::SI_KILL_TERMINATOR:
+          if (Depth == 0) {
+            if (skipIfDead(MI, *NextBB)) {
+              NextBB = std::next(BI);
+              BE = MF.end();
+            }
+          } else
             HaveKill = true;
           Kill(MI);
           break;
@@ -544,7 +774,15 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
         case AMDGPU::SI_INDIRECT_SRC_V4:
         case AMDGPU::SI_INDIRECT_SRC_V8:
         case AMDGPU::SI_INDIRECT_SRC_V16:
-          IndirectSrc(MI);
+          if (indirectSrc(MI)) {
+            // The block was split at this point. We can safely skip the middle
+            // inserted block to the following which contains the rest of this
+            // block's instructions.
+            NextBB = std::next(BI);
+            BE = MF.end();
+            Next = MBB.end();
+          }
+
           break;
 
         case AMDGPU::SI_INDIRECT_DST_V1:
@@ -552,55 +790,46 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
         case AMDGPU::SI_INDIRECT_DST_V4:
         case AMDGPU::SI_INDIRECT_DST_V8:
         case AMDGPU::SI_INDIRECT_DST_V16:
-          IndirectDst(MI);
+          if (indirectDst(MI)) {
+            // The block was split at this point. We can safely skip the middle
+            // inserted block to the following which contains the rest of this
+            // block's instructions.
+            NextBB = std::next(BI);
+            BE = MF.end();
+            Next = MBB.end();
+          }
+
           break;
+
+        case AMDGPU::SI_RETURN: {
+          assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
+
+          // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
+          // because external bytecode will be appended at the end.
+          if (BI != --MF.end() || I != MBB.getFirstTerminator()) {
+            // SI_RETURN is not the last instruction. Add an empty block at
+            // the end and jump there.
+            if (!EmptyMBBAtEnd) {
+              EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
+              MF.insert(MF.end(), EmptyMBBAtEnd);
+            }
+
+            MBB.addSuccessor(EmptyMBBAtEnd);
+            BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
+                    .addMBB(EmptyMBBAtEnd);
+            I->eraseFromParent();
+          }
+          break;
+        }
       }
     }
   }
 
-  if (NeedWQM && MFI->getShaderType() == ShaderType::PIXEL) {
-    MachineBasicBlock &MBB = MF.front();
-    BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
-            AMDGPU::EXEC).addReg(AMDGPU::EXEC);
-  }
-
-  // FIXME: This seems inappropriate to do here.
   if (NeedFlat && MFI->IsKernel) {
-    // Insert the prologue initializing the SGPRs pointing to the scratch space
-    // for flat accesses.
-    const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
-
     // TODO: What to use with function calls?
-
-    // FIXME: This is reporting stack size that is used in a scratch buffer
-    // rather than registers as well.
-    uint64_t StackSizeBytes = FrameInfo->getStackSize();
-
-    int IndirectBegin
-      = static_cast<const AMDGPUInstrInfo*>(TII)->getIndirectIndexBegin(MF);
-    // Convert register index to 256-byte unit.
-    uint64_t StackOffset = IndirectBegin < 0 ? 0 : (4 * IndirectBegin / 256);
-
-    assert((StackSizeBytes < 0xffff) && StackOffset < 0xffff &&
-           "Stack limits should be smaller than 16-bits");
-
-    // Initialize the flat scratch register pair.
-    // TODO: Can we use one s_mov_b64 here?
-
-    // Offset is in units of 256-bytes.
-    MachineBasicBlock &MBB = MF.front();
-    DebugLoc NoDL;
-    MachineBasicBlock::iterator Start = MBB.getFirstNonPHI();
-    const MCInstrDesc &SMovK = TII->get(AMDGPU::S_MOVK_I32);
-
-    assert(isInt<16>(StackOffset) && isInt<16>(StackSizeBytes));
-
-    BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_LO)
-      .addImm(StackOffset);
-
-    // Documentation says size is "per-thread scratch size in bytes"
-    BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_HI)
-      .addImm(StackSizeBytes);
+    // We will need to Initialize the flat scratch register pair.
+    if (NeedFlat)
+      MFI->setHasFlatInstructions(true);
   }
 
   return true;
diff --git a/lib/Target/AMDGPU/SILowerI1Copies.cpp b/lib/Target/AMDGPU/SILowerI1Copies.cpp
index a2fa5fd93aad..dc1d20ddb274 100644
--- a/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -18,7 +18,6 @@
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
-#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -47,8 +46,6 @@ public:
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<MachineDominatorTree>();
-    AU.addPreserved<MachineDominatorTree>();
     AU.setPreservesCFG();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
@@ -56,11 +53,8 @@ public:
 
 } // End anonymous namespace.
 
-INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE,
-                      "SI Lower i1 Copies", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
-INITIALIZE_PASS_END(SILowerI1Copies, DEBUG_TYPE,
-                    "SI Lower i1 Copies", false, false)
+INITIALIZE_PASS(SILowerI1Copies, DEBUG_TYPE,
+                "SI Lower i1 Copies", false, false)
 
 char SILowerI1Copies::ID = 0;
 
@@ -72,9 +66,10 @@ FunctionPass *llvm::createSILowerI1CopiesPass() {
 
 bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
-  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  const TargetRegisterInfo *TRI = &TII->getRegisterInfo();
+
   std::vector<unsigned> I1Defs;
 
   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 49677fc2b0a3..4d12a1ef9a93 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -1,19 +1,17 @@
-//===-- SIMachineFunctionInfo.cpp - SI Machine Function Info -------===//
+//===-- SIMachineFunctionInfo.cpp -------- SI Machine Function Info -------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-/// \file
 //===----------------------------------------------------------------------===//
 
-
 #include "SIMachineFunctionInfo.h"
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/LLVMContext.h"
@@ -22,6 +20,11 @@
 
 using namespace llvm;
 
+static cl::opt<bool> EnableSpillSGPRToVGPR(
+  "amdgpu-spill-sgpr-to-vgpr",
+  cl::desc("Enable spilling VGPRs to SGPRs"),
+  cl::ReallyHidden,
+  cl::init(true));
 
 // Pin the vtable to this file.
 void SIMachineFunctionInfo::anchor() {}
@@ -48,12 +51,20 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister),
     PSInputAddr(0),
     ReturnsVoid(true),
+    MaximumWorkGroupSize(0),
+    DebuggerReservedVGPRCount(0),
+    DebuggerWorkGroupIDStackObjectIndices({{0, 0, 0}}),
+    DebuggerWorkItemIDStackObjectIndices({{0, 0, 0}}),
     LDSWaveSpillSize(0),
     PSInputEna(0),
     NumUserSGPRs(0),
     NumSystemSGPRs(0),
     HasSpilledSGPRs(false),
     HasSpilledVGPRs(false),
+    HasNonSpillStackObjects(false),
+    HasFlatInstructions(false),
+    NumSpilledSGPRs(0),
+    NumSpilledVGPRs(0),
     PrivateSegmentBuffer(false),
     DispatchPtr(false),
     QueuePtr(false),
@@ -63,37 +74,45 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     GridWorkgroupCountX(false),
     GridWorkgroupCountY(false),
     GridWorkgroupCountZ(false),
-    WorkGroupIDX(true),
+    WorkGroupIDX(false),
     WorkGroupIDY(false),
     WorkGroupIDZ(false),
     WorkGroupInfo(false),
     PrivateSegmentWaveByteOffset(false),
-    WorkItemIDX(true),
+    WorkItemIDX(false),
     WorkItemIDY(false),
     WorkItemIDZ(false) {
-  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
   const Function *F = MF.getFunction();
 
   PSInputAddr = AMDGPU::getInitialPSInputAddr(*F);
 
   const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
 
-  if (getShaderType() == ShaderType::COMPUTE)
+  if (!AMDGPU::isShader(F->getCallingConv())) {
     KernargSegmentPtr = true;
+    WorkGroupIDX = true;
+    WorkItemIDX = true;
+  }
 
-  if (F->hasFnAttribute("amdgpu-work-group-id-y"))
+  if (F->hasFnAttribute("amdgpu-work-group-id-y") || ST.debuggerEmitPrologue())
     WorkGroupIDY = true;
 
-  if (F->hasFnAttribute("amdgpu-work-group-id-z"))
+  if (F->hasFnAttribute("amdgpu-work-group-id-z") || ST.debuggerEmitPrologue())
     WorkGroupIDZ = true;
 
-  if (F->hasFnAttribute("amdgpu-work-item-id-y"))
+  if (F->hasFnAttribute("amdgpu-work-item-id-y") || ST.debuggerEmitPrologue())
     WorkItemIDY = true;
 
-  if (F->hasFnAttribute("amdgpu-work-item-id-z"))
+  if (F->hasFnAttribute("amdgpu-work-item-id-z") || ST.debuggerEmitPrologue())
     WorkItemIDZ = true;
 
-  bool MaySpill = ST.isVGPRSpillingEnabled(this);
+  // X, XY, and XYZ are the only supported combinations, so make sure Y is
+  // enabled if Z is.
+  if (WorkItemIDZ)
+    WorkItemIDY = true;
+
+  bool MaySpill = ST.isVGPRSpillingEnabled(*F);
   bool HasStackObjects = FrameInfo->hasStackObjects();
 
   if (HasStackObjects || MaySpill)
@@ -105,12 +124,25 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
 
     if (F->hasFnAttribute("amdgpu-dispatch-ptr"))
       DispatchPtr = true;
+
+    if (F->hasFnAttribute("amdgpu-queue-ptr"))
+      QueuePtr = true;
   }
 
-  // X, XY, and XYZ are the only supported combinations, so make sure Y is
-  // enabled if Z is.
-  if (WorkItemIDZ)
-    WorkItemIDY = true;
+  // We don't need to worry about accessing spills with flat instructions.
+  // TODO: On VI where we must use flat for global, we should be able to omit
+  // this if it is never used for generic access.
+  if (HasStackObjects && ST.getGeneration() >= SISubtarget::SEA_ISLANDS &&
+      ST.isAmdHsaOS())
+    FlatScratchInit = true;
+
+  if (AMDGPU::isCompute(F->getCallingConv()))
+    MaximumWorkGroupSize = AMDGPU::getMaximumWorkGroupSize(*F);
+  else
+    MaximumWorkGroupSize = ST.getWavefrontSize();
+
+  if (ST.debuggerReserveRegs())
+    DebuggerReservedVGPRCount = 4;
 }
 
 unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
@@ -142,13 +174,24 @@ unsigned SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI)
   return KernargSegmentPtrUserSGPR;
 }
 
-SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg(
+unsigned SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) {
+  FlatScratchInitUserSGPR = TRI.getMatchingSuperReg(
+    getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
+  NumUserSGPRs += 2;
+  return FlatScratchInitUserSGPR;
+}
+
+SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg (
                                                        MachineFunction *MF,
                                                        unsigned FrameIndex,
                                                        unsigned SubIdx) {
-  const MachineFrameInfo *FrameInfo = MF->getFrameInfo();
-  const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(
-      MF->getSubtarget<AMDGPUSubtarget>().getRegisterInfo());
+  if (!EnableSpillSGPRToVGPR)
+    return SpilledReg();
+
+  const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+  MachineFrameInfo *FrameInfo = MF->getFrameInfo();
   MachineRegisterInfo &MRI = MF->getRegInfo();
   int64_t Offset = FrameInfo->getObjectOffset(FrameIndex);
   Offset += SubIdx * 4;
@@ -157,19 +200,14 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg(
   unsigned Lane = (Offset / 4) % 64;
 
   struct SpilledReg Spill;
+  Spill.Lane = Lane;
 
   if (!LaneVGPRs.count(LaneVGPRIdx)) {
     unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass);
 
-    if (LaneVGPR == AMDGPU::NoRegister) {
-      LLVMContext &Ctx = MF->getFunction()->getContext();
-      Ctx.emitError("Ran out of VGPRs for spilling SGPR");
-
-      // When compiling from inside Mesa, the compilation continues.
-      // Select an arbitrary register to avoid triggering assertions
-      // during subsequent passes.
-      LaneVGPR = AMDGPU::VGPR0;
-    }
+    if (LaneVGPR == AMDGPU::NoRegister)
+      // We have no VGPRs left for spilling SGPRs.
+      return Spill;
 
     LaneVGPRs[LaneVGPRIdx] = LaneVGPR;
 
@@ -182,14 +220,10 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg(
   }
 
   Spill.VGPR = LaneVGPRs[LaneVGPRIdx];
-  Spill.Lane = Lane;
   return Spill;
 }
 
 unsigned SIMachineFunctionInfo::getMaximumWorkGroupSize(
                                               const MachineFunction &MF) const {
-  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
-  // FIXME: We should get this information from kernel attributes if it
-  // is available.
-  return getShaderType() == ShaderType::COMPUTE ? 256 : ST.getWavefrontSize();
+  return MaximumWorkGroupSize;
 }
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 846ee5de057d..f5bd6366c717 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -11,12 +11,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-
-#ifndef LLVM_LIB_TARGET_R600_SIMACHINEFUNCTIONINFO_H
-#define LLVM_LIB_TARGET_R600_SIMACHINEFUNCTIONINFO_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_SIMACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_SIMACHINEFUNCTIONINFO_H
 
 #include "AMDGPUMachineFunction.h"
 #include "SIRegisterInfo.h"
+#include <array>
 #include <map>
 
 namespace llvm {
@@ -25,7 +25,7 @@ class MachineRegisterInfo;
 
 /// This class keeps track of the SPI_SP_INPUT_ADDR config register, which
 /// tells the hardware which interpolation parameters to load.
-class SIMachineFunctionInfo : public AMDGPUMachineFunction {
+class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
   // FIXME: This should be removed and getPreloadedValue moved here.
   friend struct SIRegisterInfo;
   void anchor() override;
@@ -61,6 +61,15 @@ class SIMachineFunctionInfo : public AMDGPUMachineFunction {
   unsigned PSInputAddr;
   bool ReturnsVoid;
 
+  unsigned MaximumWorkGroupSize;
+
+  // Number of reserved VGPRs for debugger usage.
+  unsigned DebuggerReservedVGPRCount;
+  // Stack object indices for work group IDs.
+  std::array<int, 3> DebuggerWorkGroupIDStackObjectIndices;
+  // Stack object indices for work item IDs.
+  std::array<int, 3> DebuggerWorkItemIDStackObjectIndices;
+
 public:
   // FIXME: Make private
   unsigned LDSWaveSpillSize;
@@ -73,6 +82,11 @@ public:
 private:
   bool HasSpilledSGPRs;
   bool HasSpilledVGPRs;
+  bool HasNonSpillStackObjects;
+  bool HasFlatInstructions;
+
+  unsigned NumSpilledSGPRs;
+  unsigned NumSpilledVGPRs;
 
   // Feature bits required for inputs passed in user SGPRs.
   bool PrivateSegmentBuffer : 1;
@@ -96,7 +110,6 @@ private:
   bool WorkItemIDY : 1;
   bool WorkItemIDZ : 1;
 
-
   MCPhysReg getNextUserSGPR() const {
     assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs");
     return AMDGPU::SGPR0 + NumUserSGPRs;
@@ -111,8 +124,9 @@ public:
     unsigned VGPR;
     int Lane;
     SpilledReg(unsigned R, int L) : VGPR (R), Lane (L) { }
-    SpilledReg() : VGPR(0), Lane(-1) { }
+    SpilledReg() : VGPR(AMDGPU::NoRegister), Lane(-1) { }
     bool hasLane() { return Lane != -1;}
+    bool hasReg() { return VGPR != AMDGPU::NoRegister;}
   };
 
   // SIMachineFunctionInfo definition
@@ -129,6 +143,7 @@ public:
   unsigned addDispatchPtr(const SIRegisterInfo &TRI);
   unsigned addQueuePtr(const SIRegisterInfo &TRI);
   unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI);
+  unsigned addFlatScratchInit(const SIRegisterInfo &TRI);
 
   // Add system SGPRs.
   unsigned addWorkGroupIDX() {
@@ -161,6 +176,10 @@ public:
     return PrivateSegmentWaveByteOffsetSystemSGPR;
   }
 
+  void setPrivateSegmentWaveByteOffset(unsigned Reg) {
+    PrivateSegmentWaveByteOffsetSystemSGPR = Reg;
+  }
+
   bool hasPrivateSegmentBuffer() const {
     return PrivateSegmentBuffer;
   }
@@ -261,6 +280,10 @@ public:
     ScratchWaveOffsetReg = Reg;
   }
 
+  unsigned getQueuePtrUserSGPR() const {
+    return QueuePtrUserSGPR;
+  }
+
   bool hasSpilledSGPRs() const {
     return HasSpilledSGPRs;
   }
@@ -277,6 +300,38 @@ public:
     HasSpilledVGPRs = Spill;
   }
 
+  bool hasNonSpillStackObjects() const {
+    return HasNonSpillStackObjects;
+  }
+
+  void setHasNonSpillStackObjects(bool StackObject = true) {
+    HasNonSpillStackObjects = StackObject;
+  }
+
+  bool hasFlatInstructions() const {
+    return HasFlatInstructions;
+  }
+
+  void setHasFlatInstructions(bool UseFlat = true) {
+    HasFlatInstructions = UseFlat;
+  }
+
+  unsigned getNumSpilledSGPRs() const {
+    return NumSpilledSGPRs;
+  }
+
+  unsigned getNumSpilledVGPRs() const {
+    return NumSpilledVGPRs;
+  }
+
+  void addToSpilledSGPRs(unsigned num) {
+    NumSpilledSGPRs += num;
+  }
+
+  void addToSpilledVGPRs(unsigned num) {
+    NumSpilledVGPRs += num;
+  }
+
   unsigned getPSInputAddr() const {
     return PSInputAddr;
   }
@@ -297,10 +352,70 @@ public:
     ReturnsVoid = Value;
   }
 
+  /// \returns Number of reserved VGPRs for debugger usage.
+  unsigned getDebuggerReservedVGPRCount() const {
+    return DebuggerReservedVGPRCount;
+  }
+
+  /// \returns Stack object index for \p Dim's work group ID.
+  int getDebuggerWorkGroupIDStackObjectIndex(unsigned Dim) const {
+    assert(Dim < 3);
+    return DebuggerWorkGroupIDStackObjectIndices[Dim];
+  }
+
+  /// \brief Sets stack object index for \p Dim's work group ID to \p ObjectIdx.
+  void setDebuggerWorkGroupIDStackObjectIndex(unsigned Dim, int ObjectIdx) {
+    assert(Dim < 3);
+    DebuggerWorkGroupIDStackObjectIndices[Dim] = ObjectIdx;
+  }
+
+  /// \returns Stack object index for \p Dim's work item ID.
+  int getDebuggerWorkItemIDStackObjectIndex(unsigned Dim) const {
+    assert(Dim < 3);
+    return DebuggerWorkItemIDStackObjectIndices[Dim];
+  }
+
+  /// \brief Sets stack object index for \p Dim's work item ID to \p ObjectIdx.
+  void setDebuggerWorkItemIDStackObjectIndex(unsigned Dim, int ObjectIdx) {
+    assert(Dim < 3);
+    DebuggerWorkItemIDStackObjectIndices[Dim] = ObjectIdx;
+  }
+
+  /// \returns SGPR used for \p Dim's work group ID.
+  unsigned getWorkGroupIDSGPR(unsigned Dim) const {
+    switch (Dim) {
+    case 0:
+      assert(hasWorkGroupIDX());
+      return WorkGroupIDXSystemSGPR;
+    case 1:
+      assert(hasWorkGroupIDY());
+      return WorkGroupIDYSystemSGPR;
+    case 2:
+      assert(hasWorkGroupIDZ());
+      return WorkGroupIDZSystemSGPR;
+    }
+    llvm_unreachable("unexpected dimension");
+  }
+
+  /// \returns VGPR used for \p Dim' work item ID.
+  unsigned getWorkItemIDVGPR(unsigned Dim) const {
+    switch (Dim) {
+    case 0:
+      assert(hasWorkItemIDX());
+      return AMDGPU::VGPR0;
+    case 1:
+      assert(hasWorkItemIDY());
+      return AMDGPU::VGPR1;
+    case 2:
+      assert(hasWorkItemIDZ());
+      return AMDGPU::VGPR2;
+    }
+    llvm_unreachable("unexpected dimension");
+  }
+
   unsigned getMaximumWorkGroupSize(const MachineFunction &MF) const;
 };
 
 } // End namespace llvm
 
-
 #endif
diff --git a/lib/Target/AMDGPU/SIMachineScheduler.cpp b/lib/Target/AMDGPU/SIMachineScheduler.cpp
index 1cfa98430020..7125b411c603 100644
--- a/lib/Target/AMDGPU/SIMachineScheduler.cpp
+++ b/lib/Target/AMDGPU/SIMachineScheduler.cpp
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "AMDGPU.h"
 #include "SIMachineScheduler.h"
-#include "AMDGPUSubtarget.h"
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -295,7 +295,7 @@ static bool isDefBetween(unsigned Reg,
     const MachineInstr* MI = &*UI;
     if (MI->isDebugValue())
       continue;
-    SlotIndex InstSlot = LIS->getInstructionIndex(MI).getRegSlot();
+    SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot();
     if (InstSlot >= First && InstSlot <= Last)
       return true;
   }
@@ -327,9 +327,9 @@ void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
   BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs);
 
   // Do not Track Physical Registers, because it messes up.
-  for (unsigned Reg : RPTracker.getPressure().LiveInRegs) {
-    if (TargetRegisterInfo::isVirtualRegister(Reg))
-      LiveInRegs.insert(Reg);
+  for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
+    if (TargetRegisterInfo::isVirtualRegister(RegMaskPair.RegUnit))
+      LiveInRegs.insert(RegMaskPair.RegUnit);
   }
   LiveOutRegs.clear();
   // There is several possibilities to distinguish:
@@ -354,11 +354,12 @@ void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
   // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7
   // Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7
   // The use of findDefBetween removes the case 4.
-  for (unsigned Reg : RPTracker.getPressure().LiveOutRegs) {
+  for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
+    unsigned Reg = RegMaskPair.RegUnit;
     if (TargetRegisterInfo::isVirtualRegister(Reg) &&
-        isDefBetween(Reg, LIS->getInstructionIndex(BeginBlock).getRegSlot(),
-                       LIS->getInstructionIndex(EndBlock).getRegSlot(),
-                       MRI, LIS)) {
+        isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(),
+                     LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI,
+                     LIS)) {
       LiveOutRegs.insert(Reg);
     }
   }
@@ -463,6 +464,9 @@ void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) {
   for (SDep& Succ : SU->Succs) {
     SUnit *SuccSU = Succ.getSUnit();
 
+    if (SuccSU->NodeNum >= DAG->SUnits.size())
+        continue;
+
     if (BC->isSUInBlock(SuccSU, ID) != InOrOutBlock)
       continue;
 
@@ -521,12 +525,9 @@ void SIScheduleBlock::addPred(SIScheduleBlock *Pred) {
   }
   Preds.push_back(Pred);
 
-#ifndef NDEBUG
-  for (SIScheduleBlock* S : Succs) {
-    if (PredID == S->getID())
-      assert(!"Loop in the Block Graph!\n");
-  }
-#endif
+  assert(none_of(Succs,
+                 [=](SIScheduleBlock *S) { return PredID == S->getID(); }) &&
+         "Loop in the Block Graph!");
 }
 
 void SIScheduleBlock::addSucc(SIScheduleBlock *Succ) {
@@ -540,12 +541,9 @@ void SIScheduleBlock::addSucc(SIScheduleBlock *Succ) {
   if (Succ->isHighLatencyBlock())
     ++NumHighLatencySuccessors;
   Succs.push_back(Succ);
-#ifndef NDEBUG
-  for (SIScheduleBlock* P : Preds) {
-    if (SuccID == P->getID())
-      assert("Loop in the Block Graph!\n");
-  }
-#endif
+  assert(none_of(Preds,
+                 [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) &&
+         "Loop in the Block Graph!");
 }
 
 #ifndef NDEBUG
@@ -712,8 +710,8 @@ void SIScheduleBlockCreator::colorComputeReservedDependencies() {
   // Traverse TopDown, and give different colors to SUs depending
   // on which combination of High Latencies they depend on.
 
-  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
-    SUnit *SU = &DAG->SUnits[DAG->TopDownIndex2SU[i]];
+  for (unsigned SUNum : DAG->TopDownIndex2SU) {
+    SUnit *SU = &DAG->SUnits[SUNum];
     std::set<unsigned> SUColors;
 
     // Already given.
@@ -754,8 +752,8 @@ void SIScheduleBlockCreator::colorComputeReservedDependencies() {
 
   // Same as before, but BottomUp.
 
-  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
-    SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
+  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
+    SUnit *SU = &DAG->SUnits[SUNum];
     std::set<unsigned> SUColors;
 
     // Already given.
@@ -826,8 +824,8 @@ void SIScheduleBlockCreator::colorEndsAccordingToDependencies() {
   unsigned DAGSize = DAG->SUnits.size();
   std::vector<int> PendingColoring = CurrentColoring;
 
-  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
-    SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
+  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
+    SUnit *SU = &DAG->SUnits[SUNum];
     std::set<unsigned> SUColors;
     std::set<unsigned> SUColorsPending;
 
@@ -893,8 +891,8 @@ void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() {
 void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() {
   unsigned DAGSize = DAG->SUnits.size();
 
-  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
-    SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
+  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
+    SUnit *SU = &DAG->SUnits[SUNum];
     std::set<unsigned> SUColors;
 
     if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
@@ -919,8 +917,8 @@ void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() {
 void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() {
   unsigned DAGSize = DAG->SUnits.size();
 
-  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
-    SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
+  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
+    SUnit *SU = &DAG->SUnits[SUNum];
     std::set<unsigned> SUColors;
 
     if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
@@ -940,8 +938,8 @@ void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() {
 void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() {
   unsigned DAGSize = DAG->SUnits.size();
 
-  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
-    SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
+  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
+    SUnit *SU = &DAG->SUnits[SUNum];
     std::set<unsigned> SUColors;
 
     if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
@@ -962,8 +960,8 @@ void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() {
   unsigned DAGSize = DAG->SUnits.size();
   std::map<unsigned, unsigned> ColorCount;
 
-  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
-    SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
+  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
+    SUnit *SU = &DAG->SUnits[SUNum];
     unsigned color = CurrentColoring[SU->NodeNum];
     std::map<unsigned, unsigned>::iterator Pos = ColorCount.find(color);
       if (Pos != ColorCount.end()) {
@@ -973,8 +971,8 @@ void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() {
       }
   }
 
-  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
-    SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
+  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
+    SUnit *SU = &DAG->SUnits[SUNum];
     unsigned color = CurrentColoring[SU->NodeNum];
     std::set<unsigned> SUColors;
 
@@ -1006,8 +1004,8 @@ void SIScheduleBlockCreator::regroupNoUserInstructions() {
   unsigned DAGSize = DAG->SUnits.size();
   int GroupID = NextNonReservedID++;
 
-  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
-    SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
+  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
+    SUnit *SU = &DAG->SUnits[SUNum];
     bool hasSuccessor = false;
 
     if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
@@ -1223,7 +1221,7 @@ void SIScheduleBlockCreator::scheduleInsideBlocks() {
         // is the most cpu intensive operation of the scheduler.
         // It would gain a lot if there was a way to recompute the
         // LiveIntervals for the entire scheduling region.
-        DAG->getLIS()->handleMove(MI, /*UpdateFlags=*/true);
+        DAG->getLIS()->handleMove(*MI, /*UpdateFlags=*/true);
         PosNew.push_back(CurrentTopFastSched);
       }
     }
@@ -1249,7 +1247,7 @@ void SIScheduleBlockCreator::scheduleInsideBlocks() {
       DAG->getBB()->splice(POld, DAG->getBB(), PNew);
 
       // Update LiveIntervals.
-      DAG->getLIS()->handleMove(POld, /*UpdateFlags=*/true);
+      DAG->getLIS()->handleMove(*POld, /*UpdateFlags=*/true);
     }
   }
 
@@ -1675,70 +1673,10 @@ ScheduleDAGInstrs *llvm::createSIMachineScheduler(MachineSchedContext *C) {
 // Does a topological sort over the SUs.
 // Both TopDown and BottomUp
 void SIScheduleDAGMI::topologicalSort() {
-  std::vector<int> TopDownSU2Index;
-  unsigned DAGSize = SUnits.size();
-  std::vector<SUnit*> WorkList;
-
-  DEBUG(dbgs() << "Topological Sort\n");
-  WorkList.reserve(DAGSize);
-
-  TopDownIndex2SU.resize(DAGSize);
-  TopDownSU2Index.resize(DAGSize);
-  BottomUpIndex2SU.resize(DAGSize);
-
-  WorkList.push_back(&getExitSU());
-  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
-    SUnit *SU = &SUnits[i];
-    int NodeNum = SU->NodeNum;
-    unsigned Degree = SU->Succs.size();
-    TopDownSU2Index[NodeNum] = Degree;
-    if (Degree == 0) {
-      assert(SU->Succs.empty() && "SUnit should have no successors");
-      WorkList.push_back(SU);
-    }
-  }
-
-  int Id = DAGSize;
-  while (!WorkList.empty()) {
-    SUnit *SU = WorkList.back();
-    WorkList.pop_back();
-    if (SU->NodeNum < DAGSize) {
-      TopDownSU2Index[SU->NodeNum] = --Id;
-      TopDownIndex2SU[Id] = SU->NodeNum;
-    }
-    for (SDep& Pred : SU->Preds) {
-      SUnit *SU = Pred.getSUnit();
-      if (SU->NodeNum < DAGSize && !--TopDownSU2Index[SU->NodeNum])
-        WorkList.push_back(SU);
-    }
-  }
-
-  BottomUpIndex2SU = std::vector<int>(TopDownIndex2SU.rbegin(),
-                                      TopDownIndex2SU.rend());
+  Topo.InitDAGTopologicalSorting();
 
-#ifndef NDEBUG
-  // Check correctness of the ordering
-  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
-    SUnit *SU = &SUnits[i];
-    for (SDep& Pred : SU->Preds) {
-      if (Pred.getSUnit()->NodeNum >= DAGSize)
-        continue;
-      assert(TopDownSU2Index[SU->NodeNum] >
-             TopDownSU2Index[Pred.getSUnit()->NodeNum] &&
-             "Wrong Top Down topological sorting");
-    }
-  }
-  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
-    SUnit *SU = &SUnits[i];
-    for (SDep& Succ : SU->Succs) {
-      if (Succ.getSUnit()->NodeNum >= DAGSize)
-        continue;
-      assert(TopDownSU2Index[SU->NodeNum] <
-             TopDownSU2Index[Succ.getSUnit()->NodeNum] &&
-             "Wrong Bottom Up topological sorting");
-    }
-  }
-#endif
+  TopDownIndex2SU = std::vector<int>(Topo.begin(), Topo.end());
+  BottomUpIndex2SU = std::vector<int>(Topo.rbegin(), Topo.rend());
 }
 
 // Move low latencies further from their user without
@@ -1759,7 +1697,7 @@ void SIScheduleDAGMI::moveLowLatencies() {
 
     for (SDep& PredDep : SU->Preds) {
       SUnit *Pred = PredDep.getSUnit();
-      if (SITII->isLowLatencyInstruction(Pred->getInstr())) {
+      if (SITII->isLowLatencyInstruction(*Pred->getInstr())) {
         IsLowLatencyUser = true;
       }
       if (Pred->NodeNum >= DAGSize)
@@ -1769,7 +1707,7 @@ void SIScheduleDAGMI::moveLowLatencies() {
         MinPos = PredPos + 1;
     }
 
-    if (SITII->isLowLatencyInstruction(SU->getInstr())) {
+    if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
       unsigned BestPos = LastLowLatencyUser + 1;
       if ((int)BestPos <= LastLowLatencyPos)
         BestPos = LastLowLatencyPos + 1;
@@ -1794,7 +1732,7 @@ void SIScheduleDAGMI::moveLowLatencies() {
       bool CopyForLowLat = false;
       for (SDep& SuccDep : SU->Succs) {
         SUnit *Succ = SuccDep.getSUnit();
-        if (SITII->isLowLatencyInstruction(Succ->getInstr())) {
+        if (SITII->isLowLatencyInstruction(*Succ->getInstr())) {
           CopyForLowLat = true;
         }
       }
@@ -1855,7 +1793,6 @@ void SIScheduleDAGMI::schedule()
        SU.dumpAll(this)
   );
 
-  Topo.InitDAGTopologicalSorting();
   topologicalSort();
   findRootsAndBiasEdges(TopRoots, BotRoots);
   // We reuse several ScheduleDAGMI and ScheduleDAGMILive
@@ -1878,20 +1815,21 @@ void SIScheduleDAGMI::schedule()
 
   for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
     SUnit *SU = &SUnits[i];
-    unsigned BaseLatReg, OffLatReg;
-    if (SITII->isLowLatencyInstruction(SU->getInstr())) {
+    unsigned BaseLatReg;
+    int64_t OffLatReg;
+    if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
       IsLowLatencySU[i] = 1;
-      if (SITII->getMemOpBaseRegImmOfs(SU->getInstr(), BaseLatReg,
-                                      OffLatReg, TRI))
+      if (SITII->getMemOpBaseRegImmOfs(*SU->getInstr(), BaseLatReg, OffLatReg,
+                                       TRI))
         LowLatencyOffset[i] = OffLatReg;
-    } else if (SITII->isHighLatencyInstruction(SU->getInstr()))
+    } else if (SITII->isHighLatencyInstruction(*SU->getInstr()))
       IsHighLatencySU[i] = 1;
   }
 
   SIScheduler Scheduler(this);
   Best = Scheduler.scheduleVariant(SISchedulerBlockCreatorVariant::LatenciesAlone,
                                    SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage);
-#if 0 // To enable when handleMove fix lands
+
   // if VGPR usage is extremely high, try other good performing variants
   // which could lead to lower VGPR usage
   if (Best.MaxVGPRUsage > 180) {
@@ -1930,7 +1868,7 @@ void SIScheduleDAGMI::schedule()
         Best = Temp;
     }
   }
-#endif
+
   ScheduledSUnits = Best.SUs;
   ScheduledSUnitsInv.resize(SUnits.size());
 
diff --git a/lib/Target/AMDGPU/SIMachineScheduler.h b/lib/Target/AMDGPU/SIMachineScheduler.h
index b270136811c6..117aed497cc2 100644
--- a/lib/Target/AMDGPU/SIMachineScheduler.h
+++ b/lib/Target/AMDGPU/SIMachineScheduler.h
@@ -418,7 +418,7 @@ public:
                   SISchedulerBlockSchedulerVariant ScheduleVariant);
 };
 
-class SIScheduleDAGMI : public ScheduleDAGMILive {
+class SIScheduleDAGMI final : public ScheduleDAGMILive {
   const SIInstrInfo *SITII;
   const SIRegisterInfo *SITRI;
 
@@ -441,7 +441,7 @@ public:
 
   // To init Block's RPTracker.
   void initRPTracker(RegPressureTracker &RPTracker) {
-    RPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin);
+    RPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin, false, false);
   }
 
   MachineBasicBlock *getBB() { return BB; }
@@ -460,8 +460,10 @@ public:
                                                      unsigned &VgprUsage,
                                                      unsigned &SgprUsage);
   std::set<unsigned> getInRegs() {
-    std::set<unsigned> InRegs (RPTracker.getPressure().LiveInRegs.begin(),
-                               RPTracker.getPressure().LiveInRegs.end());
+    std::set<unsigned> InRegs;
+    for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
+      InRegs.insert(RegMaskPair.RegUnit);
+    }
     return InRegs;
   };
 
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 025ed2b5b76b..0dd88ee45c58 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -15,6 +15,7 @@
 #include "SIRegisterInfo.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
+#include "AMDGPUSubtarget.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
@@ -23,7 +24,75 @@
 
 using namespace llvm;
 
-SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo() {
+static unsigned getMaxWaveCountPerSIMD(const MachineFunction &MF) {
+  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  unsigned SIMDPerCU = 4;
+
+  unsigned MaxInvocationsPerWave = SIMDPerCU * ST.getWavefrontSize();
+  return alignTo(MFI.getMaximumWorkGroupSize(MF), MaxInvocationsPerWave) /
+           MaxInvocationsPerWave;
+}
+
+static unsigned getMaxWorkGroupSGPRCount(const MachineFunction &MF) {
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  unsigned MaxWaveCountPerSIMD = getMaxWaveCountPerSIMD(MF);
+
+  unsigned TotalSGPRCountPerSIMD, AddressableSGPRCount, SGPRUsageAlignment;
+  unsigned ReservedSGPRCount;
+
+  if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+    TotalSGPRCountPerSIMD = 800;
+    AddressableSGPRCount = 102;
+    SGPRUsageAlignment = 16;
+    ReservedSGPRCount = 6; // VCC, FLAT_SCRATCH, XNACK
+  } else {
+    TotalSGPRCountPerSIMD = 512;
+    AddressableSGPRCount = 104;
+    SGPRUsageAlignment = 8;
+    ReservedSGPRCount = 2; // VCC
+  }
+
+  unsigned MaxSGPRCount = (TotalSGPRCountPerSIMD / MaxWaveCountPerSIMD);
+  MaxSGPRCount = alignDown(MaxSGPRCount, SGPRUsageAlignment);
+
+  if (ST.hasSGPRInitBug())
+    MaxSGPRCount = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
+
+  return std::min(MaxSGPRCount - ReservedSGPRCount, AddressableSGPRCount);
+}
+
+static unsigned getMaxWorkGroupVGPRCount(const MachineFunction &MF) {
+  unsigned MaxWaveCountPerSIMD = getMaxWaveCountPerSIMD(MF);
+  unsigned TotalVGPRCountPerSIMD = 256;
+  unsigned VGPRUsageAlignment = 4;
+
+  return alignDown(TotalVGPRCountPerSIMD / MaxWaveCountPerSIMD,
+                   VGPRUsageAlignment);
+}
+
+static bool hasPressureSet(const int *PSets, unsigned PSetID) {
+  for (unsigned i = 0; PSets[i] != -1; ++i) {
+    if (PSets[i] == (int)PSetID)
+      return true;
+  }
+  return false;
+}
+
+void SIRegisterInfo::classifyPressureSet(unsigned PSetID, unsigned Reg,
+                                         BitVector &PressureSets) const {
+  for (MCRegUnitIterator U(Reg, this); U.isValid(); ++U) {
+    const int *PSets = getRegUnitPressureSets(*U);
+    if (hasPressureSet(PSets, PSetID)) {
+      PressureSets.set(PSetID);
+      break;
+    }
+  }
+}
+
+SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo(),
+                                   SGPRPressureSets(getNumRegPressureSets()),
+                                   VGPRPressureSets(getNumRegPressureSets()) {
   unsigned NumRegPressureSets = getNumRegPressureSets();
 
   SGPR32SetID = NumRegPressureSets;
@@ -33,6 +102,9 @@ SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo() {
       SGPR32SetID = i;
     else if (strncmp("VGPR_32", getRegPressureSetName(i), 7) == 0)
       VGPR32SetID = i;
+
+    classifyPressureSet(i, AMDGPU::SGPR0, SGPRPressureSets);
+    classifyPressureSet(i, AMDGPU::VGPR0, VGPRPressureSets);
   }
   assert(SGPR32SetID < NumRegPressureSets &&
          VGPR32SetID < NumRegPressureSets);
@@ -47,38 +119,27 @@ void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) co
 
 unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(
   const MachineFunction &MF) const {
-  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
-  if (ST.hasSGPRInitBug()) {
-    // Leave space for flat_scr, xnack_mask, vcc, and alignment
-    unsigned BaseIdx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 8 - 4;
-    unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
-    return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
-  }
-
-  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
-    // 96/97 need to be reserved for flat_scr, 98/99 for xnack_mask, and
-    // 100/101 for vcc. This is the next sgpr128 down.
-    return AMDGPU::SGPR92_SGPR93_SGPR94_SGPR95;
-  }
-
-  return AMDGPU::SGPR96_SGPR97_SGPR98_SGPR99;
+  unsigned BaseIdx = alignDown(getMaxWorkGroupSGPRCount(MF), 4) - 4;
+  unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
+  return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
 }
 
 unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
   const MachineFunction &MF) const {
-  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
-  if (ST.hasSGPRInitBug()) {
-    unsigned Idx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 6 - 1;
-    return AMDGPU::SGPR_32RegClass.getRegister(Idx);
-  }
-
-  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
-    // Next register before reservations for flat_scr, xnack_mask, vcc,
-    // and scratch resource.
-    return AMDGPU::SGPR91;
+  unsigned RegCount = getMaxWorkGroupSGPRCount(MF);
+  unsigned Reg;
+
+  // Try to place it in a hole after PrivateSegmentbufferReg.
+  if (RegCount & 3) {
+    // We cannot put the segment buffer in (Idx - 4) ... (Idx - 1) due to
+    // alignment constraints, so we have a hole where can put the wave offset.
+    Reg = RegCount - 1;
+  } else {
+    // We can put the segment buffer in (Idx - 4) ... (Idx - 1) and put the
+    // wave offset before it.
+    Reg = RegCount - 5;
   }
-
-  return AMDGPU::SGPR95;
+  return AMDGPU::SGPR_32RegClass.getRegister(Reg);
 }
 
 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
@@ -90,35 +151,30 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   reserveRegisterTuples(Reserved, AMDGPU::EXEC);
   reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);
 
-  // Reserve the last 2 registers so we will always have at least 2 more that
-  // will physically contain VCC.
-  reserveRegisterTuples(Reserved, AMDGPU::SGPR102_SGPR103);
-
-  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
-
-  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
-    // SI/CI have 104 SGPRs. VI has 102. We need to shift down the reservation
-    // for VCC/XNACK_MASK/FLAT_SCR.
-    //
-    // TODO The SGPRs that alias to XNACK_MASK could be used as general purpose
-    // SGPRs when the XNACK feature is not used. This is currently not done
-    // because the code that counts SGPRs cannot account for such holes.
-    reserveRegisterTuples(Reserved, AMDGPU::SGPR96_SGPR97);
-    reserveRegisterTuples(Reserved, AMDGPU::SGPR98_SGPR99);
-    reserveRegisterTuples(Reserved, AMDGPU::SGPR100_SGPR101);
+  // Reserve Trap Handler registers - support is not implemented in Codegen.
+  reserveRegisterTuples(Reserved, AMDGPU::TBA);
+  reserveRegisterTuples(Reserved, AMDGPU::TMA);
+  reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1);
+  reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3);
+  reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5);
+  reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7);
+  reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9);
+  reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11);
+
+  unsigned MaxWorkGroupSGPRCount = getMaxWorkGroupSGPRCount(MF);
+  unsigned MaxWorkGroupVGPRCount = getMaxWorkGroupVGPRCount(MF);
+
+  unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
+  unsigned NumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
+  for (unsigned i = MaxWorkGroupSGPRCount; i < NumSGPRs; ++i) {
+    unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i);
+    reserveRegisterTuples(Reserved, Reg);
   }
 
-  // Tonga and Iceland can only allocate a fixed number of SGPRs due
-  // to a hw bug.
-  if (ST.hasSGPRInitBug()) {
-    unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
-    // Reserve some SGPRs for FLAT_SCRATCH, XNACK_MASK, and VCC (6 SGPRs).
-    unsigned Limit = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 6;
 
-    for (unsigned i = Limit; i < NumSGPRs; ++i) {
-      unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i);
-      reserveRegisterTuples(Reserved, Reg);
-    }
+  for (unsigned i = MaxWorkGroupVGPRCount; i < NumVGPRs; ++i) {
+    unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i);
+    reserveRegisterTuples(Reserved, Reg);
   }
 
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
@@ -138,48 +194,182 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
     assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg));
   }
 
+  // Reserve registers for debugger usage if "amdgpu-debugger-reserve-trap-regs"
+  // attribute was specified.
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  if (ST.debuggerReserveRegs()) {
+    unsigned ReservedVGPRFirst =
+      MaxWorkGroupVGPRCount - MFI->getDebuggerReservedVGPRCount();
+    for (unsigned i = ReservedVGPRFirst; i < MaxWorkGroupVGPRCount; ++i) {
+      unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i);
+      reserveRegisterTuples(Reserved, Reg);
+    }
+  }
+
   return Reserved;
 }
 
 unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
                                                 unsigned Idx) const {
-  const AMDGPUSubtarget &STI = MF.getSubtarget<AMDGPUSubtarget>();
+  const SISubtarget &STI = MF.getSubtarget<SISubtarget>();
   // FIXME: We should adjust the max number of waves based on LDS size.
-  unsigned SGPRLimit = getNumSGPRsAllowed(STI.getGeneration(),
-                                          STI.getMaxWavesPerCU());
+  unsigned SGPRLimit = getNumSGPRsAllowed(STI, STI.getMaxWavesPerCU());
   unsigned VGPRLimit = getNumVGPRsAllowed(STI.getMaxWavesPerCU());
 
   unsigned VSLimit = SGPRLimit + VGPRLimit;
 
-  for (regclass_iterator I = regclass_begin(), E = regclass_end();
-       I != E; ++I) {
-    const TargetRegisterClass *RC = *I;
+  if (SGPRPressureSets.test(Idx) && VGPRPressureSets.test(Idx)) {
+    // FIXME: This is a hack. We should never be considering the pressure of
+    // these since no virtual register should ever have this class.
+    return VSLimit;
+  }
 
-    unsigned NumSubRegs = std::max((int)RC->getSize() / 4, 1);
-    unsigned Limit;
+  if (SGPRPressureSets.test(Idx))
+    return SGPRLimit;
 
-    if (isPseudoRegClass(RC)) {
-      // FIXME: This is a hack. We should never be considering the pressure of
-      // these since no virtual register should ever have this class.
-      Limit = VSLimit;
-    } else if (isSGPRClass(RC)) {
-      Limit = SGPRLimit / NumSubRegs;
-    } else {
-      Limit = VGPRLimit / NumSubRegs;
-    }
+  return VGPRLimit;
+}
+
+bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const {
+  return Fn.getFrameInfo()->hasStackObjects();
+}
+
+bool
+SIRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) const {
+  return MF.getFrameInfo()->hasStackObjects();
+}
+
+bool SIRegisterInfo::requiresVirtualBaseRegisters(
+  const MachineFunction &) const {
+  // There are no special dedicated stack or frame pointers.
+  return true;
+}
+
+bool SIRegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
+  // This helps catch bugs as verifier errors.
+  return true;
+}
+
+int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI,
+                                                 int Idx) const {
+  if (!SIInstrInfo::isMUBUF(*MI))
+    return 0;
+
+  assert(Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                           AMDGPU::OpName::vaddr) &&
+         "Should never see frame index on non-address operand");
+
+  int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                          AMDGPU::OpName::offset);
+  return MI->getOperand(OffIdx).getImm();
+}
+
+bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
+  return MI->mayLoadOrStore();
+}
 
-    const int *Sets = getRegClassPressureSets(RC);
-    assert(Sets);
-    for (unsigned i = 0; Sets[i] != -1; ++i) {
-      if (Sets[i] == (int)Idx)
-        return Limit;
+void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
+                                                  unsigned BaseReg,
+                                                  int FrameIdx,
+                                                  int64_t Offset) const {
+  MachineBasicBlock::iterator Ins = MBB->begin();
+  DebugLoc DL; // Defaults to "unknown"
+
+  if (Ins != MBB->end())
+    DL = Ins->getDebugLoc();
+
+  MachineFunction *MF = MBB->getParent();
+  const SISubtarget &Subtarget = MF->getSubtarget<SISubtarget>();
+  const SIInstrInfo *TII = Subtarget.getInstrInfo();
+
+  if (Offset == 0) {
+    BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), BaseReg)
+      .addFrameIndex(FrameIdx);
+    return;
+  }
+
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+  unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+
+  BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
+    .addImm(Offset);
+  BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_ADD_I32_e64), BaseReg)
+    .addReg(UnusedCarry, RegState::Define | RegState::Dead)
+    .addReg(OffsetReg, RegState::Kill)
+    .addFrameIndex(FrameIdx);
+}
+
+void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+                                       int64_t Offset) const {
+
+  MachineBasicBlock *MBB = MI.getParent();
+  MachineFunction *MF = MBB->getParent();
+  const SISubtarget &Subtarget = MF->getSubtarget<SISubtarget>();
+  const SIInstrInfo *TII = Subtarget.getInstrInfo();
+
+#ifndef NDEBUG
+  // FIXME: Is it possible to be storing a frame index to itself?
+  bool SeenFI = false;
+  for (const MachineOperand &MO: MI.operands()) {
+    if (MO.isFI()) {
+      if (SeenFI)
+        llvm_unreachable("should not see multiple frame indices");
+
+      SeenFI = true;
     }
   }
-  return 256;
+#endif
+
+  MachineOperand *FIOp = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
+  assert(FIOp && FIOp->isFI() && "frame index must be address operand");
+
+  assert(TII->isMUBUF(MI));
+
+  MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset);
+  int64_t NewOffset = OffsetOp->getImm() + Offset;
+  if (isUInt<12>(NewOffset)) {
+    // If we have a legal offset, fold it directly into the instruction.
+    FIOp->ChangeToRegister(BaseReg, false);
+    OffsetOp->setImm(NewOffset);
+    return;
+  }
+
+  // The offset is not legal, so we must insert an add of the offset.
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  unsigned NewReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+  DebugLoc DL = MI.getDebugLoc();
+
+  assert(Offset != 0 && "Non-zero offset expected");
+
+  unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+  unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+
+  // In the case the instruction already had an immediate offset, here only
+  // the requested new offset is added because we are leaving the original
+  // immediate in place.
+  BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
+    .addImm(Offset);
+  BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_I32_e64), NewReg)
+    .addReg(UnusedCarry, RegState::Define | RegState::Dead)
+    .addReg(OffsetReg, RegState::Kill)
+    .addReg(BaseReg);
+
+  FIOp->ChangeToRegister(NewReg, false);
 }
 
-bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const {
-  return Fn.getFrameInfo()->hasStackObjects();
+bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
+                                        unsigned BaseReg,
+                                        int64_t Offset) const {
+  return SIInstrInfo::isMUBUF(*MI) && isUInt<12>(Offset);
+}
+
+const TargetRegisterClass *SIRegisterInfo::getPointerRegClass(
+  const MachineFunction &MF, unsigned Kind) const {
+  // This is inaccurate. It depends on the instruction and address space. The
+  // only place where we should hit this is for dealing with frame indexes /
+  // private accesses, so this is correct in that case.
+  return &AMDGPU::VGPR_32RegClass;
 }
 
 static unsigned getNumSubRegsForSpillOp(unsigned Op) {
@@ -219,32 +409,48 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
 
 void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI,
                                            unsigned LoadStoreOp,
-                                           unsigned Value,
+                                           const MachineOperand *SrcDst,
                                            unsigned ScratchRsrcReg,
                                            unsigned ScratchOffset,
                                            int64_t Offset,
                                            RegScavenger *RS) const {
 
+  unsigned Value = SrcDst->getReg();
+  bool IsKill = SrcDst->isKill();
   MachineBasicBlock *MBB = MI->getParent();
-  const MachineFunction *MF = MI->getParent()->getParent();
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(MF->getSubtarget().getInstrInfo());
-  LLVMContext &Ctx = MF->getFunction()->getContext();
+  MachineFunction *MF = MI->getParent()->getParent();
+  const SISubtarget &ST =  MF->getSubtarget<SISubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+
   DebugLoc DL = MI->getDebugLoc();
-  bool IsLoad = TII->get(LoadStoreOp).mayLoad();
+  bool IsStore = MI->mayStore();
 
   bool RanOutOfSGPRs = false;
   bool Scavenged = false;
   unsigned SOffset = ScratchOffset;
+  unsigned OriginalImmOffset = Offset;
 
   unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
   unsigned Size = NumSubRegs * 4;
 
   if (!isUInt<12>(Offset + Size)) {
-    SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0);
+    SOffset = AMDGPU::NoRegister;
+
+    // We don't have access to the register scavenger if this function is called
+    // during  PEI::scavengeFrameVirtualRegs().
+    if (RS)
+      SOffset = RS->FindUnusedReg(&AMDGPU::SGPR_32RegClass);
+
     if (SOffset == AMDGPU::NoRegister) {
+      // There are no free SGPRs, and since we are in the process of spilling
+      // VGPRs too.  Since we need a VGPR in order to spill SGPRs (this is true
+      // on SI/CI and on VI it is true until we implement spilling using scalar
+      // stores), we have no way to free up an SGPR.  Our solution here is to
+      // add the offset directly to the ScratchOffset register, and then
+      // subtract the offset after the spill to return ScratchOffset to it's
+      // original value.
       RanOutOfSGPRs = true;
-      SOffset = AMDGPU::SGPR0;
+      SOffset = ScratchOffset;
     } else {
       Scavenged = true;
     }
@@ -254,40 +460,48 @@ void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI,
     Offset = 0;
   }
 
-  if (RanOutOfSGPRs)
-    Ctx.emitError("Ran out of SGPRs for spilling VGPRS");
-
   for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += 4) {
     unsigned SubReg = NumSubRegs > 1 ?
         getPhysRegSubReg(Value, &AMDGPU::VGPR_32RegClass, i) :
         Value;
 
     unsigned SOffsetRegState = 0;
-    if (i + 1 == e && Scavenged)
-      SOffsetRegState |= RegState::Kill;
+    unsigned SrcDstRegState = getDefRegState(!IsStore);
+    if (i + 1 == e) {
+      SOffsetRegState |= getKillRegState(Scavenged);
+      // The last implicit use carries the "Kill" flag.
+      SrcDstRegState |= getKillRegState(IsKill);
+    }
 
     BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
-      .addReg(SubReg, getDefRegState(IsLoad))
+      .addReg(SubReg, getDefRegState(!IsStore))
       .addReg(ScratchRsrcReg)
       .addReg(SOffset, SOffsetRegState)
       .addImm(Offset)
       .addImm(0) // glc
       .addImm(0) // slc
       .addImm(0) // tfe
-      .addReg(Value, RegState::Implicit | getDefRegState(IsLoad))
+      .addReg(Value, RegState::Implicit | SrcDstRegState)
       .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
   }
+  if (RanOutOfSGPRs) {
+    // Subtract the offset we added to the ScratchOffset register.
+    BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScratchOffset)
+            .addReg(ScratchOffset)
+            .addImm(OriginalImmOffset);
+  }
 }
 
 void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
                                         int SPAdj, unsigned FIOperandNum,
                                         RegScavenger *RS) const {
   MachineFunction *MF = MI->getParent()->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
   MachineBasicBlock *MBB = MI->getParent();
   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
   MachineFrameInfo *FrameInfo = MF->getFrameInfo();
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(MF->getSubtarget().getInstrInfo());
+  const SISubtarget &ST =  MF->getSubtarget<SISubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
   DebugLoc DL = MI->getDebugLoc();
 
   MachineOperand &FIOp = MI->getOperand(FIOperandNum);
@@ -301,24 +515,65 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
     case AMDGPU::SI_SPILL_S64_SAVE:
     case AMDGPU::SI_SPILL_S32_SAVE: {
       unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
+      unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
 
+      unsigned SuperReg = MI->getOperand(0).getReg();
+      bool IsKill = MI->getOperand(0).isKill();
+      // SubReg carries the "Kill" flag when SubReg == SuperReg.
+      unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill);
       for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
-        unsigned SubReg = getPhysRegSubReg(MI->getOperand(0).getReg(),
+        unsigned SubReg = getPhysRegSubReg(SuperReg,
                                            &AMDGPU::SGPR_32RegClass, i);
+
         struct SIMachineFunctionInfo::SpilledReg Spill =
             MFI->getSpilledReg(MF, Index, i);
 
-        BuildMI(*MBB, MI, DL,
-                TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
-                Spill.VGPR)
-                .addReg(SubReg)
-                .addImm(Spill.Lane);
-
-        // FIXME: Since this spills to another register instead of an actual
-        // frame index, we should delete the frame index when all references to
-        // it are fixed.
+        if (Spill.hasReg()) {
+          BuildMI(*MBB, MI, DL,
+                  TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
+                  Spill.VGPR)
+                  .addReg(SubReg, getKillRegState(IsKill))
+                  .addImm(Spill.Lane);
+
+          // FIXME: Since this spills to another register instead of an actual
+          // frame index, we should delete the frame index when all references to
+          // it are fixed.
+        } else {
+          // Spill SGPR to a frame index.
+          // FIXME we should use S_STORE_DWORD here for VI.
+          MachineInstrBuilder Mov
+            = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
+            .addReg(SubReg, SubKillState);
+
+
+          // There could be undef components of a spilled super register.
+          // TODO: Can we detect this and skip the spill?
+          if (NumSubRegs > 1) {
+            // The last implicit use of the SuperReg carries the "Kill" flag.
+            unsigned SuperKillState = 0;
+            if (i + 1 == e)
+              SuperKillState |= getKillRegState(IsKill);
+            Mov.addReg(SuperReg, RegState::Implicit | SuperKillState);
+          }
+
+          unsigned Size = FrameInfo->getObjectSize(Index);
+          unsigned Align = FrameInfo->getObjectAlignment(Index);
+          MachinePointerInfo PtrInfo
+              = MachinePointerInfo::getFixedStack(*MF, Index);
+          MachineMemOperand *MMO
+              = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
+                                         Size, Align);
+          BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE))
+                  .addReg(TmpReg, RegState::Kill)         // src
+                  .addFrameIndex(Index)                   // frame_idx
+                  .addReg(MFI->getScratchRSrcReg())       // scratch_rsrc
+                  .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset
+                  .addImm(i * 4)                          // offset
+                  .addMemOperand(MMO);
+        }
       }
       MI->eraseFromParent();
+      MFI->addToSpilledSGPRs(NumSubRegs);
       break;
     }
 
@@ -329,6 +584,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
     case AMDGPU::SI_SPILL_S64_RESTORE:
     case AMDGPU::SI_SPILL_S32_RESTORE: {
       unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
+      unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
 
       for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
         unsigned SubReg = getPhysRegSubReg(MI->getOperand(0).getReg(),
@@ -336,28 +592,37 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
         struct SIMachineFunctionInfo::SpilledReg Spill =
             MFI->getSpilledReg(MF, Index, i);
 
-        BuildMI(*MBB, MI, DL,
-                TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
-                SubReg)
-                .addReg(Spill.VGPR)
-                .addImm(Spill.Lane)
-                .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
-      }
-
-      // TODO: only do this when it is needed
-      switch (MF->getSubtarget<AMDGPUSubtarget>().getGeneration()) {
-      case AMDGPUSubtarget::SOUTHERN_ISLANDS:
-        // "VALU writes SGPR" -> "SMRD reads that SGPR" needs 4 wait states
-        // ("S_NOP 3") on SI
-        TII->insertWaitStates(MI, 4);
-        break;
-      case AMDGPUSubtarget::SEA_ISLANDS:
-        break;
-      default: // VOLCANIC_ISLANDS and later
-        // "VALU writes SGPR -> VMEM reads that SGPR" needs 5 wait states
-        // ("S_NOP 4") on VI and later. This also applies to VALUs which write
-        // VCC, but we're unlikely to see VMEM use VCC.
-        TII->insertWaitStates(MI, 5);
+        if (Spill.hasReg()) {
+          BuildMI(*MBB, MI, DL,
+                  TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
+                  SubReg)
+                  .addReg(Spill.VGPR)
+                  .addImm(Spill.Lane)
+                  .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
+        } else {
+          // Restore SGPR from a stack slot.
+          // FIXME: We should use S_LOAD_DWORD here for VI.
+
+          unsigned Align = FrameInfo->getObjectAlignment(Index);
+          unsigned Size = FrameInfo->getObjectSize(Index);
+
+          MachinePointerInfo PtrInfo
+              = MachinePointerInfo::getFixedStack(*MF, Index);
+
+          MachineMemOperand *MMO = MF->getMachineMemOperand(
+              PtrInfo, MachineMemOperand::MOLoad, Size, Align);
+
+          BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg)
+                  .addFrameIndex(Index)                   // frame_idx
+                  .addReg(MFI->getScratchRSrcReg())       // scratch_rsrc
+                  .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset
+                  .addImm(i * 4)                          // offset
+                  .addMemOperand(MMO);
+          BuildMI(*MBB, MI, DL,
+                  TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
+                  .addReg(TmpReg, RegState::Kill)
+                  .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
+        }
       }
 
       MI->eraseFromParent();
@@ -372,11 +637,13 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
     case AMDGPU::SI_SPILL_V64_SAVE:
     case AMDGPU::SI_SPILL_V32_SAVE:
       buildScratchLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET,
-            TII->getNamedOperand(*MI, AMDGPU::OpName::src)->getReg(),
+            TII->getNamedOperand(*MI, AMDGPU::OpName::src),
             TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_rsrc)->getReg(),
             TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(),
-             FrameInfo->getObjectOffset(Index), RS);
+            FrameInfo->getObjectOffset(Index) +
+            TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), RS);
       MI->eraseFromParent();
+      MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode()));
       break;
     case AMDGPU::SI_SPILL_V32_RESTORE:
     case AMDGPU::SI_SPILL_V64_RESTORE:
@@ -385,10 +652,11 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
     case AMDGPU::SI_SPILL_V256_RESTORE:
     case AMDGPU::SI_SPILL_V512_RESTORE: {
       buildScratchLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
-            TII->getNamedOperand(*MI, AMDGPU::OpName::dst)->getReg(),
+            TII->getNamedOperand(*MI, AMDGPU::OpName::dst),
             TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_rsrc)->getReg(),
             TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(),
-            FrameInfo->getObjectOffset(Index), RS);
+            FrameInfo->getObjectOffset(Index) +
+            TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), RS);
       MI->eraseFromParent();
       break;
     }
@@ -396,8 +664,8 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
     default: {
       int64_t Offset = FrameInfo->getObjectOffset(Index);
       FIOp.ChangeToImmediate(Offset);
-      if (!TII->isImmOperandLegal(MI, FIOperandNum, FIOp)) {
-        unsigned TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, SPAdj);
+      if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) {
+        unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
         BuildMI(*MBB, MI, MI->getDebugLoc(),
                 TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
                 .addImm(Offset);
@@ -407,10 +675,6 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
   }
 }
 
-unsigned SIRegisterInfo::getHWRegIndex(unsigned Reg) const {
-  return getEncodingValue(Reg) & 0xff;
-}
-
 // FIXME: This is very slow. It might be worth creating a map from physreg to
 // register class.
 const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
@@ -427,7 +691,8 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
     &AMDGPU::VReg_256RegClass,
     &AMDGPU::SReg_256RegClass,
     &AMDGPU::VReg_512RegClass,
-    &AMDGPU::SReg_512RegClass
+    &AMDGPU::SReg_512RegClass,
+    &AMDGPU::SCC_CLASSRegClass,
   };
 
   for (const TargetRegisterClass *BaseClass : BaseClasses) {
@@ -442,6 +707,8 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
 // TargetRegisterClass to mark which classes are VGPRs to make this trivial.
 bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
   switch (RC->getSize()) {
+  case 0: return false;
+  case 1: return false;
   case 4:
     return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr;
   case 8:
@@ -479,6 +746,24 @@ const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass(
   }
 }
 
+const TargetRegisterClass *SIRegisterInfo::getEquivalentSGPRClass(
+                                         const TargetRegisterClass *VRC) const {
+  switch (VRC->getSize()) {
+  case 4:
+    return &AMDGPU::SGPR_32RegClass;
+  case 8:
+    return &AMDGPU::SReg_64RegClass;
+  case 16:
+    return &AMDGPU::SReg_128RegClass;
+  case 32:
+    return &AMDGPU::SReg_256RegClass;
+  case 64:
+    return &AMDGPU::SReg_512RegClass;
+  default:
+    llvm_unreachable("Invalid register class size");
+  }
+}
+
 const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
                          const TargetRegisterClass *RC, unsigned SubIdx) const {
   if (SubIdx == AMDGPU::NoSubRegister)
@@ -552,7 +837,21 @@ unsigned SIRegisterInfo::getPhysRegSubReg(unsigned Reg,
       switch(Channel) {
         case 0: return AMDGPU::VCC_LO;
         case 1: return AMDGPU::VCC_HI;
-        default: llvm_unreachable("Invalid SubIdx for VCC");
+        default: llvm_unreachable("Invalid SubIdx for VCC"); break;
+      }
+
+    case AMDGPU::TBA:
+      switch(Channel) {
+        case 0: return AMDGPU::TBA_LO;
+        case 1: return AMDGPU::TBA_HI;
+        default: llvm_unreachable("Invalid SubIdx for TBA"); break;
+      }
+
+    case AMDGPU::TMA:
+      switch(Channel) {
+        case 0: return AMDGPU::TMA_LO;
+        case 1: return AMDGPU::TMA_HI;
+        default: llvm_unreachable("Invalid SubIdx for TMA"); break;
       }
 
   case AMDGPU::FLAT_SCR:
@@ -610,7 +909,7 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
                                            enum PreloadedValue Value) const {
 
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
   (void)ST;
   switch (Value) {
   case SIRegisterInfo::WORKGROUP_ID_X:
@@ -631,11 +930,17 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
   case SIRegisterInfo::KERNARG_SEGMENT_PTR:
     assert(MFI->hasKernargSegmentPtr());
     return MFI->KernargSegmentPtrUserSGPR;
+  case SIRegisterInfo::DISPATCH_ID:
+    llvm_unreachable("unimplemented");
+  case SIRegisterInfo::FLAT_SCRATCH_INIT:
+    assert(MFI->hasFlatScratchInit());
+    return MFI->FlatScratchInitUserSGPR;
   case SIRegisterInfo::DISPATCH_PTR:
     assert(MFI->hasDispatchPtr());
     return MFI->DispatchPtrUserSGPR;
   case SIRegisterInfo::QUEUE_PTR:
-    llvm_unreachable("not implemented");
+    assert(MFI->hasQueuePtr());
+    return MFI->QueuePtrUserSGPR;
   case SIRegisterInfo::WORKITEM_ID_X:
     assert(MFI->hasWorkItemIDX());
     return AMDGPU::VGPR0;
@@ -675,9 +980,9 @@ unsigned SIRegisterInfo::getNumVGPRsAllowed(unsigned WaveCount) const {
   }
 }
 
-unsigned SIRegisterInfo::getNumSGPRsAllowed(AMDGPUSubtarget::Generation gen,
+unsigned SIRegisterInfo::getNumSGPRsAllowed(const SISubtarget &ST,
                                             unsigned WaveCount) const {
-  if (gen >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+  if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
     switch (WaveCount) {
       case 10: return 80;
       case 9:  return 80;
@@ -696,3 +1001,14 @@ unsigned SIRegisterInfo::getNumSGPRsAllowed(AMDGPUSubtarget::Generation gen,
     }
   }
 }
+
+bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI,
+                            unsigned Reg) const {
+  const TargetRegisterClass *RC;
+  if (TargetRegisterInfo::isVirtualRegister(Reg))
+    RC = MRI.getRegClass(Reg);
+  else
+    RC = getPhysRegClass(Reg);
+
+  return hasVGPRs(RC);
+}
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h
index 9410e2049cba..6e97b1b910a9 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -12,23 +12,27 @@
 //
 //===----------------------------------------------------------------------===//
 
-
-#ifndef LLVM_LIB_TARGET_R600_SIREGISTERINFO_H
-#define LLVM_LIB_TARGET_R600_SIREGISTERINFO_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_SIREGISTERINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_SIREGISTERINFO_H
 
 #include "AMDGPURegisterInfo.h"
-#include "AMDGPUSubtarget.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/Debug.h"
 
 namespace llvm {
 
-struct SIRegisterInfo : public AMDGPURegisterInfo {
+class SISubtarget;
+class MachineRegisterInfo;
+
+struct SIRegisterInfo final : public AMDGPURegisterInfo {
 private:
   unsigned SGPR32SetID;
   unsigned VGPR32SetID;
+  BitVector SGPRPressureSets;
+  BitVector VGPRPressureSets;
 
   void reserveRegisterTuples(BitVector &, unsigned Reg) const;
+  void classifyPressureSet(unsigned PSetID, unsigned Reg,
+                           BitVector &PressureSets) const;
 
 public:
   SIRegisterInfo();
@@ -47,13 +51,39 @@ public:
   unsigned getRegPressureSetLimit(const MachineFunction &MF,
                                   unsigned Idx) const override;
 
+
   bool requiresRegisterScavenging(const MachineFunction &Fn) const override;
 
+
+  bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;
+  bool requiresVirtualBaseRegisters(const MachineFunction &Fn) const override;
+  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override;
+
+  int64_t getFrameIndexInstrOffset(const MachineInstr *MI,
+                                   int Idx) const override;
+
+  bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override;
+
+  void materializeFrameBaseRegister(MachineBasicBlock *MBB,
+                                    unsigned BaseReg, int FrameIdx,
+                                    int64_t Offset) const override;
+
+  void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+                         int64_t Offset) const override;
+
+  bool isFrameOffsetLegal(const MachineInstr *MI, unsigned BaseReg,
+                          int64_t Offset) const override;
+
+  const TargetRegisterClass *getPointerRegClass(
+    const MachineFunction &MF, unsigned Kind = 0) const override;
+
   void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
                            unsigned FIOperandNum,
                            RegScavenger *RS) const override;
 
-  unsigned getHWRegIndex(unsigned Reg) const override;
+  unsigned getHWRegIndex(unsigned Reg) const {
+    return getEncodingValue(Reg) & 0xff;
+  }
 
   /// \brief Return the 'base' register class for this register.
   /// e.g. SGPR0 => SReg_32, VGPR => VGPR_32 SGPR0_SGPR1 -> SReg_32, etc.
@@ -70,9 +100,12 @@ public:
   }
 
   bool isSGPRReg(const MachineRegisterInfo &MRI, unsigned Reg) const {
+    const TargetRegisterClass *RC;
     if (TargetRegisterInfo::isVirtualRegister(Reg))
-      return isSGPRClass(MRI.getRegClass(Reg));
-    return getPhysRegClass(Reg);
+      RC = MRI.getRegClass(Reg);
+    else
+      RC = getPhysRegClass(Reg);
+    return isSGPRClass(RC);
   }
 
   /// \returns true if this class contains VGPR registers.
@@ -89,6 +122,10 @@ public:
   const TargetRegisterClass *getEquivalentVGPRClass(
                                           const TargetRegisterClass *SRC) const;
 
+  /// \returns A SGPR reg class with the same width as \p SRC
+  const TargetRegisterClass *getEquivalentSGPRClass(
+                                           const TargetRegisterClass *VRC) const;
+
   /// \returns The register class that is used for a sub-register of \p RC for
   /// the given \p SubIdx.  If \p SubIdx equals NoSubRegister, \p RC will
   /// be returned.
@@ -117,10 +154,12 @@ public:
 
   enum PreloadedValue {
     // SGPRS:
-    PRIVATE_SEGMENT_BUFFER =  0,
+    PRIVATE_SEGMENT_BUFFER = 0,
     DISPATCH_PTR        =  1,
     QUEUE_PTR           =  2,
     KERNARG_SEGMENT_PTR =  3,
+    DISPATCH_ID         =  4,
+    FLAT_SCRATCH_INIT   =  5,
     WORKGROUP_ID_X      = 10,
     WORKGROUP_ID_Y      = 11,
     WORKGROUP_ID_Z      = 12,
@@ -143,8 +182,7 @@ public:
 
   /// \brief Give the maximum number of SGPRs that can be used by \p WaveCount
   ///        concurrent waves.
-  unsigned getNumSGPRsAllowed(AMDGPUSubtarget::Generation gen,
-                              unsigned WaveCount) const;
+  unsigned getNumSGPRsAllowed(const SISubtarget &ST, unsigned WaveCount) const;
 
   unsigned findUnusedRegister(const MachineRegisterInfo &MRI,
                               const TargetRegisterClass *RC) const;
@@ -152,11 +190,14 @@ public:
   unsigned getSGPR32PressureSet() const { return SGPR32SetID; };
   unsigned getVGPR32PressureSet() const { return VGPR32SetID; };
 
+  bool isVGPR(const MachineRegisterInfo &MRI, unsigned Reg) const;
+
 private:
   void buildScratchLoadStore(MachineBasicBlock::iterator MI,
-                             unsigned LoadStoreOp, unsigned Value,
+                             unsigned LoadStoreOp, const MachineOperand *SrcDst,
                              unsigned ScratchRsrcReg, unsigned ScratchOffset,
-                             int64_t Offset, RegScavenger *RS) const;
+                             int64_t Offset,
+                             RegScavenger *RS) const;
 };
 
 } // End namespace llvm
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.td b/lib/Target/AMDGPU/SIRegisterInfo.td
index bfaf93709d8c..c427874d467a 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -44,6 +44,40 @@ def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]>,
 def SCC : SIReg<"scc", 253>;
 def M0 : SIReg <"m0", 124>;
 
+// Trap handler registers
+def TBA_LO : SIReg<"tba_lo", 108>;
+def TBA_HI : SIReg<"tba_hi", 109>;
+
+def TBA : RegisterWithSubRegs<"tba", [TBA_LO, TBA_HI]>,
+          DwarfRegAlias<TBA_LO> {
+  let Namespace = "AMDGPU";
+  let SubRegIndices = [sub0, sub1];
+  let HWEncoding = 108;
+}
+
+def TMA_LO : SIReg<"tma_lo", 110>;
+def TMA_HI : SIReg<"tma_hi", 111>;
+
+def TMA : RegisterWithSubRegs<"tma", [TMA_LO, TMA_HI]>,
+          DwarfRegAlias<TMA_LO> {
+  let Namespace = "AMDGPU";
+  let SubRegIndices = [sub0, sub1];
+  let HWEncoding = 110;
+}
+
+def TTMP0 : SIReg <"ttmp0", 112>;
+def TTMP1 : SIReg <"ttmp1", 113>;
+def TTMP2 : SIReg <"ttmp2", 114>;
+def TTMP3 : SIReg <"ttmp3", 115>;
+def TTMP4 : SIReg <"ttmp4", 116>;
+def TTMP5 : SIReg <"ttmp5", 117>;
+def TTMP6 : SIReg <"ttmp6", 118>;
+def TTMP7 : SIReg <"ttmp7", 119>;
+def TTMP8 : SIReg <"ttmp8", 120>;
+def TTMP9 : SIReg <"ttmp9", 121>;
+def TTMP10 : SIReg <"ttmp10", 122>;
+def TTMP11 : SIReg <"ttmp11", 123>;
+
 multiclass FLAT_SCR_LOHI_m <string n, bits<16> ci_e, bits<16> vi_e> {
   def _ci : SIReg<n, ci_e>;
   def _vi : SIReg<n, vi_e>;
@@ -81,11 +115,18 @@ foreach Index = 0-255 in {
 //  Groupings using register classes and tuples
 //===----------------------------------------------------------------------===//
 
+def SCC_CLASS : RegisterClass<"AMDGPU", [i1], 1, (add SCC)> {
+  let CopyCost = -1;
+  let isAllocatable = 0;
+}
+
 // TODO: Do we need to set DwarfRegAlias on register tuples?
 
 // SGPR 32-bit registers
 def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32,
-                            (add (sequence "SGPR%u", 0, 103))>;
+                            (add (sequence "SGPR%u", 0, 103))> {
+  let AllocationPriority = 1;
+}
 
 // SGPR 64-bit registers
 def SGPR_64Regs : RegisterTuples<[sub0, sub1],
@@ -93,7 +134,7 @@ def SGPR_64Regs : RegisterTuples<[sub0, sub1],
                               (add (decimate (shl SGPR_32, 1), 2))]>;
 
 // SGPR 128-bit registers
-def SGPR_128 : RegisterTuples<[sub0, sub1, sub2, sub3],
+def SGPR_128Regs : RegisterTuples<[sub0, sub1, sub2, sub3],
                               [(add (decimate SGPR_32, 4)),
                                (add (decimate (shl SGPR_32, 1), 4)),
                                (add (decimate (shl SGPR_32, 2), 4)),
@@ -130,9 +171,29 @@ def SGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7,
                                (add (decimate (shl SGPR_32, 14), 4)),
                                (add (decimate (shl SGPR_32, 15), 4))]>;
 
+// Trap handler TMP 32-bit registers
+def TTMP_32 : RegisterClass<"AMDGPU", [i32, f32], 32,
+                            (add (sequence "TTMP%u", 0, 11))> {
+  let isAllocatable = 0;
+}
+
+// Trap handler TMP 64-bit registers
+def TTMP_64Regs : RegisterTuples<[sub0, sub1],
+                             [(add (decimate TTMP_32, 2)),
+                              (add (decimate (shl TTMP_32, 1), 2))]>;
+
+// Trap handler TMP 128-bit registers
+def TTMP_128Regs : RegisterTuples<[sub0, sub1, sub2, sub3],
+                              [(add (decimate TTMP_32, 4)),
+                               (add (decimate (shl TTMP_32, 1), 4)),
+                               (add (decimate (shl TTMP_32, 2), 4)),
+                               (add (decimate (shl TTMP_32, 3), 4))]>;
+
 // VGPR 32-bit registers
 def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32,
-                            (add (sequence "VGPR%u", 0, 255))>;
+                            (add (sequence "VGPR%u", 0, 255))> {
+  let AllocationPriority = 1;
+}
 
 // VGPR 64-bit registers
 def VGPR_64 : RegisterTuples<[sub0, sub1],
@@ -192,36 +253,67 @@ class RegImmMatcher<string name> : AsmOperandClass {
   let RenderMethod = "addRegOrImmOperands";
 }
 
+// Subset of SReg_32 without M0 for SMRD instructions and alike.
+// See comments in SIInstructions.td for more info.
+def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32], 32,
+  (add SGPR_32, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI,
+   TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI)> {
+  let AllocationPriority = 1;
+}
+
 // Register class for all scalar registers (SGPRs + Special Registers)
 def SReg_32 : RegisterClass<"AMDGPU", [i32, f32], 32,
-  (add SGPR_32, M0, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI)
->;
+  (add SReg_32_XM0, M0)> {
+  let AllocationPriority = 1;
+}
+
+def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add SGPR_64Regs)> {
+  let AllocationPriority = 2;
+}
 
-def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add SGPR_64Regs)>;
+def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add TTMP_64Regs)> {
+  let isAllocatable = 0;
+}
 
 def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 32,
-  (add SGPR_64, VCC, EXEC, FLAT_SCR)
->;
+  (add SGPR_64, VCC, EXEC, FLAT_SCR, TTMP_64, TBA, TMA)> {
+  let AllocationPriority = 2;
+}
 
-def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128)> {
-  // Requires 2 s_mov_b64 to copy
-  let CopyCost = 2;
+// Requires 2 s_mov_b64 to copy
+let CopyCost = 2 in {
+
+def SGPR_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128Regs)> {
+  let AllocationPriority = 4;
+}
+
+def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add TTMP_128Regs)> {
+  let isAllocatable = 0;
+}
+
+def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128, TTMP_128)> {
+  let AllocationPriority = 4;
 }
 
-def SReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 32, (add SGPR_256)> {
+} // End CopyCost = 2
+
+def SReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add SGPR_256)> {
   // Requires 4 s_mov_b64 to copy
   let CopyCost = 4;
+  let AllocationPriority = 5;
 }
 
 def SReg_512 : RegisterClass<"AMDGPU", [v64i8, v16i32], 32, (add SGPR_512)> {
   // Requires 8 s_mov_b64 to copy
   let CopyCost = 8;
+  let AllocationPriority = 6;
 }
 
 // Register class for all vector registers (VGPRs + Interploation Registers)
 def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 32, (add VGPR_64)> {
   // Requires 2 v_mov_b32 to copy
   let CopyCost = 2;
+  let AllocationPriority = 2;
 }
 
 def VReg_96 : RegisterClass<"AMDGPU", [untyped], 32, (add VGPR_96)> {
@@ -229,19 +321,23 @@ def VReg_96 : RegisterClass<"AMDGPU", [untyped], 32, (add VGPR_96)> {
 
   // Requires 3 v_mov_b32 to copy
   let CopyCost = 3;
+  let AllocationPriority = 3;
 }
 
 def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, (add VGPR_128)> {
   // Requires 4 v_mov_b32 to copy
   let CopyCost = 4;
+  let AllocationPriority = 4;
 }
 
-def VReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 32, (add VGPR_256)> {
+def VReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add VGPR_256)> {
   let CopyCost = 8;
+  let AllocationPriority = 5;
 }
 
 def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add VGPR_512)> {
   let CopyCost = 16;
+  let AllocationPriority = 6;
 }
 
 def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> {
diff --git a/lib/Target/AMDGPU/SISchedule.td b/lib/Target/AMDGPU/SISchedule.td
index cd77e519abb2..ed19217226b8 100644
--- a/lib/Target/AMDGPU/SISchedule.td
+++ b/lib/Target/AMDGPU/SISchedule.td
@@ -11,6 +11,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+def : PredicateProlog<[{
+  const SIInstrInfo *TII =
+    static_cast<const SIInstrInfo*>(SchedModel->getInstrInfo());
+  (void)TII;
+}]>;
+
 def WriteBranch : SchedWrite;
 def WriteExport : SchedWrite;
 def WriteLDS    : SchedWrite;
@@ -39,20 +45,33 @@ def Write64Bit : SchedWrite;
 // instructions and have VALU rates, but write to the SALU (i.e. VOPC
 // instructions)
 
-def SIFullSpeedModel : SchedMachineModel;
-def SIQuarterSpeedModel : SchedMachineModel;
+class SISchedMachineModel : SchedMachineModel {
+  let CompleteModel = 0;
+  let IssueWidth = 1;
+  let PostRAScheduler = 1;
+}
 
-// BufferSize = 0 means the processors are in-order.
-let BufferSize = 0 in {
+def SIFullSpeedModel : SISchedMachineModel;
+def SIQuarterSpeedModel : SISchedMachineModel;
 
 // XXX: Are the resource counts correct?
-def HWBranch : ProcResource<1>;
-def HWExport : ProcResource<7>;   // Taken from S_WAITCNT
-def HWLGKM   : ProcResource<31>;  // Taken from S_WAITCNT
-def HWSALU   : ProcResource<1>;
-def HWVMEM   : ProcResource<15>;  // Taken from S_WAITCNT
-def HWVALU   : ProcResource<1>;
-
+def HWBranch : ProcResource<1> {
+  let BufferSize = 1;
+}
+def HWExport : ProcResource<1> {
+  let BufferSize = 7; // Taken from S_WAITCNT
+}
+def HWLGKM   : ProcResource<1> {
+  let BufferSize = 31;  // Taken from S_WAITCNT
+}
+def HWSALU   : ProcResource<1> {
+  let BufferSize = 1;
+}
+def HWVMEM   : ProcResource<1> {
+  let BufferSize = 15;  // Taken from S_WAITCNT
+}
+def HWVALU   : ProcResource<1> {
+  let BufferSize = 1;
 }
 
 class HWWriteRes<SchedWrite write, list<ProcResourceKind> resources,
@@ -70,12 +89,12 @@ class HWVALUWriteRes<SchedWrite write, int latency> :
 // The latency values are 1 / (operations / cycle) / 4.
 multiclass SICommonWriteRes {
 
-  def : HWWriteRes<WriteBranch,  [HWBranch], 100>; // XXX: Guessed ???
-  def : HWWriteRes<WriteExport,  [HWExport], 100>; // XXX: Guessed ???
-  def : HWWriteRes<WriteLDS,     [HWLGKM],    32>; // 2 - 64
-  def : HWWriteRes<WriteSALU,    [HWSALU],     1>;
-  def : HWWriteRes<WriteSMEM,    [HWLGKM],    10>; // XXX: Guessed ???
-  def : HWWriteRes<WriteVMEM,    [HWVMEM],   450>; // 300 - 600
+  def : HWWriteRes<WriteBranch,  [HWBranch], 8>;
+  def : HWWriteRes<WriteExport,  [HWExport], 4>;
+  def : HWWriteRes<WriteLDS,     [HWLGKM],   5>; // Can be between 2 and 64
+  def : HWWriteRes<WriteSALU,    [HWSALU],   1>;
+  def : HWWriteRes<WriteSMEM,    [HWLGKM],   5>;
+  def : HWWriteRes<WriteVMEM,    [HWVMEM],   80>;
   def : HWWriteRes<WriteBarrier, [HWBranch], 500>; // XXX: Guessed ???
 
   def : HWVALUWriteRes<Write32Bit,         1>;
@@ -83,6 +102,12 @@ multiclass SICommonWriteRes {
   def : HWVALUWriteRes<WriteQuarterRate32, 4>;
 }
 
+def PredIsVGPR32Copy : SchedPredicate<[{TII->isVGPRCopy(*MI) && TII->getOpSize(*MI, 0) <= 32}]>;
+def PredIsVGPR64Copy : SchedPredicate<[{TII->isVGPRCopy(*MI) && TII->getOpSize(*MI, 0) > 32}]>;
+def WriteCopy : SchedWriteVariant<[
+    SchedVar<PredIsVGPR32Copy, [Write32Bit]>,
+    SchedVar<PredIsVGPR64Copy, [Write64Bit]>,
+    SchedVar<NoSchedPred, [WriteSALU]>]>;
 
 let SchedModel = SIFullSpeedModel in {
 
@@ -92,6 +117,8 @@ def : HWVALUWriteRes<WriteFloatFMA,   1>;
 def : HWVALUWriteRes<WriteDouble,     4>;
 def : HWVALUWriteRes<WriteDoubleAdd,  2>;
 
+def : InstRW<[WriteCopy], (instrs COPY)>;
+
 } // End SchedModel = SIFullSpeedModel
 
 let SchedModel = SIQuarterSpeedModel in {
@@ -102,4 +129,6 @@ def : HWVALUWriteRes<WriteFloatFMA, 16>;
 def : HWVALUWriteRes<WriteDouble,   16>;
 def : HWVALUWriteRes<WriteDoubleAdd, 8>;
 
+def : InstRW<[WriteCopy], (instrs COPY)>;
+
 }  // End SchedModel = SIQuarterSpeedModel
diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 4f0913fe62f2..6cba55300a8c 100644
--- a/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -31,10 +31,6 @@ STATISTIC(NumInstructionsShrunk,
 STATISTIC(NumLiteralConstantsFolded,
           "Number of literal constants folded into 32-bit instructions.");
 
-namespace llvm {
-  void initializeSIShrinkInstructionsPass(PassRegistry&);
-}
-
 using namespace llvm;
 
 namespace {
@@ -61,10 +57,8 @@ public:
 
 } // End anonymous namespace.
 
-INITIALIZE_PASS_BEGIN(SIShrinkInstructions, DEBUG_TYPE,
-                      "SI Lower il Copies", false, false)
-INITIALIZE_PASS_END(SIShrinkInstructions, DEBUG_TYPE,
-                    "SI Lower il Copies", false, false)
+INITIALIZE_PASS(SIShrinkInstructions, DEBUG_TYPE,
+                "SI Shrink Instructions", false, false)
 
 char SIShrinkInstructions::ID = 0;
 
@@ -125,10 +119,7 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
   if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
     return false;
 
-  if (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
-    return false;
-
-  return true;
+  return !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp);
 }
 
 /// \brief This function checks \p MI for operands defined by a move immediate
@@ -181,31 +172,37 @@ static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
   }
 
   // We have failed to fold src0, so commute the instruction and try again.
-  if (TryToCommute && MI.isCommutable() && TII->commuteInstruction(&MI))
+  if (TryToCommute && MI.isCommutable() && TII->commuteInstruction(MI))
     foldImmediates(MI, TII, MRI, false);
 
 }
 
 // Copy MachineOperand with all flags except setting it as implicit.
-static MachineOperand copyRegOperandAsImplicit(const MachineOperand &Orig) {
-  assert(!Orig.isImplicit());
-  return MachineOperand::CreateReg(Orig.getReg(),
-                                   Orig.isDef(),
-                                   true,
-                                   Orig.isKill(),
-                                   Orig.isDead(),
-                                   Orig.isUndef(),
-                                   Orig.isEarlyClobber(),
-                                   Orig.getSubReg(),
-                                   Orig.isDebug(),
-                                   Orig.isInternalRead());
+static void copyFlagsToImplicitVCC(MachineInstr &MI,
+                                   const MachineOperand &Orig) {
+
+  for (MachineOperand &Use : MI.implicit_operands()) {
+    if (Use.getReg() == AMDGPU::VCC) {
+      Use.setIsUndef(Orig.isUndef());
+      Use.setIsKill(Orig.isKill());
+      return;
+    }
+  }
+}
+
+static bool isKImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) {
+  return isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src, 4);
 }
 
 bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
   const SIRegisterInfo &TRI = TII->getRegisterInfo();
+
   std::vector<unsigned> I1Defs;
 
   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
@@ -217,14 +214,94 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
       Next = std::next(I);
       MachineInstr &MI = *I;
 
+      if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) {
+        // If this has a literal constant source that is the same as the
+        // reversed bits of an inline immediate, replace with a bitreverse of
+        // that constant. This saves 4 bytes in the common case of materializing
+        // sign bits.
+
+        // Test if we are after regalloc. We only want to do this after any
+        // optimizations happen because this will confuse them.
+        // XXX - not exactly a check for post-regalloc run.
+        MachineOperand &Src = MI.getOperand(1);
+        if (Src.isImm() &&
+            TargetRegisterInfo::isPhysicalRegister(MI.getOperand(0).getReg())) {
+          int64_t Imm = Src.getImm();
+          if (isInt<32>(Imm) && !TII->isInlineConstant(Src, 4)) {
+            int32_t ReverseImm = reverseBits<int32_t>(static_cast<int32_t>(Imm));
+            if (ReverseImm >= -16 && ReverseImm <= 64) {
+              MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32));
+              Src.setImm(ReverseImm);
+              continue;
+            }
+          }
+        }
+      }
+
+      // Combine adjacent s_nops to use the immediate operand encoding how long
+      // to wait.
+      //
+      // s_nop N
+      // s_nop M
+      //  =>
+      // s_nop (N + M)
+      if (MI.getOpcode() == AMDGPU::S_NOP &&
+          Next != MBB.end() &&
+          (*Next).getOpcode() == AMDGPU::S_NOP) {
+
+        MachineInstr &NextMI = *Next;
+        // The instruction encodes the amount to wait with an offset of 1,
+        // i.e. 0 is wait 1 cycle. Convert both to cycles and then convert back
+        // after adding.
+        uint8_t Nop0 = MI.getOperand(0).getImm() + 1;
+        uint8_t Nop1 = NextMI.getOperand(0).getImm() + 1;
+
+        // Make sure we don't overflow the bounds.
+        if (Nop0 + Nop1 <= 8) {
+          NextMI.getOperand(0).setImm(Nop0 + Nop1 - 1);
+          MI.eraseFromParent();
+        }
+
+        continue;
+      }
+
+      // FIXME: We also need to consider movs of constant operands since
+      // immediate operands are not folded if they have more than one use, and
+      // the operand folding pass is unaware if the immediate will be free since
+      // it won't know if the src == dest constraint will end up being
+      // satisfied.
+      if (MI.getOpcode() == AMDGPU::S_ADD_I32 ||
+          MI.getOpcode() == AMDGPU::S_MUL_I32) {
+        const MachineOperand &Dest = MI.getOperand(0);
+        const MachineOperand &Src0 = MI.getOperand(1);
+        const MachineOperand &Src1 = MI.getOperand(2);
+
+        // FIXME: This could work better if hints worked with subregisters. If
+        // we have a vector add of a constant, we usually don't get the correct
+        // allocation due to the subregister usage.
+        if (TargetRegisterInfo::isVirtualRegister(Dest.getReg()) &&
+            Src0.isReg()) {
+          MRI.setRegAllocationHint(Dest.getReg(), 0, Src0.getReg());
+          continue;
+        }
+
+        if (Src0.isReg() && Src0.getReg() == Dest.getReg()) {
+          if (Src1.isImm() && isKImmOperand(TII, Src1)) {
+            unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ?
+              AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32;
+
+            MI.setDesc(TII->get(Opc));
+            MI.tieOperands(0, 1);
+          }
+        }
+      }
+
       // Try to use S_MOVK_I32, which will save 4 bytes for small immediates.
       if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
         const MachineOperand &Src = MI.getOperand(1);
 
-        if (Src.isImm()) {
-          if (isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src, 4))
-            MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
-        }
+        if (Src.isImm() && isKImmOperand(TII, Src))
+          MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
 
         continue;
       }
@@ -235,7 +312,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
       if (!canShrink(MI, TII, TRI, MRI)) {
         // Try commuting the instruction and see if that enables us to shrink
         // it.
-        if (!MI.isCommutable() || !TII->commuteInstruction(&MI) ||
+        if (!MI.isCommutable() || !TII->commuteInstruction(MI) ||
             !canShrink(MI, TII, TRI, MRI))
           continue;
       }
@@ -287,9 +364,9 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
       MachineInstrBuilder Inst32 =
           BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32));
 
-      // Add the dst operand if the 32-bit encoding also has an explicit $dst.
+      // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
       // For VOPC instructions, this is replaced by an implicit def of vcc.
-      int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::dst);
+      int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst);
       if (Op32DstIdx != -1) {
         // dst
         Inst32.addOperand(MI.getOperand(0));
@@ -314,10 +391,9 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
           Inst32.addOperand(*Src2);
         } else {
           // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
-          // replaced with an implicit read of vcc.
-          assert(Src2->getReg() == AMDGPU::VCC &&
-                 "Unexpected missing register operand");
-          Inst32.addOperand(copyRegOperandAsImplicit(*Src2));
+          // replaced with an implicit read of vcc. This was already added
+          // during the initial BuildMI, so find it to preserve the flags.
+          copyFlagsToImplicitVCC(*Inst32, *Src2);
         }
       }
 
diff --git a/lib/Target/AMDGPU/SITypeRewriter.cpp b/lib/Target/AMDGPU/SITypeRewriter.cpp
index d36c5d29b127..facc0c7df1dc 100644
--- a/lib/Target/AMDGPU/SITypeRewriter.cpp
+++ b/lib/Target/AMDGPU/SITypeRewriter.cpp
@@ -62,7 +62,7 @@ bool SITypeRewriter::doInitialization(Module &M) {
 }
 
 bool SITypeRewriter::runOnFunction(Function &F) {
-  if (AMDGPU::getShaderType(F) == ShaderType::COMPUTE)
+  if (!AMDGPU::isShader(F.getCallingConv()))
     return false;
 
   visit(F);
diff --git a/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/lib/Target/AMDGPU/SIWholeQuadMode.cpp
new file mode 100644
index 000000000000..c1a237ea5f51
--- /dev/null
+++ b/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -0,0 +1,509 @@
+//===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief This pass adds instructions to enable whole quad mode for pixel
+/// shaders.
+///
+/// Whole quad mode is required for derivative computations, but it interferes
+/// with shader side effects (stores and atomics). This pass is run on the
+/// scheduled machine IR but before register coalescing, so that machine SSA is
+/// available for analysis. It ensures that WQM is enabled when necessary, but
+/// disabled around stores and atomics.
+///
+/// When necessary, this pass creates a function prolog
+///
+///   S_MOV_B64 LiveMask, EXEC
+///   S_WQM_B64 EXEC, EXEC
+///
+/// to enter WQM at the top of the function and surrounds blocks of Exact
+/// instructions by
+///
+///   S_AND_SAVEEXEC_B64 Tmp, LiveMask
+///   ...
+///   S_MOV_B64 EXEC, Tmp
+///
+/// In order to avoid excessive switching during sequences of Exact
+/// instructions, the pass first analyzes which instructions must be run in WQM
+/// (aka which instructions produce values that lead to derivative
+/// computations).
+///
+/// Basic blocks are always exited in WQM as long as some successor needs WQM.
+///
+/// There is room for improvement given better control flow analysis:
+///
+///  (1) at the top level (outside of control flow statements, and as long as
+///      kill hasn't been used), one SGPR can be saved by recovering WQM from
+///      the LiveMask (this is implemented for the entry block).
+///
+///  (2) when entire regions (e.g. if-else blocks or entire loops) only
+///      consist of exact and don't-care instructions, the switch only has to
+///      be done at the entry and exit points rather than potentially in each
+///      block of the region.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-wqm"
+
+namespace {
+
+enum {
+  StateWQM = 0x1,
+  StateExact = 0x2,
+};
+
+struct InstrInfo {
+  char Needs = 0;
+  char OutNeeds = 0;
+};
+
+struct BlockInfo {
+  char Needs = 0;
+  char InNeeds = 0;
+  char OutNeeds = 0;
+};
+
+struct WorkItem {
+  MachineBasicBlock *MBB = nullptr;
+  MachineInstr *MI = nullptr;
+
+  WorkItem() {}
+  WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
+  WorkItem(MachineInstr *MI) : MI(MI) {}
+};
+
+class SIWholeQuadMode : public MachineFunctionPass {
+private:
+  const SIInstrInfo *TII;
+  const SIRegisterInfo *TRI;
+  MachineRegisterInfo *MRI;
+
+  DenseMap<const MachineInstr *, InstrInfo> Instructions;
+  DenseMap<MachineBasicBlock *, BlockInfo> Blocks;
+  SmallVector<const MachineInstr *, 2> ExecExports;
+  SmallVector<MachineInstr *, 1> LiveMaskQueries;
+
+  char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
+  void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
+  void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
+  char analyzeFunction(MachineFunction &MF);
+
+  void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
+               unsigned SaveWQM, unsigned LiveMaskReg);
+  void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
+             unsigned SavedWQM);
+  void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry);
+
+  void lowerLiveMaskQueries(unsigned LiveMaskReg);
+
+public:
+  static char ID;
+
+  SIWholeQuadMode() :
+    MachineFunctionPass(ID) { }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  const char *getPassName() const override {
+    return "SI Whole Quad Mode";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // End anonymous namespace
+
+char SIWholeQuadMode::ID = 0;
+
+INITIALIZE_PASS(SIWholeQuadMode, DEBUG_TYPE,
+                "SI Whole Quad Mode", false, false)
+
+char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
+
+FunctionPass *llvm::createSIWholeQuadModePass() {
+  return new SIWholeQuadMode;
+}
+
+// Scan instructions to determine which ones require an Exact execmask and
+// which ones seed WQM requirements.
+char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
+                                       std::vector<WorkItem> &Worklist) {
+  char GlobalFlags = 0;
+  bool WQMOutputs = MF.getFunction()->hasFnAttribute("amdgpu-ps-wqm-outputs");
+
+  for (auto BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) {
+    MachineBasicBlock &MBB = *BI;
+
+    for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) {
+      MachineInstr &MI = *II;
+      unsigned Opcode = MI.getOpcode();
+      char Flags = 0;
+
+      if (TII->isWQM(Opcode) || TII->isDS(Opcode)) {
+        Flags = StateWQM;
+      } else if (MI.mayStore() && TII->usesVM_CNT(MI)) {
+        Flags = StateExact;
+      } else {
+        // Handle export instructions with the exec mask valid flag set
+        if (Opcode == AMDGPU::EXP) {
+          if (MI.getOperand(4).getImm() != 0)
+            ExecExports.push_back(&MI);
+        } else if (Opcode == AMDGPU::SI_PS_LIVE) {
+          LiveMaskQueries.push_back(&MI);
+        } else if (WQMOutputs) {
+          // The function is in machine SSA form, which means that physical
+          // VGPRs correspond to shader inputs and outputs. Inputs are
+          // only used, outputs are only defined.
+          for (const MachineOperand &MO : MI.defs()) {
+            if (!MO.isReg())
+              continue;
+
+            unsigned Reg = MO.getReg();
+
+            if (!TRI->isVirtualRegister(Reg) &&
+                TRI->hasVGPRs(TRI->getPhysRegClass(Reg))) {
+              Flags = StateWQM;
+              break;
+            }
+          }
+        }
+
+        if (!Flags)
+          continue;
+      }
+
+      Instructions[&MI].Needs = Flags;
+      Worklist.push_back(&MI);
+      GlobalFlags |= Flags;
+    }
+
+    if (WQMOutputs && MBB.succ_empty()) {
+      // This is a prolog shader. Make sure we go back to exact mode at the end.
+      Blocks[&MBB].OutNeeds = StateExact;
+      Worklist.push_back(&MBB);
+      GlobalFlags |= StateExact;
+    }
+  }
+
+  return GlobalFlags;
+}
+
+void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
+                                           std::vector<WorkItem>& Worklist) {
+  MachineBasicBlock *MBB = MI.getParent();
+  InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
+  BlockInfo &BI = Blocks[MBB];
+
+  // Control flow-type instructions that are followed by WQM computations
+  // must themselves be in WQM.
+  if ((II.OutNeeds & StateWQM) && !(II.Needs & StateWQM) && MI.isTerminator()) {
+    Instructions[&MI].Needs = StateWQM;
+    II.Needs = StateWQM;
+  }
+
+  // Propagate to block level
+  BI.Needs |= II.Needs;
+  if ((BI.InNeeds | II.Needs) != BI.InNeeds) {
+    BI.InNeeds |= II.Needs;
+    Worklist.push_back(MBB);
+  }
+
+  // Propagate backwards within block
+  if (MachineInstr *PrevMI = MI.getPrevNode()) {
+    char InNeeds = II.Needs | II.OutNeeds;
+    if (!PrevMI->isPHI()) {
+      InstrInfo &PrevII = Instructions[PrevMI];
+      if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
+        PrevII.OutNeeds |= InNeeds;
+        Worklist.push_back(PrevMI);
+      }
+    }
+  }
+
+  // Propagate WQM flag to instruction inputs
+  assert(II.Needs != (StateWQM | StateExact));
+  if (II.Needs != StateWQM)
+    return;
+
+  for (const MachineOperand &Use : MI.uses()) {
+    if (!Use.isReg() || !Use.isUse())
+      continue;
+
+    // At this point, physical registers appear as inputs or outputs
+    // and following them makes no sense (and would in fact be incorrect
+    // when the same VGPR is used as both an output and an input that leads
+    // to a NeedsWQM instruction).
+    //
+    // Note: VCC appears e.g. in 64-bit addition with carry - theoretically we
+    // have to trace this, in practice it happens for 64-bit computations like
+    // pointers where both dwords are followed already anyway.
+    if (!TargetRegisterInfo::isVirtualRegister(Use.getReg()))
+      continue;
+
+    for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg())) {
+      InstrInfo &DefII = Instructions[&DefMI];
+
+      // Obviously skip if DefMI is already flagged as NeedWQM.
+      //
+      // The instruction might also be flagged as NeedExact. This happens when
+      // the result of an atomic is used in a WQM computation. In this case,
+      // the atomic must not run for helper pixels and the WQM result is
+      // undefined.
+      if (DefII.Needs != 0)
+        continue;
+
+      DefII.Needs = StateWQM;
+      Worklist.push_back(&DefMI);
+    }
+  }
+}
+
+void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
+                                     std::vector<WorkItem>& Worklist) {
+  BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
+
+  // Propagate through instructions
+  if (!MBB.empty()) {
+    MachineInstr *LastMI = &*MBB.rbegin();
+    InstrInfo &LastII = Instructions[LastMI];
+    if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
+      LastII.OutNeeds |= BI.OutNeeds;
+      Worklist.push_back(LastMI);
+    }
+  }
+
+  // Predecessor blocks must provide for our WQM/Exact needs.
+  for (MachineBasicBlock *Pred : MBB.predecessors()) {
+    BlockInfo &PredBI = Blocks[Pred];
+    if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
+      continue;
+
+    PredBI.OutNeeds |= BI.InNeeds;
+    PredBI.InNeeds |= BI.InNeeds;
+    Worklist.push_back(Pred);
+  }
+
+  // All successors must be prepared to accept the same set of WQM/Exact data.
+  for (MachineBasicBlock *Succ : MBB.successors()) {
+    BlockInfo &SuccBI = Blocks[Succ];
+    if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
+      continue;
+
+    SuccBI.InNeeds |= BI.OutNeeds;
+    Worklist.push_back(Succ);
+  }
+}
+
+char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
+  std::vector<WorkItem> Worklist;
+  char GlobalFlags = scanInstructions(MF, Worklist);
+
+  while (!Worklist.empty()) {
+    WorkItem WI = Worklist.back();
+    Worklist.pop_back();
+
+    if (WI.MI)
+      propagateInstruction(*WI.MI, Worklist);
+    else
+      propagateBlock(*WI.MBB, Worklist);
+  }
+
+  return GlobalFlags;
+}
+
+void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator Before,
+                              unsigned SaveWQM, unsigned LiveMaskReg) {
+  if (SaveWQM) {
+    BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_SAVEEXEC_B64),
+            SaveWQM)
+        .addReg(LiveMaskReg);
+  } else {
+    BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_B64),
+            AMDGPU::EXEC)
+        .addReg(AMDGPU::EXEC)
+        .addReg(LiveMaskReg);
+  }
+}
+
+void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator Before,
+                            unsigned SavedWQM) {
+  if (SavedWQM) {
+    BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::EXEC)
+        .addReg(SavedWQM);
+  } else {
+    BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
+            AMDGPU::EXEC)
+        .addReg(AMDGPU::EXEC);
+  }
+}
+
+void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
+                                   bool isEntry) {
+  auto BII = Blocks.find(&MBB);
+  if (BII == Blocks.end())
+    return;
+
+  const BlockInfo &BI = BII->second;
+
+  if (!(BI.InNeeds & StateWQM))
+    return;
+
+  // This is a non-entry block that is WQM throughout, so no need to do
+  // anything.
+  if (!isEntry && !(BI.Needs & StateExact) && BI.OutNeeds != StateExact)
+    return;
+
+  unsigned SavedWQMReg = 0;
+  bool WQMFromExec = isEntry;
+  char State = isEntry ? StateExact : StateWQM;
+
+  auto II = MBB.getFirstNonPHI(), IE = MBB.end();
+  while (II != IE) {
+    MachineInstr &MI = *II;
+    ++II;
+
+    // Skip instructions that are not affected by EXEC
+    if (TII->isScalarUnit(MI) && !MI.isTerminator())
+      continue;
+
+    // Generic instructions such as COPY will either disappear by register
+    // coalescing or be lowered to SALU or VALU instructions.
+    if (TargetInstrInfo::isGenericOpcode(MI.getOpcode())) {
+      if (MI.getNumExplicitOperands() >= 1) {
+        const MachineOperand &Op = MI.getOperand(0);
+        if (Op.isReg()) {
+          if (TRI->isSGPRReg(*MRI, Op.getReg())) {
+            // SGPR instructions are not affected by EXEC
+            continue;
+          }
+        }
+      }
+    }
+
+    char Needs = 0;
+    char OutNeeds = 0;
+    auto InstrInfoIt = Instructions.find(&MI);
+    if (InstrInfoIt != Instructions.end()) {
+      Needs = InstrInfoIt->second.Needs;
+      OutNeeds = InstrInfoIt->second.OutNeeds;
+
+      // Make sure to switch to Exact mode before the end of the block when
+      // Exact and only Exact is needed further downstream.
+      if (OutNeeds == StateExact && MI.isTerminator()) {
+        assert(Needs == 0);
+        Needs = StateExact;
+      }
+    }
+
+    // State switching
+    if (Needs && State != Needs) {
+      if (Needs == StateExact) {
+        assert(!SavedWQMReg);
+
+        if (!WQMFromExec && (OutNeeds & StateWQM))
+          SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+
+        toExact(MBB, &MI, SavedWQMReg, LiveMaskReg);
+      } else {
+        assert(WQMFromExec == (SavedWQMReg == 0));
+        toWQM(MBB, &MI, SavedWQMReg);
+        SavedWQMReg = 0;
+      }
+
+      State = Needs;
+    }
+  }
+
+  if ((BI.OutNeeds & StateWQM) && State != StateWQM) {
+    assert(WQMFromExec == (SavedWQMReg == 0));
+    toWQM(MBB, MBB.end(), SavedWQMReg);
+  } else if (BI.OutNeeds == StateExact && State != StateExact) {
+    toExact(MBB, MBB.end(), 0, LiveMaskReg);
+  }
+}
+
+void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
+  for (MachineInstr *MI : LiveMaskQueries) {
+    const DebugLoc &DL = MI->getDebugLoc();
+    unsigned Dest = MI->getOperand(0).getReg();
+    BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
+        .addReg(LiveMaskReg);
+    MI->eraseFromParent();
+  }
+}
+
+bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
+  if (MF.getFunction()->getCallingConv() != CallingConv::AMDGPU_PS)
+    return false;
+
+  Instructions.clear();
+  Blocks.clear();
+  ExecExports.clear();
+  LiveMaskQueries.clear();
+
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+
+  TII = ST.getInstrInfo();
+  TRI = &TII->getRegisterInfo();
+  MRI = &MF.getRegInfo();
+
+  char GlobalFlags = analyzeFunction(MF);
+  if (!(GlobalFlags & StateWQM)) {
+    lowerLiveMaskQueries(AMDGPU::EXEC);
+    return !LiveMaskQueries.empty();
+  }
+
+  // Store a copy of the original live mask when required
+  unsigned LiveMaskReg = 0;
+  {
+    MachineBasicBlock &Entry = MF.front();
+    MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
+
+    if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) {
+      LiveMaskReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+      BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
+          .addReg(AMDGPU::EXEC);
+    }
+
+    if (GlobalFlags == StateWQM) {
+      // For a shader that needs only WQM, we can just set it once.
+      BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
+              AMDGPU::EXEC)
+          .addReg(AMDGPU::EXEC);
+
+      lowerLiveMaskQueries(LiveMaskReg);
+      // EntryMI may become invalid here
+      return true;
+    }
+  }
+
+  lowerLiveMaskQueries(LiveMaskReg);
+
+  // Handle the general case
+  for (auto BII : Blocks)
+    processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin());
+
+  return true;
+}
diff --git a/lib/Target/AMDGPU/TargetInfo/Makefile b/lib/Target/AMDGPU/TargetInfo/Makefile
deleted file mode 100644
index 1b232871bd62..000000000000
--- a/lib/Target/AMDGPU/TargetInfo/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/AMDGPU/TargetInfo/Makefile ----------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMAMDGPUInfo
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
new file mode 100644
index 000000000000..b6868de6a74e
--- /dev/null
+++ b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
@@ -0,0 +1,69 @@
+//===-- AMDGPUAsmUtils.cpp - AsmParser/InstPrinter common -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include "AMDGPUAsmUtils.h"
+
+namespace llvm {
+namespace AMDGPU {
+namespace SendMsg {
+
+// This must be in sync with llvm::AMDGPU::SendMsg::Id enum members, see SIDefines.h.
+const char* const IdSymbolic[] = {
+  nullptr,
+  "MSG_INTERRUPT",
+  "MSG_GS",
+  "MSG_GS_DONE",
+  nullptr,
+  nullptr,
+  nullptr,
+  nullptr,
+  nullptr,
+  nullptr,
+  nullptr,
+  nullptr,
+  nullptr,
+  nullptr,
+  nullptr,
+  "MSG_SYSMSG"
+};
+
+// These two must be in sync with llvm::AMDGPU::SendMsg::Op enum members, see SIDefines.h.
+const char* const OpSysSymbolic[] = {
+  nullptr,
+  "SYSMSG_OP_ECC_ERR_INTERRUPT",
+  "SYSMSG_OP_REG_RD",
+  "SYSMSG_OP_HOST_TRAP_ACK",
+  "SYSMSG_OP_TTRACE_PC"
+};
+
+const char* const OpGsSymbolic[] = {
+  "GS_OP_NOP",
+  "GS_OP_CUT",
+  "GS_OP_EMIT",
+  "GS_OP_EMIT_CUT"
+};
+
+} // namespace SendMsg
+
+namespace Hwreg {
+
+// This must be in sync with llvm::AMDGPU::Hwreg::ID_SYMBOLIC_FIRST_/LAST_, see SIDefines.h.
+const char* const IdSymbolic[] = {
+  nullptr,
+  "HW_REG_MODE",
+  "HW_REG_STATUS",
+  "HW_REG_TRAPSTS",
+  "HW_REG_HW_ID",
+  "HW_REG_GPR_ALLOC",
+  "HW_REG_LDS_ALLOC",
+  "HW_REG_IB_STS"
+};
+
+} // namespace Hwreg
+} // namespace AMDGPU
+} // namespace llvm
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
new file mode 100644
index 000000000000..b2dc2c0e364c
--- /dev/null
+++ b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
@@ -0,0 +1,31 @@
+//===-- AMDGPUAsmUtils.h - AsmParser/InstPrinter common ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUASMUTILS_H
+#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUASMUTILS_H
+
+namespace llvm {
+namespace AMDGPU {
+namespace SendMsg { // Symbolic names for the sendmsg(...) syntax.
+
+extern const char* const IdSymbolic[];
+extern const char* const OpSysSymbolic[];
+extern const char* const OpGsSymbolic[];
+
+} // namespace SendMsg
+
+namespace Hwreg { // Symbolic names for the hwreg(...) syntax.
+
+extern const char* const IdSymbolic[];
+
+} // namespace Hwreg
+} // namespace AMDGPU
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 1f5deaef9d3b..c6f9142c0aa5 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -109,29 +109,45 @@ bool isReadOnlySegment(const GlobalValue *GV) {
   return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS;
 }
 
-static unsigned getIntegerAttribute(const Function &F, const char *Name,
-                                    unsigned Default) {
+int getIntegerAttribute(const Function &F, StringRef Name, int Default) {
   Attribute A = F.getFnAttribute(Name);
-  unsigned Result = Default;
+  int Result = Default;
 
   if (A.isStringAttribute()) {
     StringRef Str = A.getValueAsString();
     if (Str.getAsInteger(0, Result)) {
       LLVMContext &Ctx = F.getContext();
-      Ctx.emitError("can't parse shader type");
+      Ctx.emitError("can't parse integer attribute " + Name);
     }
   }
+
   return Result;
 }
 
-unsigned getShaderType(const Function &F) {
-  return getIntegerAttribute(F, "ShaderType", ShaderType::COMPUTE);
+unsigned getMaximumWorkGroupSize(const Function &F) {
+  return getIntegerAttribute(F, "amdgpu-max-work-group-size", 256);
 }
 
 unsigned getInitialPSInputAddr(const Function &F) {
   return getIntegerAttribute(F, "InitialPSInputAddr", 0);
 }
 
+bool isShader(CallingConv::ID cc) {
+  switch(cc) {
+    case CallingConv::AMDGPU_VS:
+    case CallingConv::AMDGPU_GS:
+    case CallingConv::AMDGPU_PS:
+    case CallingConv::AMDGPU_CS:
+      return true;
+    default:
+      return false;
+  }
+}
+
+bool isCompute(CallingConv::ID cc) {
+  return !isShader(cc) || cc == CallingConv::AMDGPU_CS;
+}
+
 bool isSI(const MCSubtargetInfo &STI) {
   return STI.getFeatureBits()[AMDGPU::FeatureSouthernIslands];
 }
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 57cbe1b58f98..995a9041fb36 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -11,6 +11,7 @@
 #define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUBASEINFO_H
 
 #include "AMDKernelCodeT.h"
+#include "llvm/IR/CallingConv.h"
 
 namespace llvm {
 
@@ -44,9 +45,13 @@ bool isGroupSegment(const GlobalValue *GV);
 bool isGlobalSegment(const GlobalValue *GV);
 bool isReadOnlySegment(const GlobalValue *GV);
 
-unsigned getShaderType(const Function &F);
+int getIntegerAttribute(const Function &F, StringRef Name, int Default);
+
+unsigned getMaximumWorkGroupSize(const Function &F);
 unsigned getInitialPSInputAddr(const Function &F);
 
+bool isShader(CallingConv::ID cc);
+bool isCompute(CallingConv::ID cc);
 
 bool isSI(const MCSubtargetInfo &STI);
 bool isCI(const MCSubtargetInfo &STI);
diff --git a/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h b/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h
new file mode 100644
index 000000000000..3a5ff60601d0
--- /dev/null
+++ b/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h
@@ -0,0 +1,165 @@
+//===--------------------- AMDKernelCodeTInfo.h ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file - specifies tables for amd_kernel_code_t structure parsing/printing
+//
+//===----------------------------------------------------------------------===//
+
+#define QNAME(name) amd_kernel_code_t::name
+#define FLD_T(name) decltype(QNAME(name)), &QNAME(name)
+
+#define FIELD2(sname, name) \
+  RECORD(sname, printField<FLD_T(name)>, parseField<FLD_T(name)>)
+
+#define FIELD(name) FIELD2(name, name)
+
+
+#define PRINTCODEPROP(name) \
+  printBitField<FLD_T(code_properties),\
+                AMD_CODE_PROPERTY_##name##_SHIFT,\
+                AMD_CODE_PROPERTY_##name##_WIDTH>
+
+#define PARSECODEPROP(name) \
+  parseBitField<FLD_T(code_properties),\
+                AMD_CODE_PROPERTY_##name##_SHIFT,\
+                AMD_CODE_PROPERTY_##name##_WIDTH>
+
+#define CODEPROP(name, shift) \
+  RECORD(name, PRINTCODEPROP(shift), PARSECODEPROP(shift))
+
+// have to define these lambdas because of Set/GetMacro
+#define PRINTCOMP(GetMacro, Shift) \
+[](StringRef Name, const amd_kernel_code_t &C, raw_ostream &OS) { \
+   printName(OS, Name) << \
+     (int)GetMacro(C.compute_pgm_resource_registers >> Shift); \
+}
+#define PARSECOMP(SetMacro, Shift) \
+[](amd_kernel_code_t &C, MCAsmParser &MCParser, raw_ostream &Err) { \
+   int64_t Value = 0; \
+   if (!expectAbsExpression(MCParser, Value, Err)) \
+     return false; \
+   C.compute_pgm_resource_registers |= SetMacro(Value) << Shift; \
+   return true; \
+}
+
+#define COMPPGM(name, GetMacro, SetMacro, Shift) \
+  RECORD(name, PRINTCOMP(GetMacro, Shift), PARSECOMP(SetMacro, Shift))
+
+#define COMPPGM1(name, AccMacro) \
+  COMPPGM(compute_pgm_rsrc1_##name, \
+          G_00B848_##AccMacro, S_00B848_##AccMacro, 0)
+
+#define COMPPGM2(name, AccMacro) \
+  COMPPGM(compute_pgm_rsrc2_##name, \
+          G_00B84C_##AccMacro, S_00B84C_##AccMacro, 32)
+
+///////////////////////////////////////////////////////////////////////////////
+// Begin of the table
+// Define RECORD(name, print, parse) in your code to get field definitions
+// and include this file
+
+FIELD2(kernel_code_version_major, amd_kernel_code_version_major),
+FIELD2(kernel_code_version_minor, amd_kernel_code_version_minor),
+FIELD2(machine_kind,              amd_machine_kind),
+FIELD2(machine_version_major,     amd_machine_version_major),
+FIELD2(machine_version_minor,     amd_machine_version_minor),
+FIELD2(machine_version_stepping,  amd_machine_version_stepping),
+FIELD(kernel_code_entry_byte_offset),
+FIELD(kernel_code_prefetch_byte_size),
+FIELD(max_scratch_backing_memory_byte_size),
+FIELD(compute_pgm_resource_registers),
+FIELD(workitem_private_segment_byte_size),
+FIELD(workgroup_group_segment_byte_size),
+FIELD(gds_segment_byte_size),
+FIELD(kernarg_segment_byte_size),
+FIELD(workgroup_fbarrier_count),
+FIELD(wavefront_sgpr_count),
+FIELD(workitem_vgpr_count),
+FIELD(reserved_vgpr_first),
+FIELD(reserved_vgpr_count),
+FIELD(reserved_sgpr_first),
+FIELD(reserved_sgpr_count),
+FIELD(debug_wavefront_private_segment_offset_sgpr),
+FIELD(debug_private_segment_buffer_sgpr),
+FIELD(kernarg_segment_alignment),
+FIELD(group_segment_alignment),
+FIELD(private_segment_alignment),
+FIELD(wavefront_size),
+FIELD(call_convention),
+FIELD(runtime_loader_kernel_symbol),
+
+COMPPGM1(vgprs,          VGPRS),
+COMPPGM1(sgprs,          SGPRS),
+COMPPGM1(priority,       PRIORITY),
+COMPPGM1(float_mode,     FLOAT_MODE),
+COMPPGM1(priv,           PRIV),
+COMPPGM1(dx10_clamp,     DX10_CLAMP),
+COMPPGM1(debug_mode,     DEBUG_MODE),
+COMPPGM1(ieee_mode,      IEEE_MODE),
+COMPPGM2(scratch_en,     SCRATCH_EN),
+COMPPGM2(user_sgpr,      USER_SGPR),
+COMPPGM2(tgid_x_en,      TGID_X_EN),
+COMPPGM2(tgid_y_en,      TGID_Y_EN),
+COMPPGM2(tgid_z_en,      TGID_Z_EN),
+COMPPGM2(tg_size_en,     TG_SIZE_EN),
+COMPPGM2(tidig_comp_cnt, TIDIG_COMP_CNT),
+COMPPGM2(excp_en_msb,    EXCP_EN_MSB),
+COMPPGM2(lds_size,       LDS_SIZE),
+COMPPGM2(excp_en,        EXCP_EN),
+
+CODEPROP(enable_sgpr_private_segment_buffer,
+         ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER),
+CODEPROP(enable_sgpr_dispatch_ptr,
+         ENABLE_SGPR_DISPATCH_PTR),
+CODEPROP(enable_sgpr_queue_ptr,
+         ENABLE_SGPR_QUEUE_PTR),
+CODEPROP(enable_sgpr_kernarg_segment_ptr,
+         ENABLE_SGPR_KERNARG_SEGMENT_PTR),
+CODEPROP(enable_sgpr_dispatch_id,
+         ENABLE_SGPR_DISPATCH_ID),
+CODEPROP(enable_sgpr_flat_scratch_init,
+         ENABLE_SGPR_FLAT_SCRATCH_INIT),
+CODEPROP(enable_sgpr_private_segment_size,
+         ENABLE_SGPR_PRIVATE_SEGMENT_SIZE),
+CODEPROP(enable_sgpr_grid_workgroup_count_x,
+         ENABLE_SGPR_GRID_WORKGROUP_COUNT_X),
+CODEPROP(enable_sgpr_grid_workgroup_count_y,
+         ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y),
+CODEPROP(enable_sgpr_grid_workgroup_count_z,
+         ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z),
+CODEPROP(enable_ordered_append_gds,
+         ENABLE_ORDERED_APPEND_GDS),
+CODEPROP(private_element_size,
+         PRIVATE_ELEMENT_SIZE),
+CODEPROP(is_ptr64,
+         IS_PTR64),
+CODEPROP(is_dynamic_callstack,
+         IS_DYNAMIC_CALLSTACK),
+CODEPROP(is_debug_enabled,
+         IS_DEBUG_SUPPORTED),
+CODEPROP(is_xnack_enabled,
+         IS_XNACK_SUPPORTED)
+
+// end of the table
+///////////////////////////////////////////////////////////////////////////////
+
+#undef QNAME
+#undef FLD_T
+#undef FIELD2
+#undef FIELD
+#undef PRINTCODEPROP
+#undef PARSECODEPROP
+#undef CODEPROP
+#undef PRINTCOMP
+#undef PAPSECOMP
+#undef COMPPGM
+#undef COMPPGM1
+#undef COMPPGM2
diff --git a/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp b/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp
new file mode 100644
index 000000000000..f64973afa44f
--- /dev/null
+++ b/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp
@@ -0,0 +1,166 @@
+//===--------------------AMDKernelCodeTUtils.cpp --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file - utility functions to parse/print amd_kernel_code_t structure
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDKernelCodeTUtils.h"
+#include "SIDefines.h"
+#include <llvm/MC/MCParser/MCAsmLexer.h>
+#include <llvm/MC/MCParser/MCAsmParser.h>
+#include <llvm/Support/raw_ostream.h>
+
+using namespace llvm;
+
+static ArrayRef<StringRef> get_amd_kernel_code_t_FldNames() {
+  static StringRef const Table[] = {
+    "", // not found placeholder
+#define RECORD(name, print, parse) #name
+#include "AMDKernelCodeTInfo.h"
+#undef RECORD
+  };
+  return makeArrayRef(Table);
+}
+
+static StringMap<int> createIndexMap(const ArrayRef<StringRef> &a) {
+  StringMap<int> map;
+  for (auto Name : a)
+    map.insert(std::make_pair(Name, map.size()));
+  return map;
+}
+
+static int get_amd_kernel_code_t_FieldIndex(StringRef name) {
+  static const auto map = createIndexMap(get_amd_kernel_code_t_FldNames());
+  return map.lookup(name) - 1; // returns -1 if not found
+}
+
+static StringRef get_amd_kernel_code_t_FieldName(int index) {
+  return get_amd_kernel_code_t_FldNames()[index + 1];
+}
+
+
+// Field printing
+
+static raw_ostream &printName(raw_ostream &OS, StringRef Name) {
+  return OS << Name << " = ";
+}
+
+template <typename T, T amd_kernel_code_t::*ptr>
+static void printField(StringRef Name, const amd_kernel_code_t &C,
+                       raw_ostream &OS) {
+  printName(OS, Name) << (int)(C.*ptr);
+}
+
+template <typename T, T amd_kernel_code_t::*ptr, int shift, int width = 1>
+static void printBitField(StringRef Name, const amd_kernel_code_t &c,
+                          raw_ostream &OS) {
+  const auto Mask = (static_cast<T>(1) << width) - 1;
+  printName(OS, Name) << (int)((c.*ptr >> shift) & Mask);
+}
+
+typedef void(*PrintFx)(StringRef,
+                       const amd_kernel_code_t &,
+                       raw_ostream &);
+
+static ArrayRef<PrintFx> getPrinterTable() {
+  static const PrintFx Table[] = {
+#define RECORD(name, print, parse) print
+#include "AMDKernelCodeTInfo.h"
+#undef RECORD
+  };
+  return makeArrayRef(Table);
+}
+
+void llvm::printAmdKernelCodeField(const amd_kernel_code_t &C,
+                                   int FldIndex,
+                                   raw_ostream &OS) {
+  auto Printer = getPrinterTable()[FldIndex];
+  if (Printer)
+    Printer(get_amd_kernel_code_t_FieldName(FldIndex), C, OS);
+}
+
+void llvm::dumpAmdKernelCode(const amd_kernel_code_t *C,
+                             raw_ostream &OS,
+                             const char *tab) {
+  const int Size = getPrinterTable().size();
+  for (int i = 0; i < Size; ++i) {
+    OS << tab;
+    printAmdKernelCodeField(*C, i, OS);
+    OS << '\n';
+  }
+}
+
+
+// Field parsing
+
+static bool expectAbsExpression(MCAsmParser &MCParser, int64_t &Value, raw_ostream& Err) {
+
+  if (MCParser.getLexer().isNot(AsmToken::Equal)) {
+    Err << "expected '='";
+    return false;
+  }
+  MCParser.getLexer().Lex();
+
+  if (MCParser.parseAbsoluteExpression(Value)) {
+    Err << "integer absolute expression expected";
+    return false;
+  }
+  return true;
+}
+
+template <typename T, T amd_kernel_code_t::*ptr>
+static bool parseField(amd_kernel_code_t &C, MCAsmParser &MCParser,
+                       raw_ostream &Err) {
+  int64_t Value = 0;
+  if (!expectAbsExpression(MCParser, Value, Err))
+    return false;
+  C.*ptr = (T)Value;
+  return true;
+}
+
+template <typename T, T amd_kernel_code_t::*ptr, int shift, int width = 1>
+static bool parseBitField(amd_kernel_code_t &C, MCAsmParser &MCParser,
+                          raw_ostream &Err) {
+  int64_t Value = 0;
+  if (!expectAbsExpression(MCParser, Value, Err))
+    return false;
+  const uint64_t Mask = ((UINT64_C(1)  << width) - 1) << shift;
+  C.*ptr &= (T)~Mask;
+  C.*ptr |= (T)((Value << shift) & Mask);
+  return true;
+}
+
+typedef bool(*ParseFx)(amd_kernel_code_t &,
+                       MCAsmParser &MCParser,
+                       raw_ostream &Err);
+
+static ArrayRef<ParseFx> getParserTable() {
+  static const ParseFx Table[] = {
+#define RECORD(name, print, parse) parse
+#include "AMDKernelCodeTInfo.h"
+#undef RECORD
+  };
+  return makeArrayRef(Table);
+}
+
+bool llvm::parseAmdKernelCodeField(StringRef ID,
+                                   MCAsmParser &MCParser,
+                                   amd_kernel_code_t &C,
+                                   raw_ostream &Err) {
+  const int Idx = get_amd_kernel_code_t_FieldIndex(ID);
+  if (Idx < 0) {
+    Err << "unexpected amd_kernel_code_t field name " << ID;
+    return false;
+  }
+  auto Parser = getParserTable()[Idx];
+  return Parser ? Parser(C, MCParser, Err) : false;
+}
diff --git a/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h b/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h
new file mode 100644
index 000000000000..d9edca7a82ac
--- /dev/null
+++ b/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h
@@ -0,0 +1,39 @@
+//===- AMDGPUKernelCodeTUtils.h - helpers for amd_kernel_code_t  *- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file AMDKernelCodeTUtils.h
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDKERNELCODETUTILS_H
+#define AMDKERNELCODETUTILS_H
+
+#include "AMDKernelCodeT.h"
+
+namespace llvm {
+
+class MCAsmLexer;
+class MCAsmParser;
+class raw_ostream;
+class StringRef;
+
+void printAmdKernelCodeField(const amd_kernel_code_t &C,
+  int FldIndex,
+  raw_ostream &OS);
+
+void dumpAmdKernelCode(const amd_kernel_code_t *C,
+  raw_ostream &OS,
+  const char *tab);
+
+bool parseAmdKernelCodeField(StringRef ID,
+  MCAsmParser &Parser,
+  amd_kernel_code_t &C,
+  raw_ostream &Err);
+
+}
+
+#endif // AMDKERNELCODETUTILS_H
diff --git a/lib/Target/AMDGPU/Utils/CMakeLists.txt b/lib/Target/AMDGPU/Utils/CMakeLists.txt
index 2c07aeab7dd3..01b80ebe8d3d 100644
--- a/lib/Target/AMDGPU/Utils/CMakeLists.txt
+++ b/lib/Target/AMDGPU/Utils/CMakeLists.txt
@@ -1,3 +1,5 @@
 add_llvm_library(LLVMAMDGPUUtils
   AMDGPUBaseInfo.cpp
+  AMDKernelCodeTUtils.cpp
+  AMDGPUAsmUtils.cpp
   )
diff --git a/lib/Target/AMDGPU/Utils/Makefile b/lib/Target/AMDGPU/Utils/Makefile
deleted file mode 100644
index 1019e726d50e..000000000000
--- a/lib/Target/AMDGPU/Utils/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/AMDGPU/Utils/Makefile --------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMAMDGPUUtils
-
-# Hack: we need to include 'main' AMDGPU target directory to grab private
-# headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AMDGPU/VIInstrFormats.td b/lib/Target/AMDGPU/VIInstrFormats.td
index d8738f992630..912ed5329bfe 100644
--- a/lib/Target/AMDGPU/VIInstrFormats.td
+++ b/lib/Target/AMDGPU/VIInstrFormats.td
@@ -91,21 +91,28 @@ class MTBUFe_vi <bits<4> op> : Enc64 {
 
 class SMEMe_vi <bits<8> op, bit imm> : Enc64 {
   bits<7>  sbase;
-  bits<7>  sdata;
+  bits<7>  sdst;
   bits<1>  glc;
-  bits<20> offset;
 
   let Inst{5-0}   = sbase{6-1};
-  let Inst{12-6}  = sdata;
+  let Inst{12-6}  = sdst;
   let Inst{16}    = glc;
   let Inst{17}    = imm;
   let Inst{25-18} = op;
   let Inst{31-26} = 0x30; //encoding
+}
+
+class SMEM_IMMe_vi <bits<8> op> : SMEMe_vi<op, 1> {
+  bits<20> offset;
   let Inst{51-32} = offset;
 }
 
-class VOP3e_vi <bits<10> op> : Enc64 {
-  bits<8> vdst;
+class SMEM_SOFFe_vi <bits<8> op> : SMEMe_vi<op, 0> {
+  bits<20> soff;
+  let Inst{51-32} = soff;
+}
+
+class VOP3a_vi <bits<10> op> : Enc64 {
   bits<2> src0_modifiers;
   bits<9> src0;
   bits<2> src1_modifiers;
@@ -115,7 +122,6 @@ class VOP3e_vi <bits<10> op> : Enc64 {
   bits<1> clamp;
   bits<2> omod;
 
-  let Inst{7-0}   = vdst;
   let Inst{8}     = src0_modifiers{1};
   let Inst{9}     = src1_modifiers{1};
   let Inst{10}    = src2_modifiers{1};
@@ -131,6 +137,20 @@ class VOP3e_vi <bits<10> op> : Enc64 {
   let Inst{63} = src2_modifiers{0};
 }
 
+class VOP3e_vi <bits<10> op> : VOP3a_vi <op> {
+  bits<8> vdst;
+
+  let Inst{7-0} = vdst;
+}
+
+// Encoding used for VOPC instructions encoded as VOP3
+// Differs from VOP3e by destination name (sdst) as VOPC doesn't have vector dst
+class VOP3ce_vi <bits<10> op> : VOP3a_vi <op> {
+  bits<8> sdst;
+
+  let Inst{7-0} = sdst;
+}
+
 class VOP3be_vi <bits<10> op> : Enc64 {
   bits<8> vdst;
   bits<2> src0_modifiers;
@@ -157,6 +177,117 @@ class VOP3be_vi <bits<10> op> : Enc64 {
   let Inst{63} = src2_modifiers{0};
 }
 
+class VOP_DPP <dag outs, dag ins, string asm, list<dag> pattern, bit HasMods = 0> :
+    VOPAnyCommon <outs, ins, asm, pattern> {
+  let DPP = 1;
+  let Size = 8;
+
+  let AsmMatchConverter = !if(!eq(HasMods,1), "cvtDPP", "");
+}
+
+class VOP_DPPe : Enc64 {
+  bits<2> src0_modifiers;
+  bits<8> src0;
+  bits<2> src1_modifiers;
+  bits<9> dpp_ctrl;
+  bits<1> bound_ctrl;
+  bits<4> bank_mask;
+  bits<4> row_mask;
+
+  let Inst{39-32} = src0;
+  let Inst{48-40} = dpp_ctrl;
+  let Inst{51}    = bound_ctrl;
+  let Inst{52}    = src0_modifiers{0}; // src0_neg
+  let Inst{53}    = src0_modifiers{1}; // src0_abs
+  let Inst{54}    = src1_modifiers{0}; // src1_neg
+  let Inst{55}    = src1_modifiers{1}; // src1_abs
+  let Inst{59-56} = bank_mask;
+  let Inst{63-60} = row_mask;
+}
+
+class VOP1_DPPe <bits<8> op> : VOP_DPPe {
+  bits<8> vdst;
+
+  let Inst{8-0} = 0xfa; // dpp
+  let Inst{16-9} = op;
+  let Inst{24-17} = vdst;
+  let Inst{31-25} = 0x3f; //encoding
+}
+
+class VOP2_DPPe <bits<6> op> : VOP_DPPe {
+  bits<8> vdst;
+  bits<8> src1;
+
+  let Inst{8-0} = 0xfa; //dpp
+  let Inst{16-9} = src1;
+  let Inst{24-17} = vdst;
+  let Inst{30-25} = op;
+  let Inst{31} = 0x0; //encoding
+}
+
+class VOP_SDWA <dag outs, dag ins, string asm, list<dag> pattern, bit HasMods = 0> :
+    VOPAnyCommon <outs, ins, asm, pattern> {
+  let SDWA = 1;
+  let Size = 8;
+}
+
+class VOP_SDWAe : Enc64 {
+  bits<8> src0;
+  bits<3> src0_sel;
+  bits<2> src0_fmodifiers; // {abs,neg}
+  bits<1> src0_imodifiers; // sext
+  bits<3> src1_sel;
+  bits<2> src1_fmodifiers;
+  bits<1> src1_imodifiers;
+  bits<3> dst_sel;
+  bits<2> dst_unused;
+  bits<1> clamp;
+
+  let Inst{39-32} = src0;
+  let Inst{42-40} = dst_sel;
+  let Inst{44-43} = dst_unused;
+  let Inst{45} = clamp;
+  let Inst{50-48} = src0_sel;
+  let Inst{53-52} = src0_fmodifiers;
+  let Inst{51} = src0_imodifiers;
+  let Inst{58-56} = src1_sel;
+  let Inst{61-60} = src1_fmodifiers;
+  let Inst{59} = src1_imodifiers;
+}
+
+class VOP1_SDWAe <bits<8> op> : VOP_SDWAe {
+  bits<8> vdst;
+
+  let Inst{8-0} = 0xf9; // sdwa
+  let Inst{16-9} = op;
+  let Inst{24-17} = vdst;
+  let Inst{31-25} = 0x3f; // encoding
+}
+
+class VOP2_SDWAe <bits<6> op> : VOP_SDWAe {
+  bits<8> vdst;
+  bits<8> src1;
+
+  let Inst{8-0} = 0xf9; // sdwa
+  let Inst{16-9} = src1;
+  let Inst{24-17} = vdst;
+  let Inst{30-25} = op;
+  let Inst{31} = 0x0; // encoding
+}
+
+class VOPC_SDWAe <bits<8> op> : VOP_SDWAe {
+  bits<8> src1;
+
+  let Inst{8-0} = 0xf9; // sdwa
+  let Inst{16-9} = src1;
+  let Inst{24-17} = op;
+  let Inst{31-25} = 0x3e; // encoding
+
+  // VOPC disallows dst_sel and dst_unused as they have no effect on destination
+  let Inst{42-40} = 0x6;
+  let Inst{44-43} = 0x2;
+}
+
 class EXPe_vi : EXPe {
   let Inst{31-26} = 0x31; //encoding
 }
diff --git a/lib/Target/AMDGPU/VIInstructions.td b/lib/Target/AMDGPU/VIInstructions.td
index 1a7801c92bd7..5c490ab900f2 100644
--- a/lib/Target/AMDGPU/VIInstructions.td
+++ b/lib/Target/AMDGPU/VIInstructions.td
@@ -11,6 +11,8 @@
 
 let SIAssemblerPredicate = DisableInst, SubtargetPredicate = isVI in {
 
+let DisableSIDecoder = 1 in {
+
 //===----------------------------------------------------------------------===//
 // VOP1 Instructions
 //===----------------------------------------------------------------------===//
@@ -52,9 +54,9 @@ defm V_SUBREV_F16 : VOP2Inst <vop2<0, 0x21>, "v_subrev_f16", VOP_F16_F16_F16,
 defm V_MUL_F16 : VOP2Inst <vop2<0, 0x22>, "v_mul_f16", VOP_F16_F16_F16>;
 defm V_MAC_F16 : VOP2Inst <vop2<0, 0x23>, "v_mac_f16", VOP_F16_F16_F16>;
 } // End isCommutable = 1
-defm V_MADMK_F16 : VOP2MADK <vop2<0,0x24>, "v_madmk_f16">;
+defm V_MADMK_F16 : VOP2MADK <vop2<0,0x24>, "v_madmk_f16", VOP_MADMK>;
 let isCommutable = 1 in {
-defm V_MADAK_F16 : VOP2MADK <vop2<0,0x25>, "v_madak_f16">;
+defm V_MADAK_F16 : VOP2MADK <vop2<0,0x25>, "v_madak_f16", VOP_MADAK>;
 defm V_ADD_U16 : VOP2Inst <vop2<0,0x26>, "v_add_u16", VOP_I16_I16_I16>;
 defm V_SUB_U16 : VOP2Inst <vop2<0,0x27>, "v_sub_u16" , VOP_I16_I16_I16>;
 defm V_SUBREV_U16 : VOP2Inst <vop2<0,0x28>, "v_subrev_u16", VOP_I16_I16_I16>;
@@ -73,6 +75,16 @@ defm V_MIN_I16 : VOP2Inst <vop2<0,0x32>, "v_min_i16", VOP_I16_I16_I16>;
 } // End isCommutable = 1
 defm V_LDEXP_F16 : VOP2Inst <vop2<0,0x33>, "v_ldexp_f16", VOP_F16_F16_I16>;
 
+//===----------------------------------------------------------------------===//
+// VOP3 Instructions
+//===----------------------------------------------------------------------===//
+let isCommutable = 1 in {
+    defm V_MAD_F16 : VOP3Inst <vop3<0, 0x1ea>, "v_mad_f16", VOP_F16_F16_F16_F16>;
+    defm V_MAD_U16 : VOP3Inst <vop3<0, 0x1eb>, "v_mad_u16", VOP_I16_I16_I16_I16>;
+    defm V_MAD_I16 : VOP3Inst <vop3<0, 0x1ec>, "v_mad_i16", VOP_I16_I16_I16_I16>;
+}
+} // let DisableSIDecoder = 1
+
 // Aliases to simplify matching of floating-point instructions that
 // are VOP2 on SI and VOP3 on VI.
 
@@ -99,6 +111,9 @@ def S_DCACHE_WB : SMEM_Inval <0x21,
 def S_DCACHE_WB_VOL : SMEM_Inval <0x23,
   "s_dcache_wb_vol", int_amdgcn_s_dcache_wb_vol>;
 
+def S_MEMREALTIME : SMEM_Ret<0x25,
+  "s_memrealtime", int_amdgcn_s_memrealtime>;
+
 } // End SIAssemblerPredicate = DisableInst, SubtargetPredicate = isVI
 
 let Predicates = [isVI] in {
@@ -109,4 +124,35 @@ def : Pat <
   (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset))
 >;
 
+//===----------------------------------------------------------------------===//
+// DPP Patterns
+//===----------------------------------------------------------------------===//
+
+def : Pat <
+  (int_amdgcn_mov_dpp i32:$src, imm:$dpp_ctrl, imm:$row_mask, imm:$bank_mask,
+                      imm:$bound_ctrl),
+  (V_MOV_B32_dpp $src, (as_i32imm $dpp_ctrl), (as_i32imm $row_mask),
+                       (as_i32imm $bank_mask), (as_i1imm $bound_ctrl))
+>;
+
+//===----------------------------------------------------------------------===//
+// Misc Patterns
+//===----------------------------------------------------------------------===//
+
+def : Pat <
+  (i64 (readcyclecounter)),
+  (S_MEMREALTIME)
+>;
+
+//===----------------------------------------------------------------------===//
+// DS_PERMUTE/DS_BPERMUTE Instructions.
+//===----------------------------------------------------------------------===//
+
+let Uses = [EXEC] in {
+defm DS_PERMUTE_B32 : DS_1A1D_PERMUTE <0x3e, "ds_permute_b32", VGPR_32,
+                                       int_amdgcn_ds_permute>;
+defm DS_BPERMUTE_B32 : DS_1A1D_PERMUTE <0x3f, "ds_bpermute_b32", VGPR_32,
+                                       int_amdgcn_ds_bpermute>;
+}
+
 } // End Predicates = [isVI]