119 files changed, 2405 insertions, 1149 deletions
diff --git a/docs/ReleaseNotes.rst b/docs/ReleaseNotes.rst
index dd54d45c99c9..f27fb0277085 100644
--- a/docs/ReleaseNotes.rst
+++ b/docs/ReleaseNotes.rst
@@ -102,6 +102,282 @@ enable handling of case (3).
 .. _discussions: http://lists.cs.uiuc.edu/pipermail/llvmdev/2014-May/073235.html
 
 
+Metadata is not a Value
+-----------------------
+
+Metadata nodes (``!{...}``) and strings (``!"..."``) are no longer values.
+They have no use-lists, no type, cannot RAUW, and cannot be function-local.
+
+Bridges between Value and Metadata
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+LLVM intrinsics can reference metadata using the ``metadata`` type, and
+metadata nodes can reference constant values.
+
+Function-local metadata is limited to direct arguments to LLVM intrinsics.
+
+Metadata is typeless
+^^^^^^^^^^^^^^^^^^^^
+
+The following old IR:
+
+.. code-block:: llvm
+
+    @g = global i32 0
+
+    define void @foo(i32 %v) {
+    entry:
+      call void @llvm.md(metadata !{i32 %v})
+      call void @llvm.md(metadata !{i32* @global})
+      call void @llvm.md(metadata !0)
+      call void @llvm.md(metadata !{metadata !"string"})
+      call void @llvm.md(metadata !{metadata !{metadata !1, metadata !"string"}})
+      ret void, !bar !1, !baz !2
+    }
+
+    declare void @llvm.md(metadata)
+
+    !0 = metadata !{metadata !1, metadata !2, metadata !3, metadata !"some string"}
+    !1 = metadata !{metadata !2, null, metadata !"other", i32* @global, i32 7}
+    !2 = metadata !{}
+
+should now be written as:
+
+.. code-block:: llvm
+
+    @g = global i32 0
+
+    define void @foo(i32 %v) {
+    entry:
+      call void @llvm.md(metadata i32 %v) ; The only legal place for function-local
+                                          ; metadata.
+      call void @llvm.md(metadata i32* @global)
+      call void @llvm.md(metadata !0)
+      call void @llvm.md(metadata !{!"string"})
+      call void @llvm.md(metadata !{!{!1, !"string"}})
+      ret void, !bar !1, !baz !2
+    }
+
+    declare void @llvm.md(metadata)
+
+    !0 = !{!1, !2, !3, !"some string"}
+    !1 = !{!2, null, !"other", i32* @global, i32 7}
+    !2 = !{}
+
+Distinct metadata nodes
+^^^^^^^^^^^^^^^^^^^^^^^
+
+Metadata nodes can opt-out of uniquing, using the keyword ``distinct``.
+Distinct nodes are still owned by the context, but are stored in a side table,
+and not uniqued.
+
+In LLVM 3.5, metadata nodes would drop uniquing if an operand changed to
+``null`` during optimizations.  This is no longer true.  However, if an operand
+change causes a uniquing collision, they become ``distinct``.  Unlike LLVM 3.5,
+where serializing to assembly or bitcode would re-unique the nodes, they now
+remain ``distinct``.
+
+The following IR:
+
+.. code-block:: llvm
+
+    !named = !{!0, !1, !2, !3, !4, !5, !6, !7, !8}
+
+    !0 = !{}
+    !1 = !{}
+    !2 = distinct !{}
+    !3 = distinct !{}
+    !4 = !{!0}
+    !5 = distinct !{!0}
+    !6 = !{!4, !{}, !5}
+    !7 = !{!{!0}, !0, !5}
+    !8 = distinct !{!{!0}, !0, !5}
+
+is equivalent to the following:
+
+.. code-block:: llvm
+
+    !named = !{!0, !0, !1, !2, !3, !4, !5, !5, !6}
+
+    !0 = !{}
+    !1 = distinct !{}
+    !2 = distinct !{}
+    !3 = !{!0}
+    !4 = distinct !{!0}
+    !5 = !{!3, !0, !4}
+    !6 = distinct !{!3, !0, !4}
+
+Constructing cyclic graphs
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+During graph construction, if a metadata node transitively references a forward
+declaration, the node itself is considered "unresolved" until the forward
+declaration resolves.  An unresolved node can RAUW itself to support uniquing.
+Nodes automatically resolve once all their operands have resolved.
+
+However, cyclic graphs prevent the nodes from resolving.  An API client that
+constructs a cyclic graph must call ``resolveCycles()`` to resolve nodes in the
+cycle.
+
+To save self-references from that burden, self-referencing nodes are implicitly
+``distinct``.  So the following IR:
+
+.. code-block:: llvm
+
+    !named = !{!0, !1, !2, !3, !4}
+
+    !0 = !{!0}
+    !1 = !{!1}
+    !2 = !{!2, !1}
+    !3 = !{!2, !1}
+    !4 = !{!2, !1}
+
+is equivalent to:
+
+.. code-block:: llvm
+
+    !named = !{!0, !1, !2, !3, !3}
+
+    !0 = distinct !{!0}
+    !1 = distinct !{!1}
+    !2 = distinct !{!2, !1}
+    !3 = !{!2, !1}
+
+MDLocation (aka DebugLoc aka DILocation)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+There's a new first-class metadata construct called ``MDLocation`` (to be
+followed in subsequent releases by others).  It's used for the locations
+referenced by ``!dbg`` metadata attachments.
+
+For example, if an old ``!dbg`` attachment looked like this:
+
+.. code-block:: llvm
+
+    define i32 @foo(i32 %a, i32 %b) {
+    entry:
+      %add = add i32 %a, %b, !dbg !0
+      ret %add, !dbg !1
+    }
+
+    !0 = metadata !{i32 10, i32 3, metadata !2, metadata !1)
+    !1 = metadata !{i32 20, i32 7, metadata !3)
+    !2 = metadata !{...}
+    !3 = metadata !{...}
+
+the new attachment looks like this:
+
+.. code-block:: llvm
+
+    define i32 @foo(i32 %a, i32 %b) {
+    entry:
+      %add = add i32 %a, %b, !dbg !0
+      ret %add, !dbg !1
+    }
+
+    !0 = !MDLocation(line: 10, column: 3, scope: !2, inlinedAt: !1)
+    !1 = !MDLocation(line: 20, column: 7, scope: !3)
+    !2 = !{...}
+    !3 = !{...}
+
+The fields are named, can be reordered, and have sane defaults if left out
+(although ``scope:`` is required).
+
+
+Alias syntax change
+-----------------------
+
+The syntax for aliases is now closer to what is used for global variables
+
+.. code-block:: llvm
+
+    @a = weak global ...
+    @b = weak alias ...
+
+The order of the ``alias`` keyword and the linkage was swapped before.
+
+The old JIT has been removed
+----------------------------
+
+All users should transition to MCJIT.
+
+
+object::Binary doesn't owns the file buffer
+-------------------------------------------
+
+It is now just a wrapper, which simplifies using object::Binary with other
+users of the underlying file.
+
+IR in object files is now supported
+-----------------------------------
+
+Regular object files can contain IR in a section named ``.llvmbc``.
+
+
+The gold plugin has been rewritten
+----------------------------------
+
+It is now implemented directly on top of lib/Linker instead of ``lib/LTO``.
+The API of ``lib/LTO`` is sufficiently different from gold's view of the
+linking process that some cases could not be conveniently implemented.
+
+The new implementation is also lazier and has a ``save-temps`` option.
+
+
+Change in the representation of lazy loaded funcs
+-------------------------------------------------
+
+Lazy loaded functions are now represented is a way that ``isDeclaration``
+returns the correct answer even before reading the body.
+
+
+The opt option -std-compile-opts was removed
+--------------------------------------------
+
+It was effectively an alias of -O3.
+
+
+Python 2.7 is now required
+--------------------------
+
+This was done to simplify compatibility with python 3.
+
+The leak detector has been removed
+----------------------------------
+
+In practice tools like asan and valgrind were finding way more bugs than
+the old leak detector, so it was removed.
+
+
+New comdat syntax
+-----------------
+
+The syntax of comdats was changed to
+
+.. code-block:: llvm
+
+    $c = comdat any
+    @g = global i32 0, comdat($c)
+    @c = global i32 0, comdat
+
+The version without the parentheses is a syntatic sugar for a comdat with
+the same name as the global.
+
+
+Diagnotic infrastructure used by lib/Linker and lib/Bitcode
+-----------------------------------------------------------
+
+These libraries now use the diagnostic handler to print errors and warnings.
+This provides better error messages and simpler error handling.
+
+
+The PreserveSource linker mode was removed
+------------------------------------------
+
+It was fairly broken and was removed.
+
+
+
 Changes to the ARM Backend
 --------------------------
 
@@ -237,8 +513,35 @@ An exciting aspect of LLVM is that it is used as an enabling technology for
 a lot of other language and tools projects. This section lists some of the
 projects that have already been updated to work with LLVM 3.6.
 
-* A project
-
+Portable Computing Language (pocl)
+----------------------------------
+
+In addition to producing an easily portable open source OpenCL
+implementation, another major goal of `pocl <http://portablecl.org/>`_
+is improving performance portability of OpenCL programs with
+compiler optimizations, reducing the need for target-dependent manual
+optimizations. An important part of pocl is a set of LLVM passes used to
+statically parallelize multiple work-items with the kernel compiler, even in
+the presence of work-group barriers. This enables static parallelization of
+the fine-grained static concurrency in the work groups in multiple ways. 
+
+TTA-based Co-design Environment (TCE)
+-------------------------------------
+
+`TCE <http://tce.cs.tut.fi/>`_ is a toolset for designing customized
+exposed datapath processors based on the Transport triggered 
+architecture (TTA). 
+
+The toolset provides a complete co-design flow from C/C++
+programs down to synthesizable VHDL/Verilog and parallel program binaries.
+Processor customization points include the register files, function units,
+supported operations, and the interconnection network.
+
+TCE uses Clang and LLVM for C/C++/OpenCL C language support, target independent 
+optimizations and also for parts of code generation. It generates
+new LLVM-based code generators "on the fly" for the designed processors and
+loads them in to the compiler backend as runtime libraries to avoid
+per-target recompilation of larger parts of the compiler chain. 
 
 Additional Information
 ======================
diff --git a/include/llvm/IR/LegacyPassManagers.h b/include/llvm/IR/LegacyPassManagers.h
index ab500a1dd2a3..7f7889ad5fb3 100644
--- a/include/llvm/IR/LegacyPassManagers.h
+++ b/include/llvm/IR/LegacyPassManagers.h
@@ -195,6 +195,9 @@ public:
   /// then return NULL.
   Pass *findAnalysisPass(AnalysisID AID);
 
+  /// Retrieve the PassInfo for an analysis.
+  const PassInfo *findAnalysisPassInfo(AnalysisID AID) const;
+
   /// Find analysis usage information for the pass P.
   AnalysisUsage *findAnalysisUsage(Pass *P);
 
@@ -251,6 +254,12 @@ private:
   SmallVector<ImmutablePass *, 16> ImmutablePasses;
 
   DenseMap<Pass *, AnalysisUsage *> AnUsageMap;
+
+  /// Collection of PassInfo objects found via analysis IDs and in this top
+  /// level manager. This is used to memoize queries to the pass registry.
+  /// FIXME: This is an egregious hack because querying the pass registry is
+  /// either slow or racy.
+  mutable DenseMap<AnalysisID, const PassInfo *> AnalysisPassInfos;
 };
 
 
diff --git a/include/llvm/MC/MCAsmBackend.h b/include/llvm/MC/MCAsmBackend.h
index 216271086a40..a6d41392724e 100644
--- a/include/llvm/MC/MCAsmBackend.h
+++ b/include/llvm/MC/MCAsmBackend.h
@@ -61,6 +61,12 @@ public:
   /// markers. If not, data region directives will be ignored.
   bool hasDataInCodeSupport() const { return HasDataInCodeSupport; }
 
+  /// doesSectionRequireSymbols - Check whether the given section requires that
+  /// all symbols (even temporaries) have symbol table entries.
+  virtual bool doesSectionRequireSymbols(const MCSection &Section) const {
+    return false;
+  }
+
   /// @name Target Fixup Interfaces
   /// @{
 
diff --git a/include/llvm/MC/MCAssembler.h b/include/llvm/MC/MCAssembler.h
index 31f29eb1f3ff..681a31728799 100644
--- a/include/llvm/MC/MCAssembler.h
+++ b/include/llvm/MC/MCAssembler.h
@@ -11,7 +11,6 @@
 #define LLVM_MC_MCASSEMBLER_H
 
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallString.h"
@@ -882,8 +881,6 @@ private:
 
   iplist<MCSymbolData> Symbols;
 
-  DenseSet<const MCSymbol *> LocalsUsedInReloc;
-
   /// The map of sections to their associated assembler backend data.
   //
   // FIXME: Avoid this indirection?
@@ -983,9 +980,6 @@ private:
                                         MCFragment &F, const MCFixup &Fixup);
 
 public:
-  void addLocalUsedInReloc(const MCSymbol &Sym);
-  bool isLocalUsedInReloc(const MCSymbol &Sym) const;
-
   /// Compute the effective fragment size assuming it is laid out at the given
   /// \p SectionAddress and \p FragmentOffset.
   uint64_t computeFragmentSize(const MCAsmLayout &Layout,
diff --git a/include/llvm/MC/MCMachObjectWriter.h b/include/llvm/MC/MCMachObjectWriter.h
index e4681c0a3dc3..0c5aa8a18063 100644
--- a/include/llvm/MC/MCMachObjectWriter.h
+++ b/include/llvm/MC/MCMachObjectWriter.h
@@ -68,10 +68,12 @@ public:
   /// @name API
   /// @{
 
-  virtual void RecordRelocation(MachObjectWriter *Writer, MCAssembler &Asm,
+  virtual void RecordRelocation(MachObjectWriter *Writer,
+                                const MCAssembler &Asm,
                                 const MCAsmLayout &Layout,
                                 const MCFragment *Fragment,
-                                const MCFixup &Fixup, MCValue Target,
+                                const MCFixup &Fixup,
+                                MCValue Target,
                                 uint64_t &FixedValue) = 0;
 
   /// @}
@@ -95,14 +97,8 @@ class MachObjectWriter : public MCObjectWriter {
   /// @name Relocation Data
   /// @{
 
-  struct RelAndSymbol {
-    const MCSymbolData *Sym;
-    MachO::any_relocation_info MRE;
-    RelAndSymbol(const MCSymbolData *Sym, const MachO::any_relocation_info &MRE)
-        : Sym(Sym), MRE(MRE) {}
-  };
-
-  llvm::DenseMap<const MCSectionData *, std::vector<RelAndSymbol>> Relocations;
+  llvm::DenseMap<const MCSectionData*,
+                 std::vector<MachO::any_relocation_info> > Relocations;
   llvm::DenseMap<const MCSectionData*, unsigned> IndirectSymBase;
 
   /// @}
@@ -217,15 +213,9 @@ public:
   //  - Input errors, where something cannot be correctly encoded. 'as' allows
   //    these through in many cases.
 
-  // Add a relocation to be output in the object file. At the time this is
-  // called, the symbol indexes are not know, so if the relocation refers
-  // to a symbol it should be passed as \p RelSymbol so that it can be updated
-  // afterwards. If the relocation doesn't refer to a symbol, nullptr should be
-  // used.
-  void addRelocation(const MCSymbolData *RelSymbol, const MCSectionData *SD,
+  void addRelocation(const MCSectionData *SD,
                      MachO::any_relocation_info &MRE) {
-    RelAndSymbol P(RelSymbol, MRE);
-    Relocations[SD].push_back(P);
+    Relocations[SD].push_back(MRE);
   }
 
   void RecordScatteredRelocation(const MCAssembler &Asm,
@@ -241,7 +231,7 @@ public:
                             const MCFixup &Fixup, MCValue Target,
                             uint64_t &FixedValue);
 
-  void RecordRelocation(MCAssembler &Asm, const MCAsmLayout &Layout,
+  void RecordRelocation(const MCAssembler &Asm, const MCAsmLayout &Layout,
                         const MCFragment *Fragment, const MCFixup &Fixup,
                         MCValue Target, bool &IsPCRel,
                         uint64_t &FixedValue) override;
diff --git a/include/llvm/MC/MCObjectWriter.h b/include/llvm/MC/MCObjectWriter.h
index 173ef416f2cc..55c828c6c179 100644
--- a/include/llvm/MC/MCObjectWriter.h
+++ b/include/llvm/MC/MCObjectWriter.h
@@ -76,10 +76,12 @@ public:
   /// post layout binding. The implementation is responsible for storing
   /// information about the relocation so that it can be emitted during
   /// WriteObject().
-  virtual void RecordRelocation(MCAssembler &Asm, const MCAsmLayout &Layout,
+  virtual void RecordRelocation(const MCAssembler &Asm,
+                                const MCAsmLayout &Layout,
                                 const MCFragment *Fragment,
                                 const MCFixup &Fixup, MCValue Target,
-                                bool &IsPCRel, uint64_t &FixedValue) = 0;
+                                bool &IsPCRel,
+                                uint64_t &FixedValue) = 0;
 
   /// \brief Check whether the difference (A - B) between two symbol
   /// references is fully resolved.
diff --git a/include/llvm/Transforms/Utils/SimplifyLibCalls.h b/include/llvm/Transforms/Utils/SimplifyLibCalls.h
index d2f096fd1efd..70ef0eb765dd 100644
--- a/include/llvm/Transforms/Utils/SimplifyLibCalls.h
+++ b/include/llvm/Transforms/Utils/SimplifyLibCalls.h
@@ -17,6 +17,7 @@
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 
 namespace llvm {
 class Value;
@@ -53,8 +54,10 @@ private:
   Value *optimizeMemCpyChk(CallInst *CI, IRBuilder<> &B);
   Value *optimizeMemMoveChk(CallInst *CI, IRBuilder<> &B);
   Value *optimizeMemSetChk(CallInst *CI, IRBuilder<> &B);
-  Value *optimizeStrCpyChk(CallInst *CI, IRBuilder<> &B);
-  Value *optimizeStrNCpyChk(CallInst *CI, IRBuilder<> &B);
+
+  // Str/Stp cpy are similar enough to be handled in the same functions.
+  Value *optimizeStrpCpyChk(CallInst *CI, IRBuilder<> &B, LibFunc::Func Func);
+  Value *optimizeStrpNCpyChk(CallInst *CI, IRBuilder<> &B, LibFunc::Func Func);
 
   /// \brief Checks whether the call \p CI to a fortified libcall is foldable
   /// to the non-fortified version.
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index 86852d634ff2..ee4273770555 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -3154,8 +3154,9 @@ const SCEV *ScalarEvolution::getMinusSCEV(const SCEV *LHS, const SCEV *RHS,
   if (LHS == RHS)
     return getConstant(LHS->getType(), 0);
 
-  // X - Y --> X + -Y
-  return getAddExpr(LHS, getNegativeSCEV(RHS), Flags);
+  // X - Y --> X + -Y.
+  // X -(nsw || nuw) Y --> X + -Y.
+  return getAddExpr(LHS, getNegativeSCEV(RHS));
 }
 
 /// getTruncateOrZeroExtend - Return a SCEV corresponding to a conversion of the
@@ -3461,12 +3462,10 @@ const SCEV *ScalarEvolution::createNodeForPHI(PHINode *PN) {
                   if (isKnownPositive(getMinusSCEV(getSCEV(GEP), Ptr)))
                     Flags = setFlags(Flags, SCEV::FlagNUW);
                 }
-              } else if (const SubOperator *OBO =
-                           dyn_cast<SubOperator>(BEValueV)) {
-                if (OBO->hasNoUnsignedWrap())
-                  Flags = setFlags(Flags, SCEV::FlagNUW);
-                if (OBO->hasNoSignedWrap())
-                  Flags = setFlags(Flags, SCEV::FlagNSW);
+
+                // We cannot transfer nuw and nsw flags from subtraction
+                // operations -- sub nuw X, Y is not the same as add nuw X, -Y
+                // for instance.
               }
 
               const SCEV *StartVal = getSCEV(StartValueV);
diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index 3d3da2ac1d31..455258e81e6d 100644
--- a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -626,10 +626,7 @@ static uint64_t getBaseTypeSize(DwarfDebug *DD, DIDerivedType Ty) {
 
   DIType BaseType = DD->resolve(Ty.getTypeDerivedFrom());
 
-  // If this type is not derived from any type or the type is a declaration then
-  // take conservative approach.
-  if (!BaseType.isValid() || BaseType.isForwardDecl())
-    return Ty.getSizeInBits();
+  assert(BaseType.isValid());
 
   // If this is a derived type, go ahead and get the base type, unless it's a
   // reference then it's just the size of the field. Pointer types have no need
@@ -1473,7 +1470,7 @@ void DwarfUnit::constructMemberDIE(DIE &Buffer, DIDerivedType DT) {
     uint64_t FieldSize = getBaseTypeSize(DD, DT);
     uint64_t OffsetInBytes;
 
-    if (Size != FieldSize) {
+    if (FieldSize && Size != FieldSize) {
       // Handle bitfield, assume bytes are 8 bits.
       addUInt(MemberDie, dwarf::DW_AT_byte_size, None, FieldSize/8);
       addUInt(MemberDie, dwarf::DW_AT_bit_size, None, Size);
diff --git a/lib/CodeGen/MachineFunctionPass.cpp b/lib/CodeGen/MachineFunctionPass.cpp
index 789f2042a073..2f076b6734d9 100644
--- a/lib/CodeGen/MachineFunctionPass.cpp
+++ b/lib/CodeGen/MachineFunctionPass.cpp
@@ -11,11 +11,19 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/IR/Function.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/DominanceFrontier.h"
+#include "llvm/Analysis/IVUsers.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/StackProtector.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
 using namespace llvm;
 
 Pass *MachineFunctionPass::createPrinterPass(raw_ostream &O,
@@ -43,15 +51,13 @@ void MachineFunctionPass::getAnalysisUsage(AnalysisUsage &AU) const {
   // because CodeGen overloads that to mean preserving the MachineBasicBlock
   // CFG in addition to the LLVM IR CFG.
   AU.addPreserved<AliasAnalysis>();
-  AU.addPreserved("scalar-evolution");
-  AU.addPreserved("iv-users");
-  AU.addPreserved("memdep");
-  AU.addPreserved("live-values");
-  AU.addPreserved("domtree");
-  AU.addPreserved("domfrontier");
-  AU.addPreserved("loops");
-  AU.addPreserved("lda");
-  AU.addPreserved("stack-protector");
+  AU.addPreserved<DominanceFrontier>();
+  AU.addPreserved<DominatorTreeWrapperPass>();
+  AU.addPreserved<IVUsers>();
+  AU.addPreserved<LoopInfo>();
+  AU.addPreserved<MemoryDependenceAnalysis>();
+  AU.addPreserved<ScalarEvolution>();
+  AU.addPreserved<StackProtector>();
 
   FunctionPass::getAnalysisUsage(AU);
 }
diff --git a/lib/IR/Core.cpp b/lib/IR/Core.cpp
index a25c4d66d3bb..753d9c229eca 100644
--- a/lib/IR/Core.cpp
+++ b/lib/IR/Core.cpp
@@ -563,9 +563,23 @@ LLVMValueRef LLVMGetMetadata(LLVMValueRef Inst, unsigned KindID) {
   return nullptr;
 }
 
-void LLVMSetMetadata(LLVMValueRef Inst, unsigned KindID, LLVMValueRef MD) {
-  MDNode *N =
-      MD ? cast<MDNode>(unwrap<MetadataAsValue>(MD)->getMetadata()) : nullptr;
+// MetadataAsValue uses a canonical format which strips the actual MDNode for
+// MDNode with just a single constant value, storing just a ConstantAsMetadata
+// This undoes this canonicalization, reconstructing the MDNode.
+static MDNode *extractMDNode(MetadataAsValue *MAV) {
+  Metadata *MD = MAV->getMetadata();
+  assert((isa<MDNode>(MD) || isa<ConstantAsMetadata>(MD)) &&
+      "Expected a metadata node or a canonicalized constant");
+
+  if (MDNode *N = dyn_cast<MDNode>(MD))
+    return N;
+
+  return MDNode::get(MAV->getContext(), MD);
+}
+
+void LLVMSetMetadata(LLVMValueRef Inst, unsigned KindID, LLVMValueRef Val) {
+  MDNode *N = Val ? extractMDNode(unwrap<MetadataAsValue>(Val)) : nullptr;
+
   unwrap<Instruction>(Inst)->setMetadata(KindID, N);
 }
 
@@ -795,7 +809,7 @@ void LLVMAddNamedMetadataOperand(LLVMModuleRef M, const char* name,
     return;
   if (!Val)
     return;
-  N->addOperand(cast<MDNode>(unwrap<MetadataAsValue>(Val)->getMetadata()));
+  N->addOperand(extractMDNode(unwrap<MetadataAsValue>(Val)));
 }
 
 /*--.. Operations on scalar constants ......................................--*/
diff --git a/lib/IR/LegacyPassManager.cpp b/lib/IR/LegacyPassManager.cpp
index b9ab25651fa2..fa8d50ec160c 100644
--- a/lib/IR/LegacyPassManager.cpp
+++ b/lib/IR/LegacyPassManager.cpp
@@ -600,8 +600,7 @@ void PMTopLevelManager::schedulePass(Pass *P) {
   // If P is an analysis pass and it is available then do not
   // generate the analysis again. Stale analysis info should not be
   // available at this point.
-  const PassInfo *PI =
-    PassRegistry::getPassRegistry()->getPassInfo(P->getPassID());
+  const PassInfo *PI = findAnalysisPassInfo(P->getPassID());
   if (PI && PI->isAnalysis() && findAnalysisPass(P->getPassID())) {
     delete P;
     return;
@@ -619,7 +618,7 @@ void PMTopLevelManager::schedulePass(Pass *P) {
 
       Pass *AnalysisPass = findAnalysisPass(*I);
       if (!AnalysisPass) {
-        const PassInfo *PI = PassRegistry::getPassRegistry()->getPassInfo(*I);
+        const PassInfo *PI = findAnalysisPassInfo(*I);
 
         if (!PI) {
           // Pass P is not in the global PassRegistry
@@ -716,8 +715,7 @@ Pass *PMTopLevelManager::findAnalysisPass(AnalysisID AID) {
       return *I;
 
     // If Pass not found then check the interfaces implemented by Immutable Pass
-    const PassInfo *PassInf =
-      PassRegistry::getPassRegistry()->getPassInfo(PI);
+    const PassInfo *PassInf = findAnalysisPassInfo(PI);
     assert(PassInf && "Expected all immutable passes to be initialized");
     const std::vector<const PassInfo*> &ImmPI =
       PassInf->getInterfacesImplemented();
@@ -731,6 +729,17 @@ Pass *PMTopLevelManager::findAnalysisPass(AnalysisID AID) {
   return nullptr;
 }
 
+const PassInfo *PMTopLevelManager::findAnalysisPassInfo(AnalysisID AID) const {
+  const PassInfo *&PI = AnalysisPassInfos[AID];
+  if (!PI)
+    PI = PassRegistry::getPassRegistry()->getPassInfo(AID);
+  else
+    assert(PI == PassRegistry::getPassRegistry()->getPassInfo(AID) &&
+           "The pass info pointer changed for an analysis ID!");
+
+  return PI;
+}
+
 // Print passes managed by this top level manager.
 void PMTopLevelManager::dumpPasses() const {
 
@@ -759,8 +768,7 @@ void PMTopLevelManager::dumpArguments() const {
   dbgs() << "Pass Arguments: ";
   for (SmallVectorImpl<ImmutablePass *>::const_iterator I =
        ImmutablePasses.begin(), E = ImmutablePasses.end(); I != E; ++I)
-    if (const PassInfo *PI =
-        PassRegistry::getPassRegistry()->getPassInfo((*I)->getPassID())) {
+    if (const PassInfo *PI = findAnalysisPassInfo((*I)->getPassID())) {
       assert(PI && "Expected all immutable passes to be initialized");
       if (!PI->isAnalysisGroup())
         dbgs() << " -" << PI->getPassArgument();
@@ -824,7 +832,7 @@ void PMDataManager::recordAvailableAnalysis(Pass *P) {
 
   // This pass is the current implementation of all of the interfaces it
   // implements as well.
-  const PassInfo *PInf = PassRegistry::getPassRegistry()->getPassInfo(PI);
+  const PassInfo *PInf = TPM->findAnalysisPassInfo(PI);
   if (!PInf) return;
   const std::vector<const PassInfo*> &II = PInf->getInterfacesImplemented();
   for (unsigned i = 0, e = II.size(); i != e; ++i)
@@ -957,7 +965,7 @@ void PMDataManager::freePass(Pass *P, StringRef Msg,
   }
 
   AnalysisID PI = P->getPassID();
-  if (const PassInfo *PInf = PassRegistry::getPassRegistry()->getPassInfo(PI)) {
+  if (const PassInfo *PInf = TPM->findAnalysisPassInfo(PI)) {
     // Remove the pass itself (if it is not already removed).
     AvailableAnalysis.erase(PI);
 
@@ -1037,7 +1045,7 @@ void PMDataManager::add(Pass *P, bool ProcessAnalysis) {
   for (SmallVectorImpl<AnalysisID>::iterator
          I = ReqAnalysisNotAvailable.begin(),
          E = ReqAnalysisNotAvailable.end() ;I != E; ++I) {
-    const PassInfo *PI = PassRegistry::getPassRegistry()->getPassInfo(*I);
+    const PassInfo *PI = TPM->findAnalysisPassInfo(*I);
     Pass *AnalysisPass = PI->createPass();
     this->addLowerLevelRequiredPass(P, AnalysisPass);
   }
@@ -1142,7 +1150,7 @@ void PMDataManager::dumpPassArguments() const {
       PMD->dumpPassArguments();
     else
       if (const PassInfo *PI =
-            PassRegistry::getPassRegistry()->getPassInfo((*I)->getPassID()))
+            TPM->findAnalysisPassInfo((*I)->getPassID()))
         if (!PI->isAnalysisGroup())
           dbgs() << " -" << PI->getPassArgument();
   }
@@ -1218,7 +1226,7 @@ void PMDataManager::dumpAnalysisUsage(StringRef Msg, const Pass *P,
   dbgs() << (const void*)P << std::string(getDepth()*2+3, ' ') << Msg << " Analyses:";
   for (unsigned i = 0; i != Set.size(); ++i) {
     if (i) dbgs() << ',';
-    const PassInfo *PInf = PassRegistry::getPassRegistry()->getPassInfo(Set[i]);
+    const PassInfo *PInf = TPM->findAnalysisPassInfo(Set[i]);
     if (!PInf) {
       // Some preserved passes, such as AliasAnalysis, may not be initialized by
       // all drivers.
@@ -1658,8 +1666,8 @@ void MPPassManager::addLowerLevelRequiredPass(Pass *P, Pass *RequiredPass) {
 
     OnTheFlyManagers[P] = FPP;
   }
-  const PassInfo * RequiredPassPI =
-    PassRegistry::getPassRegistry()->getPassInfo(RequiredPass->getPassID());
+  const PassInfo *RequiredPassPI =
+      TPM->findAnalysisPassInfo(RequiredPass->getPassID());
 
   Pass *FoundPass = nullptr;
   if (RequiredPassPI && RequiredPassPI->isAnalysis()) {
diff --git a/lib/MC/ELFObjectWriter.cpp b/lib/MC/ELFObjectWriter.cpp
index 4dcf910e01a7..e2439abaf001 100644
--- a/lib/MC/ELFObjectWriter.cpp
+++ b/lib/MC/ELFObjectWriter.cpp
@@ -219,7 +219,7 @@ class ELFObjectWriter : public MCObjectWriter {
                                   const MCSymbolData *SD, uint64_t C,
                                   unsigned Type) const;
 
-    void RecordRelocation(MCAssembler &Asm, const MCAsmLayout &Layout,
+    void RecordRelocation(const MCAssembler &Asm, const MCAsmLayout &Layout,
                           const MCFragment *Fragment, const MCFixup &Fixup,
                           MCValue Target, bool &IsPCRel,
                           uint64_t &FixedValue) override;
@@ -789,11 +789,13 @@ static const MCSymbol *getWeakRef(const MCSymbolRefExpr &Ref) {
   return nullptr;
 }
 
-void ELFObjectWriter::RecordRelocation(MCAssembler &Asm,
+void ELFObjectWriter::RecordRelocation(const MCAssembler &Asm,
                                        const MCAsmLayout &Layout,
                                        const MCFragment *Fragment,
-                                       const MCFixup &Fixup, MCValue Target,
-                                       bool &IsPCRel, uint64_t &FixedValue) {
+                                       const MCFixup &Fixup,
+                                       MCValue Target,
+                                       bool &IsPCRel,
+                                       uint64_t &FixedValue) {
   const MCSectionData *FixupSection = Fragment->getParent();
   uint64_t C = Target.getConstant();
   uint64_t FixupOffset = Layout.getFragmentOffset(Fragment) + Fixup.getOffset();
diff --git a/lib/MC/MCAsmInfoDarwin.cpp b/lib/MC/MCAsmInfoDarwin.cpp
index f7054902f24c..04cc0ff4a864 100644
--- a/lib/MC/MCAsmInfoDarwin.cpp
+++ b/lib/MC/MCAsmInfoDarwin.cpp
@@ -27,7 +27,22 @@ bool MCAsmInfoDarwin::isSectionAtomizableBySymbols(
   // contain.
   // Sections holding 2 byte strings require symbols in order to be atomized.
   // There is no dedicated section for 4 byte strings.
-  if (SMO.getType() == MachO::S_CSTRING_LITERALS)
+  if (SMO.getKind().isMergeable1ByteCString())
+    return false;
+
+  if (SMO.getSegmentName() == "__TEXT" &&
+      SMO.getSectionName() == "__objc_classname" &&
+      SMO.getType() == MachO::S_CSTRING_LITERALS)
+    return false;
+
+  if (SMO.getSegmentName() == "__TEXT" &&
+      SMO.getSectionName() == "__objc_methname" &&
+      SMO.getType() == MachO::S_CSTRING_LITERALS)
+    return false;
+
+  if (SMO.getSegmentName() == "__TEXT" &&
+      SMO.getSectionName() == "__objc_methtype" &&
+      SMO.getType() == MachO::S_CSTRING_LITERALS)
     return false;
 
   if (SMO.getSegmentName() == "__DATA" && SMO.getSectionName() == "__cfstring")
diff --git a/lib/MC/MCAssembler.cpp b/lib/MC/MCAssembler.cpp
index 45d49fae94bf..e3c2443f4a21 100644
--- a/lib/MC/MCAssembler.cpp
+++ b/lib/MC/MCAssembler.cpp
@@ -425,16 +425,6 @@ bool MCAssembler::isThumbFunc(const MCSymbol *Symbol) const {
   return true;
 }
 
-void MCAssembler::addLocalUsedInReloc(const MCSymbol &Sym) {
-  assert(Sym.isTemporary());
-  LocalsUsedInReloc.insert(&Sym);
-}
-
-bool MCAssembler::isLocalUsedInReloc(const MCSymbol &Sym) const {
-  assert(Sym.isTemporary());
-  return LocalsUsedInReloc.count(&Sym);
-}
-
 bool MCAssembler::isSymbolLinkerVisible(const MCSymbol &Symbol) const {
   // Non-temporary labels should always be visible to the linker.
   if (!Symbol.isTemporary())
@@ -444,10 +434,8 @@ bool MCAssembler::isSymbolLinkerVisible(const MCSymbol &Symbol) const {
   if (!Symbol.isInSection())
     return false;
 
-  if (isLocalUsedInReloc(Symbol))
-    return true;
-
-  return false;
+  // Otherwise, check if the section requires symbols even for temporary labels.
+  return getBackend().doesSectionRequireSymbols(Symbol.getSection());
 }
 
 const MCSymbolData *MCAssembler::getAtom(const MCSymbolData *SD) const {
diff --git a/lib/MC/MachObjectWriter.cpp b/lib/MC/MachObjectWriter.cpp
index 588d424120c4..d3751bd9ba57 100644
--- a/lib/MC/MachObjectWriter.cpp
+++ b/lib/MC/MachObjectWriter.cpp
@@ -448,11 +448,14 @@ void MachObjectWriter::WriteLinkerOptionsLoadCommand(
   assert(OS.tell() - Start == Size);
 }
 
-void MachObjectWriter::RecordRelocation(MCAssembler &Asm,
+
+void MachObjectWriter::RecordRelocation(const MCAssembler &Asm,
                                         const MCAsmLayout &Layout,
                                         const MCFragment *Fragment,
-                                        const MCFixup &Fixup, MCValue Target,
-                                        bool &IsPCRel, uint64_t &FixedValue) {
+                                        const MCFixup &Fixup,
+                                        MCValue Target,
+                                        bool &IsPCRel,
+                                        uint64_t &FixedValue) {
   TargetObjectWriter->RecordRelocation(this, Asm, Layout, Fragment, Fixup,
                                        Target, FixedValue);
 }
@@ -613,22 +616,6 @@ void MachObjectWriter::ComputeSymbolTable(
     ExternalSymbolData[i].SymbolData->setIndex(Index++);
   for (unsigned i = 0, e = UndefinedSymbolData.size(); i != e; ++i)
     UndefinedSymbolData[i].SymbolData->setIndex(Index++);
-
-  for (const MCSectionData &SD : Asm) {
-    std::vector<RelAndSymbol> &Relocs = Relocations[&SD];
-    for (RelAndSymbol &Rel : Relocs) {
-      if (!Rel.Sym)
-        continue;
-
-      // Set the Index and the IsExtern bit.
-      unsigned Index = Rel.Sym->getIndex();
-      assert(isInt<24>(Index));
-      if (IsLittleEndian)
-        Rel.MRE.r_word1 = (Rel.MRE.r_word1 & (-1 << 24)) | Index | (1 << 27);
-      else
-        Rel.MRE.r_word1 = (Rel.MRE.r_word1 & 0xff) | Index << 8 | (1 << 4);
-    }
-  }
 }
 
 void MachObjectWriter::computeSectionAddresses(const MCAssembler &Asm,
@@ -675,6 +662,10 @@ void MachObjectWriter::ExecutePostLayoutBinding(MCAssembler &Asm,
   // Mark symbol difference expressions in variables (from .set or = directives)
   // as absolute.
   markAbsoluteVariableSymbols(Asm, Layout);
+
+  // Compute symbol table information and bind symbol indices.
+  ComputeSymbolTable(Asm, LocalSymbolData, ExternalSymbolData,
+                     UndefinedSymbolData);
 }
 
 bool MachObjectWriter::
@@ -758,10 +749,6 @@ IsSymbolRefDifferenceFullyResolvedImpl(const MCAssembler &Asm,
 
 void MachObjectWriter::WriteObject(MCAssembler &Asm,
                                    const MCAsmLayout &Layout) {
-  // Compute symbol table information and bind symbol indices.
-  ComputeSymbolTable(Asm, LocalSymbolData, ExternalSymbolData,
-                     UndefinedSymbolData);
-
   unsigned NumSections = Asm.size();
   const MCAssembler::VersionMinInfoType &VersionInfo =
     Layout.getAssembler().getVersionMinInfo();
@@ -852,7 +839,7 @@ void MachObjectWriter::WriteObject(MCAssembler &Asm,
   uint64_t RelocTableEnd = SectionDataStart + SectionDataFileSize;
   for (MCAssembler::const_iterator it = Asm.begin(),
          ie = Asm.end(); it != ie; ++it) {
-    std::vector<RelAndSymbol> &Relocs = Relocations[it];
+    std::vector<MachO::any_relocation_info> &Relocs = Relocations[it];
     unsigned NumRelocs = Relocs.size();
     uint64_t SectionStart = SectionDataStart + getSectionAddress(it);
     WriteSection(Asm, Layout, *it, SectionStart, RelocTableEnd, NumRelocs);
@@ -946,10 +933,10 @@ void MachObjectWriter::WriteObject(MCAssembler &Asm,
          ie = Asm.end(); it != ie; ++it) {
     // Write the section relocation entries, in reverse order to match 'as'
     // (approximately, the exact algorithm is more complicated than this).
-    std::vector<RelAndSymbol> &Relocs = Relocations[it];
+    std::vector<MachO::any_relocation_info> &Relocs = Relocations[it];
     for (unsigned i = 0, e = Relocs.size(); i != e; ++i) {
-      Write32(Relocs[e - i - 1].MRE.r_word0);
-      Write32(Relocs[e - i - 1].MRE.r_word1);
+      Write32(Relocs[e - i - 1].r_word0);
+      Write32(Relocs[e - i - 1].r_word1);
     }
   }
 
diff --git a/lib/MC/WinCOFFObjectWriter.cpp b/lib/MC/WinCOFFObjectWriter.cpp
index d8729bdbc4bb..c17f99b9bd7b 100644
--- a/lib/MC/WinCOFFObjectWriter.cpp
+++ b/lib/MC/WinCOFFObjectWriter.cpp
@@ -175,7 +175,7 @@ public:
                                               const MCFragment &FB, bool InSet,
                                               bool IsPCRel) const override;
 
-  void RecordRelocation(MCAssembler &Asm, const MCAsmLayout &Layout,
+  void RecordRelocation(const MCAssembler &Asm, const MCAsmLayout &Layout,
                         const MCFragment *Fragment, const MCFixup &Fixup,
                         MCValue Target, bool &IsPCRel,
                         uint64_t &FixedValue) override;
@@ -661,9 +661,13 @@ bool WinCOFFObjectWriter::IsSymbolRefDifferenceFullyResolvedImpl(
                                                                 InSet, IsPCRel);
 }
 
-void WinCOFFObjectWriter::RecordRelocation(
-    MCAssembler &Asm, const MCAsmLayout &Layout, const MCFragment *Fragment,
-    const MCFixup &Fixup, MCValue Target, bool &IsPCRel, uint64_t &FixedValue) {
+void WinCOFFObjectWriter::RecordRelocation(const MCAssembler &Asm,
+                                           const MCAsmLayout &Layout,
+                                           const MCFragment *Fragment,
+                                           const MCFixup &Fixup,
+                                           MCValue Target,
+                                           bool &IsPCRel,
+                                           uint64_t &FixedValue) {
   assert(Target.getSymA() && "Relocation must reference a symbol!");
 
   const MCSymbol &Symbol = Target.getSymA()->getSymbol();
diff --git a/lib/Support/Triple.cpp b/lib/Support/Triple.cpp
index f923a9aa87ae..0838e90baaec 100644
--- a/lib/Support/Triple.cpp
+++ b/lib/Support/Triple.cpp
@@ -246,13 +246,21 @@ static Triple::ArchType parseARMArch(StringRef ArchName) {
 
   if (ArchName.startswith("armv")) {
     offset = 3;
-    arch = Triple::arm;
+    if (ArchName.endswith("eb")) {
+      arch = Triple::armeb;
+      ArchName = ArchName.substr(0, ArchName.size() - 2);
+    } else
+      arch = Triple::arm;
   } else if (ArchName.startswith("armebv")) {
     offset = 5;
     arch = Triple::armeb;
   } else if (ArchName.startswith("thumbv")) {
     offset = 5;
-    arch = Triple::thumb;
+    if (ArchName.endswith("eb")) {
+      arch = Triple::thumbeb;
+      ArchName = ArchName.substr(0, ArchName.size() - 2);
+    } else
+      arch = Triple::thumb;
   } else if (ArchName.startswith("thumbebv")) {
     offset = 7;
     arch = Triple::thumbeb;
@@ -271,6 +279,8 @@ static Triple::ArchType parseARMArch(StringRef ArchName) {
 }
 
 static Triple::ArchType parseArch(StringRef ArchName) {
+  Triple::ArchType ARMArch(parseARMArch(ArchName));
+
   return StringSwitch<Triple::ArchType>(ArchName)
     .Cases("i386", "i486", "i586", "i686", Triple::x86)
     // FIXME: Do we need to support these?
@@ -280,9 +290,10 @@ static Triple::ArchType parseArch(StringRef ArchName) {
     .Cases("powerpc64", "ppu", Triple::ppc64)
     .Case("powerpc64le", Triple::ppc64le)
     .Case("xscale", Triple::arm)
-    .StartsWith("arm", parseARMArch(ArchName))
-    .StartsWith("thumb", parseARMArch(ArchName))
-    .StartsWith("aarch64", parseARMArch(ArchName))
+    .Case("xscaleeb", Triple::armeb)
+    .StartsWith("arm", ARMArch)
+    .StartsWith("thumb", ARMArch)
+    .StartsWith("aarch64", ARMArch)
     .Case("msp430", Triple::msp430)
     .Cases("mips", "mipseb", "mipsallegrex", Triple::mips)
     .Cases("mipsel", "mipsallegrexel", Triple::mipsel)
@@ -379,6 +390,9 @@ static Triple::ObjectFormatType parseFormat(StringRef EnvironmentName) {
 }
 
 static Triple::SubArchType parseSubArch(StringRef SubArchName) {
+  if (SubArchName.endswith("eb"))
+    SubArchName = SubArchName.substr(0, SubArchName.size() - 2);
+
   return StringSwitch<Triple::SubArchType>(SubArchName)
     .EndsWith("v8", Triple::ARMSubArch_v8)
     .EndsWith("v8a", Triple::ARMSubArch_v8)
@@ -1022,6 +1036,8 @@ const char *Triple::getARMCPUForArch(StringRef MArch) const {
     offset = 5;
   if (offset != StringRef::npos && MArch.substr(offset, 2) == "eb")
     offset += 2;
+  if (MArch.endswith("eb"))
+    MArch = MArch.substr(0, MArch.size() - 2);
   if (offset != StringRef::npos)
     result = llvm::StringSwitch<const char *>(MArch.substr(offset))
       .Cases("v2", "v2a", "arm2")
diff --git a/lib/Target/AArch64/AArch64CallingConvention.td b/lib/Target/AArch64/AArch64CallingConvention.td
index 1a8040275ca8..a391e766deea 100644
--- a/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/lib/Target/AArch64/AArch64CallingConvention.td
@@ -204,6 +204,44 @@ def RetCC_AArch64_WebKit_JS : CallingConv<[
                                           [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>
 ]>;
 
+//===----------------------------------------------------------------------===//
+// ARM64 Calling Convention for GHC
+//===----------------------------------------------------------------------===//
+
+// This calling convention is specific to the Glasgow Haskell Compiler.
+// The only documentation is the GHC source code, specifically the C header
+// file:
+//
+//     https://github.com/ghc/ghc/blob/master/includes/stg/MachRegs.h
+//
+// which defines the registers for the Spineless Tagless G-Machine (STG) that
+// GHC uses to implement lazy evaluation. The generic STG machine has a set of
+// registers which are mapped to appropriate set of architecture specific
+// registers for each CPU architecture.
+//
+// The STG Machine is documented here:
+//
+//    https://ghc.haskell.org/trac/ghc/wiki/Commentary/Compiler/GeneratedCode
+//
+// The AArch64 register mapping is under the heading "The ARMv8/AArch64 ABI
+// register mapping".
+
+def CC_AArch64_GHC : CallingConv<[
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, f128], CCBitConvertToType<v2f64>>,
+
+  CCIfType<[v2f64], CCAssignToReg<[Q4, Q5]>>,
+  CCIfType<[f32], CCAssignToReg<[S8, S9, S10, S11]>>,
+  CCIfType<[f64], CCAssignToReg<[D12, D13, D14, D15]>>,
+
+  // Promote i8/i16/i32 arguments to i64.
+  CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+  // Pass in STG registers: Base, Sp, Hp, R1, R2, R3, R4, R5, R6, SpLim
+  CCIfType<[i64], CCAssignToReg<[X19, X20, X21, X22, X23, X24, X25, X26, X27, X28]>>
+]>;
+
 // FIXME: LR is only callee-saved in the sense that *we* preserve it and are
 // presumably a callee to someone. External functions may not do so, but this
 // is currently safe since BL has LR as an implicit-def and what happens after a
@@ -249,3 +287,4 @@ def CSR_AArch64_AllRegs
                            (sequence "S%u", 0, 31), (sequence "D%u", 0, 31),
                            (sequence "Q%u", 0, 31))>;
 
+def CSR_AArch64_NoRegs : CalleeSavedRegs<(add)>;
diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp
index 419fbc8be6c9..ca4e97bd7c03 100644
--- a/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/lib/Target/AArch64/AArch64FastISel.cpp
@@ -302,6 +302,8 @@ static unsigned getImplicitScaleFactor(MVT VT) {
 CCAssignFn *AArch64FastISel::CCAssignFnForCall(CallingConv::ID CC) const {
   if (CC == CallingConv::WebKit_JS)
     return CC_AArch64_WebKit_JS;
+  if (CC == CallingConv::GHC)
+    return CC_AArch64_GHC;
   return Subtarget->isTargetDarwin() ? CC_AArch64_DarwinPCS : CC_AArch64_AAPCS;
 }
 
diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp
index 66aa216db2c9..d8e91562e316 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -215,6 +215,11 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const {
   bool HasFP = hasFP(MF);
   DebugLoc DL = MBB.findDebugLoc(MBBI);
 
+  // All calls are tail calls in GHC calling conv, and functions have no
+  // prologue/epilogue.
+  if (MF.getFunction()->getCallingConv() == CallingConv::GHC)
+    return;
+
   int NumBytes = (int)MFI->getStackSize();
   if (!AFI->hasStackFrame()) {
     assert(!HasFP && "unexpected function without stack frame but with FP");
@@ -451,6 +456,11 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
   int NumBytes = MFI->getStackSize();
   const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
 
+  // All calls are tail calls in GHC calling conv, and functions have no
+  // prologue/epilogue.
+  if (MF.getFunction()->getCallingConv() == CallingConv::GHC)
+    return;
+
   // Initial and residual are named for consitency with the prologue. Note that
   // in the epilogue, the residual adjustment is executed first.
   uint64_t ArgumentPopSize = 0;
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 0d44f992a2ab..19f51ce9450c 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1990,6 +1990,8 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
     llvm_unreachable("Unsupported calling convention.");
   case CallingConv::WebKit_JS:
     return CC_AArch64_WebKit_JS;
+  case CallingConv::GHC:
+    return CC_AArch64_GHC;
   case CallingConv::C:
   case CallingConv::Fast:
     if (!Subtarget->isTargetDarwin())
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp
index d734d436add7..206cdbbe0c5b 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -33,6 +33,10 @@ using namespace llvm;
 #define GET_REGINFO_TARGET_DESC
 #include "AArch64GenRegisterInfo.inc"
 
+static cl::opt<bool>
+ReserveX18("aarch64-reserve-x18", cl::Hidden,
+          cl::desc("Reserve X18, making it unavailable as GPR"));
+
 AArch64RegisterInfo::AArch64RegisterInfo(const AArch64InstrInfo *tii,
                                          const AArch64Subtarget *sti)
     : AArch64GenRegisterInfo(AArch64::LR), TII(tii), STI(sti) {}
@@ -40,6 +44,10 @@ AArch64RegisterInfo::AArch64RegisterInfo(const AArch64InstrInfo *tii,
 const MCPhysReg *
 AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   assert(MF && "Invalid MachineFunction pointer.");
+  if (MF->getFunction()->getCallingConv() == CallingConv::GHC)
+    // GHC set of callee saved regs is empty as all those regs are
+    // used for passing STG regs around
+    return CSR_AArch64_NoRegs_SaveList;
   if (MF->getFunction()->getCallingConv() == CallingConv::AnyReg)
     return CSR_AArch64_AllRegs_SaveList;
   else
@@ -48,6 +56,9 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
 
 const uint32_t *
 AArch64RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
+  if (CC == CallingConv::GHC)
+    // This is academic becase all GHC calls are (supposed to be) tail calls
+    return CSR_AArch64_NoRegs_RegMask;
   if (CC == CallingConv::AnyReg)
     return CSR_AArch64_AllRegs_RegMask;
   else
@@ -63,7 +74,7 @@ const uint32_t *AArch64RegisterInfo::getTLSCallPreservedMask() const {
 }
 
 const uint32_t *
-AArch64RegisterInfo::getThisReturnPreservedMask(CallingConv::ID) const {
+AArch64RegisterInfo::getThisReturnPreservedMask(CallingConv::ID CC) const {
   // This should return a register mask that is the same as that returned by
   // getCallPreservedMask but that additionally preserves the register used for
   // the first i64 argument (which must also be the register used to return a
@@ -71,6 +82,7 @@ AArch64RegisterInfo::getThisReturnPreservedMask(CallingConv::ID) const {
   //
   // In case that the calling convention does not use the same register for
   // both, the function should return NULL (does not currently apply)
+  assert(CC != CallingConv::GHC && "should not be GHC calling convention.");
   return CSR_AArch64_AAPCS_ThisReturn_RegMask;
 }
 
@@ -90,7 +102,7 @@ AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
     Reserved.set(AArch64::W29);
   }
 
-  if (STI->isTargetDarwin()) {
+  if (STI->isTargetDarwin() || ReserveX18) {
     Reserved.set(AArch64::X18); // Platform register
     Reserved.set(AArch64::W18);
   }
@@ -117,7 +129,7 @@ bool AArch64RegisterInfo::isReservedReg(const MachineFunction &MF,
     return true;
   case AArch64::X18:
   case AArch64::W18:
-    return STI->isTargetDarwin();
+    return STI->isTargetDarwin() || ReserveX18;
   case AArch64::FP:
   case AArch64::W29:
     return TFI->hasFP(MF) || STI->isTargetDarwin();
@@ -379,7 +391,7 @@ unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
   case AArch64::GPR64commonRegClassID:
     return 32 - 1                                      // XZR/SP
            - (TFI->hasFP(MF) || STI->isTargetDarwin()) // FP
-           - STI->isTargetDarwin() // X18 reserved as platform register
+           - (STI->isTargetDarwin() || ReserveX18) // X18 reserved as platform register
            - hasBasePointer(MF);   // X19
   case AArch64::FPR8RegClassID:
   case AArch64::FPR16RegClassID:
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index 423da6500c48..27cbac9f12e2 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -317,6 +317,42 @@ public:
                                          MachO::CPU_SUBTYPE_ARM64_ALL);
   }
 
+  bool doesSectionRequireSymbols(const MCSection &Section) const override {
+    // Any section for which the linker breaks things into atoms needs to
+    // preserve symbols, including assembler local symbols, to identify
+    // those atoms. These sections are:
+    // Sections of type:
+    //
+    //    S_CSTRING_LITERALS  (e.g. __cstring)
+    //    S_LITERAL_POINTERS  (e.g.  objc selector pointers)
+    //    S_16BYTE_LITERALS, S_8BYTE_LITERALS, S_4BYTE_LITERALS
+    //
+    // Sections named:
+    //
+    //    __TEXT,__eh_frame
+    //    __TEXT,__ustring
+    //    __DATA,__cfstring
+    //    __DATA,__objc_classrefs
+    //    __DATA,__objc_catlist
+    //
+    // FIXME: It would be better if the compiler used actual linker local
+    // symbols for each of these sections rather than preserving what
+    // are ostensibly assembler local symbols.
+    const MCSectionMachO &SMO = static_cast<const MCSectionMachO &>(Section);
+    return (SMO.getType() == MachO::S_CSTRING_LITERALS ||
+            SMO.getType() == MachO::S_4BYTE_LITERALS ||
+            SMO.getType() == MachO::S_8BYTE_LITERALS ||
+            SMO.getType() == MachO::S_16BYTE_LITERALS ||
+            SMO.getType() == MachO::S_LITERAL_POINTERS ||
+            (SMO.getSegmentName() == "__TEXT" &&
+             (SMO.getSectionName() == "__eh_frame" ||
+              SMO.getSectionName() == "__ustring")) ||
+            (SMO.getSegmentName() == "__DATA" &&
+             (SMO.getSectionName() == "__cfstring" ||
+              SMO.getSectionName() == "__objc_classrefs" ||
+              SMO.getSectionName() == "__objc_catlist")));
+  }
+
   /// \brief Generate the compact unwind encoding from the CFI directives.
   uint32_t generateCompactUnwindEncoding(
                              ArrayRef<MCCFIInstruction> Instrs) const override {
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
index f6fab5d6b6fe..e12a24be0a67 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
@@ -10,7 +10,6 @@
 #include "MCTargetDesc/AArch64FixupKinds.h"
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
 #include "llvm/ADT/Twine.h"
-#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
@@ -34,7 +33,7 @@ public:
       : MCMachObjectTargetWriter(true /* is64Bit */, CPUType, CPUSubtype,
                                  /*UseAggressiveSymbolFolding=*/true) {}
 
-  void RecordRelocation(MachObjectWriter *Writer, MCAssembler &Asm,
+  void RecordRelocation(MachObjectWriter *Writer, const MCAssembler &Asm,
                         const MCAsmLayout &Layout, const MCFragment *Fragment,
                         const MCFixup &Fixup, MCValue Target,
                         uint64_t &FixedValue) override;
@@ -113,25 +112,8 @@ bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo(
   }
 }
 
-static bool canUseLocalRelocation(const MCSectionMachO &Section,
-                                  const MCSymbol &Symbol, unsigned Log2Size) {
-  // Debug info sections can use local relocations.
-  if (Section.hasAttribute(MachO::S_ATTR_DEBUG))
-    return true;
-
-  // Otherwise, only pointer sized relocations are supported.
-  if (Log2Size != 3)
-    return false;
-
-  // But only if they don't point to a cstring.
-  if (!Symbol.isInSection())
-    return true;
-  const MCSectionMachO &RefSec = cast<MCSectionMachO>(Symbol.getSection());
-  return RefSec.getType() != MachO::S_CSTRING_LITERALS;
-}
-
 void AArch64MachObjectWriter::RecordRelocation(
-    MachObjectWriter *Writer, MCAssembler &Asm, const MCAsmLayout &Layout,
+    MachObjectWriter *Writer, const MCAssembler &Asm, const MCAsmLayout &Layout,
     const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target,
     uint64_t &FixedValue) {
   unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
@@ -141,9 +123,9 @@ void AArch64MachObjectWriter::RecordRelocation(
   unsigned Log2Size = 0;
   int64_t Value = 0;
   unsigned Index = 0;
+  unsigned IsExtern = 0;
   unsigned Type = 0;
   unsigned Kind = Fixup.getKind();
-  const MCSymbolData *RelSymbol = nullptr;
 
   FixupOffset += Fixup.getOffset();
 
@@ -189,8 +171,10 @@ void AArch64MachObjectWriter::RecordRelocation(
     // FIXME: Should this always be extern?
     // SymbolNum of 0 indicates the absolute section.
     Type = MachO::ARM64_RELOC_UNSIGNED;
+    Index = 0;
 
     if (IsPCRel) {
+      IsExtern = 1;
       Asm.getContext().FatalError(Fixup.getLoc(),
                                   "PC relative absolute relocation!");
 
@@ -214,12 +198,15 @@ void AArch64MachObjectWriter::RecordRelocation(
         Layout.getSymbolOffset(&B_SD) ==
             Layout.getFragmentOffset(Fragment) + Fixup.getOffset()) {
       // SymB is the PC, so use a PC-rel pointer-to-GOT relocation.
+      Index = A_Base->getIndex();
+      IsExtern = 1;
       Type = MachO::ARM64_RELOC_POINTER_TO_GOT;
       IsPCRel = 1;
       MachO::any_relocation_info MRE;
       MRE.r_word0 = FixupOffset;
-      MRE.r_word1 = (IsPCRel << 24) | (Log2Size << 25) | (Type << 28);
-      Writer->addRelocation(A_Base, Fragment->getParent(), MRE);
+      MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
+                     (IsExtern << 27) | (Type << 28));
+      Writer->addRelocation(Fragment->getParent(), MRE);
       return;
     } else if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None ||
                Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None)
@@ -265,30 +252,25 @@ void AArch64MachObjectWriter::RecordRelocation(
                   ? 0
                   : Writer->getSymbolAddress(B_Base, Layout));
 
+    Index = A_Base->getIndex();
+    IsExtern = 1;
     Type = MachO::ARM64_RELOC_UNSIGNED;
 
     MachO::any_relocation_info MRE;
     MRE.r_word0 = FixupOffset;
-    MRE.r_word1 = (IsPCRel << 24) | (Log2Size << 25) | (Type << 28);
-    Writer->addRelocation(A_Base, Fragment->getParent(), MRE);
+    MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
+                   (IsExtern << 27) | (Type << 28));
+    Writer->addRelocation(Fragment->getParent(), MRE);
 
-    RelSymbol = B_Base;
+    Index = B_Base->getIndex();
+    IsExtern = 1;
     Type = MachO::ARM64_RELOC_SUBTRACTOR;
   } else { // A + constant
     const MCSymbol *Symbol = &Target.getSymA()->getSymbol();
-    const MCSectionMachO &Section = static_cast<const MCSectionMachO &>(
-        Fragment->getParent()->getSection());
-
-    bool CanUseLocalRelocation =
-        canUseLocalRelocation(Section, *Symbol, Log2Size);
-    if (Symbol->isTemporary() && (Value || !CanUseLocalRelocation)) {
-      const MCSection &Sec = Symbol->getSection();
-      if (!Asm.getContext().getAsmInfo()->isSectionAtomizableBySymbols(Sec))
-        Asm.addLocalUsedInReloc(*Symbol);
-    }
-
     const MCSymbolData &SD = Asm.getSymbolData(*Symbol);
     const MCSymbolData *Base = Asm.getAtom(&SD);
+    const MCSectionMachO &Section = static_cast<const MCSectionMachO &>(
+        Fragment->getParent()->getSection());
 
     // If the symbol is a variable and we weren't able to get a Base for it
     // (i.e., it's not in the symbol table associated with a section) resolve
@@ -328,13 +310,16 @@ void AArch64MachObjectWriter::RecordRelocation(
     // sections, and for pointer-sized relocations (.quad), we allow section
     // relocations.  It's code sections that run into trouble.
     if (Base) {
-      RelSymbol = Base;
+      Index = Base->getIndex();
+      IsExtern = 1;
 
       // Add the local offset, if needed.
       if (Base != &SD)
         Value += Layout.getSymbolOffset(&SD) - Layout.getSymbolOffset(Base);
     } else if (Symbol->isInSection()) {
-      if (!CanUseLocalRelocation)
+      // Pointer-sized relocations can use a local relocation. Otherwise,
+      // we have to be in a debug info section.
+      if (!Section.hasAttribute(MachO::S_ATTR_DEBUG) && Log2Size != 3)
         Asm.getContext().FatalError(
             Fixup.getLoc(),
             "unsupported relocation of local symbol '" + Symbol->getName() +
@@ -344,6 +329,7 @@ void AArch64MachObjectWriter::RecordRelocation(
       const MCSectionData &SymSD =
           Asm.getSectionData(SD.getSymbol().getSection());
       Index = SymSD.getOrdinal() + 1;
+      IsExtern = 0;
       Value += Writer->getSymbolAddress(&SD, Layout);
 
       if (IsPCRel)
@@ -376,16 +362,16 @@ void AArch64MachObjectWriter::RecordRelocation(
 
     MachO::any_relocation_info MRE;
     MRE.r_word0 = FixupOffset;
-    MRE.r_word1 =
-        (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | (Type << 28);
-    Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE);
+    MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
+                   (IsExtern << 27) | (Type << 28));
+    Writer->addRelocation(Fragment->getParent(), MRE);
 
     // Now set up the Addend relocation.
     Type = MachO::ARM64_RELOC_ADDEND;
     Index = Value;
-    RelSymbol = nullptr;
     IsPCRel = 0;
     Log2Size = 2;
+    IsExtern = 0;
 
     // Put zero into the instruction itself. The addend is in the relocation.
     Value = 0;
@@ -397,9 +383,9 @@ void AArch64MachObjectWriter::RecordRelocation(
   // struct relocation_info (8 bytes)
   MachO::any_relocation_info MRE;
   MRE.r_word0 = FixupOffset;
-  MRE.r_word1 =
-      (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | (Type << 28);
-  Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE);
+  MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
+                 (IsExtern << 27) | (Type << 28));
+  Writer->addRelocation(Fragment->getParent(), MRE);
 }
 
 MCObjectWriter *llvm::createAArch64MachObjectWriter(raw_ostream &OS,
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index c429ac185211..fda3e815624d 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -567,10 +567,21 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
       //   MOV  NewBase, Base
       //   ADDS NewBase, #imm8.
       if (Base != NewBase && Offset >= 8) {
+        const ARMSubtarget &Subtarget = MBB.getParent()->getTarget()
+                       .getSubtarget<ARMSubtarget>();
         // Need to insert a MOV to the new base first.
-        BuildMI(MBB, MBBI, dl, TII->get(ARM::tMOVr), NewBase)
-          .addReg(Base, getKillRegState(BaseKill))
-          .addImm(Pred).addReg(PredReg);
+        if (isARMLowRegister(NewBase) && isARMLowRegister(Base) &&
+            !Subtarget.hasV6Ops()) {
+          // thumbv4t doesn't have lo->lo copies, and we can't predicate tMOVSr
+          if (Pred != ARMCC::AL)
+            return false;
+          BuildMI(MBB, MBBI, dl, TII->get(ARM::tMOVSr), NewBase)
+            .addReg(Base, getKillRegState(BaseKill));
+        } else
+          BuildMI(MBB, MBBI, dl, TII->get(ARM::tMOVr), NewBase)
+            .addReg(Base, getKillRegState(BaseKill))
+            .addImm(Pred).addReg(PredReg);
+
         // Set up BaseKill and Base correctly to insert the ADDS/SUBS below.
         Base = NewBase;
         BaseKill = false;
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 6a00b28f37a7..96f3b4e64326 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -9191,27 +9191,39 @@ bool ARMAsmParser::parseDirectiveCPU(SMLoc L) {
 // FIXME: This is duplicated in getARMFPUFeatures() in
 // tools/clang/lib/Driver/Tools.cpp
 static const struct {
-  const unsigned Fpu;
+  const unsigned ID;
   const uint64_t Enabled;
   const uint64_t Disabled;
-} Fpus[] = {
-      {ARM::VFP, ARM::FeatureVFP2, ARM::FeatureNEON},
-      {ARM::VFPV2, ARM::FeatureVFP2, ARM::FeatureNEON},
-      {ARM::VFPV3, ARM::FeatureVFP3, ARM::FeatureNEON},
-      {ARM::VFPV3_D16, ARM::FeatureVFP3 | ARM::FeatureD16, ARM::FeatureNEON},
-      {ARM::VFPV4, ARM::FeatureVFP4, ARM::FeatureNEON},
-      {ARM::VFPV4_D16, ARM::FeatureVFP4 | ARM::FeatureD16, ARM::FeatureNEON},
-      {ARM::FPV5_D16, ARM::FeatureFPARMv8 | ARM::FeatureD16,
-       ARM::FeatureNEON | ARM::FeatureCrypto},
-      {ARM::FP_ARMV8, ARM::FeatureFPARMv8,
-       ARM::FeatureNEON | ARM::FeatureCrypto},
-      {ARM::NEON, ARM::FeatureNEON, 0},
-      {ARM::NEON_VFPV4, ARM::FeatureVFP4 | ARM::FeatureNEON, 0},
-      {ARM::NEON_FP_ARMV8, ARM::FeatureFPARMv8 | ARM::FeatureNEON,
-       ARM::FeatureCrypto},
-      {ARM::CRYPTO_NEON_FP_ARMV8,
-       ARM::FeatureFPARMv8 | ARM::FeatureNEON | ARM::FeatureCrypto, 0},
-      {ARM::SOFTVFP, 0, 0},
+} FPUs[] = {
+    {ARM::VFP, ARM::FeatureVFP2, ARM::FeatureNEON},
+    {ARM::VFPV2, ARM::FeatureVFP2, ARM::FeatureNEON},
+    {ARM::VFPV3, ARM::FeatureVFP2 | ARM::FeatureVFP3, ARM::FeatureNEON},
+    {ARM::VFPV3_D16, ARM::FeatureVFP2 | ARM::FeatureVFP3 | ARM::FeatureD16,
+     ARM::FeatureNEON},
+    {ARM::VFPV4, ARM::FeatureVFP2 | ARM::FeatureVFP3 | ARM::FeatureVFP4,
+     ARM::FeatureNEON},
+    {ARM::VFPV4_D16,
+     ARM::FeatureVFP2 | ARM::FeatureVFP3 | ARM::FeatureVFP4 | ARM::FeatureD16,
+     ARM::FeatureNEON},
+    {ARM::FPV5_D16, ARM::FeatureVFP2 | ARM::FeatureVFP3 | ARM::FeatureVFP4 |
+                        ARM::FeatureFPARMv8 | ARM::FeatureD16,
+     ARM::FeatureNEON | ARM::FeatureCrypto},
+    {ARM::FP_ARMV8, ARM::FeatureVFP2 | ARM::FeatureVFP3 | ARM::FeatureVFP4 |
+                        ARM::FeatureFPARMv8,
+     ARM::FeatureNEON | ARM::FeatureCrypto},
+    {ARM::NEON, ARM::FeatureVFP2 | ARM::FeatureVFP3 | ARM::FeatureNEON, 0},
+    {ARM::NEON_VFPV4,
+     ARM::FeatureVFP2 | ARM::FeatureVFP3 | ARM::FeatureVFP4 | ARM::FeatureNEON,
+     0},
+    {ARM::NEON_FP_ARMV8,
+     ARM::FeatureVFP2 | ARM::FeatureVFP3 | ARM::FeatureVFP4 |
+         ARM::FeatureFPARMv8 | ARM::FeatureNEON,
+     ARM::FeatureCrypto},
+    {ARM::CRYPTO_NEON_FP_ARMV8,
+     ARM::FeatureVFP2 | ARM::FeatureVFP3 | ARM::FeatureVFP4 |
+         ARM::FeatureFPARMv8 | ARM::FeatureNEON | ARM::FeatureCrypto,
+     0},
+    {ARM::SOFTVFP, 0, 0},
 };
 
 /// parseDirectiveFPU
@@ -9229,14 +9241,14 @@ bool ARMAsmParser::parseDirectiveFPU(SMLoc L) {
     return false;
   }
 
-  for (const auto &Fpu : Fpus) {
-    if (Fpu.Fpu != ID)
+  for (const auto &Entry : FPUs) {
+    if (Entry.ID != ID)
       continue;
 
     // Need to toggle features that should be on but are off and that
     // should off but are on.
-    uint64_t Toggle = (Fpu.Enabled & ~STI.getFeatureBits()) |
-                      (Fpu.Disabled & STI.getFeatureBits());
+    uint64_t Toggle = (Entry.Enabled & ~STI.getFeatureBits()) |
+                      (Entry.Disabled & STI.getFeatureBits());
     setAvailableFeatures(ComputeAvailableFeatures(STI.ToggleFeature(Toggle)));
     break;
   }
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
index 3187d36f7519..7da500390ed1 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
@@ -54,10 +54,10 @@ public:
     : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype,
                                /*UseAggressiveSymbolFolding=*/true) {}
 
-  void RecordRelocation(MachObjectWriter *Writer, MCAssembler &Asm,
-                        const MCAsmLayout &Layout, const MCFragment *Fragment,
-                        const MCFixup &Fixup, MCValue Target,
-                        uint64_t &FixedValue) override;
+  void RecordRelocation(MachObjectWriter *Writer,
+                        const MCAssembler &Asm, const MCAsmLayout &Layout,
+                        const MCFragment *Fragment, const MCFixup &Fixup,
+                        MCValue Target, uint64_t &FixedValue) override;
 };
 }
 
@@ -232,7 +232,7 @@ RecordARMScatteredHalfRelocation(MachObjectWriter *Writer,
                    (IsPCRel               << 30) |
                    MachO::R_SCATTERED);
     MRE.r_word1 = Value2;
-    Writer->addRelocation(nullptr, Fragment->getParent(), MRE);
+    Writer->addRelocation(Fragment->getParent(), MRE);
   }
 
   MachO::any_relocation_info MRE;
@@ -243,7 +243,7 @@ RecordARMScatteredHalfRelocation(MachObjectWriter *Writer,
                  (IsPCRel     << 30) |
                  MachO::R_SCATTERED);
   MRE.r_word1 = Value;
-  Writer->addRelocation(nullptr, Fragment->getParent(), MRE);
+  Writer->addRelocation(Fragment->getParent(), MRE);
 }
 
 void ARMMachObjectWriter::RecordARMScatteredRelocation(MachObjectWriter *Writer,
@@ -297,7 +297,7 @@ void ARMMachObjectWriter::RecordARMScatteredRelocation(MachObjectWriter *Writer,
                    (IsPCRel               << 30) |
                    MachO::R_SCATTERED);
     MRE.r_word1 = Value2;
-    Writer->addRelocation(nullptr, Fragment->getParent(), MRE);
+    Writer->addRelocation(Fragment->getParent(), MRE);
   }
 
   MachO::any_relocation_info MRE;
@@ -307,7 +307,7 @@ void ARMMachObjectWriter::RecordARMScatteredRelocation(MachObjectWriter *Writer,
                  (IsPCRel     << 30) |
                  MachO::R_SCATTERED);
   MRE.r_word1 = Value;
-  Writer->addRelocation(nullptr, Fragment->getParent(), MRE);
+  Writer->addRelocation(Fragment->getParent(), MRE);
 }
 
 bool ARMMachObjectWriter::requiresExternRelocation(MachObjectWriter *Writer,
@@ -351,10 +351,11 @@ bool ARMMachObjectWriter::requiresExternRelocation(MachObjectWriter *Writer,
 }
 
 void ARMMachObjectWriter::RecordRelocation(MachObjectWriter *Writer,
-                                           MCAssembler &Asm,
+                                           const MCAssembler &Asm,
                                            const MCAsmLayout &Layout,
                                            const MCFragment *Fragment,
-                                           const MCFixup &Fixup, MCValue Target,
+                                           const MCFixup &Fixup,
+                                           MCValue Target,
                                            uint64_t &FixedValue) {
   unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
   unsigned Log2Size;
@@ -400,8 +401,8 @@ void ARMMachObjectWriter::RecordRelocation(MachObjectWriter *Writer,
   // See <reloc.h>.
   uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
   unsigned Index = 0;
+  unsigned IsExtern = 0;
   unsigned Type = 0;
-  const MCSymbolData *RelSymbol = nullptr;
 
   if (Target.isAbsolute()) { // constant
     // FIXME!
@@ -421,7 +422,8 @@ void ARMMachObjectWriter::RecordRelocation(MachObjectWriter *Writer,
     // Check whether we need an external or internal relocation.
     if (requiresExternRelocation(Writer, Asm, *Fragment, RelocType, SD,
                                  FixedValue)) {
-      RelSymbol = SD;
+      IsExtern = 1;
+      Index = SD->getIndex();
 
       // For external relocations, make sure to offset the fixup value to
       // compensate for the addend of the symbol address, if it was
@@ -445,8 +447,11 @@ void ARMMachObjectWriter::RecordRelocation(MachObjectWriter *Writer,
   // struct relocation_info (8 bytes)
   MachO::any_relocation_info MRE;
   MRE.r_word0 = FixupOffset;
-  MRE.r_word1 =
-      (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | (Type << 28);
+  MRE.r_word1 = ((Index     <<  0) |
+                 (IsPCRel   << 24) |
+                 (Log2Size  << 25) |
+                 (IsExtern  << 27) |
+                 (Type      << 28));
 
   // Even when it's not a scattered relocation, movw/movt always uses
   // a PAIR relocation.
@@ -471,10 +476,10 @@ void ARMMachObjectWriter::RecordRelocation(MachObjectWriter *Writer,
                        (Log2Size              << 25) |
                        (MachO::ARM_RELOC_PAIR << 28));
 
-    Writer->addRelocation(nullptr, Fragment->getParent(), MREPair);
+    Writer->addRelocation(Fragment->getParent(), MREPair);
   }
 
-  Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE);
+  Writer->addRelocation(Fragment->getParent(), MRE);
 }
 
 MCObjectWriter *llvm::createARMMachObjectWriter(raw_ostream &OS,
diff --git a/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp b/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp
index 2b459a4336b3..0c2407508869 100644
--- a/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp
+++ b/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp
@@ -15,6 +15,7 @@
 #include "Hexagon.h"
 #include "HexagonTargetMachine.h"
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/StackProtector.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/Pass.h"
@@ -42,7 +43,7 @@ namespace {
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<MachineFunctionAnalysis>();
       AU.addPreserved<MachineFunctionAnalysis>();
-      AU.addPreserved("stack-protector");
+      AU.addPreserved<StackProtector>();
       FunctionPass::getAnalysisUsage(AU);
     }
   };
diff --git a/lib/Target/Mips/Mips16ISelLowering.cpp b/lib/Target/Mips/Mips16ISelLowering.cpp
index d4852c4ece40..74796954a528 100644
--- a/lib/Target/Mips/Mips16ISelLowering.cpp
+++ b/lib/Target/Mips/Mips16ISelLowering.cpp
@@ -497,14 +497,14 @@ getOpndList(SmallVectorImpl<SDValue> &Ops,
   SDValue JumpTarget = Callee;
 
   // T9 should contain the address of the callee function if
-  // -reloction-model=pic or it is an indirect call.
+  // -relocation-model=pic or it is an indirect call.
   if (IsPICCall || !GlobalOrExternal) {
     unsigned V0Reg = Mips::V0;
     if (NeedMips16Helper) {
       RegsToPass.push_front(std::make_pair(V0Reg, Callee));
       JumpTarget = DAG.getExternalSymbol(Mips16HelperFunction, getPointerTy());
       ExternalSymbolSDNode *S = cast<ExternalSymbolSDNode>(JumpTarget);
-      JumpTarget = getAddrGlobal(S, JumpTarget.getValueType(), DAG,
+      JumpTarget = getAddrGlobal(S, CLI.DL, JumpTarget.getValueType(), DAG,
                                  MipsII::MO_GOT, Chain,
                                  FuncInfo->callPtrInfo(S->getSymbol()));
     } else
diff --git a/lib/Target/Mips/Mips32r6InstrInfo.td b/lib/Target/Mips/Mips32r6InstrInfo.td
index 6d6735b3aae6..185d12ec93fd 100644
--- a/lib/Target/Mips/Mips32r6InstrInfo.td
+++ b/lib/Target/Mips/Mips32r6InstrInfo.td
@@ -756,7 +756,7 @@ def : MipsPat<(setge f32:$lhs, f32:$rhs), (CMP_LT_S f32:$rhs, f32:$lhs)>,
       ISA_MIPS32R6;
 def : MipsPat<(setlt f32:$lhs, f32:$rhs), (CMP_LT_S f32:$lhs, f32:$rhs)>,
       ISA_MIPS32R6;
-def : MipsPat<(setlt f32:$lhs, f32:$rhs), (CMP_LE_S f32:$lhs, f32:$rhs)>,
+def : MipsPat<(setle f32:$lhs, f32:$rhs), (CMP_LE_S f32:$lhs, f32:$rhs)>,
       ISA_MIPS32R6;
 def : MipsPat<(setne f32:$lhs, f32:$rhs),
               (NOR (CMP_EQ_S f32:$lhs, f32:$rhs), ZERO)>, ISA_MIPS32R6;
@@ -776,7 +776,7 @@ def : MipsPat<(setge f64:$lhs, f64:$rhs), (CMP_LT_D f64:$rhs, f64:$lhs)>,
       ISA_MIPS32R6;
 def : MipsPat<(setlt f64:$lhs, f64:$rhs), (CMP_LT_D f64:$lhs, f64:$rhs)>,
       ISA_MIPS32R6;
-def : MipsPat<(setlt f64:$lhs, f64:$rhs), (CMP_LE_D f64:$lhs, f64:$rhs)>,
+def : MipsPat<(setle f64:$lhs, f64:$rhs), (CMP_LE_D f64:$lhs, f64:$rhs)>,
       ISA_MIPS32R6;
 def : MipsPat<(setne f64:$lhs, f64:$rhs),
               (NOR (CMP_EQ_D f64:$lhs, f64:$rhs), ZERO)>, ISA_MIPS32R6;
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index 99fd739c3ed0..d25f5637f57c 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -1613,22 +1613,22 @@ SDValue MipsTargetLowering::lowerGlobalAddress(SDValue Op,
 
     if (TLOF.IsGlobalInSmallSection(GV, getTargetMachine()))
       // %gp_rel relocation
-      return getAddrGPRel(N, Ty, DAG);
+      return getAddrGPRel(N, SDLoc(N), Ty, DAG);
 
     // %hi/%lo relocation
-    return getAddrNonPIC(N, Ty, DAG);
+    return getAddrNonPIC(N, SDLoc(N), Ty, DAG);
   }
 
   if (GV->hasInternalLinkage() || (GV->hasLocalLinkage() && !isa<Function>(GV)))
-    return getAddrLocal(N, Ty, DAG,
+    return getAddrLocal(N, SDLoc(N), Ty, DAG,
                         Subtarget.isABI_N32() || Subtarget.isABI_N64());
 
   if (LargeGOT)
-    return getAddrGlobalLargeGOT(N, Ty, DAG, MipsII::MO_GOT_HI16,
+    return getAddrGlobalLargeGOT(N, SDLoc(N), Ty, DAG, MipsII::MO_GOT_HI16,
                                  MipsII::MO_GOT_LO16, DAG.getEntryNode(),
                                  MachinePointerInfo::getGOT());
 
-  return getAddrGlobal(N, Ty, DAG,
+  return getAddrGlobal(N, SDLoc(N), Ty, DAG,
                        (Subtarget.isABI_N32() || Subtarget.isABI_N64())
                            ? MipsII::MO_GOT_DISP
                            : MipsII::MO_GOT16,
@@ -1642,9 +1642,9 @@ SDValue MipsTargetLowering::lowerBlockAddress(SDValue Op,
 
   if (getTargetMachine().getRelocationModel() != Reloc::PIC_ &&
       !Subtarget.isABI_N64())
-    return getAddrNonPIC(N, Ty, DAG);
+    return getAddrNonPIC(N, SDLoc(N), Ty, DAG);
 
-  return getAddrLocal(N, Ty, DAG,
+  return getAddrLocal(N, SDLoc(N), Ty, DAG,
                       Subtarget.isABI_N32() || Subtarget.isABI_N64());
 }
 
@@ -1735,9 +1735,9 @@ lowerJumpTable(SDValue Op, SelectionDAG &DAG) const
 
   if (getTargetMachine().getRelocationModel() != Reloc::PIC_ &&
       !Subtarget.isABI_N64())
-    return getAddrNonPIC(N, Ty, DAG);
+    return getAddrNonPIC(N, SDLoc(N), Ty, DAG);
 
-  return getAddrLocal(N, Ty, DAG,
+  return getAddrLocal(N, SDLoc(N), Ty, DAG,
                       Subtarget.isABI_N32() || Subtarget.isABI_N64());
 }
 
@@ -1754,12 +1754,12 @@ lowerConstantPool(SDValue Op, SelectionDAG &DAG) const
 
     if (TLOF.IsConstantInSmallSection(N->getConstVal(), getTargetMachine()))
       // %gp_rel relocation
-      return getAddrGPRel(N, Ty, DAG);
+      return getAddrGPRel(N, SDLoc(N), Ty, DAG);
 
-    return getAddrNonPIC(N, Ty, DAG);
+    return getAddrNonPIC(N, SDLoc(N), Ty, DAG);
   }
 
-  return getAddrLocal(N, Ty, DAG,
+  return getAddrLocal(N, SDLoc(N), Ty, DAG,
                       Subtarget.isABI_N32() || Subtarget.isABI_N64());
 }
 
@@ -2681,15 +2681,15 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       InternalLinkage = Val->hasInternalLinkage();
 
       if (InternalLinkage)
-        Callee = getAddrLocal(G, Ty, DAG,
+        Callee = getAddrLocal(G, DL, Ty, DAG,
                               Subtarget.isABI_N32() || Subtarget.isABI_N64());
       else if (LargeGOT) {
-        Callee = getAddrGlobalLargeGOT(G, Ty, DAG, MipsII::MO_CALL_HI16,
+        Callee = getAddrGlobalLargeGOT(G, DL, Ty, DAG, MipsII::MO_CALL_HI16,
                                        MipsII::MO_CALL_LO16, Chain,
                                        FuncInfo->callPtrInfo(Val));
         IsCallReloc = true;
       } else {
-        Callee = getAddrGlobal(G, Ty, DAG, MipsII::MO_GOT_CALL, Chain,
+        Callee = getAddrGlobal(G, DL, Ty, DAG, MipsII::MO_GOT_CALL, Chain,
                                FuncInfo->callPtrInfo(Val));
         IsCallReloc = true;
       }
@@ -2702,15 +2702,15 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     const char *Sym = S->getSymbol();
 
     if (!Subtarget.isABI_N64() && !IsPIC) // !N64 && static
-      Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(),
-                                            MipsII::MO_NO_FLAG);
+      Callee =
+          DAG.getTargetExternalSymbol(Sym, getPointerTy(), MipsII::MO_NO_FLAG);
     else if (LargeGOT) {
-      Callee = getAddrGlobalLargeGOT(S, Ty, DAG, MipsII::MO_CALL_HI16,
+      Callee = getAddrGlobalLargeGOT(S, DL, Ty, DAG, MipsII::MO_CALL_HI16,
                                      MipsII::MO_CALL_LO16, Chain,
                                      FuncInfo->callPtrInfo(Sym));
       IsCallReloc = true;
     } else { // N64 || PIC
-      Callee = getAddrGlobal(S, Ty, DAG, MipsII::MO_GOT_CALL, Chain,
+      Callee = getAddrGlobal(S, DL, Ty, DAG, MipsII::MO_GOT_CALL, Chain,
                              FuncInfo->callPtrInfo(Sym));
       IsCallReloc = true;
     }
diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h
index 71f140b7a656..4132de6dbcad 100644
--- a/lib/Target/Mips/MipsISelLowering.h
+++ b/lib/Target/Mips/MipsISelLowering.h
@@ -272,9 +272,8 @@ namespace llvm {
     //
     // (add (load (wrapper $gp, %got(sym)), %lo(sym))
     template <class NodeTy>
-    SDValue getAddrLocal(NodeTy *N, EVT Ty, SelectionDAG &DAG,
+    SDValue getAddrLocal(NodeTy *N, SDLoc DL, EVT Ty, SelectionDAG &DAG,
                          bool IsN32OrN64) const {
-      SDLoc DL(N);
       unsigned GOTFlag = IsN32OrN64 ? MipsII::MO_GOT_PAGE : MipsII::MO_GOT;
       SDValue GOT = DAG.getNode(MipsISD::Wrapper, DL, Ty, getGlobalReg(DAG, Ty),
                                 getTargetNode(N, Ty, DAG, GOTFlag));
@@ -291,11 +290,10 @@ namespace llvm {
     // computing a global symbol's address:
     //
     // (load (wrapper $gp, %got(sym)))
-    template<class NodeTy>
-    SDValue getAddrGlobal(NodeTy *N, EVT Ty, SelectionDAG &DAG,
+    template <class NodeTy>
+    SDValue getAddrGlobal(NodeTy *N, SDLoc DL, EVT Ty, SelectionDAG &DAG,
                           unsigned Flag, SDValue Chain,
                           const MachinePointerInfo &PtrInfo) const {
-      SDLoc DL(N);
       SDValue Tgt = DAG.getNode(MipsISD::Wrapper, DL, Ty, getGlobalReg(DAG, Ty),
                                 getTargetNode(N, Ty, DAG, Flag));
       return DAG.getLoad(Ty, DL, Chain, Tgt, PtrInfo, false, false, false, 0);
@@ -305,14 +303,13 @@ namespace llvm {
     // computing a global symbol's address in large-GOT mode:
     //
     // (load (wrapper (add %hi(sym), $gp), %lo(sym)))
-    template<class NodeTy>
-    SDValue getAddrGlobalLargeGOT(NodeTy *N, EVT Ty, SelectionDAG &DAG,
-                                  unsigned HiFlag, unsigned LoFlag,
-                                  SDValue Chain,
+    template <class NodeTy>
+    SDValue getAddrGlobalLargeGOT(NodeTy *N, SDLoc DL, EVT Ty,
+                                  SelectionDAG &DAG, unsigned HiFlag,
+                                  unsigned LoFlag, SDValue Chain,
                                   const MachinePointerInfo &PtrInfo) const {
-      SDLoc DL(N);
-      SDValue Hi = DAG.getNode(MipsISD::Hi, DL, Ty,
-                               getTargetNode(N, Ty, DAG, HiFlag));
+      SDValue Hi =
+          DAG.getNode(MipsISD::Hi, DL, Ty, getTargetNode(N, Ty, DAG, HiFlag));
       Hi = DAG.getNode(ISD::ADD, DL, Ty, Hi, getGlobalReg(DAG, Ty));
       SDValue Wrapper = DAG.getNode(MipsISD::Wrapper, DL, Ty, Hi,
                                     getTargetNode(N, Ty, DAG, LoFlag));
@@ -324,9 +321,9 @@ namespace llvm {
     // computing a symbol's address in non-PIC mode:
     //
     // (add %hi(sym), %lo(sym))
-    template<class NodeTy>
-    SDValue getAddrNonPIC(NodeTy *N, EVT Ty, SelectionDAG &DAG) const {
-      SDLoc DL(N);
+    template <class NodeTy>
+    SDValue getAddrNonPIC(NodeTy *N, SDLoc DL, EVT Ty,
+                          SelectionDAG &DAG) const {
       SDValue Hi = getTargetNode(N, Ty, DAG, MipsII::MO_ABS_HI);
       SDValue Lo = getTargetNode(N, Ty, DAG, MipsII::MO_ABS_LO);
       return DAG.getNode(ISD::ADD, DL, Ty,
@@ -338,9 +335,8 @@ namespace llvm {
     // computing a symbol's address using gp-relative addressing:
     //
     // (add $gp, %gp_rel(sym))
-    template<class NodeTy>
-    SDValue getAddrGPRel(NodeTy *N, EVT Ty, SelectionDAG &DAG) const {
-      SDLoc DL(N);
+    template <class NodeTy>
+    SDValue getAddrGPRel(NodeTy *N, SDLoc DL, EVT Ty, SelectionDAG &DAG) const {
       assert(Ty == MVT::i32);
       SDValue GPRel = getTargetNode(N, Ty, DAG, MipsII::MO_GPREL);
       return DAG.getNode(ISD::ADD, DL, Ty,
diff --git a/lib/Target/NVPTX/NVPTXAllocaHoisting.h b/lib/Target/NVPTX/NVPTXAllocaHoisting.h
index 69fc86e75414..c343980c5014 100644
--- a/lib/Target/NVPTX/NVPTXAllocaHoisting.h
+++ b/lib/Target/NVPTX/NVPTXAllocaHoisting.h
@@ -15,6 +15,7 @@
 #define LLVM_LIB_TARGET_NVPTX_NVPTXALLOCAHOISTING_H
 
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/StackProtector.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Pass.h"
 
@@ -32,8 +33,8 @@ public:
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<DataLayoutPass>();
-    AU.addPreserved("stack-protector");
     AU.addPreserved<MachineFunctionAnalysis>();
+    AU.addPreserved<StackProtector>();
   }
 
   const char *getPassName() const override {
diff --git a/lib/Target/NVPTX/NVPTXLowerAggrCopies.h b/lib/Target/NVPTX/NVPTXLowerAggrCopies.h
index 8759406a6803..da301d5de62d 100644
--- a/lib/Target/NVPTX/NVPTXLowerAggrCopies.h
+++ b/lib/Target/NVPTX/NVPTXLowerAggrCopies.h
@@ -16,6 +16,7 @@
 #define LLVM_LIB_TARGET_NVPTX_NVPTXLOWERAGGRCOPIES_H
 
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/StackProtector.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Pass.h"
 
@@ -29,8 +30,8 @@ struct NVPTXLowerAggrCopies : public FunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<DataLayoutPass>();
-    AU.addPreserved("stack-protector");
     AU.addPreserved<MachineFunctionAnalysis>();
+    AU.addPreserved<StackProtector>();
   }
 
   bool runOnFunction(Function &F) override;
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
index f7259b9a098c..df2f14a91a7e 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
@@ -41,7 +41,7 @@ public:
       : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype,
                                  /*UseAggressiveSymbolFolding=*/Is64Bit) {}
 
-  void RecordRelocation(MachObjectWriter *Writer, MCAssembler &Asm,
+  void RecordRelocation(MachObjectWriter *Writer, const MCAssembler &Asm,
                         const MCAsmLayout &Layout, const MCFragment *Fragment,
                         const MCFixup &Fixup, MCValue Target,
                         uint64_t &FixedValue) override {
@@ -282,7 +282,7 @@ bool PPCMachObjectWriter::RecordScatteredRelocation(
     MachO::any_relocation_info MRE;
     makeScatteredRelocationInfo(MRE, other_half, MachO::GENERIC_RELOC_PAIR,
                                 Log2Size, IsPCRel, Value2);
-    Writer->addRelocation(nullptr, Fragment->getParent(), MRE);
+    Writer->addRelocation(Fragment->getParent(), MRE);
   } else {
     // If the offset is more than 24-bits, it won't fit in a scattered
     // relocation offset field, so we fall back to using a non-scattered
@@ -296,7 +296,7 @@ bool PPCMachObjectWriter::RecordScatteredRelocation(
   }
   MachO::any_relocation_info MRE;
   makeScatteredRelocationInfo(MRE, FixupOffset, Type, Log2Size, IsPCRel, Value);
-  Writer->addRelocation(nullptr, Fragment->getParent(), MRE);
+  Writer->addRelocation(Fragment->getParent(), MRE);
   return true;
 }
 
@@ -331,9 +331,9 @@ void PPCMachObjectWriter::RecordPPCRelocation(
   // See <reloc.h>.
   const uint32_t FixupOffset = getFixupOffset(Layout, Fragment, Fixup);
   unsigned Index = 0;
+  unsigned IsExtern = 0;
   unsigned Type = RelocType;
 
-  const MCSymbolData *RelSymbol = nullptr;
   if (Target.isAbsolute()) { // constant
                              // SymbolNum of 0 indicates the absolute section.
                              //
@@ -355,7 +355,8 @@ void PPCMachObjectWriter::RecordPPCRelocation(
 
     // Check whether we need an external or internal relocation.
     if (Writer->doesSymbolRequireExternRelocation(SD)) {
-      RelSymbol = SD;
+      IsExtern = 1;
+      Index = SD->getIndex();
       // For external relocations, make sure to offset the fixup value to
       // compensate for the addend of the symbol address, if it was
       // undefined. This occurs with weak definitions, for example.
@@ -374,8 +375,9 @@ void PPCMachObjectWriter::RecordPPCRelocation(
 
   // struct relocation_info (8 bytes)
   MachO::any_relocation_info MRE;
-  makeRelocationInfo(MRE, FixupOffset, Index, IsPCRel, Log2Size, false, Type);
-  Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE);
+  makeRelocationInfo(MRE, FixupOffset, Index, IsPCRel, Log2Size, IsExtern,
+                     Type);
+  Writer->addRelocation(Fragment->getParent(), MRE);
 }
 
 MCObjectWriter *llvm::createPPCMachObjectWriter(raw_ostream &OS, bool Is64Bit,
diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h
index fcf9eca80e96..c6600550126e 100644
--- a/lib/Target/R600/AMDGPU.h
+++ b/lib/Target/R600/AMDGPU.h
@@ -77,7 +77,11 @@ extern Target TheGCNTarget;
 
 namespace AMDGPU {
 enum TargetIndex {
-  TI_CONSTDATA_START
+  TI_CONSTDATA_START,
+  TI_SCRATCH_RSRC_DWORD0,
+  TI_SCRATCH_RSRC_DWORD1,
+  TI_SCRATCH_RSRC_DWORD2,
+  TI_SCRATCH_RSRC_DWORD3
 };
 }
 
diff --git a/lib/Target/R600/AMDGPU.td b/lib/Target/R600/AMDGPU.td
index 8a5ca613dc80..1df4448abf05 100644
--- a/lib/Target/R600/AMDGPU.td
+++ b/lib/Target/R600/AMDGPU.td
@@ -92,6 +92,11 @@ def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space",
         "true",
         "Support flat address space">;
 
+def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling",
+        "EnableVGPRSpilling",
+        "true",
+        "Enable spilling of VGPRs to scratch memory">;
+
 class SubtargetFeatureFetchLimit <string Value> :
                           SubtargetFeature <"fetch"#Value,
         "TexVTXClauseSize",
diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp
index 624f3919b409..6185e367ff50 100644
--- a/lib/Target/R600/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp
@@ -116,7 +116,6 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
   SIProgramInfo KernelInfo;
   if (STM.isAmdHsaOS()) {
-    OutStreamer.SwitchSection(getObjFileLowering().getTextSection());
     getSIProgramInfo(KernelInfo, MF);
     EmitAmdKernelCodeT(MF, KernelInfo);
     OutStreamer.EmitCodeAlignment(2 << (MF.getAlignment() - 1));
@@ -421,6 +420,7 @@ static unsigned getRsrcReg(unsigned ShaderType) {
 
 void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
                                          const SIProgramInfo &KernelInfo) {
+  const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   unsigned RsrcReg = getRsrcReg(MFI->getShaderType());
 
@@ -441,6 +441,10 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
     OutStreamer.EmitIntValue(RsrcReg, 4);
     OutStreamer.EmitIntValue(S_00B028_VGPRS(KernelInfo.VGPRBlocks) |
                              S_00B028_SGPRS(KernelInfo.SGPRBlocks), 4);
+    if (STM.isVGPRSpillingEnabled(MFI)) {
+      OutStreamer.EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4);
+      OutStreamer.EmitIntValue(S_0286E8_WAVESIZE(KernelInfo.ScratchBlocks), 4);
+    }
   }
 
   if (MFI->getShaderType() == ShaderType::PIXEL) {
@@ -504,6 +508,19 @@ void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF,
 
   header.wavefront_size = STM.getWavefrontSize();
 
+  const MCSectionELF *VersionSection = OutContext.getELFSection(".hsa.version",
+      ELF::SHT_PROGBITS, 0, SectionKind::getReadOnly());
+  OutStreamer.SwitchSection(VersionSection);
+  OutStreamer.EmitBytes(Twine("HSA Code Unit:" +
+                        Twine(header.hsail_version_major) + "." +
+                        Twine(header.hsail_version_minor) + ":" +
+                        "AMD:" +
+                        Twine(header.amd_code_version_major) + "." +
+                        Twine(header.amd_code_version_minor) +  ":" +
+                        "GFX8.1:0").str());
+
+  OutStreamer.SwitchSection(getObjFileLowering().getTextSection());
+
   if (isVerbose()) {
     OutStreamer.emitRawComment("amd_code_version_major = " +
                                Twine(header.amd_code_version_major), false);
diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
index eaa506db96c3..15112c7e54d4 100644
--- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
@@ -417,6 +417,28 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
                                   N->getValueType(0), Ops);
   }
 
+  case ISD::LOAD: {
+    // To simplify the TableGen patters, we replace all i64 loads with
+    // v2i32 loads.  Alternatively, we could promote i64 loads to v2i32
+    // during DAG legalization, however, so places (ExpandUnalignedLoad)
+    // in the DAG legalizer assume that if i64 is legal, so doing this
+    // promotion early can cause problems.
+    EVT VT = N->getValueType(0);
+    LoadSDNode *LD = cast<LoadSDNode>(N);
+    if (VT != MVT::i64 || LD->getExtensionType() != ISD::NON_EXTLOAD)
+      break;
+
+    SDValue NewLoad = CurDAG->getLoad(MVT::v2i32, SDLoc(N), LD->getChain(),
+                                     LD->getBasePtr(), LD->getMemOperand());
+    SDValue BitCast = CurDAG->getNode(ISD::BITCAST, SDLoc(N),
+                                      MVT::i64, NewLoad);
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLoad.getValue(1));
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), BitCast);
+    SelectCode(NewLoad.getNode());
+    N = BitCast.getNode();
+    break;
+  }
+
   case AMDGPUISD::REGISTER_LOAD: {
     if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
       break;
@@ -962,16 +984,27 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
   const SITargetLowering& Lowering =
     *static_cast<const SITargetLowering*>(getTargetLowering());
 
-  unsigned ScratchPtrReg =
-      TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR);
   unsigned ScratchOffsetReg =
       TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET);
   Lowering.CreateLiveInRegister(*CurDAG, &AMDGPU::SReg_32RegClass,
                                 ScratchOffsetReg, MVT::i32);
+  SDValue Sym0 = CurDAG->getExternalSymbol("SCRATCH_RSRC_DWORD0", MVT::i32);
+  SDValue ScratchRsrcDword0 =
+      SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, Sym0), 0);
 
-  SDValue ScratchPtr =
-    CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
-                           MRI.getLiveInVirtReg(ScratchPtrReg), MVT::i64);
+  SDValue Sym1 = CurDAG->getExternalSymbol("SCRATCH_RSRC_DWORD1", MVT::i32);
+  SDValue ScratchRsrcDword1 =
+      SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, Sym1), 0);
+
+  const SDValue RsrcOps[] = {
+      CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, MVT::i32),
+      ScratchRsrcDword0,
+      CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32),
+      ScratchRsrcDword1,
+      CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32),
+  };
+  SDValue ScratchPtr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
+                                              MVT::v2i32, RsrcOps), 0);
   Rsrc = SDValue(Lowering.buildScratchRSRC(*CurDAG, DL, ScratchPtr), 0);
   SOffset = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
       MRI.getLiveInVirtReg(ScratchOffsetReg), MVT::i32);
@@ -988,22 +1021,6 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
     }
   }
 
-  // (add FI, n0)
-  if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
-       isa<FrameIndexSDNode>(Addr.getOperand(0))) {
-    VAddr = Addr.getOperand(1);
-    ImmOffset = Addr.getOperand(0);
-    return true;
-  }
-
-  // (FI)
-  if (isa<FrameIndexSDNode>(Addr)) {
-    VAddr = SDValue(CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
-                                          CurDAG->getConstant(0, MVT::i32)), 0);
-    ImmOffset = Addr;
-    return true;
-  }
-
   // (node)
   VAddr = Addr;
   ImmOffset = CurDAG->getTargetConstant(0, MVT::i16);
diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
index 206050d54a02..2adcdf1c299e 100644
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -187,9 +187,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
   AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
 
-  setOperationAction(ISD::LOAD, MVT::i64, Promote);
-  AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
-
   setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
   AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
 
diff --git a/lib/Target/R600/AMDGPUInstrInfo.cpp b/lib/Target/R600/AMDGPUInstrInfo.cpp
index 5beaa6841c94..e34a7b7345f1 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.cpp
+++ b/lib/Target/R600/AMDGPUInstrInfo.cpp
@@ -341,8 +341,39 @@ int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const {
 // instead.
 namespace llvm {
 namespace AMDGPU {
-int getMCOpcode(uint16_t Opcode, unsigned Gen) {
+static int getMCOpcode(uint16_t Opcode, unsigned Gen) {
   return getMCOpcodeGen(Opcode, (enum Subtarget)Gen);
 }
 }
 }
+
+// This must be kept in sync with the SISubtarget class in SIInstrInfo.td
+enum SISubtarget {
+  SI = 0,
+  VI = 1
+};
+
+enum SISubtarget AMDGPUSubtargetToSISubtarget(unsigned Gen) {
+  switch (Gen) {
+  default:
+    return SI;
+  case AMDGPUSubtarget::VOLCANIC_ISLANDS:
+    return VI;
+  }
+}
+
+int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const {
+  int MCOp = AMDGPU::getMCOpcode(Opcode,
+                        AMDGPUSubtargetToSISubtarget(RI.ST.getGeneration()));
+
+  // -1 means that Opcode is already a native instruction.
+  if (MCOp == -1)
+    return Opcode;
+
+  // (uint16_t)-1 means that Opcode is a pseudo instruction that has
+  // no encoding in the given subtarget generation.
+  if (MCOp == (uint16_t)-1)
+    return -1;
+
+  return MCOp;
+}
diff --git a/lib/Target/R600/AMDGPUInstrInfo.h b/lib/Target/R600/AMDGPUInstrInfo.h
index da9833d25a52..e28ce0f03acc 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.h
+++ b/lib/Target/R600/AMDGPUInstrInfo.h
@@ -135,6 +135,11 @@ public:
   bool isRegisterStore(const MachineInstr &MI) const;
   bool isRegisterLoad(const MachineInstr &MI) const;
 
+  /// \brief Return a target-specific opcode if Opcode is a pseudo instruction.
+  /// Return -1 if the target-specific opcode for the pseudo instruction does
+  /// not exist. If Opcode is not a pseudo instruction, this is identity.
+  int pseudoToMCOpcode(int Opcode) const;
+
 //===---------------------------------------------------------------------===//
 // Pure virtual funtions to be implemented by sub-classes.
 //===---------------------------------------------------------------------===//
diff --git a/lib/Target/R600/AMDGPUMCInstLower.cpp b/lib/Target/R600/AMDGPUMCInstLower.cpp
index 1995ef2b0c9e..03aa32d05ef0 100644
--- a/lib/Target/R600/AMDGPUMCInstLower.cpp
+++ b/lib/Target/R600/AMDGPUMCInstLower.cpp
@@ -22,6 +22,7 @@
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
@@ -39,29 +40,17 @@ AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &st):
   Ctx(ctx), ST(st)
 { }
 
-enum AMDGPUMCInstLower::SISubtarget
-AMDGPUMCInstLower::AMDGPUSubtargetToSISubtarget(unsigned Gen) const {
-  switch (Gen) {
-  default:
-    return AMDGPUMCInstLower::SI;
-  case AMDGPUSubtarget::VOLCANIC_ISLANDS:
-    return AMDGPUMCInstLower::VI;
-  }
-}
-
-unsigned AMDGPUMCInstLower::getMCOpcode(unsigned MIOpcode) const {
-
-  int MCOpcode = AMDGPU::getMCOpcode(MIOpcode,
-                              AMDGPUSubtargetToSISubtarget(ST.getGeneration()));
-  if (MCOpcode == -1)
-    MCOpcode = MIOpcode;
+void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
 
-  return MCOpcode;
-}
+  int MCOpcode = ST.getInstrInfo()->pseudoToMCOpcode(MI->getOpcode());
 
-void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
+  if (MCOpcode == -1) {
+    LLVMContext &C = MI->getParent()->getParent()->getFunction()->getContext();
+    C.emitError("AMDGPUMCInstLower::lower - Pseudo instruction doesn't have "
+                "a target-specific version: " + Twine(MI->getOpcode()));
+  }
 
-  OutMI.setOpcode(getMCOpcode(MI->getOpcode()));
+  OutMI.setOpcode(MCOpcode);
 
   for (const MachineOperand &MO : MI->explicit_operands()) {
     MCOperand MCOp;
@@ -91,6 +80,12 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
       MCOp = MCOperand::CreateExpr(Expr);
       break;
     }
+    case MachineOperand::MO_ExternalSymbol: {
+      MCSymbol *Sym = Ctx.GetOrCreateSymbol(StringRef(MO.getSymbolName()));
+      const MCSymbolRefExpr *Expr = MCSymbolRefExpr::Create(Sym, Ctx);
+      MCOp = MCOperand::CreateExpr(Expr);
+      break;
+    }
     }
     OutMI.addOperand(MCOp);
   }
diff --git a/lib/Target/R600/AMDGPUMCInstLower.h b/lib/Target/R600/AMDGPUMCInstLower.h
index 0ae4d11bf1d1..d322fe072b2b 100644
--- a/lib/Target/R600/AMDGPUMCInstLower.h
+++ b/lib/Target/R600/AMDGPUMCInstLower.h
@@ -19,23 +19,9 @@ class MCContext;
 class MCInst;
 
 class AMDGPUMCInstLower {
-
-  // This must be kept in sync with the SISubtarget class in SIInstrInfo.td
-  enum SISubtarget {
-    SI = 0,
-    VI = 1
-  };
-
   MCContext &Ctx;
   const AMDGPUSubtarget &ST;
 
-  /// Convert a member of the AMDGPUSubtarget::Generation enum to the
-  /// SISubtarget enum.
-  enum SISubtarget AMDGPUSubtargetToSISubtarget(unsigned Gen) const;
-
-  /// Get the MC opcode for this MachineInstr.
-  unsigned getMCOpcode(unsigned MIOpcode) const;
-
 public:
   AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &ST);
 
diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp
index 597e558e6634..b1c7498fc142 100644
--- a/lib/Target/R600/AMDGPUSubtarget.cpp
+++ b/lib/Target/R600/AMDGPUSubtarget.cpp
@@ -18,7 +18,9 @@
 #include "R600MachineScheduler.h"
 #include "SIISelLowering.h"
 #include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/CodeGen/MachineScheduler.h"
 
 using namespace llvm;
 
@@ -78,6 +80,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef GPU, StringRef FS,
       FlatAddressSpace(false), EnableIRStructurizer(true),
       EnablePromoteAlloca(false), EnableIfCvt(true),
       EnableLoadStoreOpt(false), WavefrontSize(0), CFALUBug(false), LocalMemorySize(0),
+      EnableVGPRSpilling(false),
       DL(computeDataLayout(initializeSubtargetDependencies(GPU, FS))),
       FrameLowering(TargetFrameLowering::StackGrowsUp,
                     64 * 16, // Maximum stack alignment (long16)
@@ -113,3 +116,26 @@ unsigned AMDGPUSubtarget::getAmdKernelCodeChipID() const {
   case SEA_ISLANDS: return 12;
   }
 }
+
+bool AMDGPUSubtarget::isVGPRSpillingEnabled(
+                                       const SIMachineFunctionInfo *MFI) const {
+  return MFI->getShaderType() == ShaderType::COMPUTE || EnableVGPRSpilling;
+}
+
+void AMDGPUSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
+                                          MachineInstr *begin,
+                                          MachineInstr *end,
+                                          unsigned NumRegionInstrs) const {
+  if (getGeneration() >= SOUTHERN_ISLANDS) {
+
+    // Track register pressure so the scheduler can try to decrease
+    // pressure once register usage is above the threshold defined by
+    // SIRegisterInfo::getRegPressureSetLimit()
+    Policy.ShouldTrackPressure = true;
+
+    // Enabling both top down and bottom up scheduling seems to give us less
+    // register spills than just using one of these approaches on its own.
+    Policy.OnlyTopDown = false;
+    Policy.OnlyBottomUp = false;
+  }
+}
diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h
index 90179d79d25d..566b45c1dccc 100644
--- a/lib/Target/R600/AMDGPUSubtarget.h
+++ b/lib/Target/R600/AMDGPUSubtarget.h
@@ -30,6 +30,8 @@
 
 namespace llvm {
 
+class SIMachineFunctionInfo;
+
 class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
 
 public:
@@ -63,6 +65,7 @@ private:
   unsigned WavefrontSize;
   bool CFALUBug;
   int LocalMemorySize;
+  bool EnableVGPRSpilling;
 
   const DataLayout DL;
   AMDGPUFrameLowering FrameLowering;
@@ -206,6 +209,10 @@ public:
     return getGeneration() <= NORTHERN_ISLANDS;
   }
 
+  void overrideSchedPolicy(MachineSchedPolicy &Policy,
+                           MachineInstr *begin, MachineInstr *end,
+                           unsigned NumRegionInstrs) const override;
+
   // Helper functions to simplify if statements
   bool isTargetELF() const {
     return false;
@@ -224,6 +231,15 @@ public:
   bool isAmdHsaOS() const {
     return TargetTriple.getOS() == Triple::AMDHSA;
   }
+  bool isVGPRSpillingEnabled(const SIMachineFunctionInfo *MFI) const;
+
+  unsigned getMaxWavesPerCU() const {
+    if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)
+      return 10;
+
+    // FIXME: Not sure what this is for other subtagets.
+    llvm_unreachable("do not know max waves per CU for this subtarget.");
+  }
 };
 
 } // End namespace llvm
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
index d0c634fb7e42..5fb311b3016b 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -29,7 +29,7 @@ public:
                                 const MCAsmLayout &Layout) override {
     //XXX: Implement if necessary.
   }
-  void RecordRelocation(MCAssembler &Asm, const MCAsmLayout &Layout,
+  void RecordRelocation(const MCAssembler &Asm, const MCAsmLayout &Layout,
                         const MCFragment *Fragment, const MCFixup &Fixup,
                         MCValue Target, bool &IsPCRel,
                         uint64_t &FixedValue) override {
diff --git a/lib/Target/R600/SIDefines.h b/lib/Target/R600/SIDefines.h
index 73a9c73d8e7b..7601794beab8 100644
--- a/lib/Target/R600/SIDefines.h
+++ b/lib/Target/R600/SIDefines.h
@@ -163,4 +163,8 @@ namespace SIOutMods {
 #define R_00B860_COMPUTE_TMPRING_SIZE                                   0x00B860
 #define   S_00B860_WAVESIZE(x)                                        (((x) & 0x1FFF) << 12)
 
+#define R_0286E8_SPI_TMPRING_SIZE                                       0x0286E8
+#define   S_0286E8_WAVESIZE(x)                                        (((x) & 0x1FFF) << 12)
+
+
 #endif
diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
index 0a3fa2f930d7..6b2ea0682a43 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -588,6 +588,12 @@ SDValue SITargetLowering::LowerFormalArguments(
 
     InVals.push_back(Val);
   }
+
+  if (Info->getShaderType() != ShaderType::COMPUTE) {
+    unsigned ScratchIdx = CCInfo.getFirstUnallocated(
+        AMDGPU::SGPR_32RegClass.begin(), AMDGPU::SGPR_32RegClass.getNumRegs());
+    Info->ScratchOffsetReg = AMDGPU::SGPR_32RegClass.getRegister(ScratchIdx);
+  }
   return Chain;
 }
 
diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td
index 99a1df36c1f4..09c0cbe8f5c3 100644
--- a/lib/Target/R600/SIInstrFormats.td
+++ b/lib/Target/R600/SIInstrFormats.td
@@ -85,49 +85,41 @@ class Enc64 {
 
 let Uses = [EXEC] in {
 
-class VOPCCommon <dag ins, string asm, list<dag> pattern> :
-    InstSI <(outs VCCReg:$dst), ins, asm, pattern> {
+class VOPAnyCommon <dag outs, dag ins, string asm, list<dag> pattern> :
+    InstSI <outs, ins, asm, pattern> {
 
-  let DisableEncoding = "$dst";
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
   let UseNamedOperandTable = 1;
-  let VOPC = 1;
   let VALU = 1;
+}
+
+class VOPCCommon <dag ins, string asm, list<dag> pattern> :
+    VOPAnyCommon <(outs VCCReg:$dst), ins, asm, pattern> {
+
+  let DisableEncoding = "$dst";
+  let VOPC = 1;
   let Size = 4;
 }
 
 class VOP1Common <dag outs, dag ins, string asm, list<dag> pattern> :
-    InstSI <outs, ins, asm, pattern> {
-  let mayLoad = 0;
-  let mayStore = 0;
-  let hasSideEffects = 0;
-  let UseNamedOperandTable = 1;
+    VOPAnyCommon <outs, ins, asm, pattern> {
+
   let VOP1 = 1;
-  let VALU = 1;
   let Size = 4;
 }
 
 class VOP2Common <dag outs, dag ins, string asm, list<dag> pattern> :
-    InstSI <outs, ins, asm, pattern> {
+    VOPAnyCommon <outs, ins, asm, pattern> {
 
-  let mayLoad = 0;
-  let mayStore = 0;
-  let hasSideEffects = 0;
-  let UseNamedOperandTable = 1;
   let VOP2 = 1;
-  let VALU = 1;
   let Size = 4;
 }
 
 class VOP3Common <dag outs, dag ins, string asm, list<dag> pattern> :
-    InstSI <outs, ins, asm, pattern> {
+    VOPAnyCommon <outs, ins, asm, pattern> {
 
-  let mayLoad = 0;
-  let mayStore = 0;
-  let hasSideEffects = 0;
-  let UseNamedOperandTable = 1;
   // Using complex patterns gives VOP3 patterns a very high complexity rating,
   // but standalone patterns are almost always prefered, so we need to adjust the
   // priority lower.  The goal is to use a high number to reduce complexity to
@@ -135,8 +127,6 @@ class VOP3Common <dag outs, dag ins, string asm, list<dag> pattern> :
   let AddedComplexity = -1000;
 
   let VOP3 = 1;
-  let VALU = 1;
-
   int Size = 8;
 }
 
diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
index 1a4c0d4e57b0..80b560eb65ae 100644
--- a/lib/Target/R600/SIInstrInfo.cpp
+++ b/lib/Target/R600/SIInstrInfo.cpp
@@ -430,15 +430,6 @@ unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
   return AMDGPU::COPY;
 }
 
-static bool shouldTryToSpillVGPRs(MachineFunction *MF) {
-
-  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
-
-  // FIXME: Implement spilling for other shader types.
-  return MFI->getShaderType() == ShaderType::COMPUTE;
-
-}
-
 void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                       MachineBasicBlock::iterator MI,
                                       unsigned SrcReg, bool isKill,
@@ -462,7 +453,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
       case 256: Opcode = AMDGPU::SI_SPILL_S256_SAVE; break;
       case 512: Opcode = AMDGPU::SI_SPILL_S512_SAVE; break;
     }
-  } else if(shouldTryToSpillVGPRs(MF) && RI.hasVGPRs(RC)) {
+  } else if(RI.hasVGPRs(RC) && ST.isVGPRSpillingEnabled(MFI)) {
     MFI->setHasSpilledVGPRs();
 
     switch(RC->getSize() * 8) {
@@ -482,7 +473,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
             .addFrameIndex(FrameIndex)
             // Place-holder registers, these will be filled in by
             // SIPrepareScratchRegs.
-            .addReg(AMDGPU::SGPR0_SGPR1, RegState::Undef)
+            .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef)
             .addReg(AMDGPU::SGPR0, RegState::Undef);
   } else {
     LLVMContext &Ctx = MF->getFunction()->getContext();
@@ -499,6 +490,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                        const TargetRegisterClass *RC,
                                        const TargetRegisterInfo *TRI) const {
   MachineFunction *MF = MBB.getParent();
+  const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
   MachineFrameInfo *FrameInfo = MF->getFrameInfo();
   DebugLoc DL = MBB.findDebugLoc(MI);
   int Opcode = -1;
@@ -511,7 +503,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
       case 256: Opcode = AMDGPU::SI_SPILL_S256_RESTORE; break;
       case 512: Opcode = AMDGPU::SI_SPILL_S512_RESTORE; break;
     }
-  } else if(shouldTryToSpillVGPRs(MF) && RI.hasVGPRs(RC)) {
+  } else if(RI.hasVGPRs(RC) && ST.isVGPRSpillingEnabled(MFI)) {
     switch(RC->getSize() * 8) {
       case 32: Opcode = AMDGPU::SI_SPILL_V32_RESTORE; break;
       case 64: Opcode = AMDGPU::SI_SPILL_V64_RESTORE; break;
@@ -528,7 +520,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
             .addFrameIndex(FrameIndex)
             // Place-holder registers, these will be filled in by
             // SIPrepareScratchRegs.
-            .addReg(AMDGPU::SGPR0_SGPR1, RegState::Undef)
+            .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef)
             .addReg(AMDGPU::SGPR0, RegState::Undef);
 
   } else {
@@ -615,7 +607,7 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB,
               .addImm(-1)
               .addImm(0);
 
-      BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e32),
+      BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64),
               TIDReg)
               .addImm(-1)
               .addReg(TIDReg);
@@ -1053,7 +1045,11 @@ bool SIInstrInfo::canFoldOffset(unsigned OffsetSize, unsigned AS) const {
 }
 
 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
-  return AMDGPU::getVOPe32(Opcode) != -1;
+  int Op32 = AMDGPU::getVOPe32(Opcode);
+  if (Op32 == -1)
+    return false;
+
+  return pseudoToMCOpcode(Op32) != -1;
 }
 
 bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
@@ -1126,12 +1122,18 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
     }
 
     switch (Desc.OpInfo[i].OperandType) {
-    case MCOI::OPERAND_REGISTER: {
-      if (MI->getOperand(i).isImm() &&
-          !isImmOperandLegal(MI, i, MI->getOperand(i))) {
-          ErrInfo = "Illegal immediate value for operand.";
-          return false;
-        }
+    case MCOI::OPERAND_REGISTER:
+      if (MI->getOperand(i).isImm() || MI->getOperand(i).isFPImm()) {
+        ErrInfo = "Illegal immediate value for operand.";
+        return false;
+      }
+      break;
+    case AMDGPU::OPERAND_REG_IMM32:
+      break;
+    case AMDGPU::OPERAND_REG_INLINE_C:
+      if (MI->getOperand(i).isImm() && !isInlineConstant(MI->getOperand(i))) {
+        ErrInfo = "Illegal immediate value for operand.";
+        return false;
       }
       break;
     case MCOI::OPERAND_IMMEDIATE:
@@ -1287,7 +1289,7 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
   case AMDGPU::S_LOAD_DWORDX2_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64;
   case AMDGPU::S_LOAD_DWORDX4_IMM:
   case AMDGPU::S_LOAD_DWORDX4_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64;
-  case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e32;
+  case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
   case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
   case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
   }
@@ -2278,7 +2280,7 @@ void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist
   MachineOperand &Dest = Inst->getOperand(0);
   MachineOperand &Src = Inst->getOperand(1);
 
-  const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e32);
+  const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
   const TargetRegisterClass *SrcRC = Src.isReg() ?
     MRI.getRegClass(Src.getReg()) :
     &AMDGPU::SGPR_32RegClass;
diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h
index f766dc85e86a..28cd27dd8962 100644
--- a/lib/Target/R600/SIInstrInfo.h
+++ b/lib/Target/R600/SIInstrInfo.h
@@ -325,7 +325,6 @@ namespace AMDGPU {
   int getVOPe32(uint16_t Opcode);
   int getCommuteRev(uint16_t Opcode);
   int getCommuteOrig(uint16_t Opcode);
-  int getMCOpcode(uint16_t Opcode, unsigned Gen);
   int getAddr64Inst(uint16_t Opcode);
   int getAtomicRetOp(uint16_t Opcode);
   int getAtomicNoRetOp(uint16_t Opcode);
diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
index 7cc9588c8e4b..175e11d709cf 100644
--- a/lib/Target/R600/SIInstrInfo.td
+++ b/lib/Target/R600/SIInstrInfo.td
@@ -36,6 +36,12 @@ class vop2 <bits<6> si, bits<6> vi = si> : vop {
   field bits<10> VI3 = {0, 1, 0, 0, vi{5-0}};
 }
 
+// Specify a VOP2 opcode for SI and VOP3 opcode for VI
+// that doesn't have VOP2 encoding on VI
+class vop23 <bits<6> si, bits<10> vi> : vop2 <si> {
+  let VI3 = vi;
+}
+
 class vop3 <bits<9> si, bits<10> vi = {0, si}> : vop {
   let SI3 = si;
   let VI3 = vi;
@@ -57,7 +63,7 @@ class sopk <bits<5> si, bits<5> vi = si> {
 }
 
 // Execpt for the NONE field, this must be kept in sync with the SISubtarget enum
-// in AMDGPUMCInstLower.h
+// in AMDGPUInstrInfo.cpp
 def SISubtarget {
   int NONE = -1;
   int SI = 0;
@@ -731,7 +737,7 @@ class getAsm32 <int NumSrcArgs> {
 // Returns the assembly string for the inputs and outputs of a VOP3
 // instruction.
 class getAsm64 <int NumSrcArgs, bit HasModifiers> {
-  string src0 = "$src0_modifiers,";
+  string src0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,");
   string src1 = !if(!eq(NumSrcArgs, 1), "",
                    !if(!eq(NumSrcArgs, 2), " $src1_modifiers",
                                            " $src1_modifiers,"));
@@ -848,6 +854,16 @@ class VOP2_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> :
   let isPseudo = 1;
 }
 
+multiclass VOP2SI_m <vop2 op, dag outs, dag ins, string asm, list<dag> pattern,
+                     string opName, string revOpSI> {
+  def "" : VOP2_Pseudo <outs, ins, pattern, opName>,
+           VOP2_REV<revOpSI#"_e32", !eq(revOpSI, opName)>;
+
+  def _si : VOP2 <op.SI, outs, ins, opName#asm, []>,
+            VOP2_REV<revOpSI#"_e32_si", !eq(revOpSI, opName)>,
+            SIMCInstr <opName#"_e32", SISubtarget.SI>;
+}
+
 multiclass VOP2_m <vop2 op, dag outs, dag ins, string asm, list<dag> pattern,
                    string opName, string revOpSI, string revOpVI> {
   def "" : VOP2_Pseudo <outs, ins, pattern, opName>,
@@ -889,16 +905,6 @@ class VOP3_Real_vi <bits<10> op, dag outs, dag ins, string asm, string opName> :
   VOP3e_vi <op>,
   SIMCInstr <opName#"_e64", SISubtarget.VI>;
 
-// VI only instruction
-class VOP3_vi <bits<10> op, string opName, dag outs, dag ins, string asm,
-               list<dag> pattern, int NumSrcArgs, bit HasMods = 1> :
-  VOP3Common <outs, ins, asm, pattern>,
-  VOP <opName>,
-  VOP3e_vi <op>,
-  VOP3DisableFields<!if(!eq(NumSrcArgs, 1), 0, 1),
-                    !if(!eq(NumSrcArgs, 2), 0, 1),
-                    HasMods>;
-
 multiclass VOP3_m <vop op, dag outs, dag ins, string asm, list<dag> pattern,
                    string opName, int NumSrcArgs, bit HasMods = 1> {
 
@@ -998,6 +1004,23 @@ multiclass VOP3_C_m <vop op, dag outs, dag ins, string asm,
   }
 }
 
+// An instruction that is VOP2 on SI and VOP3 on VI, no modifiers.
+multiclass VOP2SI_3VI_m <vop3 op, string opName, dag outs, dag ins,
+                         string asm, list<dag> pattern = []> {
+  let isPseudo = 1 in {
+    def "" : VOPAnyCommon <outs, ins, "", pattern>,
+             SIMCInstr<opName, SISubtarget.NONE>;
+  }
+
+  def _si : VOP2 <op.SI3{5-0}, outs, ins, asm, []>,
+            SIMCInstr <opName, SISubtarget.SI>;
+
+  def _vi : VOP3Common <outs, ins, asm, []>,
+            VOP3e_vi <op.VI3>,
+            VOP3DisableFields <1, 0, 0>,
+            SIMCInstr <opName, SISubtarget.VI>;
+}
+
 multiclass VOP1_Helper <vop1 op, string opName, dag outs,
                         dag ins32, string asm32, list<dag> pat32,
                         dag ins64, string asm64, list<dag> pat64,
@@ -1089,6 +1112,33 @@ multiclass VOP2bInst <vop2 op, string opName, VOPProfile P,
   revOp, P.HasModifiers
 >;
 
+// A VOP2 instruction that is VOP3-only on VI.
+multiclass VOP2_VI3_Helper <vop23 op, string opName, dag outs,
+                            dag ins32, string asm32, list<dag> pat32,
+                            dag ins64, string asm64, list<dag> pat64,
+                            string revOpSI, string revOpVI, bit HasMods> {
+  defm _e32 : VOP2SI_m <op, outs, ins32, asm32, pat32, opName, revOpSI>;
+
+  defm _e64 : VOP3_2_m <op, outs, ins64, opName#"_e64"#asm64, pat64, opName,
+                        revOpSI, revOpVI, HasMods>;
+}
+
+multiclass VOP2_VI3_Inst <vop23 op, string opName, VOPProfile P,
+                          SDPatternOperator node = null_frag,
+                          string revOpSI = opName, string revOpVI = revOpSI>
+                          : VOP2_VI3_Helper <
+  op, opName, P.Outs,
+  P.Ins32, P.Asm32, [],
+  P.Ins64, P.Asm64,
+  !if(P.HasModifiers,
+      [(set P.DstVT:$dst,
+           (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
+                                      i1:$clamp, i32:$omod)),
+                 (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
+      [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]),
+  revOpSI, revOpVI, P.HasModifiers
+>;
+
 class VOPC_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> :
   VOPCCommon <ins, "", pattern>,
   VOP <opName>,
@@ -1224,34 +1274,6 @@ multiclass VOP3Inst <vop3 op, string opName, VOPProfile P,
   P.NumSrcArgs, P.HasModifiers
 >;
 
-class VOP3InstVI <bits<10> op, string opName, VOPProfile P,
-                  SDPatternOperator node = null_frag> : VOP3_vi <
-  op, opName#"_vi", P.Outs, P.Ins64, opName#P.Asm64,
-  !if(!eq(P.NumSrcArgs, 3),
-    !if(P.HasModifiers,
-        [(set P.DstVT:$dst,
-            (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
-                                       i1:$clamp, i32:$omod)),
-                  (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
-                  (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))))],
-        [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1,
-                                  P.Src2VT:$src2))]),
-  !if(!eq(P.NumSrcArgs, 2),
-    !if(P.HasModifiers,
-        [(set P.DstVT:$dst,
-            (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
-                                       i1:$clamp, i32:$omod)),
-                  (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
-        [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))])
-  /* P.NumSrcArgs == 1 */,
-    !if(P.HasModifiers,
-        [(set P.DstVT:$dst,
-            (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
-                                       i1:$clamp, i32:$omod))))],
-        [(set P.DstVT:$dst, (node P.Src0VT:$src0))]))),
-  P.NumSrcArgs, P.HasModifiers
->;
-
 multiclass VOP3b_Helper <vop op, RegisterClass vrc, RegisterOperand arc,
                     string opName, list<dag> pattern> :
   VOP3b_2_m <
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index e05b6bb7d0f1..4b1a84662cb5 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -1525,25 +1525,25 @@ defm V_SUBBREV_U32 : VOP2bInst <vop2<0x2a, 0x1e>, "v_subbrev_u32",
 } // End Uses = [VCC]
 } // End isCommutable = 1, Defs = [VCC]
 
-// These instructions only exist on SI and CI
-let SubtargetPredicate = isSICI in {
-
-def V_READLANE_B32 : VOP2 <
-  0x00000001,
+defm V_READLANE_B32 : VOP2SI_3VI_m <
+  vop3 <0x001, 0x289>,
+  "v_readlane_b32",
   (outs SReg_32:$vdst),
   (ins VGPR_32:$src0, SSrc_32:$vsrc1),
-  "v_readlane_b32 $vdst, $src0, $vsrc1",
-  []
+  "v_readlane_b32 $vdst, $src0, $vsrc1"
 >;
 
-def V_WRITELANE_B32 : VOP2 <
-  0x00000002,
+defm V_WRITELANE_B32 : VOP2SI_3VI_m <
+  vop3 <0x002, 0x28a>,
+  "v_writelane_b32",
   (outs VGPR_32:$vdst),
   (ins SReg_32:$src0, SSrc_32:$vsrc1),
-  "v_writelane_b32 $vdst, $src0, $vsrc1",
-  []
+  "v_writelane_b32 $vdst, $src0, $vsrc1"
 >;
 
+// These instructions only exist on SI and CI
+let SubtargetPredicate = isSICI in {
+
 let isCommutable = 1 in {
 defm V_MAC_LEGACY_F32 : VOP2Inst <vop2<0x6>, "v_mac_legacy_f32",
   VOP_F32_F32_F32
@@ -1568,30 +1568,33 @@ defm V_LSHL_B32 : VOP2Inst <vop2<0x19>, "v_lshl_b32", VOP_I32_I32_I32, shl>;
 }
 
 } // End isCommutable = 1
+} // End let SubtargetPredicate = SICI
 
-defm V_BFM_B32 : VOP2Inst <vop2<0x1e>, "v_bfm_b32", VOP_I32_I32_I32,
-  AMDGPUbfm>;
-defm V_BCNT_U32_B32 : VOP2Inst <vop2<0x22>, "v_bcnt_u32_b32", VOP_I32_I32_I32>;
-defm V_MBCNT_LO_U32_B32 : VOP2Inst <vop2<0x23>, "v_mbcnt_lo_u32_b32",
+defm V_BFM_B32 : VOP2_VI3_Inst <vop23<0x1e, 0x293>, "v_bfm_b32", VOP_I32_I32_I32,
+  AMDGPUbfm
+>;
+defm V_BCNT_U32_B32 : VOP2_VI3_Inst <vop23<0x22, 0x28b>, "v_bcnt_u32_b32",
+  VOP_I32_I32_I32
+>;
+defm V_MBCNT_LO_U32_B32 : VOP2_VI3_Inst <vop23<0x23, 0x28c>, "v_mbcnt_lo_u32_b32",
   VOP_I32_I32_I32
 >;
-defm V_MBCNT_HI_U32_B32 : VOP2Inst <vop2<0x24>, "v_mbcnt_hi_u32_b32",
+defm V_MBCNT_HI_U32_B32 : VOP2_VI3_Inst <vop23<0x24, 0x28d>, "v_mbcnt_hi_u32_b32",
   VOP_I32_I32_I32
 >;
-defm V_LDEXP_F32 : VOP2Inst <vop2<0x2b>, "v_ldexp_f32",
+defm V_LDEXP_F32 : VOP2_VI3_Inst <vop23<0x2b, 0x288>, "v_ldexp_f32",
   VOP_F32_F32_I32, AMDGPUldexp
 >;
 
 ////def V_CVT_PKACCUM_U8_F32 : VOP2_U8 <0x0000002c, "v_cvt_pkaccum_u8_f32", []>;
 ////def V_CVT_PKNORM_I16_F32 : VOP2_I16 <0x0000002d, "v_cvt_pknorm_i16_f32", []>;
 ////def V_CVT_PKNORM_U16_F32 : VOP2_U16 <0x0000002e, "v_cvt_pknorm_u16_f32", []>;
-defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <vop2<0x2f>, "v_cvt_pkrtz_f16_f32",
+defm V_CVT_PKRTZ_F16_F32 : VOP2_VI3_Inst <vop23<0x2f, 0x296>, "v_cvt_pkrtz_f16_f32",
  VOP_I32_F32_F32, int_SI_packf16
 >;
 ////def V_CVT_PK_U16_U32 : VOP2_U16 <0x00000030, "v_cvt_pk_u16_u32", []>;
 ////def V_CVT_PK_I16_I32 : VOP2_I16 <0x00000031, "v_cvt_pk_i16_i32", []>;
 
-} // End let SubtargetPredicate = SICI
 //===----------------------------------------------------------------------===//
 // VOP3 Instructions
 //===----------------------------------------------------------------------===//
@@ -1656,9 +1659,6 @@ defm V_ALIGNBYTE_B32 : VOP3Inst <vop3<0x14f, 0x1cf>, "v_alignbyte_b32",
   VOP_I32_I32_I32_I32
 >;
 
-// Only on SI
-defm V_MULLIT_F32 : VOP3Inst <vop3<0x150>, "v_mullit_f32",
-  VOP_F32_F32_F32_F32>;
 defm V_MIN3_F32 : VOP3Inst <vop3<0x151>, "v_min3_f32",
   VOP_F32_F32_F32_F32, AMDGPUfmin3>;
 
@@ -1699,20 +1699,6 @@ defm V_DIV_FIXUP_F64 : VOP3Inst <
 
 } // let SchedRW = [WriteDouble]
 
-defm V_LSHL_B64 : VOP3Inst <vop3<0x161>, "v_lshl_b64",
-  VOP_I64_I64_I32, shl
->;
-
-// Only on SI
-defm V_LSHR_B64 : VOP3Inst <vop3<0x162>, "v_lshr_b64",
-  VOP_I64_I64_I32, srl
->;
-
-// Only on SI
-defm V_ASHR_I64 : VOP3Inst <vop3<0x163>, "v_ashr_i64",
-  VOP_I64_I64_I32, sra
->;
-
 let SchedRW = [WriteDouble] in {
 let isCommutable = 1 in {
 
@@ -1785,6 +1771,26 @@ defm V_TRIG_PREOP_F64 : VOP3Inst <
 
 } // let SchedRW = [WriteDouble]
 
+// These instructions only exist on SI and CI
+let SubtargetPredicate = isSICI in {
+
+defm V_LSHL_B64 : VOP3Inst <vop3<0x161>, "v_lshl_b64",
+  VOP_I64_I64_I32, shl
+>;
+
+defm V_LSHR_B64 : VOP3Inst <vop3<0x162>, "v_lshr_b64",
+  VOP_I64_I64_I32, srl
+>;
+
+defm V_ASHR_I64 : VOP3Inst <vop3<0x163>, "v_ashr_i64",
+  VOP_I64_I64_I32, sra
+>;
+
+defm V_MULLIT_F32 : VOP3Inst <vop3<0x150>, "v_mullit_f32",
+  VOP_F32_F32_F32_F32>;
+
+} // End SubtargetPredicate = isSICI
+
 //===----------------------------------------------------------------------===//
 // Pseudo Instructions
 //===----------------------------------------------------------------------===//
@@ -1943,14 +1949,14 @@ multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> {
   let UseNamedOperandTable = 1 in {
     def _SAVE : InstSI <
       (outs),
-      (ins sgpr_class:$src, i32imm:$frame_idx, SReg_64:$scratch_ptr,
+      (ins sgpr_class:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc,
            SReg_32:$scratch_offset),
       "", []
     >;
 
     def _RESTORE : InstSI <
       (outs sgpr_class:$dst),
-      (ins i32imm:$frame_idx, SReg_64:$scratch_ptr, SReg_32:$scratch_offset),
+      (ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset),
       "", []
     >;
   } // End UseNamedOperandTable = 1
@@ -1966,14 +1972,14 @@ multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> {
   let UseNamedOperandTable = 1 in {
     def _SAVE : InstSI <
       (outs),
-      (ins vgpr_class:$src, i32imm:$frame_idx, SReg_64:$scratch_ptr,
+      (ins vgpr_class:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc,
            SReg_32:$scratch_offset),
       "", []
     >;
 
     def _RESTORE : InstSI <
       (outs vgpr_class:$dst),
-      (ins i32imm:$frame_idx, SReg_64:$scratch_ptr, SReg_32:$scratch_offset),
+      (ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset),
       "", []
     >;
   } // End UseNamedOperandTable = 1
@@ -2728,16 +2734,12 @@ def : Pat <
                    (V_RCP_IFLAG_F32_e32 (V_CVT_F32_U32_e32 $src0))))
 >;
 
-let Predicates = [isSICI] in {
-
 def : Pat <
   (int_SI_tid),
-  (V_MBCNT_HI_U32_B32_e32 0xffffffff,
+  (V_MBCNT_HI_U32_B32_e64 0xffffffff,
                           (V_MBCNT_LO_U32_B32_e64 0xffffffff, 0))
 >;
 
-}
-
 //===----------------------------------------------------------------------===//
 // VOP3 Patterns
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/R600/SIMachineFunctionInfo.h b/lib/Target/R600/SIMachineFunctionInfo.h
index 71852717d7e6..667da4c8af61 100644
--- a/lib/Target/R600/SIMachineFunctionInfo.h
+++ b/lib/Target/R600/SIMachineFunctionInfo.h
@@ -50,6 +50,7 @@ public:
   unsigned NumUserSGPRs;
   std::map<unsigned, unsigned> LaneVGPRs;
   unsigned LDSWaveSpillSize;
+  unsigned ScratchOffsetReg;
   bool hasCalculatedTID() const { return TIDReg != AMDGPU::NoRegister; };
   unsigned getTIDReg() const { return TIDReg; };
   void setTIDReg(unsigned Reg) { TIDReg = Reg; }
diff --git a/lib/Target/R600/SIPrepareScratchRegs.cpp b/lib/Target/R600/SIPrepareScratchRegs.cpp
index f0e7edec6b48..0a57a5bc201a 100644
--- a/lib/Target/R600/SIPrepareScratchRegs.cpp
+++ b/lib/Target/R600/SIPrepareScratchRegs.cpp
@@ -84,28 +84,10 @@ bool SIPrepareScratchRegs::runOnMachineFunction(MachineFunction &MF) {
   if (!Entry->isLiveIn(ScratchOffsetPreloadReg))
     Entry->addLiveIn(ScratchOffsetPreloadReg);
 
-  // Load the scratch pointer
-  unsigned ScratchPtrReg =
-      TRI->findUnusedRegister(MRI, &AMDGPU::SGPR_64RegClass);
-  int ScratchPtrFI = -1;
-
-  if (ScratchPtrReg != AMDGPU::NoRegister) {
-    // Found an SGPR to use.
-    MRI.setPhysRegUsed(ScratchPtrReg);
-    BuildMI(*Entry, I, DL, TII->get(AMDGPU::S_MOV_B64), ScratchPtrReg)
-            .addReg(ScratchPtrPreloadReg);
-  } else {
-    // No SGPR is available, we must spill.
-    ScratchPtrFI = FrameInfo->CreateSpillStackObject(8, 4);
-    BuildMI(*Entry, I, DL, TII->get(AMDGPU::SI_SPILL_S64_SAVE))
-            .addReg(ScratchPtrPreloadReg)
-            .addFrameIndex(ScratchPtrFI);
-  }
-
   // Load the scratch offset.
   unsigned ScratchOffsetReg =
       TRI->findUnusedRegister(MRI, &AMDGPU::SGPR_32RegClass);
-  int ScratchOffsetFI = ~0;
+  int ScratchOffsetFI = -1;
 
   if (ScratchOffsetReg != AMDGPU::NoRegister) {
     // Found an SGPR to use
@@ -117,7 +99,9 @@ bool SIPrepareScratchRegs::runOnMachineFunction(MachineFunction &MF) {
     ScratchOffsetFI = FrameInfo->CreateSpillStackObject(4,4);
     BuildMI(*Entry, I, DL, TII->get(AMDGPU::SI_SPILL_S32_SAVE))
             .addReg(ScratchOffsetPreloadReg)
-            .addFrameIndex(ScratchOffsetFI);
+            .addFrameIndex(ScratchOffsetFI)
+            .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef)
+            .addReg(AMDGPU::SGPR0, RegState::Undef);
   }
 
 
@@ -125,22 +109,27 @@ bool SIPrepareScratchRegs::runOnMachineFunction(MachineFunction &MF) {
   // add them to all the SI_SPILL_V* instructions.
 
   RegScavenger RS;
-  bool UseRegScavenger =
-      (ScratchPtrReg == AMDGPU::NoRegister ||
-      ScratchOffsetReg == AMDGPU::NoRegister);
+  unsigned ScratchRsrcFI = FrameInfo->CreateSpillStackObject(16, 4);
+  RS.addScavengingFrameIndex(ScratchRsrcFI);
+
   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
        BI != BE; ++BI) {
 
     MachineBasicBlock &MBB = *BI;
-    if (UseRegScavenger)
-      RS.enterBasicBlock(&MBB);
+    // Add the scratch offset reg as a live-in so that the register scavenger
+    // doesn't re-use it.
+    if (!MBB.isLiveIn(ScratchOffsetReg) &&
+        ScratchOffsetReg != AMDGPU::NoRegister)
+      MBB.addLiveIn(ScratchOffsetReg);
+    RS.enterBasicBlock(&MBB);
 
     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
          I != E; ++I) {
       MachineInstr &MI = *I;
+      RS.forward(I);
       DebugLoc DL = MI.getDebugLoc();
       switch(MI.getOpcode()) {
-        default: break;;
+        default: break;
         case AMDGPU::SI_SPILL_V512_SAVE:
         case AMDGPU::SI_SPILL_V256_SAVE:
         case AMDGPU::SI_SPILL_V128_SAVE:
@@ -153,43 +142,66 @@ bool SIPrepareScratchRegs::runOnMachineFunction(MachineFunction &MF) {
         case AMDGPU::SI_SPILL_V256_RESTORE:
         case AMDGPU::SI_SPILL_V512_RESTORE:
 
-          // Scratch Pointer
-          if (ScratchPtrReg == AMDGPU::NoRegister) {
-            ScratchPtrReg = RS.scavengeRegister(&AMDGPU::SGPR_64RegClass, 0);
-            BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_SPILL_S64_RESTORE),
-                    ScratchPtrReg)
-                    .addFrameIndex(ScratchPtrFI)
-                    .addReg(AMDGPU::NoRegister)
-                    .addReg(AMDGPU::NoRegister);
-          } else if (!MBB.isLiveIn(ScratchPtrReg)) {
-            MBB.addLiveIn(ScratchPtrReg);
-          }
+          // Scratch resource
+          unsigned ScratchRsrcReg =
+              RS.scavengeRegister(&AMDGPU::SReg_128RegClass, 0);
+
+          uint64_t Rsrc = AMDGPU::RSRC_DATA_FORMAT | AMDGPU::RSRC_TID_ENABLE |
+                          0xffffffff; // Size
+
+          unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
+          unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
+          unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
+          unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
+
+          BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc0)
+                  .addExternalSymbol("SCRATCH_RSRC_DWORD0")
+                  .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+          BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc1)
+                  .addExternalSymbol("SCRATCH_RSRC_DWORD1")
+                  .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+          BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc2)
+                  .addImm(Rsrc & 0xffffffff)
+                  .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+          BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc3)
+                  .addImm(Rsrc >> 32)
+                  .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
 
+          // Scratch Offset
           if (ScratchOffsetReg == AMDGPU::NoRegister) {
             ScratchOffsetReg = RS.scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
             BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_SPILL_S32_RESTORE),
                     ScratchOffsetReg)
                     .addFrameIndex(ScratchOffsetFI)
-                    .addReg(AMDGPU::NoRegister)
-                    .addReg(AMDGPU::NoRegister);
+                    .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef)
+                    .addReg(AMDGPU::SGPR0, RegState::Undef);
           } else if (!MBB.isLiveIn(ScratchOffsetReg)) {
             MBB.addLiveIn(ScratchOffsetReg);
           }
 
-          if (ScratchPtrReg == AMDGPU::NoRegister ||
+          if (ScratchRsrcReg == AMDGPU::NoRegister ||
               ScratchOffsetReg == AMDGPU::NoRegister) {
             LLVMContext &Ctx = MF.getFunction()->getContext();
             Ctx.emitError("ran out of SGPRs for spilling VGPRs");
-            ScratchPtrReg = AMDGPU::SGPR0;
+            ScratchRsrcReg = AMDGPU::SGPR0;
             ScratchOffsetReg = AMDGPU::SGPR0;
           }
-          MI.getOperand(2).setReg(ScratchPtrReg);
+          MI.getOperand(2).setReg(ScratchRsrcReg);
+          MI.getOperand(2).setIsKill(true);
+          MI.getOperand(2).setIsUndef(false);
           MI.getOperand(3).setReg(ScratchOffsetReg);
+          MI.getOperand(3).setIsUndef(false);
+          MI.getOperand(3).setIsKill(false);
+          MI.addOperand(MachineOperand::CreateReg(Rsrc0, false, true, true));
+          MI.addOperand(MachineOperand::CreateReg(Rsrc1, false, true, true));
+          MI.addOperand(MachineOperand::CreateReg(Rsrc2, false, true, true));
+          MI.addOperand(MachineOperand::CreateReg(Rsrc3, false, true, true));
 
           break;
       }
-      if (UseRegScavenger)
-        RS.forward();
     }
   }
   return true;
diff --git a/lib/Target/R600/SIRegisterInfo.cpp b/lib/Target/R600/SIRegisterInfo.cpp
index f9feea470f15..0396bf384066 100644
--- a/lib/Target/R600/SIRegisterInfo.cpp
+++ b/lib/Target/R600/SIRegisterInfo.cpp
@@ -23,7 +23,6 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/LLVMContext.h"
 
-#include "llvm/Support/Debug.h"
 using namespace llvm;
 
 SIRegisterInfo::SIRegisterInfo(const AMDGPUSubtarget &st)
@@ -51,9 +50,32 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   return Reserved;
 }
 
-unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
-                                             MachineFunction &MF) const {
-  return RC->getNumRegs();
+unsigned SIRegisterInfo::getRegPressureSetLimit(unsigned Idx) const {
+
+  // FIXME: We should adjust the max number of waves based on LDS size.
+  unsigned SGPRLimit = getNumSGPRsAllowed(ST.getMaxWavesPerCU());
+  unsigned VGPRLimit = getNumVGPRsAllowed(ST.getMaxWavesPerCU());
+
+  for (regclass_iterator I = regclass_begin(), E = regclass_end();
+       I != E; ++I) {
+
+    unsigned NumSubRegs = std::max((int)(*I)->getSize() / 4, 1);
+    unsigned Limit;
+
+    if (isSGPRClass(*I)) {
+      Limit = SGPRLimit / NumSubRegs;
+    } else {
+      Limit = VGPRLimit / NumSubRegs;
+    }
+
+    const int *Sets = getRegClassPressureSets(*I);
+    assert(Sets);
+    for (unsigned i = 0; Sets[i] != -1; ++i) {
+	    if (Sets[i] == (int)Idx)
+        return Limit;
+    }
+  }
+  return 256;
 }
 
 bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const {
@@ -98,7 +120,7 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
 void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI,
                                            unsigned LoadStoreOp,
                                            unsigned Value,
-                                           unsigned ScratchPtr,
+                                           unsigned ScratchRsrcReg,
                                            unsigned ScratchOffset,
                                            int64_t Offset,
                                            RegScavenger *RS) const {
@@ -113,33 +135,9 @@ void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI,
   bool RanOutOfSGPRs = false;
   unsigned SOffset = ScratchOffset;
 
-  unsigned RsrcReg = RS->scavengeRegister(&AMDGPU::SReg_128RegClass, MI, 0);
-  if (RsrcReg == AMDGPU::NoRegister) {
-    RanOutOfSGPRs = true;
-    RsrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
-  }
-
   unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
   unsigned Size = NumSubRegs * 4;
 
-  uint64_t Rsrc = AMDGPU::RSRC_DATA_FORMAT | AMDGPU::RSRC_TID_ENABLE |
-                  0xffffffff; // Size
-
-  BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B64),
-          getSubReg(RsrcReg, AMDGPU::sub0_sub1))
-          .addReg(ScratchPtr)
-          .addReg(RsrcReg, RegState::ImplicitDefine);
-
-  BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32),
-          getSubReg(RsrcReg, AMDGPU::sub2))
-          .addImm(Rsrc & 0xffffffff)
-          .addReg(RsrcReg, RegState::ImplicitDefine);
-
-  BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32),
-          getSubReg(RsrcReg, AMDGPU::sub3))
-          .addImm(Rsrc >> 32)
-          .addReg(RsrcReg, RegState::ImplicitDefine);
-
   if (!isUInt<12>(Offset + Size)) {
     SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0);
     if (SOffset == AMDGPU::NoRegister) {
@@ -163,9 +161,9 @@ void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI,
 
     BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
             .addReg(SubReg, getDefRegState(IsLoad))
-            .addReg(RsrcReg, getKillRegState(IsKill))
+            .addReg(ScratchRsrcReg, getKillRegState(IsKill))
             .addImm(Offset)
-            .addReg(SOffset, getKillRegState(IsKill))
+            .addReg(SOffset)
             .addImm(0) // glc
             .addImm(0) // slc
             .addImm(0) // tfe
@@ -235,9 +233,8 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
            Ctx.emitError("Ran out of VGPRs for spilling SGPR");
         }
 
-        if (isM0) {
+        if (isM0)
           SubReg = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0);
-        }
 
         BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), SubReg)
                 .addReg(Spill.VGPR)
@@ -262,7 +259,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
     case AMDGPU::SI_SPILL_V32_SAVE:
       buildScratchLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET,
             TII->getNamedOperand(*MI, AMDGPU::OpName::src)->getReg(),
-            TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_ptr)->getReg(),
+            TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_rsrc)->getReg(),
             TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(),
              FrameInfo->getObjectOffset(Index), RS);
       MI->eraseFromParent();
@@ -274,7 +271,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
     case AMDGPU::SI_SPILL_V512_RESTORE: {
       buildScratchLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
             TII->getNamedOperand(*MI, AMDGPU::OpName::dst)->getReg(),
-            TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_ptr)->getReg(),
+            TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_rsrc)->getReg(),
             TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(),
             FrameInfo->getObjectOffset(Index), RS);
       MI->eraseFromParent();
@@ -289,7 +286,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
         BuildMI(*MBB, MI, MI->getDebugLoc(),
                 TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
                 .addImm(Offset);
-        FIOp.ChangeToRegister(TmpReg, false);
+        FIOp.ChangeToRegister(TmpReg, false, false, true);
       }
     }
   }
@@ -446,6 +443,8 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
   case SIRegisterInfo::TGID_Z:
     return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 2);
   case SIRegisterInfo::SCRATCH_WAVE_OFFSET:
+    if (MFI->getShaderType() != ShaderType::COMPUTE)
+      return MFI->ScratchOffsetReg;
     return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 4);
   case SIRegisterInfo::SCRATCH_PTR:
     return AMDGPU::SGPR2_SGPR3;
@@ -475,3 +474,29 @@ unsigned SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
   return AMDGPU::NoRegister;
 }
 
+unsigned SIRegisterInfo::getNumVGPRsAllowed(unsigned WaveCount) const {
+  switch(WaveCount) {
+    case 10: return 24;
+    case 9:  return 28;
+    case 8:  return 32;
+    case 7:  return 36;
+    case 6:  return 40;
+    case 5:  return 48;
+    case 4:  return 64;
+    case 3:  return 84;
+    case 2:  return 128;
+    default: return 256;
+  }
+}
+
+unsigned SIRegisterInfo::getNumSGPRsAllowed(unsigned WaveCount) const {
+  switch(WaveCount) {
+    case 10: return 48;
+    case 9:  return 56;
+    case 8:  return 64;
+    case 7:  return 72;
+    case 6:  return 80;
+    case 5:  return 96;
+    default: return 103;
+  }
+}
diff --git a/lib/Target/R600/SIRegisterInfo.h b/lib/Target/R600/SIRegisterInfo.h
index d14212c2b104..d908ffd12d2c 100644
--- a/lib/Target/R600/SIRegisterInfo.h
+++ b/lib/Target/R600/SIRegisterInfo.h
@@ -17,6 +17,7 @@
 #define LLVM_LIB_TARGET_R600_SIREGISTERINFO_H
 
 #include "AMDGPURegisterInfo.h"
+#include "llvm/Support/Debug.h"
 
 namespace llvm {
 
@@ -26,8 +27,7 @@ struct SIRegisterInfo : public AMDGPURegisterInfo {
 
   BitVector getReservedRegs(const MachineFunction &MF) const override;
 
-  unsigned getRegPressureLimit(const TargetRegisterClass *RC,
-                               MachineFunction &MF) const override;
+  unsigned getRegPressureSetLimit(unsigned Idx) const override;
 
   bool requiresRegisterScavenging(const MachineFunction &Fn) const override;
 
@@ -105,13 +105,21 @@ struct SIRegisterInfo : public AMDGPURegisterInfo {
   unsigned getPreloadedValue(const MachineFunction &MF,
                              enum PreloadedValue Value) const;
 
+  /// \brief Give the maximum number of VGPRs that can be used by \p WaveCount
+  ///        concurrent waves.
+  unsigned getNumVGPRsAllowed(unsigned WaveCount) const;
+
+  /// \brief Give the maximum number of SGPRs that can be used by \p WaveCount
+  ///        concurrent waves.
+  unsigned getNumSGPRsAllowed(unsigned WaveCount) const;
+
   unsigned findUnusedRegister(const MachineRegisterInfo &MRI,
                               const TargetRegisterClass *RC) const;
 
 private:
   void buildScratchLoadStore(MachineBasicBlock::iterator MI,
                              unsigned LoadStoreOp, unsigned Value,
-                             unsigned ScratchPtr, unsigned ScratchOffset,
+                             unsigned ScratchRsrcReg, unsigned ScratchOffset,
                              int64_t Offset, RegScavenger *RS) const;
 };
 
diff --git a/lib/Target/R600/SIShrinkInstructions.cpp b/lib/Target/R600/SIShrinkInstructions.cpp
index f91d1177bbae..6a3410688fe7 100644
--- a/lib/Target/R600/SIShrinkInstructions.cpp
+++ b/lib/Target/R600/SIShrinkInstructions.cpp
@@ -10,6 +10,7 @@
 //
 
 #include "AMDGPU.h"
+#include "AMDGPUMCInstLower.h"
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
 #include "llvm/ADT/Statistic.h"
@@ -206,13 +207,13 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
           continue;
       }
 
-      int Op32 = AMDGPU::getVOPe32(MI.getOpcode());
-
-      // Op32 could be -1 here if we started with an instruction that had a
+      // getVOPe32 could be -1 here if we started with an instruction that had
       // a 32-bit encoding and then commuted it to an instruction that did not.
-      if (Op32 == -1)
+      if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
         continue;
 
+      int Op32 = AMDGPU::getVOPe32(MI.getOpcode());
+
       if (TII->isVOPC(Op32)) {
         unsigned DstReg = MI.getOperand(0).getReg();
         if (TargetRegisterInfo::isVirtualRegister(DstReg)) {
diff --git a/lib/Target/R600/VIInstructions.td b/lib/Target/R600/VIInstructions.td
index 07cfa29ae12b..24e66cea6277 100644
--- a/lib/Target/R600/VIInstructions.td
+++ b/lib/Target/R600/VIInstructions.td
@@ -11,22 +11,6 @@
 
 let SubtargetPredicate = isVI in {
 
-def V_LDEXP_F32 : VOP3InstVI <0x288, "v_ldexp_f32", VOP_F32_F32_I32,
-  AMDGPUldexp
->;
-def V_BFM_B32 : VOP3InstVI <0x293, "v_bfm_b32", VOP_I32_I32_I32, AMDGPUbfm>;
-def V_BCNT_U32_B32 : VOP3InstVI <0x28b, "v_bcnt_u32_b32", VOP_I32_I32_I32>;
-def V_MBCNT_LO_U32_B32 : VOP3InstVI <0x28c, "v_mbcnt_lo_u32_b32",
-  VOP_I32_I32_I32
->;
-def V_MBCNT_HI_U32_B32 : VOP3InstVI <0x28d, "v_mbcnt_hi_u32_b32",
-  VOP_I32_I32_I32
->;
-
-def V_CVT_PKRTZ_F16_F32 : VOP3InstVI <0x296, "v_cvt_pkrtz_f16_f32",
- VOP_I32_F32_F32, int_SI_packf16
->;
-
 defm BUFFER_LOAD_DWORD_VI : MUBUF_Load_Helper_vi <
   0x14, "buffer_load_dword", VGPR_32, i32, global_load
 >;
@@ -37,22 +21,13 @@ defm BUFFER_LOAD_FORMAT_XYZW_VI : MUBUF_Load_Helper_vi <
 
 } // End SubtargetPredicate = isVI
 
-//===----------------------------------------------------------------------===//
-// VOP2 Patterns
-//===----------------------------------------------------------------------===//
-
-let Predicates = [isVI] in {
-
-def : Pat <
-  (int_SI_tid),
-  (V_MBCNT_HI_U32_B32 0xffffffff,
-                      (V_MBCNT_LO_U32_B32 0xffffffff, 0))
->;
 
 //===----------------------------------------------------------------------===//
 // SMEM Patterns
 //===----------------------------------------------------------------------===//
 
+let Predicates = [isVI] in {
+
 // 1. Offset as 8bit DWORD immediate
 def : Pat <
   (SIload_constant v4i32:$sbase, IMM20bit:$offset),
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
index 6767e4b224f8..42690206e8c7 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
@@ -42,7 +42,8 @@ SparcELFMCAsmInfo::SparcELFMCAsmInfo(StringRef TT) {
   SunStyleELFSectionSwitchSyntax = true;
   UsesELFSectionDirectiveForBSS = true;
 
-  UseIntegratedAssembler = true;
+  if (TheTriple.isOSSolaris() || TheTriple.isOSOpenBSD())
+    UseIntegratedAssembler = true;
 }
 
 const MCExpr*
diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 719b761084f9..164b4192ae66 100644
--- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -777,6 +777,19 @@ public:
                                      MachO::CPU_TYPE_X86_64, Subtype);
   }
 
+  bool doesSectionRequireSymbols(const MCSection &Section) const override {
+    // Temporary labels in the string literals sections require symbols. The
+    // issue is that the x86_64 relocation format does not allow symbol +
+    // offset, and so the linker does not have enough information to resolve the
+    // access to the appropriate atom unless an external relocation is used. For
+    // non-cstring sections, we expect the compiler to use a non-temporary label
+    // for anything that could have an addend pointing outside the symbol.
+    //
+    // See <rdar://problem/4765733>.
+    const MCSectionMachO &SMO = static_cast<const MCSectionMachO&>(Section);
+    return SMO.getType() == MachO::S_CSTRING_LITERALS;
+  }
+
   /// \brief Generate the compact unwind encoding for the CFI instructions.
   uint32_t generateCompactUnwindEncoding(
                              ArrayRef<MCCFIInstruction> Instrs) const override {
diff --git a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
index be6a8e4a43eb..e8b0b4c5826f 100644
--- a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
@@ -222,6 +222,9 @@ unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target,
         case MCSymbolRefExpr::VK_GOT:
           Type = ELF::R_386_GOT32;
           break;
+        case MCSymbolRefExpr::VK_PLT:
+          Type = ELF::R_386_PLT32;
+          break;
         case MCSymbolRefExpr::VK_GOTOFF:
           Type = ELF::R_386_GOTOFF;
           break;
diff --git a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
index 7a83f4c64e6d..67b0c8900850 100644
--- a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
@@ -10,7 +10,6 @@
 #include "MCTargetDesc/X86MCTargetDesc.h"
 #include "MCTargetDesc/X86FixupKinds.h"
 #include "llvm/ADT/Twine.h"
-#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
@@ -48,21 +47,23 @@ class X86MachObjectWriter : public MCMachObjectTargetWriter {
                               const MCFixup &Fixup,
                               MCValue Target,
                               uint64_t &FixedValue);
-  void RecordX86_64Relocation(MachObjectWriter *Writer, MCAssembler &Asm,
+  void RecordX86_64Relocation(MachObjectWriter *Writer,
+                              const MCAssembler &Asm,
                               const MCAsmLayout &Layout,
-                              const MCFragment *Fragment, const MCFixup &Fixup,
-                              MCValue Target, uint64_t &FixedValue);
-
+                              const MCFragment *Fragment,
+                              const MCFixup &Fixup,
+                              MCValue Target,
+                              uint64_t &FixedValue);
 public:
   X86MachObjectWriter(bool Is64Bit, uint32_t CPUType,
                       uint32_t CPUSubtype)
     : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype,
                                /*UseAggressiveSymbolFolding=*/Is64Bit) {}
 
-  void RecordRelocation(MachObjectWriter *Writer, MCAssembler &Asm,
-                        const MCAsmLayout &Layout, const MCFragment *Fragment,
-                        const MCFixup &Fixup, MCValue Target,
-                        uint64_t &FixedValue) override {
+  void RecordRelocation(MachObjectWriter *Writer,
+                        const MCAssembler &Asm, const MCAsmLayout &Layout,
+                        const MCFragment *Fragment, const MCFixup &Fixup,
+                        MCValue Target, uint64_t &FixedValue) override {
     if (Writer->is64Bit())
       RecordX86_64Relocation(Writer, Asm, Layout, Fragment, Fixup, Target,
                              FixedValue);
@@ -96,10 +97,13 @@ static unsigned getFixupKindLog2Size(unsigned Kind) {
   }
 }
 
-void X86MachObjectWriter::RecordX86_64Relocation(
-    MachObjectWriter *Writer, MCAssembler &Asm, const MCAsmLayout &Layout,
-    const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target,
-    uint64_t &FixedValue) {
+void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
+                                                 const MCAssembler &Asm,
+                                                 const MCAsmLayout &Layout,
+                                                 const MCFragment *Fragment,
+                                                 const MCFixup &Fixup,
+                                                 MCValue Target,
+                                                 uint64_t &FixedValue) {
   unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
   unsigned IsRIPRel = isFixupKindRIPRel(Fixup.getKind());
   unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind());
@@ -113,7 +117,6 @@ void X86MachObjectWriter::RecordX86_64Relocation(
   unsigned Index = 0;
   unsigned IsExtern = 0;
   unsigned Type = 0;
-  const MCSymbolData *RelSymbol = nullptr;
 
   Value = Target.getConstant();
 
@@ -129,6 +132,7 @@ void X86MachObjectWriter::RecordX86_64Relocation(
   if (Target.isAbsolute()) { // constant
     // SymbolNum of 0 indicates the absolute section.
     Type = MachO::X86_64_RELOC_UNSIGNED;
+    Index = 0;
 
     // FIXME: I believe this is broken, I don't think the linker can understand
     // it. I think it would require a local relocation, but I'm not sure if that
@@ -189,30 +193,36 @@ void X86MachObjectWriter::RecordX86_64Relocation(
     Value -= Writer->getSymbolAddress(&B_SD, Layout) -
       (!B_Base ? 0 : Writer->getSymbolAddress(B_Base, Layout));
 
-    if (!A_Base)
+    if (A_Base) {
+      Index = A_Base->getIndex();
+      IsExtern = 1;
+    } else {
       Index = A_SD.getFragment()->getParent()->getOrdinal() + 1;
+      IsExtern = 0;
+    }
     Type = MachO::X86_64_RELOC_UNSIGNED;
 
     MachO::any_relocation_info MRE;
     MRE.r_word0 = FixupOffset;
-    MRE.r_word1 =
-        (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | (Type << 28);
-    Writer->addRelocation(A_Base, Fragment->getParent(), MRE);
-
-    if (B_Base)
-      RelSymbol = B_Base;
-    else
+    MRE.r_word1 = ((Index     <<  0) |
+                   (IsPCRel   << 24) |
+                   (Log2Size  << 25) |
+                   (IsExtern  << 27) |
+                   (Type      << 28));
+    Writer->addRelocation(Fragment->getParent(), MRE);
+
+    if (B_Base) {
+      Index = B_Base->getIndex();
+      IsExtern = 1;
+    } else {
       Index = B_SD.getFragment()->getParent()->getOrdinal() + 1;
+      IsExtern = 0;
+    }
     Type = MachO::X86_64_RELOC_SUBTRACTOR;
   } else {
     const MCSymbol *Symbol = &Target.getSymA()->getSymbol();
-    if (Symbol->isTemporary() && Value) {
-      const MCSection &Sec = Symbol->getSection();
-      if (!Asm.getContext().getAsmInfo()->isSectionAtomizableBySymbols(Sec))
-        Asm.addLocalUsedInReloc(*Symbol);
-    }
     const MCSymbolData &SD = Asm.getSymbolData(*Symbol);
-    RelSymbol = Asm.getAtom(&SD);
+    const MCSymbolData *Base = Asm.getAtom(&SD);
 
     // Relocations inside debug sections always use local relocations when
     // possible. This seems to be done because the debugger doesn't fully
@@ -222,20 +232,23 @@ void X86MachObjectWriter::RecordX86_64Relocation(
       const MCSectionMachO &Section = static_cast<const MCSectionMachO&>(
         Fragment->getParent()->getSection());
       if (Section.hasAttribute(MachO::S_ATTR_DEBUG))
-        RelSymbol = nullptr;
+        Base = nullptr;
     }
 
     // x86_64 almost always uses external relocations, except when there is no
     // symbol to use as a base address (a local symbol with no preceding
     // non-local symbol).
-    if (RelSymbol) {
+    if (Base) {
+      Index = Base->getIndex();
+      IsExtern = 1;
+
       // Add the local offset, if needed.
-      if (RelSymbol != &SD)
-        Value +=
-            Layout.getSymbolOffset(&SD) - Layout.getSymbolOffset(RelSymbol);
+      if (Base != &SD)
+        Value += Layout.getSymbolOffset(&SD) - Layout.getSymbolOffset(Base);
     } else if (Symbol->isInSection() && !Symbol->isVariable()) {
       // The index is the section ordinal (1-based).
       Index = SD.getFragment()->getParent()->getOrdinal() + 1;
+      IsExtern = 0;
       Value += Writer->getSymbolAddress(&SD, Layout);
 
       if (IsPCRel)
@@ -334,9 +347,12 @@ void X86MachObjectWriter::RecordX86_64Relocation(
   // struct relocation_info (8 bytes)
   MachO::any_relocation_info MRE;
   MRE.r_word0 = FixupOffset;
-  MRE.r_word1 = (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
-                (IsExtern << 27) | (Type << 28);
-  Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE);
+  MRE.r_word1 = ((Index     <<  0) |
+                 (IsPCRel   << 24) |
+                 (Log2Size  << 25) |
+                 (IsExtern  << 27) |
+                 (Type      << 28));
+  Writer->addRelocation(Fragment->getParent(), MRE);
 }
 
 bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer,
@@ -408,7 +424,7 @@ bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer,
                    (IsPCRel                   << 30) |
                    MachO::R_SCATTERED);
     MRE.r_word1 = Value2;
-    Writer->addRelocation(nullptr, Fragment->getParent(), MRE);
+    Writer->addRelocation(Fragment->getParent(), MRE);
   } else {
     // If the offset is more than 24-bits, it won't fit in a scattered
     // relocation offset field, so we fall back to using a non-scattered
@@ -430,7 +446,7 @@ bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer,
                  (IsPCRel     << 30) |
                  MachO::R_SCATTERED);
   MRE.r_word1 = Value;
-  Writer->addRelocation(nullptr, Fragment->getParent(), MRE);
+  Writer->addRelocation(Fragment->getParent(), MRE);
   return true;
 }
 
@@ -451,6 +467,7 @@ void X86MachObjectWriter::RecordTLVPRelocation(MachObjectWriter *Writer,
 
   // Get the symbol data.
   const MCSymbolData *SD_A = &Asm.getSymbolData(Target.getSymA()->getSymbol());
+  unsigned Index = SD_A->getIndex();
 
   // We're only going to have a second symbol in pic mode and it'll be a
   // subtraction from the picbase. For 32-bit pic the addend is the difference
@@ -473,9 +490,12 @@ void X86MachObjectWriter::RecordTLVPRelocation(MachObjectWriter *Writer,
   // struct relocation_info (8 bytes)
   MachO::any_relocation_info MRE;
   MRE.r_word0 = Value;
-  MRE.r_word1 =
-      (IsPCRel << 24) | (Log2Size << 25) | (MachO::GENERIC_RELOC_TLV << 28);
-  Writer->addRelocation(SD_A, Fragment->getParent(), MRE);
+  MRE.r_word1 = ((Index                    <<  0) |
+                 (IsPCRel                  << 24) |
+                 (Log2Size                 << 25) |
+                 (1                        << 27) | // r_extern
+                 (MachO::GENERIC_RELOC_TLV << 28)); // r_type
+  Writer->addRelocation(Fragment->getParent(), MRE);
 }
 
 void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer,
@@ -526,8 +546,8 @@ void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer,
   // See <reloc.h>.
   uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
   unsigned Index = 0;
+  unsigned IsExtern = 0;
   unsigned Type = 0;
-  const MCSymbolData *RelSymbol = nullptr;
 
   if (Target.isAbsolute()) { // constant
     // SymbolNum of 0 indicates the absolute section.
@@ -548,7 +568,8 @@ void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer,
 
     // Check whether we need an external or internal relocation.
     if (Writer->doesSymbolRequireExternRelocation(SD)) {
-      RelSymbol = SD;
+      IsExtern = 1;
+      Index = SD->getIndex();
       // For external relocations, make sure to offset the fixup value to
       // compensate for the addend of the symbol address, if it was
       // undefined. This occurs with weak definitions, for example.
@@ -570,9 +591,12 @@ void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer,
   // struct relocation_info (8 bytes)
   MachO::any_relocation_info MRE;
   MRE.r_word0 = FixupOffset;
-  MRE.r_word1 =
-      (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | (Type << 28);
-  Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE);
+  MRE.r_word1 = ((Index     <<  0) |
+                 (IsPCRel   << 24) |
+                 (Log2Size  << 25) |
+                 (IsExtern  << 27) |
+                 (Type      << 28));
+  Writer->addRelocation(Fragment->getParent(), MRE);
 }
 
 MCObjectWriter *llvm::createX86MachObjectWriter(raw_ostream &OS,
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index dab2c4b47ad6..83b4b82311f2 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1376,6 +1376,10 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
     dyn_cast<Function>(CS.getCalledValue()->stripPointerCasts());
   if (!Callee)
     return false;
+  // The prototype of thunks are a lie, don't try to directly call such
+  // functions.
+  if (Callee->hasFnAttribute("thunk"))
+    return false;
   Instruction *Caller = CS.getInstruction();
   const AttributeSet &CallerPAL = CS.getAttributes();
 
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index b814b2525dca..ac13eebf8275 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -2182,9 +2182,16 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS,
 
       // Handle the floating point versions of equality comparisons too.
       if ((isKnownTrue && Cmp->getPredicate() == CmpInst::FCMP_OEQ) ||
-          (isKnownFalse && Cmp->getPredicate() == CmpInst::FCMP_UNE))
-        Worklist.push_back(std::make_pair(Op0, Op1));
-
+          (isKnownFalse && Cmp->getPredicate() == CmpInst::FCMP_UNE)) {
+        // Floating point -0.0 and 0.0 compare equal, so we can't
+        // propagate a constant based on that comparison.
+        // FIXME: We should do this optimization if 'no signed zeros' is
+        // applicable via an instruction-level fast-math-flag or some other
+        // indicator that relaxed FP semantics are being used.
+        if (!isa<ConstantFP>(Op1) || !cast<ConstantFP>(Op1)->isZero())
+          Worklist.push_back(std::make_pair(Op0, Op1));
+      }
+ 
       // If "A >= B" is known true, replace "A < B" with false everywhere.
       CmpInst::Predicate NotPred = Cmp->getInversePredicate();
       Constant *NotVal = ConstantInt::get(Cmp->getType(), isKnownFalse);
diff --git a/lib/Transforms/Scalar/LoopInstSimplify.cpp b/lib/Transforms/Scalar/LoopInstSimplify.cpp
index 1ac38e0f52a5..d664f85c773d 100644
--- a/lib/Transforms/Scalar/LoopInstSimplify.cpp
+++ b/lib/Transforms/Scalar/LoopInstSimplify.cpp
@@ -18,6 +18,7 @@
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
@@ -47,7 +48,7 @@ namespace {
       AU.addRequiredID(LoopSimplifyID);
       AU.addPreservedID(LoopSimplifyID);
       AU.addPreservedID(LCSSAID);
-      AU.addPreserved("scalar-evolution");
+      AU.addPreserved<ScalarEvolution>();
       AU.addRequired<TargetLibraryInfo>();
     }
   };
diff --git a/lib/Transforms/Utils/LowerSwitch.cpp b/lib/Transforms/Utils/LowerSwitch.cpp
index 35cd917330ab..04b91306d3cb 100644
--- a/lib/Transforms/Utils/LowerSwitch.cpp
+++ b/lib/Transforms/Utils/LowerSwitch.cpp
@@ -46,7 +46,6 @@ namespace {
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       // This is a cluster of orthogonal Transforms
       AU.addPreserved<UnifyFunctionExitNodes>();
-      AU.addPreserved("mem2reg");
       AU.addPreservedID(LowerInvokePassID);
     }
 
diff --git a/lib/Transforms/Utils/SimplifyIndVar.cpp b/lib/Transforms/Utils/SimplifyIndVar.cpp
index f8aa1d3eec12..6cb91a154f06 100644
--- a/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -278,9 +278,8 @@ bool SimplifyIndvar::strengthenOverflowingOperation(BinaryOperator *BO,
                                                     Value *IVOperand) {
 
   // Currently we only handle instructions of the form "add <indvar> <value>"
-  // and "sub <indvar> <value>".
   unsigned Op = BO->getOpcode();
-  if (!(Op == Instruction::Add || Op == Instruction::Sub))
+  if (Op != Instruction::Add)
     return false;
 
   // If BO is already both nuw and nsw then there is nothing left to do
@@ -304,15 +303,6 @@ bool SimplifyIndvar::strengthenOverflowingOperation(BinaryOperator *BO,
   if (OtherOpSCEV == SE->getCouldNotCompute())
     return false;
 
-  if (Op == Instruction::Sub) {
-    // If the subtraction is of the form "sub <indvar>, <op>", then pretend it
-    // is "add <indvar>, -<op>" and continue, else bail out.
-    if (OtherOperandIdx != 1)
-      return false;
-
-    OtherOpSCEV = SE->getNegativeSCEV(OtherOpSCEV);
-  }
-
   const SCEV *IVOpSCEV = SE->getSCEV(IVOperand);
   const SCEV *ZeroSCEV = SE->getConstant(IVOpSCEV->getType(), 0);
 
diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 5b4647ddcb5e..5a0d52e04f9f 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -1968,8 +1968,12 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
     // Try to further simplify the result.
     CallInst *SimplifiedCI = dyn_cast<CallInst>(SimplifiedFortifiedCI);
     if (SimplifiedCI && SimplifiedCI->getCalledFunction())
-      if (Value *V = optimizeStringMemoryLibCall(SimplifiedCI, Builder))
+      if (Value *V = optimizeStringMemoryLibCall(SimplifiedCI, Builder)) {
+        // If we were able to further simplify, remove the now redundant call.
+        SimplifiedCI->replaceAllUsesWith(V);
+        SimplifiedCI->eraseFromParent();
         return V;
+      }
     return SimplifiedFortifiedCI;
   }
 
@@ -2218,11 +2222,11 @@ Value *FortifiedLibCallSimplifier::optimizeMemSetChk(CallInst *CI, IRBuilder<> &
   return nullptr;
 }
 
-Value *FortifiedLibCallSimplifier::optimizeStrCpyChk(CallInst *CI, IRBuilder<> &B) {
+Value *FortifiedLibCallSimplifier::optimizeStrpCpyChk(CallInst *CI,
+                                                      IRBuilder<> &B,
+                                                      LibFunc::Func Func) {
   Function *Callee = CI->getCalledFunction();
   StringRef Name = Callee->getName();
-  LibFunc::Func Func =
-      Name.startswith("str") ? LibFunc::strcpy_chk : LibFunc::stpcpy_chk;
 
   if (!checkStringCopyLibFuncSignature(Callee, Func, DL))
     return nullptr;
@@ -2231,7 +2235,7 @@ Value *FortifiedLibCallSimplifier::optimizeStrCpyChk(CallInst *CI, IRBuilder<> &
         *ObjSize = CI->getArgOperand(2);
 
   // __stpcpy_chk(x,x,...)  -> x+strlen(x)
-  if (!OnlyLowerUnknownSize && Dst == Src) {
+  if (Func == LibFunc::stpcpy_chk && !OnlyLowerUnknownSize && Dst == Src) {
     Value *StrLen = EmitStrLen(Src, B, DL, TLI);
     return StrLen ? B.CreateInBoundsGEP(Dst, StrLen) : nullptr;
   }
@@ -2266,11 +2270,11 @@ Value *FortifiedLibCallSimplifier::optimizeStrCpyChk(CallInst *CI, IRBuilder<> &
   return nullptr;
 }
 
-Value *FortifiedLibCallSimplifier::optimizeStrNCpyChk(CallInst *CI, IRBuilder<> &B) {
+Value *FortifiedLibCallSimplifier::optimizeStrpNCpyChk(CallInst *CI,
+                                                       IRBuilder<> &B,
+                                                       LibFunc::Func Func) {
   Function *Callee = CI->getCalledFunction();
   StringRef Name = Callee->getName();
-  LibFunc::Func Func =
-      Name.startswith("str") ? LibFunc::strncpy_chk : LibFunc::stpncpy_chk;
 
   if (!checkStringCopyLibFuncSignature(Callee, Func, DL))
     return nullptr;
@@ -2310,10 +2314,10 @@ Value *FortifiedLibCallSimplifier::optimizeCall(CallInst *CI) {
     return optimizeMemSetChk(CI, Builder);
   case LibFunc::stpcpy_chk:
   case LibFunc::strcpy_chk:
-    return optimizeStrCpyChk(CI, Builder);
+    return optimizeStrpCpyChk(CI, Builder, Func);
   case LibFunc::stpncpy_chk:
   case LibFunc::strncpy_chk:
-    return optimizeStrNCpyChk(CI, Builder);
+    return optimizeStrpNCpyChk(CI, Builder, Func);
   default:
     break;
   }
diff --git a/lib/Transforms/Utils/SymbolRewriter.cpp b/lib/Transforms/Utils/SymbolRewriter.cpp
index b35a662f17b5..d36283ea3ae1 100644
--- a/lib/Transforms/Utils/SymbolRewriter.cpp
+++ b/lib/Transforms/Utils/SymbolRewriter.cpp
@@ -79,6 +79,19 @@ static cl::list<std::string> RewriteMapFiles("rewrite-map-file",
 
 namespace llvm {
 namespace SymbolRewriter {
+void rewriteComdat(Module &M, GlobalObject *GO, const std::string &Source,
+                   const std::string &Target) {
+  if (Comdat *CD = GO->getComdat()) {
+    auto &Comdats = M.getComdatSymbolTable();
+
+    Comdat *C = M.getOrInsertComdat(Target);
+    C->setSelectionKind(CD->getSelectionKind());
+    GO->setComdat(C);
+
+    Comdats.erase(Comdats.find(Source));
+  }
+}
+
 template <RewriteDescriptor::Type DT, typename ValueType,
           ValueType *(llvm::Module::*Get)(StringRef) const>
 class ExplicitRewriteDescriptor : public RewriteDescriptor {
@@ -102,10 +115,14 @@ template <RewriteDescriptor::Type DT, typename ValueType,
 bool ExplicitRewriteDescriptor<DT, ValueType, Get>::performOnModule(Module &M) {
   bool Changed = false;
   if (ValueType *S = (M.*Get)(Source)) {
+    if (GlobalObject *GO = dyn_cast<GlobalObject>(S))
+      rewriteComdat(M, GO, Source, Target);
+
     if (Value *T = (M.*Get)(Target))
       S->setValueName(T->getValueName());
     else
       S->setName(Target);
+
     Changed = true;
   }
   return Changed;
@@ -145,6 +162,12 @@ performOnModule(Module &M) {
       report_fatal_error("unable to transforn " + C.getName() + " in " +
                          M.getModuleIdentifier() + ": " + Error);
 
+    if (C.getName() == Name)
+      continue;
+
+    if (GlobalObject *GO = dyn_cast<GlobalObject>(&C))
+      rewriteComdat(M, GO, C.getName(), Name);
+
     if (Value *V = (M.*Get)(Name))
       C.setValueName(V->getValueName());
     else
diff --git a/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp b/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
index 0c2fc0a972b5..7e00a80989dc 100644
--- a/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
+++ b/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
@@ -35,7 +35,6 @@ void UnifyFunctionExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{
   // We preserve the non-critical-edgeness property
   AU.addPreservedID(BreakCriticalEdgesID);
   // This is a cluster of orthogonal Transforms
-  AU.addPreserved("mem2reg");
   AU.addPreservedID(LowerSwitchID);
 }
 
diff --git a/test/Analysis/ScalarEvolution/nw-sub-is-not-nw-add.ll b/test/Analysis/ScalarEvolution/nw-sub-is-not-nw-add.ll
new file mode 100644
index 000000000000..41b07d5cd537
--- /dev/null
+++ b/test/Analysis/ScalarEvolution/nw-sub-is-not-nw-add.ll
@@ -0,0 +1,41 @@
+; RUN: opt -S -indvars < %s | FileCheck %s
+
+; Check that SCEV does not assume sub nuw X Y == add nuw X, -Y
+define void @f(i32* %loc) {
+; CHECK-LABEL: @f
+ entry:
+  br label %loop
+
+ loop:
+  %idx = phi i32 [ 6, %entry ], [ %idx.dec, %loop ]
+  store i32 %idx, i32* %loc
+  %idx.dec = sub nuw i32 %idx, 1
+  %cond = icmp uge i32 %idx.dec, 5
+  br i1 %cond, label %loop, label %exit
+; CHECK-NOT: br i1 true, label %loop, label %exit
+
+ exit:
+  ret void
+}
+
+declare void @use_i1(i1)
+
+; Check that SCEV does not assume sub nsw X Y == add nsw X, -Y
+define void @g(i32 %lim) {
+; CHECK-LABEL: @g
+ entry:
+  br label %loop
+
+ loop:
+  %idx = phi i32 [ -1, %entry ], [ %idx.dec, %loop ]
+  %t = icmp sgt i32 %idx, 0
+; CHECK-NOT:   call void @use_i1(i1 false)
+; CHECK: call void @use_i1(i1 %t)
+  call void @use_i1(i1 %t)
+  %idx.dec = sub nsw i32 %idx, -2147483648
+  %cond = icmp eq i32 %idx.dec, %lim
+  br i1 %cond, label %loop, label %exit
+
+ exit:
+  ret void
+}
diff --git a/test/Bindings/llvm-c/add_named_metadata_operand.ll b/test/Bindings/llvm-c/add_named_metadata_operand.ll
new file mode 100644
index 000000000000..fcc833d6af03
--- /dev/null
+++ b/test/Bindings/llvm-c/add_named_metadata_operand.ll
@@ -0,0 +1,2 @@
+; RUN: llvm-c-test --add-named-metadata-operand < /dev/null
+; This used to trigger an assertion
diff --git a/test/Bindings/llvm-c/set_metadata.ll b/test/Bindings/llvm-c/set_metadata.ll
new file mode 100644
index 000000000000..0a65b8c47dee
--- /dev/null
+++ b/test/Bindings/llvm-c/set_metadata.ll
@@ -0,0 +1,2 @@
+; RUN: llvm-c-test --set-metadata < /dev/null
+; This used to trigger an assertion
diff --git a/test/CodeGen/AArch64/arm64-platform-reg.ll b/test/CodeGen/AArch64/arm64-platform-reg.ll
index 651c793f73a4..b0d3ee0ff8a3 100644
--- a/test/CodeGen/AArch64/arm64-platform-reg.ll
+++ b/test/CodeGen/AArch64/arm64-platform-reg.ll
@@ -1,4 +1,5 @@
-; RUN: llc -mtriple=arm64-apple-ios -o - %s | FileCheck %s --check-prefix=CHECK-DARWIN
+; RUN: llc -mtriple=arm64-apple-ios -o - %s | FileCheck %s --check-prefix=CHECK-RESERVE-X18
+; RUN: llc -mtriple=arm64-freebsd-gnu -aarch64-reserve-x18 -o - %s | FileCheck %s --check-prefix=CHECK-RESERVE-X18
 ; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
 
 ; x18 is reserved as a platform register on Darwin but not on other
@@ -16,11 +17,11 @@ define void @keep_live() {
 ; CHECK: ldr x18
 ; CHECK: str x18
 
-; CHECK-DARWIN-NOT: ldr fp
-; CHECK-DARWIN-NOT: ldr x18
-; CHECK-DARWIN: Spill
-; CHECK-DARWIN-NOT: ldr fp
-; CHECK-DARWIN-NOT: ldr x18
-; CHECK-DARWIN: ret
+; CHECK-RESERVE-X18-NOT: ldr fp
+; CHECK-RESERVE-X18-NOT: ldr x18
+; CHECK-RESERVE-X18: Spill
+; CHECK-RESERVE-X18-NOT: ldr fp
+; CHECK-RESERVE-X18-NOT: ldr x18
+; CHECK-RESERVE-X18: ret
   ret void
 }
diff --git a/test/CodeGen/AArch64/ghc-cc.ll b/test/CodeGen/AArch64/ghc-cc.ll
new file mode 100644
index 000000000000..505bd5fca66d
--- /dev/null
+++ b/test/CodeGen/AArch64/ghc-cc.ll
@@ -0,0 +1,89 @@
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
+
+; Check the GHC call convention works (aarch64)
+
+@base  = external global i64 ; assigned to register: r19
+@sp    = external global i64 ; assigned to register: r20
+@hp    = external global i64 ; assigned to register: r21
+@r1    = external global i64 ; assigned to register: r22
+@r2    = external global i64 ; assigned to register: r23
+@r3    = external global i64 ; assigned to register: r24
+@r4    = external global i64 ; assigned to register: r25
+@r5    = external global i64 ; assigned to register: r26
+@r6    = external global i64 ; assigned to register: r27
+@splim = external global i64 ; assigned to register: r28
+
+@f1 = external global float  ; assigned to register: s8
+@f2 = external global float  ; assigned to register: s9
+@f3 = external global float  ; assigned to register: s10
+@f4 = external global float  ; assigned to register: s11
+
+@d1 = external global double ; assigned to register: d12
+@d2 = external global double ; assigned to register: d13
+@d3 = external global double ; assigned to register: d14
+@d4 = external global double ; assigned to register: d15
+
+define ghccc i64 @addtwo(i64 %x, i64 %y) nounwind {
+entry:
+  ; CHECK-LABEL: addtwo
+  ; CHECK:       add      x0, x19, x20
+  ; CHECK-NEXT:  ret
+  %0 = add i64 %x, %y
+  ret i64 %0
+}
+
+define void @zap(i64 %a, i64 %b) nounwind {
+entry:
+  ; CHECK-LABEL: zap
+  ; CHECK-NOT:   mov   {{x[0-9]+}}, sp
+  ; CHECK:       bl    addtwo
+  ; CHECK-NEXT:  bl    foo
+  %0 = call ghccc i64 @addtwo(i64 %a, i64 %b)
+  call void @foo() nounwind
+  ret void
+}
+
+define ghccc void @foo_i64 () nounwind {
+entry:
+  ; CHECK-LABEL: foo_i64
+  ; CHECK:       adrp    {{x[0-9]+}}, base
+  ; CHECK-NEXT:  ldr     x19, [{{x[0-9]+}}, :lo12:base]
+  ; CHECK-NEXT:  bl      bar_i64
+  ; CHECK-NEXT:  ret
+
+  %0 = load i64* @base
+  tail call ghccc void @bar_i64( i64 %0 ) nounwind
+  ret void
+}
+
+define ghccc void @foo_float () nounwind {
+entry:
+  ; CHECK-LABEL: foo_float
+  ; CHECK:       adrp    {{x[0-9]+}}, f1
+  ; CHECK-NEXT:  ldr     s8, [{{x[0-9]+}}, :lo12:f1]
+  ; CHECK-NEXT:  bl      bar_float
+  ; CHECK-NEXT:  ret
+
+  %0 = load float* @f1
+  tail call ghccc void @bar_float( float %0 ) nounwind
+  ret void
+}
+
+define ghccc void @foo_double () nounwind {
+entry:
+  ; CHECK-LABEL: foo_double
+  ; CHECK:       adrp    {{x[0-9]+}}, d1
+  ; CHECK-NEXT:  ldr     d12, [{{x[0-9]+}}, :lo12:d1]
+  ; CHECK-NEXT:  bl      bar_double
+  ; CHECK-NEXT:  ret
+
+  %0 = load double* @d1
+  tail call ghccc void @bar_double( double %0 ) nounwind
+  ret void
+}
+
+declare ghccc void @foo ()
+
+declare ghccc void @bar_i64 (i64)
+declare ghccc void @bar_float (float)
+declare ghccc void @bar_double (double)
diff --git a/test/CodeGen/ARM/2015-01-21-thumbv4t-ldstr-opt.ll b/test/CodeGen/ARM/2015-01-21-thumbv4t-ldstr-opt.ll
new file mode 100644
index 000000000000..f3cc3d82121f
--- /dev/null
+++ b/test/CodeGen/ARM/2015-01-21-thumbv4t-ldstr-opt.ll
@@ -0,0 +1,55 @@
+; RUN: llc -mtriple=thumbv4t-none--eabi < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-V4T
+; RUN: llc -mtriple=thumbv6m-none--eabi < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-V6M
+
+; CHECK-LABEL: foo
+define i32 @foo(i32 %z, ...) #0 {
+entry:
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  %c = alloca i32, align 4
+  %d = alloca i32, align 4
+  %e = alloca i32, align 4
+  %f = alloca i32, align 4
+  %g = alloca i32, align 4
+  %h = alloca i32, align 4
+
+  store i32 1, i32* %a, align 4
+  store i32 2, i32* %b, align 4
+  store i32 3, i32* %c, align 4
+  store i32 4, i32* %d, align 4
+  store i32 5, i32* %e, align 4
+  store i32 6, i32* %f, align 4
+  store i32 7, i32* %g, align 4
+  store i32 8, i32* %h, align 4
+
+  %0 = load i32* %a, align 4
+  %1 = load i32* %b, align 4
+  %2 = load i32* %c, align 4
+  %3 = load i32* %d, align 4
+  %4 = load i32* %e, align 4
+  %5 = load i32* %f, align 4
+  %6 = load i32* %g, align 4
+  %7 = load i32* %h, align 4
+
+  %add  = add nsw i32 %0, %1
+  %add4 = add nsw i32 %add, %2
+  %add5 = add nsw i32 %add4, %3
+  %add6 = add nsw i32 %add5, %4
+  %add7 = add nsw i32 %add6, %5
+  %add8 = add nsw i32 %add7, %6
+  %add9 = add nsw i32 %add8, %7
+
+  %addz = add nsw i32 %add9, %z
+  call void @llvm.va_start(i8* null)
+  ret i32 %addz
+
+; CHECK:      sub sp, #40
+; CHECK-NEXT: add [[BASE:r[0-9]]], sp, #8
+
+; CHECK-V4T:  movs [[NEWBASE:r[0-9]]], [[BASE]]
+; CHECK-V6M:  mov [[NEWBASE:r[0-9]]], [[BASE]]
+; CHECK-NEXT: adds [[NEWBASE]], #8
+; CHECK-NEXT: ldm [[NEWBASE]],
+}
+
+declare void @llvm.va_start(i8*) nounwind
diff --git a/test/CodeGen/Mips/fcmp.ll b/test/CodeGen/Mips/fcmp.ll
index b7759831c5a2..8e83b0064ed9 100644
--- a/test/CodeGen/Mips/fcmp.ll
+++ b/test/CodeGen/Mips/fcmp.ll
@@ -781,3 +781,93 @@ define i32 @true_f64(double %a, double %b) nounwind {
   %2 = zext i1 %1 to i32
   ret i32 %2
 }
+
+; The optimizers sometimes produce setlt instead of setolt/setult.
+define float @bug1_f32(float %angle, float %at) #0 {
+entry:
+; ALL-LABEL: bug1_f32:
+
+; 32-C-DAG:      add.s    $[[T0:f[0-9]+]], $f14, $f12
+; 32-C-DAG:      lwc1     $[[T1:f[0-9]+]], %lo($CPI32_0)(
+; 32-C-DAG:      c.ole.s  $[[T0]], $[[T1]]
+; 32-C-DAG:      bc1t
+
+; 32-CMP-DAG:    add.s    $[[T0:f[0-9]+]], $f14, $f12
+; 32-CMP-DAG:    lwc1     $[[T1:f[0-9]+]], %lo($CPI32_0)(
+; 32-CMP-DAG:    cmp.le.s $[[T2:f[0-9]+]], $[[T0]], $[[T1]]
+; 32-CMP-DAG:    mfc1     $[[T3:[0-9]+]], $[[T2]]
+; FIXME: This instruction is redundant.
+; 32-CMP-DAG:    andi     $[[T4:[0-9]+]], $[[T3]], 1
+; 32-CMP-DAG:    bnez     $[[T4]],
+
+; 64-C-DAG:      add.s    $[[T0:f[0-9]+]], $f13, $f12
+; 64-C-DAG:      lwc1     $[[T1:f[0-9]+]], %got_ofst($CPI32_0)(
+; 64-C-DAG:      c.ole.s  $[[T0]], $[[T1]]
+; 64-C-DAG:      bc1t
+
+; 64-CMP-DAG:    add.s    $[[T0:f[0-9]+]], $f13, $f12
+; 64-CMP-DAG:    lwc1     $[[T1:f[0-9]+]], %got_ofst($CPI32_0)(
+; 64-CMP-DAG:    cmp.le.s $[[T2:f[0-9]+]], $[[T0]], $[[T1]]
+; 64-CMP-DAG:    mfc1     $[[T3:[0-9]+]], $[[T2]]
+; FIXME: This instruction is redundant.
+; 64-CMP-DAG:    andi     $[[T4:[0-9]+]], $[[T3]], 1
+; 64-CMP-DAG:    bnez     $[[T4]],
+
+  %add = fadd fast float %at, %angle
+  %cmp = fcmp ogt float %add, 1.000000e+00
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %sub = fadd fast float %add, -1.000000e+00
+  br label %if.end
+
+if.end:
+  %theta.0 = phi float [ %sub, %if.then ], [ %add, %entry ]
+  ret float %theta.0
+}
+
+; The optimizers sometimes produce setlt instead of setolt/setult.
+define double @bug1_f64(double %angle, double %at) #0 {
+entry:
+; ALL-LABEL: bug1_f64:
+
+; 32-C-DAG:      add.d    $[[T0:f[0-9]+]], $f14, $f12
+; 32-C-DAG:      ldc1     $[[T1:f[0-9]+]], %lo($CPI33_0)(
+; 32-C-DAG:      c.ole.d  $[[T0]], $[[T1]]
+; 32-C-DAG:      bc1t
+
+; 32-CMP-DAG:    add.d    $[[T0:f[0-9]+]], $f14, $f12
+; 32-CMP-DAG:    ldc1     $[[T1:f[0-9]+]], %lo($CPI33_0)(
+; 32-CMP-DAG:    cmp.le.d $[[T2:f[0-9]+]], $[[T0]], $[[T1]]
+; 32-CMP-DAG:    mfc1     $[[T3:[0-9]+]], $[[T2]]
+; FIXME: This instruction is redundant.
+; 32-CMP-DAG:    andi     $[[T4:[0-9]+]], $[[T3]], 1
+; 32-CMP-DAG:    bnez     $[[T4]],
+
+; 64-C-DAG:      add.d    $[[T0:f[0-9]+]], $f13, $f12
+; 64-C-DAG:      ldc1     $[[T1:f[0-9]+]], %got_ofst($CPI33_0)(
+; 64-C-DAG:      c.ole.d  $[[T0]], $[[T1]]
+; 64-C-DAG:      bc1t
+
+; 64-CMP-DAG:    add.d    $[[T0:f[0-9]+]], $f13, $f12
+; 64-CMP-DAG:    ldc1     $[[T1:f[0-9]+]], %got_ofst($CPI33_0)(
+; 64-CMP-DAG:    cmp.le.d $[[T2:f[0-9]+]], $[[T0]], $[[T1]]
+; 64-CMP-DAG:    mfc1     $[[T3:[0-9]+]], $[[T2]]
+; FIXME: This instruction is redundant.
+; 64-CMP-DAG:    andi     $[[T4:[0-9]+]], $[[T3]], 1
+; 64-CMP-DAG:    bnez     $[[T4]],
+
+  %add = fadd fast double %at, %angle
+  %cmp = fcmp ogt double %add, 1.000000e+00
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %sub = fadd fast double %add, -1.000000e+00
+  br label %if.end
+
+if.end:
+  %theta.0 = phi double [ %sub, %if.then ], [ %add, %entry ]
+  ret double %theta.0
+}
+
+attributes #0 = { nounwind readnone "no-nans-fp-math"="true" }
diff --git a/test/CodeGen/R600/basic-loop.ll b/test/CodeGen/R600/basic-loop.ll
index 72737ae273e6..9d0509b38d8a 100644
--- a/test/CodeGen/R600/basic-loop.ll
+++ b/test/CodeGen/R600/basic-loop.ll
@@ -1,4 +1,3 @@
-; XFAIL: *
 ; RUN: llc -O0 -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck %s
 
 ; CHECK-LABEL: {{^}}test_loop:
diff --git a/test/CodeGen/R600/ctpop.ll b/test/CodeGen/R600/ctpop.ll
index a47bc876cb96..c64f443ad697 100644
--- a/test/CodeGen/R600/ctpop.ll
+++ b/test/CodeGen/R600/ctpop.ll
@@ -24,8 +24,7 @@ define void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
 ; XXX - Why 0 in register?
 ; FUNC-LABEL: {{^}}v_ctpop_i32:
 ; SI: buffer_load_dword [[VAL:v[0-9]+]],
-; SI: v_mov_b32_e32 [[VZERO:v[0-9]+]], 0
-; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[VZERO]]
+; SI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], 0
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
 
@@ -40,8 +39,7 @@ define void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noali
 ; FUNC-LABEL: {{^}}v_ctpop_add_chain_i32:
 ; SI: buffer_load_dword [[VAL0:v[0-9]+]],
 ; SI: buffer_load_dword [[VAL1:v[0-9]+]],
-; SI: v_mov_b32_e32 [[VZERO:v[0-9]+]], 0
-; SI: v_bcnt_u32_b32_e32 [[MIDRESULT:v[0-9]+]], [[VAL1]], [[VZERO]]
+; SI: v_bcnt_u32_b32_e64 [[MIDRESULT:v[0-9]+]], [[VAL1]], 0
 ; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
@@ -73,8 +71,8 @@ define void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(
 }
 
 ; FUNC-LABEL: {{^}}v_ctpop_v2i32:
-; SI: v_bcnt_u32_b32_e32
-; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e64
+; SI: v_bcnt_u32_b32_e64
 ; SI: s_endpgm
 
 ; EG: BCNT_INT
@@ -87,10 +85,10 @@ define void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrs
 }
 
 ; FUNC-LABEL: {{^}}v_ctpop_v4i32:
-; SI: v_bcnt_u32_b32_e32
-; SI: v_bcnt_u32_b32_e32
-; SI: v_bcnt_u32_b32_e32
-; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e64
+; SI: v_bcnt_u32_b32_e64
+; SI: v_bcnt_u32_b32_e64
+; SI: v_bcnt_u32_b32_e64
 ; SI: s_endpgm
 
 ; EG: BCNT_INT
@@ -105,14 +103,14 @@ define void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrs
 }
 
 ; FUNC-LABEL: {{^}}v_ctpop_v8i32:
-; SI: v_bcnt_u32_b32_e32
-; SI: v_bcnt_u32_b32_e32
-; SI: v_bcnt_u32_b32_e32
-; SI: v_bcnt_u32_b32_e32
-; SI: v_bcnt_u32_b32_e32
-; SI: v_bcnt_u32_b32_e32
-; SI: v_bcnt_u32_b32_e32
-; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e64
+; SI: v_bcnt_u32_b32_e64
+; SI: v_bcnt_u32_b32_e64
+; SI: v_bcnt_u32_b32_e64
+; SI: v_bcnt_u32_b32_e64
+; SI: v_bcnt_u32_b32_e64
+; SI: v_bcnt_u32_b32_e64
+; SI: v_bcnt_u32_b32_e64
 ; SI: s_endpgm
 
 ; EG: BCNT_INT
@@ -131,22 +129,22 @@ define void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <8 x i32> addrs
 }
 
 ; FUNC-LABEL: {{^}}v_ctpop_v16i32:
-; SI: v_bcnt_u32_b32_e32
-; SI: v_bcnt_u32_b32_e32
-; SI: v_bcnt_u32_b32_e32
-; SI: v_bcnt_u32_b32_e32
-; SI: v_bcnt_u32_b32_e32
-; SI: v_bcnt_u32_b32_e32
-; SI: v_bcnt_u32_b32_e32
-; SI: v_bcnt_u32_b32_e32
-; SI: v_bcnt_u32_b32_e32
-; SI: v_bcnt_u32_b32_e32
-; SI: v_bcnt_u32_b32_e32
-; SI: v_bcnt_u32_b32_e32
-; SI: v_bcnt_u32_b32_e32
-; SI: v_bcnt_u32_b32_e32
-; SI: v_bcnt_u32_b32_e32
-; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e64
+; SI: v_bcnt_u32_b32_e64
+; SI: v_bcnt_u32_b32_e64
+; SI: v_bcnt_u32_b32_e64
+; SI: v_bcnt_u32_b32_e64
+; SI: v_bcnt_u32_b32_e64
+; SI: v_bcnt_u32_b32_e64
+; SI: v_bcnt_u32_b32_e64
+; SI: v_bcnt_u32_b32_e64
+; SI: v_bcnt_u32_b32_e64
+; SI: v_bcnt_u32_b32_e64
+; SI: v_bcnt_u32_b32_e64
+; SI: v_bcnt_u32_b32_e64
+; SI: v_bcnt_u32_b32_e64
+; SI: v_bcnt_u32_b32_e64
+; SI: v_bcnt_u32_b32_e64
 ; SI: s_endpgm
 
 ; EG: BCNT_INT
diff --git a/test/CodeGen/R600/ctpop64.ll b/test/CodeGen/R600/ctpop64.ll
index 8dfe571d3477..9758ac96ea9b 100644
--- a/test/CodeGen/R600/ctpop64.ll
+++ b/test/CodeGen/R600/ctpop64.ll
@@ -21,8 +21,7 @@ define void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
 
 ; FUNC-LABEL: {{^}}v_ctpop_i64:
 ; SI: buffer_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}},
-; SI: v_mov_b32_e32 [[VZERO:v[0-9]+]], 0
-; SI: v_bcnt_u32_b32_e32 [[MIDRESULT:v[0-9]+]], v[[LOVAL]], [[VZERO]]
+; SI: v_bcnt_u32_b32_e64 [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0
 ; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
diff --git a/test/CodeGen/R600/ds_read2st64.ll b/test/CodeGen/R600/ds_read2st64.ll
index 24834af20404..efd875e93176 100644
--- a/test/CodeGen/R600/ds_read2st64.ll
+++ b/test/CodeGen/R600/ds_read2st64.ll
@@ -65,8 +65,8 @@ define void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float add
 
 ; SI-LABEL: @simple_read2st64_f32_over_max_offset
 ; SI-NOT: ds_read2st64_b32
-; SI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256
 ; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], 0x10000, {{v[0-9]+}}
+; SI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256
 ; SI: ds_read_b32 {{v[0-9]+}}, [[BIGADD]]
 ; SI: s_endpgm
 define void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
@@ -197,8 +197,8 @@ define void @simple_read2st64_f64_max_offset(double addrspace(1)* %out, double a
 
 ; SI-LABEL: @simple_read2st64_f64_over_max_offset
 ; SI-NOT: ds_read2st64_b64
-; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset:512
 ; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], 0x10000, {{v[0-9]+}}
+; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset:512
 ; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, [[BIGADD]]
 ; SI: s_endpgm
 define void @simple_read2st64_f64_over_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
diff --git a/test/CodeGen/R600/fp_to_sint.ll b/test/CodeGen/R600/fp_to_sint.ll
index 35cfb03d39b4..d76e8a341c6f 100644
--- a/test/CodeGen/R600/fp_to_sint.ll
+++ b/test/CodeGen/R600/fp_to_sint.ll
@@ -1,16 +1,27 @@
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s --check-prefix=EG --check-prefix=FUNC
 ; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s --check-prefix=SI --check-prefix=FUNC
 
+declare float @llvm.fabs.f32(float) #0
+
 ; FUNC-LABEL: {{^}}fp_to_sint_i32:
 ; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
 ; SI: v_cvt_i32_f32_e32
 ; SI: s_endpgm
-define void @fp_to_sint_i32 (i32 addrspace(1)* %out, float %in) {
+define void @fp_to_sint_i32(i32 addrspace(1)* %out, float %in) {
   %conv = fptosi float %in to i32
   store i32 %conv, i32 addrspace(1)* %out
   ret void
 }
 
+; FUNC-LABEL: {{^}}fp_to_sint_i32_fabs:
+; SI: v_cvt_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|{{$}}
+define void @fp_to_sint_i32_fabs(i32 addrspace(1)* %out, float %in) {
+  %in.fabs = call float @llvm.fabs.f32(float %in) #0
+  %conv = fptosi float %in.fabs to i32
+  store i32 %conv, i32 addrspace(1)* %out
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}fp_to_sint_v2i32:
 ; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
 ; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
@@ -214,3 +225,5 @@ define void @fp_to_sint_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) {
   store <4 x i64> %conv, <4 x i64> addrspace(1)* %out
   ret void
 }
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/R600/hsa.ll b/test/CodeGen/R600/hsa.ll
index 2e79866362ac..5ce3beaa16c0 100644
--- a/test/CodeGen/R600/hsa.ll
+++ b/test/CodeGen/R600/hsa.ll
@@ -1,6 +1,8 @@
 ; RUN: llc < %s -mtriple=r600--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA %s
 
 ; HSA: {{^}}simple:
+; HSA: .section        .hsa.version
+; HSA-NEXT: .ascii  "HSA Code Unit:0.0:AMD:0.1:GFX8.1:0"
 ; Make sure we are setting the ATC bit:
 ; HSA: s_mov_b32 s[[HI:[0-9]]], 0x100f000
 ; HSA: buffer_store_dword v{{[0-9]+}}, s[0:[[HI]]], 0
diff --git a/test/CodeGen/R600/misaligned-load.ll b/test/CodeGen/R600/misaligned-load.ll
new file mode 100644
index 000000000000..6290ca09d502
--- /dev/null
+++ b/test/CodeGen/R600/misaligned-load.ll
@@ -0,0 +1,18 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; SI: @byte_aligned_load64
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: s_endpgm
+define void @byte_aligned_load64(i64 addrspace(1)* %out, i64 addrspace(3)* %in) {
+entry:
+  %0 = load i64 addrspace(3)* %in, align 1
+  store i64 %0, i64 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/scratch-buffer.ll b/test/CodeGen/R600/scratch-buffer.ll
new file mode 100644
index 000000000000..740328a495da
--- /dev/null
+++ b/test/CodeGen/R600/scratch-buffer.ll
@@ -0,0 +1,86 @@
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck %s
+
+; When a frame index offset is more than 12-bits, make sure we don't store
+; it in mubuf's offset field.
+
+; Also, make sure we use the same register for storing the scratch buffer addresss
+; for both stores. This register is allocated by the register scavenger, so we
+; should be able to reuse the same regiser for each scratch buffer access.
+
+; CHECK-LABEL: {{^}}legal_offset_fi:
+; CHECK: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0{{$}}
+; CHECK: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen
+; CHECK: v_mov_b32_e32 [[OFFSET]], 0x8000
+; CHECK: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}}
+
+define void @legal_offset_fi(i32 addrspace(1)* %out, i32 %cond, i32 %if_offset, i32 %else_offset) {
+entry:
+  %scratch0 = alloca [8192 x i32]
+  %scratch1 = alloca [8192 x i32]
+
+  %scratchptr0 = getelementptr [8192 x i32]* %scratch0, i32 0, i32 0
+  store i32 1, i32* %scratchptr0
+
+  %scratchptr1 = getelementptr [8192 x i32]* %scratch1, i32 0, i32 0
+  store i32 2, i32* %scratchptr1
+
+  %cmp = icmp eq i32 %cond, 0
+  br i1 %cmp, label %if, label %else
+
+if:
+  %if_ptr = getelementptr [8192 x i32]* %scratch0, i32 0, i32 %if_offset
+  %if_value = load i32* %if_ptr
+  br label %done
+
+else:
+  %else_ptr = getelementptr [8192 x i32]* %scratch1, i32 0, i32 %else_offset
+  %else_value = load i32* %else_ptr
+  br label %done
+
+done:
+  %value = phi i32 [%if_value, %if], [%else_value, %else]
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+
+  ret void
+
+}
+
+; CHECK-LABEL: {{^}}legal_offset_fi_offset
+; CHECK: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen
+; CHECK: v_add_i32_e32 [[OFFSET:v[0-9]+]], 0x8000
+; CHECK: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}}
+
+define void @legal_offset_fi_offset(i32 addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %offsets, i32 %if_offset, i32 %else_offset) {
+entry:
+  %scratch0 = alloca [8192 x i32]
+  %scratch1 = alloca [8192 x i32]
+
+  %offset0 = load i32 addrspace(1)* %offsets
+  %scratchptr0 = getelementptr [8192 x i32]* %scratch0, i32 0, i32 %offset0
+  store i32 %offset0, i32* %scratchptr0
+
+  %offsetptr1 = getelementptr i32 addrspace(1)* %offsets, i32 1
+  %offset1 = load i32 addrspace(1)* %offsetptr1
+  %scratchptr1 = getelementptr [8192 x i32]* %scratch1, i32 0, i32 %offset1
+  store i32 %offset1, i32* %scratchptr1
+
+  %cmp = icmp eq i32 %cond, 0
+  br i1 %cmp, label %if, label %else
+
+if:
+  %if_ptr = getelementptr [8192 x i32]* %scratch0, i32 0, i32 %if_offset
+  %if_value = load i32* %if_ptr
+  br label %done
+
+else:
+  %else_ptr = getelementptr [8192 x i32]* %scratch1, i32 0, i32 %else_offset
+  %else_value = load i32* %else_ptr
+  br label %done
+
+done:
+  %value = phi i32 [%if_value, %if], [%else_value, %else]
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
+
diff --git a/test/CodeGen/R600/si-triv-disjoint-mem-access.ll b/test/CodeGen/R600/si-triv-disjoint-mem-access.ll
index b2f4a9ff05e1..f6dcb388248a 100644
--- a/test/CodeGen/R600/si-triv-disjoint-mem-access.ll
+++ b/test/CodeGen/R600/si-triv-disjoint-mem-access.ll
@@ -51,8 +51,8 @@ define void @no_reorder_local_load_volatile_global_store_local_load(i32 addrspac
 
 ; FUNC-LABEL: @no_reorder_barrier_local_load_global_store_local_load
 ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4
-; CI: buffer_store_dword
 ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:8
+; CI: buffer_store_dword
 define void @no_reorder_barrier_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
   %ptr0 = load i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
 
diff --git a/test/CodeGen/SPARC/2008-10-10-InlineAsmMemoryOperand.ll b/test/CodeGen/SPARC/2008-10-10-InlineAsmMemoryOperand.ll
index 373a1967307a..e8315f17ebb6 100644
--- a/test/CodeGen/SPARC/2008-10-10-InlineAsmMemoryOperand.ll
+++ b/test/CodeGen/SPARC/2008-10-10-InlineAsmMemoryOperand.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=sparc -no-integrated-as
+; RUN: llc < %s -march=sparc
 ; PR 1557
 
 target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-f128:128:128"
diff --git a/test/CodeGen/SPARC/inlineasm.ll b/test/CodeGen/SPARC/inlineasm.ll
index 526cde8de8b4..2650533b7fec 100644
--- a/test/CodeGen/SPARC/inlineasm.ll
+++ b/test/CodeGen/SPARC/inlineasm.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=sparc -no-integrated-as <%s | FileCheck %s
+; RUN: llc -march=sparc <%s | FileCheck %s
 
 ; CHECK-LABEL: test_constraint_r
 ; CHECK:       add %o1, %o0, %o0
diff --git a/test/CodeGen/SPARC/mult-alt-generic-sparc.ll b/test/CodeGen/SPARC/mult-alt-generic-sparc.ll
index 6a67616d53be..6013b17d9372 100644
--- a/test/CodeGen/SPARC/mult-alt-generic-sparc.ll
+++ b/test/CodeGen/SPARC/mult-alt-generic-sparc.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=sparc -no-integrated-as
+; RUN: llc < %s -march=sparc
 ; ModuleID = 'mult-alt-generic.c'
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-n32"
 target triple = "sparc"
diff --git a/test/DebugInfo/Mips/fn-call-line.ll b/test/DebugInfo/Mips/fn-call-line.ll
new file mode 100644
index 000000000000..14cd8c9d987f
--- /dev/null
+++ b/test/DebugInfo/Mips/fn-call-line.ll
@@ -0,0 +1,84 @@
+; RUN: llc -mtriple=mips-linux-gnu -filetype=asm -asm-verbose=0 -O0 < %s | FileCheck %s
+; RUN: llc -mtriple=mips-linux-gnu -filetype=obj -O0 < %s | llvm-dwarfdump -debug-dump=line - | FileCheck %s --check-prefix=INT
+
+; Mips used to generate 'jumpy' debug line info around calls. The address
+; calculation for each call to f1() would share the same line info so it would
+; emit output of the form:
+;   .loc $first_call_location
+;   .. address calculation ..
+;   .. function call ..
+;   .. address calculation ..
+;   .loc $second_call_location
+;   .. function call ..
+;   .loc $first_call_location
+;   .. address calculation ..
+;   .loc $third_call_location
+;   .. function call ..
+;   ...
+; which would cause confusing stepping behaviour for the end user.
+;
+; This test checks that we emit more user friendly debug line info of the form:
+;   .loc $first_call_location
+;   .. address calculation ..
+;   .. function call ..
+;   .loc $second_call_location
+;   .. address calculation ..
+;   .. function call ..
+;   .loc $third_call_location
+;   .. address calculation ..
+;   .. function call ..
+;   ...
+;
+; Generated with clang from fn-call-line.c:
+; void f1();
+; void f2() {
+;   f1();
+;   f1();
+; }
+
+; CHECK: .loc	1 3 3
+; CHECK-NOT: .loc
+; CHECK: %call16(f1) 
+; CHECK-NOT: .loc
+; CHECK: .loc	1 4 3
+; CHECK-NOT: .loc
+; CHECK: %call16(f1) 
+
+; INT: {{^}}Address
+; INT: -----
+; INT-NEXT: 2 0 1 0 0 is_stmt{{$}}
+; INT-NEXT: 3 3 1 0 0 is_stmt prologue_end{{$}}
+; INT-NEXT: 4 3 1 0 0 is_stmt{{$}}
+
+
+; Function Attrs: nounwind uwtable
+define void @f2() #0 {
+entry:
+  call void (...)* @f1(), !dbg !11
+  call void (...)* @f1(), !dbg !12
+  ret void, !dbg !13
+}
+
+declare void @f1(...) #1
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!8, !9}
+!llvm.ident = !{!10}
+
+!0 = !{!"0x11\0012\00clang version 3.7.0 (trunk 226641)\000\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/fn-call-line.c] [DW_LANG_C99]
+!1 = !{!"fn-call-line.c", !"/tmp/dbginfo"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00f2\00f2\00\002\000\001\000\000\000\000\002", !1, !5, !6, null, void ()* @f2, null, null, !2} ; [ DW_TAG_subprogram ] [line 2] [def] [f2]
+!5 = !{!"0x29", !1}                               ; [ DW_TAG_file_type ] [/tmp/dbginfo/fn-call-line.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{null}
+!8 = !{i32 2, !"Dwarf Version", i32 4}
+!9 = !{i32 2, !"Debug Info Version", i32 2}
+!10 = !{!"clang version 3.7.0 (trunk 226641)"}
+!11 = !MDLocation(line: 3, column: 3, scope: !4)
+!12 = !MDLocation(line: 4, column: 3, scope: !4)
+!13 = !MDLocation(line: 5, column: 1, scope: !4)
diff --git a/test/DebugInfo/X86/decl-derived-member.ll b/test/DebugInfo/X86/decl-derived-member.ll
index 8c15c53e8141..c4f3dd90ffd5 100644
--- a/test/DebugInfo/X86/decl-derived-member.ll
+++ b/test/DebugInfo/X86/decl-derived-member.ll
@@ -7,8 +7,9 @@
 ; struct base {
 ;  virtual ~base();
 ; };
+; typedef base base_type;
 ; struct foo {
-;  base b;
+;  base_type b;
 ; };
 ; foo f;
 
@@ -20,40 +21,47 @@
 
 %struct.foo = type { %struct.base }
 %struct.base = type { i32 (...)** }
+
+$_ZN3fooC2Ev = comdat any
+
+$_ZN3fooD2Ev = comdat any
+
+$_ZN4baseC2Ev = comdat any
+
 @f = global %struct.foo zeroinitializer, align 8
 @__dso_handle = external global i8
 @_ZTV4base = external unnamed_addr constant [4 x i8*]
-@llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @_GLOBAL__I_a }]
+@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @_GLOBAL__sub_I_decl_derived_member.cpp, i8* null }]
 
 define internal void @__cxx_global_var_init() section ".text.startup" {
 entry:
-  call void @_ZN3fooC2Ev(%struct.foo* @f) #2, !dbg !35
-  %0 = call i32 @__cxa_atexit(void (i8*)* bitcast (void (%struct.foo*)* @_ZN3fooD2Ev to void (i8*)*), i8* bitcast (%struct.foo* @f to i8*), i8* @__dso_handle) #2, !dbg !35
-  ret void, !dbg !35
+  call void @_ZN3fooC2Ev(%struct.foo* @f) #2, !dbg !33
+  %0 = call i32 @__cxa_atexit(void (i8*)* bitcast (void (%struct.foo*)* @_ZN3fooD2Ev to void (i8*)*), i8* bitcast (%struct.foo* @f to i8*), i8* @__dso_handle) #2, !dbg !33
+  ret void, !dbg !33
 }
 
 ; Function Attrs: inlinehint nounwind uwtable
-define linkonce_odr void @_ZN3fooC2Ev(%struct.foo* %this) unnamed_addr #0 align 2 {
+define linkonce_odr void @_ZN3fooC2Ev(%struct.foo* %this) unnamed_addr #0 comdat align 2 {
 entry:
   %this.addr = alloca %struct.foo*, align 8
   store %struct.foo* %this, %struct.foo** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata %struct.foo** %this.addr, metadata !36, metadata !{!"0x102"}), !dbg !38
+  call void @llvm.dbg.declare(metadata %struct.foo** %this.addr, metadata !34, metadata !36), !dbg !37
   %this1 = load %struct.foo** %this.addr
-  %b = getelementptr inbounds %struct.foo* %this1, i32 0, i32 0, !dbg !39
-  call void @_ZN4baseC2Ev(%struct.base* %b) #2, !dbg !39
-  ret void, !dbg !39
+  %b = getelementptr inbounds %struct.foo* %this1, i32 0, i32 0, !dbg !38
+  call void @_ZN4baseC2Ev(%struct.base* %b) #2, !dbg !38
+  ret void, !dbg !38
 }
 
 ; Function Attrs: inlinehint uwtable
-define linkonce_odr void @_ZN3fooD2Ev(%struct.foo* %this) unnamed_addr #1 align 2 {
+define linkonce_odr void @_ZN3fooD2Ev(%struct.foo* %this) unnamed_addr #1 comdat align 2 {
 entry:
   %this.addr = alloca %struct.foo*, align 8
   store %struct.foo* %this, %struct.foo** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata %struct.foo** %this.addr, metadata !40, metadata !{!"0x102"}), !dbg !41
+  call void @llvm.dbg.declare(metadata %struct.foo** %this.addr, metadata !39, metadata !36), !dbg !40
   %this1 = load %struct.foo** %this.addr
-  %b = getelementptr inbounds %struct.foo* %this1, i32 0, i32 0, !dbg !42
-  call void @_ZN4baseD1Ev(%struct.base* %b), !dbg !42
-  ret void, !dbg !44
+  %b = getelementptr inbounds %struct.foo* %this1, i32 0, i32 0, !dbg !41
+  call void @_ZN4baseD1Ev(%struct.base* %b), !dbg !41
+  ret void, !dbg !43
 }
 
 ; Function Attrs: nounwind
@@ -62,24 +70,24 @@ declare i32 @__cxa_atexit(void (i8*)*, i8*, i8*) #2
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.declare(metadata, metadata, metadata) #3
 
-declare void @_ZN4baseD1Ev(%struct.base*) #4
-
 ; Function Attrs: inlinehint nounwind uwtable
-define linkonce_odr void @_ZN4baseC2Ev(%struct.base* %this) unnamed_addr #0 align 2 {
+define linkonce_odr void @_ZN4baseC2Ev(%struct.base* %this) unnamed_addr #0 comdat align 2 {
 entry:
   %this.addr = alloca %struct.base*, align 8
   store %struct.base* %this, %struct.base** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata %struct.base** %this.addr, metadata !45, metadata !{!"0x102"}), !dbg !47
+  call void @llvm.dbg.declare(metadata %struct.base** %this.addr, metadata !44, metadata !36), !dbg !46
   %this1 = load %struct.base** %this.addr
-  %0 = bitcast %struct.base* %this1 to i8***, !dbg !48
-  store i8** getelementptr inbounds ([4 x i8*]* @_ZTV4base, i64 0, i64 2), i8*** %0, !dbg !48
-  ret void, !dbg !48
+  %0 = bitcast %struct.base* %this1 to i32 (...)***, !dbg !47
+  store i32 (...)** bitcast (i8** getelementptr inbounds ([4 x i8*]* @_ZTV4base, i64 0, i64 2) to i32 (...)**), i32 (...)*** %0, !dbg !47
+  ret void, !dbg !47
 }
 
-define internal void @_GLOBAL__I_a() section ".text.startup" {
+declare void @_ZN4baseD1Ev(%struct.base*) #4
+
+define internal void @_GLOBAL__sub_I_decl_derived_member.cpp() section ".text.startup" {
 entry:
-  call void @__cxx_global_var_init(), !dbg !49
-  ret void, !dbg !49
+  call void @__cxx_global_var_init(), !dbg !48
+  ret void
 }
 
 attributes #0 = { inlinehint nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
@@ -89,56 +97,55 @@ attributes #3 = { nounwind readnone }
 attributes #4 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
 !llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!32, !33}
-!llvm.ident = !{!34}
+!llvm.module.flags = !{!30, !31}
+!llvm.ident = !{!32}
 
-!0 = !{!"0x11\004\00clang version 3.5.0 (trunk 203673) (llvm/trunk 203681)\000\00\000\00\001", !1, !2, !3, !8, !30, !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/foo.cc] [DW_LANG_C_plus_plus]
-!1 = !{!"foo.cc", !"/usr/local/google/home/echristo"}
+!0 = !{!"0x11\004\00clang version 3.7.0 (trunk 227104) (llvm/trunk 227103)\000\00\000\00\001", !1, !2, !3, !9, !28, !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/decl-derived-member.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"decl-derived-member.cpp", !"/tmp/dbginfo"}
 !2 = !{}
-!3 = !{!4, !7}
+!3 = !{!4, !8}
 !4 = !{!"0x13\00foo\005\0064\0064\000\000\000", !1, null, null, !5, null, null, !"_ZTS3foo"} ; [ DW_TAG_structure_type ] [foo] [line 5, size 64, align 64, offset 0] [def] [from ]
 !5 = !{!6}
-!6 = !{!"0xd\00b\006\0064\0064\000\000", !1, !"_ZTS3foo", !"_ZTS4base"} ; [ DW_TAG_member ] [b] [line 6, size 64, align 64, offset 0] [from _ZTS4base]
-!7 = !{!"0x13\00base\001\000\000\000\004\000", !1, null, null, null, null, null, !"_ZTS4base"} ; [ DW_TAG_structure_type ] [base] [line 1, size 0, align 0, offset 0] [decl] [from ]
-!8 = !{!9, !13, !19, !22, !28}
-!9 = !{!"0x2e\00__cxx_global_var_init\00__cxx_global_var_init\00\009\001\001\000\006\00256\000\009", !1, !10, !11, null, void ()* @__cxx_global_var_init, null, null, !2} ; [ DW_TAG_subprogram ] [line 9] [local] [def] [__cxx_global_var_init]
-!10 = !{!"0x29", !1}         ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/foo.cc]
-!11 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!12 = !{null}
-!13 = !{!"0x2e\00~foo\00~foo\00_ZN3fooD2Ev\005\000\001\000\006\00320\000\005", !1, !"_ZTS3foo", !14, null, void (%struct.foo*)* @_ZN3fooD2Ev, null, !17, !2} ; [ DW_TAG_subprogram ] [line 5] [def] [~foo]
-!14 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !15, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!15 = !{null, !16}
-!16 = !{!"0xf\00\000\0064\0064\000\001088", null, null, !"_ZTS3foo"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS3foo]
-!17 = !{!"0x2e\00~foo\00~foo\00\000\000\000\000\006\00320\000\000", null, !"_ZTS3foo", !14, null, null, null, i32 0, !18} ; [ DW_TAG_subprogram ] [line 0] [~foo]
-!18 = !{i32 786468}
-!19 = !{!"0x2e\00foo\00foo\00_ZN3fooC2Ev\005\000\001\000\006\00320\000\005", !1, !"_ZTS3foo", !14, null, void (%struct.foo*)* @_ZN3fooC2Ev, null, !20, !2} ; [ DW_TAG_subprogram ] [line 5] [def] [foo]
-!20 = !{!"0x2e\00foo\00foo\00\000\000\000\000\006\00320\000\000", null, !"_ZTS3foo", !14, null, null, null, i32 0, !21} ; [ DW_TAG_subprogram ] [line 0] [foo]
-!21 = !{i32 786468}
-!22 = !{!"0x2e\00base\00base\00_ZN4baseC2Ev\001\000\001\000\006\00320\000\001", !1, !"_ZTS4base", !23, null, void (%struct.base*)* @_ZN4baseC2Ev, null, !26, !2} ; [ DW_TAG_subprogram ] [line 1] [def] [base]
-!23 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !24, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!24 = !{null, !25}
-!25 = !{!"0xf\00\000\0064\0064\000\001088", null, null, !"_ZTS4base"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS4base]
-!26 = !{!"0x2e\00base\00base\00\000\000\000\000\006\00320\000\000", null, !"_ZTS4base", !23, null, null, null, i32 0, !27} ; [ DW_TAG_subprogram ] [line 0] [base]
-!27 = !{i32 786468}
-!28 = !{!"0x2e\00\00\00_GLOBAL__I_a\001\001\001\000\006\0064\000\001", !1, !10, !29, null, void ()* @_GLOBAL__I_a, null, null, !2} ; [ DW_TAG_subprogram ] [line 1] [local] [def]
-!29 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!30 = !{!31}
-!31 = !{!"0x34\00f\00f\00\009\000\001", null, !10, !4, %struct.foo* @f, null} ; [ DW_TAG_variable ] [f] [line 9] [def]
-!32 = !{i32 2, !"Dwarf Version", i32 4}
-!33 = !{i32 1, !"Debug Info Version", i32 2}
-!34 = !{!"clang version 3.5.0 (trunk 203673) (llvm/trunk 203681)"}
-!35 = !MDLocation(line: 9, scope: !9)
-!36 = !{!"0x101\00this\0016777216\001088", !19, null, !37} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!37 = !{!"0xf\00\000\0064\0064\000\000", null, null, !"_ZTS3foo"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS3foo]
-!38 = !MDLocation(line: 0, scope: !19)
-!39 = !MDLocation(line: 5, scope: !19)
-!40 = !{!"0x101\00this\0016777216\001088", !13, null, !37} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!41 = !MDLocation(line: 0, scope: !13)
-!42 = !MDLocation(line: 5, scope: !43)
-!43 = !{!"0xb\005\000\000", !1, !13} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/foo.cc]
-!44 = !MDLocation(line: 5, scope: !13)
-!45 = !{!"0x101\00this\0016777216\001088", !22, null, !46} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!46 = !{!"0xf\00\000\0064\0064\000\000", null, null, !"_ZTS4base"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS4base]
-!47 = !MDLocation(line: 0, scope: !22)
-!48 = !MDLocation(line: 1, scope: !22)
-!49 = !MDLocation(line: 1, scope: !28)
+!6 = !{!"0xd\00b\006\0064\0064\000\000", !1, !"_ZTS3foo", !7} ; [ DW_TAG_member ] [b] [line 6, size 64, align 64, offset 0] [from base_type]
+!7 = !{!"0x16\00base_type\004\000\000\000\000", !1, null, !"_ZTS4base"} ; [ DW_TAG_typedef ] [base_type] [line 4, size 0, align 0, offset 0] [from _ZTS4base]
+!8 = !{!"0x13\00base\001\000\000\000\004\000", !1, null, null, null, null, null, !"_ZTS4base"} ; [ DW_TAG_structure_type ] [base] [line 1, size 0, align 0, offset 0] [decl] [from ]
+!9 = !{!10, !14, !19, !24, !26}
+!10 = !{!"0x2e\00__cxx_global_var_init\00__cxx_global_var_init\00\008\001\001\000\000\00256\000\008", !1, !11, !12, null, void ()* @__cxx_global_var_init, null, null, !2} ; [ DW_TAG_subprogram ] [line 8] [local] [def] [__cxx_global_var_init]
+!11 = !{!"0x29", !1}                              ; [ DW_TAG_file_type ] [/tmp/dbginfo/decl-derived-member.cpp]
+!12 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !13, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!13 = !{null}
+!14 = !{!"0x2e\00foo\00foo\00_ZN3fooC2Ev\005\000\001\000\000\00320\000\005", !1, !"_ZTS3foo", !15, null, void (%struct.foo*)* @_ZN3fooC2Ev, null, !18, !2} ; [ DW_TAG_subprogram ] [line 5] [def] [foo]
+!15 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!16 = !{null, !17}
+!17 = !{!"0xf\00\000\0064\0064\000\001088\00", null, null, !"_ZTS3foo"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS3foo]
+!18 = !{!"0x2e\00foo\00foo\00\000\000\000\000\000\00320\000\000", null, !"_ZTS3foo", !15, null, null, null, null, null} ; [ DW_TAG_subprogram ] [line 0] [foo]
+!19 = !{!"0x2e\00base\00base\00_ZN4baseC2Ev\001\000\001\000\000\00320\000\001", !1, !"_ZTS4base", !20, null, void (%struct.base*)* @_ZN4baseC2Ev, null, !23, !2} ; [ DW_TAG_subprogram ] [line 1] [def] [base]
+!20 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !21, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!21 = !{null, !22}
+!22 = !{!"0xf\00\000\0064\0064\000\001088\00", null, null, !"_ZTS4base"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS4base]
+!23 = !{!"0x2e\00base\00base\00\000\000\000\000\000\00320\000\000", null, !"_ZTS4base", !20, null, null, null, null, null} ; [ DW_TAG_subprogram ] [line 0] [base]
+!24 = !{!"0x2e\00~foo\00~foo\00_ZN3fooD2Ev\005\000\001\000\000\00320\000\005", !1, !"_ZTS3foo", !15, null, void (%struct.foo*)* @_ZN3fooD2Ev, null, !25, !2} ; [ DW_TAG_subprogram ] [line 5] [def] [~foo]
+!25 = !{!"0x2e\00~foo\00~foo\00\000\000\000\000\000\00320\000\000", null, !"_ZTS3foo", !15, null, null, null, null, null} ; [ DW_TAG_subprogram ] [line 0] [~foo]
+!26 = !{!"0x2e\00\00\00_GLOBAL__sub_I_decl_derived_member.cpp\000\001\001\000\000\0064\000\000", !1, !11, !27, null, void ()* @_GLOBAL__sub_I_decl_derived_member.cpp, null, null, !2} ; [ DW_TAG_subprogram ] [line 0] [local] [def]
+!27 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!28 = !{!29}
+!29 = !{!"0x34\00f\00f\00\008\000\001", null, !11, !"_ZTS3foo", %struct.foo* @f, null} ; [ DW_TAG_variable ] [f] [line 8] [def]
+!30 = !{i32 2, !"Dwarf Version", i32 4}
+!31 = !{i32 2, !"Debug Info Version", i32 2}
+!32 = !{!"clang version 3.7.0 (trunk 227104) (llvm/trunk 227103)"}
+!33 = !MDLocation(line: 8, column: 5, scope: !10)
+!34 = !{!"0x101\00this\0016777216\001088", !14, null, !35} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!35 = !{!"0xf\00\000\0064\0064\000\000", null, null, !"_ZTS3foo"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS3foo]
+!36 = !{!"0x102"}                                 ; [ DW_TAG_expression ]
+!37 = !MDLocation(line: 0, scope: !14)
+!38 = !MDLocation(line: 5, column: 8, scope: !14)
+!39 = !{!"0x101\00this\0016777216\001088", !24, null, !35} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!40 = !MDLocation(line: 0, scope: !24)
+!41 = !MDLocation(line: 5, column: 8, scope: !42)
+!42 = !{!"0xb\005\008\002", !1, !24}              ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/decl-derived-member.cpp]
+!43 = !MDLocation(line: 5, column: 8, scope: !24)
+!44 = !{!"0x101\00this\0016777216\001088", !19, null, !45} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!45 = !{!"0xf\00\000\0064\0064\000\000", null, null, !"_ZTS4base"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS4base]
+!46 = !MDLocation(line: 0, scope: !19)
+!47 = !MDLocation(line: 1, column: 8, scope: !19)
+!48 = !MDLocation(line: 0, scope: !26)
diff --git a/test/MC/ARM/pr22395.s b/test/MC/ARM/pr22395.s
new file mode 100644
index 000000000000..5da5d964298c
--- /dev/null
+++ b/test/MC/ARM/pr22395.s
@@ -0,0 +1,63 @@
+@ RUN: llvm-mc -triple armv4t-eabi -filetype asm -o - %s 2>&1 | FileCheck %s
+
+	.text
+	.thumb
+
+	.p2align 2
+
+	.fpu neon
+	vldmia r0, {d16-d31}
+
+@ CHECK: vldmia	r0, {d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31}
+@ CHECK-NOT: error: instruction requires: VFP2
+
+	.fpu vfpv3
+	vadd.f32 s1, s2, s3
+@ CHECK: vadd.f32 s1, s2, s3
+@ CHECK-NOT: error: instruction requires: VPF2
+
+	.fpu vfpv3-d16
+	vadd.f32 s1, s2, s3
+@ CHECK: vadd.f32 s1, s2, s3
+@ CHECK-NOT: error: instruction requires: VPF2
+
+	.fpu vfpv4
+	vadd.f32 s1, s2, s3
+@ CHECK: vadd.f32 s1, s2, s3
+@ CHECK-NOT: error: instruction requires: VPF2
+
+	.fpu vfpv4-d16
+	vadd.f32 s1, s2, s3
+@ CHECK: vadd.f32 s1, s2, s3
+@ CHECK-NOT: error: instruction requires: VPF2
+
+	.fpu fpv5-d16
+	vadd.f32 s1, s2, s3
+@ CHECK: vadd.f32 s1, s2, s3
+@ CHECK-NOT: error: instruction requires: VPF2
+
+	.fpu fp-armv8
+	vadd.f32 s1, s2, s3
+@ CHECK: vadd.f32 s1, s2, s3
+@ CHECK-NOT: error: instruction requires: VPF2
+
+	.fpu fp-armv8
+	vadd.f32 s1, s2, s3
+@ CHECK: vadd.f32 s1, s2, s3
+@ CHECK-NOT: error: instruction requires: VPF2
+
+	.fpu neon
+	vadd.f32 s1, s2, s3
+@ CHECK: vadd.f32 s1, s2, s3
+@ CHECK-NOT: error: instruction requires: VPF2
+
+	.fpu neon-vfpv4
+	vadd.f32 s1, s2, s3
+@ CHECK: vadd.f32 s1, s2, s3
+@ CHECK-NOT: error: instruction requires: VPF2
+
+	.fpu crypto-neon-fp-armv8
+	vadd.f32 s1, s2, s3
+@ CHECK: vadd.f32 s1, s2, s3
+@ CHECK-NOT: error: instruction requires: VPF2
+
diff --git a/test/MC/ELF/relocation-386.s b/test/MC/ELF/relocation-386.s
index ba12df0d3b63..83c524b9d07b 100644
--- a/test/MC/ELF/relocation-386.s
+++ b/test/MC/ELF/relocation-386.s
@@ -63,6 +63,8 @@
 // Relocation 28 (und_symbol-bar2) is of type R_386_PC8
 // CHECK-NEXT:     0xA0         R_386_PC8        und_symbol 0x0
 // CHECK-NEXT:     0xA3         R_386_GOTOFF     und_symbol 0x0
+// Relocation 29 (zed@PLT) is of type R_386_PLT32 and uses the symbol
+// CHECK-NEXT:     0xA9         R_386_PLT32      zed 0x0
 // CHECK-NEXT:   }
 // CHECK-NEXT: ]
 
@@ -129,6 +131,7 @@ bar2:
         .byte und_symbol-bar2
 
         leal 1 + und_symbol@GOTOFF, %edi
+        movl zed@PLT(%eax), %eax
 
         .section        zedsec,"awT",@progbits
 zed:
diff --git a/test/MC/MachO/AArch64/mergeable.s b/test/MC/MachO/AArch64/mergeable.s
deleted file mode 100644
index fc6ec04f37bf..000000000000
--- a/test/MC/MachO/AArch64/mergeable.s
+++ /dev/null
@@ -1,59 +0,0 @@
-// RUN: llvm-mc -triple aarch64-apple-darwin14 %s -filetype=obj -o - | llvm-readobj -r --expand-relocs | FileCheck %s
-
-// Test that we "S + K" produce a relocation with a symbol, but just S produces
-// a relocation with the section.
-
-	.section	__TEXT,__literal4,4byte_literals
-L0:
-	.long	42
-
-	.section	__TEXT,__cstring,cstring_literals
-L1:
-	.asciz	"42"
-
-	.section	__DATA,__data
-	.quad	L0
-	.quad	L0 + 1
-	.quad	L1
-	.quad	L1 + 1
-
-// CHECK:      Relocations [
-// CHECK-NEXT:   Section __data {
-// CHECK-NEXT:     Relocation {
-// CHECK-NEXT:       Offset: 0x18
-// CHECK-NEXT:       PCRel: 0
-// CHECK-NEXT:       Length: 3
-// CHECK-NEXT:       Extern: 1
-// CHECK-NEXT:       Type: ARM64_RELOC_UNSIGNED (0)
-// CHECK-NEXT:       Symbol: L1
-// CHECK-NEXT:       Scattered: 0
-// CHECK-NEXT:     }
-// CHECK-NEXT:     Relocation {
-// CHECK-NEXT:       Offset: 0x10
-// CHECK-NEXT:       PCRel: 0
-// CHECK-NEXT:       Length: 3
-// CHECK-NEXT:       Extern: 1
-// CHECK-NEXT:       Type: ARM64_RELOC_UNSIGNED (0)
-// CHECK-NEXT:       Symbol: L1
-// CHECK-NEXT:       Scattered: 0
-// CHECK-NEXT:     }
-// CHECK-NEXT:     Relocation {
-// CHECK-NEXT:       Offset: 0x8
-// CHECK-NEXT:       PCRel: 0
-// CHECK-NEXT:       Length: 3
-// CHECK-NEXT:       Extern: 1
-// CHECK-NEXT:       Type: ARM64_RELOC_UNSIGNED (0)
-// CHECK-NEXT:       Symbol: L0
-// CHECK-NEXT:       Scattered: 0
-// CHECK-NEXT:     }
-// CHECK-NEXT:     Relocation {
-// CHECK-NEXT:       Offset: 0x0
-// CHECK-NEXT:       PCRel: 0
-// CHECK-NEXT:       Length: 3
-// CHECK-NEXT:       Extern: 0
-// CHECK-NEXT:       Type: ARM64_RELOC_UNSIGNED (0)
-// CHECK-NEXT:       Symbol: 0x2
-// CHECK-NEXT:       Scattered: 0
-// CHECK-NEXT:     }
-// CHECK-NEXT:   }
-// CHECK-NEXT: ]
diff --git a/test/MC/MachO/x86_64-mergeable.s b/test/MC/MachO/x86_64-mergeable.s
deleted file mode 100644
index 972477693ed2..000000000000
--- a/test/MC/MachO/x86_64-mergeable.s
+++ /dev/null
@@ -1,59 +0,0 @@
-// RUN: llvm-mc -triple x86_64-apple-darwin14 %s -filetype=obj -o - | llvm-readobj -r --expand-relocs | FileCheck %s
-
-// Test that we "S + K" produce a relocation with a symbol, but just S produces
-// a relocation with the section.
-
-	.section	__TEXT,__literal4,4byte_literals
-L0:
-	.long	42
-
-	.section	__TEXT,__cstring,cstring_literals
-L1:
-	.asciz	"42"
-
-	.section	__DATA,__data
-	.quad	L0
-	.quad	L0 + 1
-	.quad	L1
-	.quad	L1 + 1
-
-// CHECK:      Relocations [
-// CHECK-NEXT:   Section __data {
-// CHECK-NEXT:     Relocation {
-// CHECK-NEXT:       Offset: 0x18
-// CHECK-NEXT:       PCRel: 0
-// CHECK-NEXT:       Length: 3
-// CHECK-NEXT:       Extern: 1
-// CHECK-NEXT:       Type: X86_64_RELOC_UNSIGNED (0)
-// CHECK-NEXT:       Symbol: L1
-// CHECK-NEXT:       Scattered: 0
-// CHECK-NEXT:     }
-// CHECK-NEXT:     Relocation {
-// CHECK-NEXT:       Offset: 0x10
-// CHECK-NEXT:       PCRel: 0
-// CHECK-NEXT:       Length: 3
-// CHECK-NEXT:       Extern: 0
-// CHECK-NEXT:       Type: X86_64_RELOC_UNSIGNED (0)
-// CHECK-NEXT:       Symbol: 0x3
-// CHECK-NEXT:       Scattered: 0
-// CHECK-NEXT:     }
-// CHECK-NEXT:     Relocation {
-// CHECK-NEXT:       Offset: 0x8
-// CHECK-NEXT:       PCRel: 0
-// CHECK-NEXT:       Length: 3
-// CHECK-NEXT:       Extern: 1
-// CHECK-NEXT:       Type: X86_64_RELOC_UNSIGNED (0)
-// CHECK-NEXT:       Symbol: L0
-// CHECK-NEXT:       Scattered: 0
-// CHECK-NEXT:     }
-// CHECK-NEXT:     Relocation {
-// CHECK-NEXT:       Offset: 0x0
-// CHECK-NEXT:       PCRel: 0
-// CHECK-NEXT:       Length: 3
-// CHECK-NEXT:       Extern: 0
-// CHECK-NEXT:       Type: X86_64_RELOC_UNSIGNED (0)
-// CHECK-NEXT:       Symbol: 0x2
-// CHECK-NEXT:       Scattered: 0
-// CHECK-NEXT:     }
-// CHECK-NEXT:   }
-// CHECK-NEXT: ]
diff --git a/test/MC/MachO/x86_64-symbols.s b/test/MC/MachO/x86_64-symbols.s
index f40183df853c..e2dcc440fb5f 100644
--- a/test/MC/MachO/x86_64-symbols.s
+++ b/test/MC/MachO/x86_64-symbols.s
@@ -121,12 +121,6 @@ D38:
 //L39:
 //D39:
 
-        .section foo, bar
-        .long L4 + 1
-        .long L35 + 1
-        .long L36 + 1
-        .long L37 + 1
-        .long L38 + 1
 
 // CHECK: Symbols [
 // CHECK-NEXT:   Symbol {
diff --git a/test/SymbolRewriter/rewrite.ll b/test/SymbolRewriter/rewrite.ll
index 716fff9f284c..e8a0db6d606c 100644
--- a/test/SymbolRewriter/rewrite.ll
+++ b/test/SymbolRewriter/rewrite.ll
@@ -28,12 +28,40 @@ entry:
   ret void
 }
 
+$source_comdat_function = comdat any
+define dllexport void @source_comdat_function() comdat($source_comdat_function) {
+entry:
+  ret void
+}
+
+$source_comdat_function_1 = comdat exactmatch
+define dllexport void @source_comdat_function_1() comdat($source_comdat_function_1) {
+entry:
+  ret void
+}
+
+$source_comdat_variable = comdat largest
+@source_comdat_variable = global i32 32, comdat($source_comdat_variable)
+
+$source_comdat_variable_1 = comdat noduplicates
+@source_comdat_variable_1 = global i32 64, comdat($source_comdat_variable_1)
+
+; CHECK: $target_comdat_function = comdat any
+; CHECK: $target_comdat_function_1 = comdat exactmatch
+; CHECK: $target_comdat_variable = comdat largest
+; CHECK: $target_comdat_variable_1 = comdat noduplicates
+
 ; CHECK: @target_variable = external global i32
 ; CHECK-NOT: @source_variable = external global i32
 ; CHECK: @target_pattern_variable = external global i32
 ; CHECK-NOT: @source_pattern_variable = external global i32
 ; CHECK: @target_pattern_multiple_variable_matches = external global i32
 ; CHECK-NOT: @source_pattern_multiple_variable_matches = external global i32
+; CHECK: @target_comdat_variable = global i32 32, comdat
+; CHECK-NOT: @source_comdat_variable = global i32 32, comdat
+; CHECK: @target_comdat_variable_1 = global i32 64, comdat
+; CHECK-NOT: @source_comdat_variable_1 = global i32 64, comdat
+
 ; CHECK: declare void @target_function()
 ; CHECK-NOT: declare void @source_function()
 ; CHECK: declare void @target_pattern_function()
@@ -57,3 +85,8 @@ entry:
 ; CHECK:   ret i32 %res
 ; CHECK: }
 
+; CHECK: define dllexport void @target_comdat_function() comdat
+; CHECK-NOT: define dllexport void @source_comdat_function() comdat
+; CHECK: define dllexport void @target_comdat_function_1() comdat
+; CHECK-NOT: define dllexport void @source_comdat_function_1() comdat
+
diff --git a/test/SymbolRewriter/rewrite.map b/test/SymbolRewriter/rewrite.map
index ef6dfc8ca2b9..8094939d088d 100644
--- a/test/SymbolRewriter/rewrite.map
+++ b/test/SymbolRewriter/rewrite.map
@@ -44,3 +44,23 @@ global alias: {
   target: _ZN1SD1Ev,
 }
 
+function: {
+  source: source_comdat_function,
+  target: target_comdat_function,
+}
+
+function: {
+  source: source_comdat_function_(.*),
+  transform: target_comdat_function_\1,
+}
+
+global variable: {
+  source: source_comdat_variable,
+  target: target_comdat_variable,
+}
+
+global variable: {
+  source: source_comdat_variable_(.*),
+  transform: target_comdat_variable_\1,
+}
+
diff --git a/test/Transforms/GVN/edge.ll b/test/Transforms/GVN/edge.ll
index 1dc285ec3d7c..f28a76bb42d6 100644
--- a/test/Transforms/GVN/edge.ll
+++ b/test/Transforms/GVN/edge.ll
@@ -69,11 +69,11 @@ if:
   br label %return
 
 return:
-  %retval.0 = phi double [ %div, %if ], [ %x, %entry ]
-  ret double %retval.0
+  %retval = phi double [ %div, %if ], [ %x, %entry ]
+  ret double %retval
 
 ; CHECK-LABEL: define double @fcmp_oeq(
-; CHECK: %div = fdiv double %x, 2.000000e+00
+; CHECK: %div = fdiv double %x, 2.0
 }
 
 define double @fcmp_une(double %x, double %y) {
@@ -86,10 +86,46 @@ else:
   br label %return
 
 return:
-  %retval.0 = phi double [ %div, %else ], [ %x, %entry ]
-  ret double %retval.0
+  %retval = phi double [ %div, %else ], [ %x, %entry ]
+  ret double %retval
 
 ; CHECK-LABEL: define double @fcmp_une(
-; CHECK: %div = fdiv double %x, 2.000000e+00
+; CHECK: %div = fdiv double %x, 2.0
 }
 
+; PR22376 - We can't propagate zero constants because -0.0 
+; compares equal to 0.0. If %y is -0.0 in this test case,
+; we would produce the wrong sign on the infinity return value.
+define double @fcmp_oeq_zero(double %x, double %y) {
+entry:
+  %cmp = fcmp oeq double %y, 0.0
+  br i1 %cmp, label %if, label %return
+
+if:
+  %div = fdiv double %x, %y
+  br label %return
+
+return:
+  %retval = phi double [ %div, %if ], [ %x, %entry ]
+  ret double %retval
+
+; CHECK-LABEL: define double @fcmp_oeq_zero(
+; CHECK: %div = fdiv double %x, %y
+}
+
+define double @fcmp_une_zero(double %x, double %y) {
+entry:
+  %cmp = fcmp une double %y, -0.0
+  br i1 %cmp, label %return, label %else
+
+else:
+  %div = fdiv double %x, %y
+  br label %return
+
+return:
+  %retval = phi double [ %div, %else ], [ %x, %entry ]
+  ret double %retval
+
+; CHECK-LABEL: define double @fcmp_une_zero(
+; CHECK: %div = fdiv double %x, %y
+}
diff --git a/test/Transforms/IndVarSimplify/pr22222.ll b/test/Transforms/IndVarSimplify/pr22222.ll
new file mode 100644
index 000000000000..ccdfe538dfa5
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/pr22222.ll
@@ -0,0 +1,46 @@
+; RUN: opt -indvars -S < %s | FileCheck %s
+
+@b = common global i32 0, align 4
+@c = common global i32 0, align 4
+@a = common global i32 0, align 4
+
+declare void @abort() #1
+
+; Function Attrs: nounwind ssp uwtable
+define i32 @main() {
+entry:
+  %a.promoted13 = load i32* @a, align 4
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.end
+  %or.lcssa14 = phi i32 [ %a.promoted13, %entry ], [ %or.lcssa, %for.end ]
+  %d.010 = phi i32 [ 1, %entry ], [ 0, %for.end ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %inc12 = phi i32 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %or11 = phi i32 [ %or.lcssa14, %for.cond1.preheader ], [ %or, %for.body3 ]
+; CHECK-NOT: sub nuw i32 %inc12, %d.010
+; CHECK: sub i32 %inc12, %d.010
+  %add = sub i32 %inc12, %d.010
+  %or = or i32 %or11, %add
+  %inc = add i32 %inc12, 1
+  br i1 false, label %for.body3, label %for.end
+
+for.end:                                          ; preds = %for.body3
+  %or.lcssa = phi i32 [ %or, %for.body3 ]
+  br i1 false, label %for.cond1.preheader, label %for.end6
+
+for.end6:                                         ; preds = %for.end
+  %or.lcssa.lcssa = phi i32 [ %or.lcssa, %for.end ]
+  store i32 %or.lcssa.lcssa, i32* @a, align 4
+  %cmp7 = icmp eq i32 %or.lcssa.lcssa, -1
+  br i1 %cmp7, label %if.end, label %if.then
+
+if.then:                                          ; preds = %for.end6
+  tail call void @abort() #2
+  unreachable
+
+if.end:                                           ; preds = %for.end6
+  ret i32 0
+}
diff --git a/test/Transforms/IndVarSimplify/strengthen-overflow.ll b/test/Transforms/IndVarSimplify/strengthen-overflow.ll
index 07e489e03382..2bafe96e1ccc 100644
--- a/test/Transforms/IndVarSimplify/strengthen-overflow.ll
+++ b/test/Transforms/IndVarSimplify/strengthen-overflow.ll
@@ -52,58 +52,6 @@ define i32 @test.signed.add.1(i32* %array, i32 %length, i32 %init) {
   ret i32 42
 }
 
-define i32 @test.signed.sub.0(i32* %array, i32 %length, i32 %init) {
-; CHECK-LABEL: @test.signed.sub.0
- entry:
-  %upper = icmp sgt i32 %init, %length
-  br i1 %upper, label %loop, label %exit
-
- loop:
-; CHECK-LABEL: loop
-  %civ = phi i32 [ %init, %entry ], [ %civ.inc, %latch ]
-  %civ.inc = sub i32 %civ, 1
-; CHECK: %civ.inc = sub nsw i32 %civ, 1
-  %cmp = icmp slt i32 %civ.inc, %length
-  br i1 %cmp, label %latch, label %break
-
- latch:
-  store i32 0, i32* %array
-  %check = icmp sgt i32 %civ.inc, %length
-  br i1 %check, label %loop, label %break
-
- break:
-  ret i32 %civ.inc
-
- exit:
-  ret i32 42
-}
-
-define i32 @test.signed.sub.1(i32* %array, i32 %length, i32 %init) {
-; CHECK-LABEL: @test.signed.sub.1
- entry:
-  %upper = icmp sgt i32 %init, %length
-  br i1 %upper, label %loop, label %exit
-
- loop:
-; CHECK-LABEL: loop
-  %civ = phi i32 [ %init, %entry ], [ %civ.inc, %latch ]
-  %civ.inc = sub i32 %civ, 1
-; CHECK: %civ.inc = sub i32 %civ, 1
-  %cmp = icmp slt i32 %civ.inc, %length
-  br i1 %cmp, label %latch, label %break
-
- latch:
-  store i32 0, i32* %array
-  %check = icmp sge i32 %civ.inc, %length
-  br i1 %check, label %loop, label %break
-
- break:
-  ret i32 %civ.inc
-
- exit:
-  ret i32 42
-}
-
 define i32 @test.unsigned.add.0(i32* %array, i32 %length, i32 %init) {
 ; CHECK-LABEL: @test.unsigned.add.0
  entry:
@@ -156,59 +104,5 @@ define i32 @test.unsigned.add.1(i32* %array, i32 %length, i32 %init) {
   ret i32 42
 }
 
-define i32 @test.unsigned.sub.0(i32* %array, i32* %length_ptr, i32 %init) {
-; CHECK-LABEL: @test.unsigned.sub.0
- entry:
-  %length = load i32* %length_ptr, !range !0
-  %upper = icmp ult i32 %init, %length
-  br i1 %upper, label %loop, label %exit
-
- loop:
-; CHECK-LABEL: loop
-  %civ = phi i32 [ %init, %entry ], [ %civ.inc, %latch ]
-  %civ.inc = sub i32 %civ, 2
-; CHECK: %civ.inc = sub nuw i32 %civ, 2
-  %cmp = icmp slt i32 %civ.inc, %length
-  br i1 %cmp, label %latch, label %break
-
- latch:
-  store i32 0, i32* %array
-  %check = icmp ult i32 %civ.inc, %length
-  br i1 %check, label %loop, label %break
-
- break:
-  ret i32 %civ.inc
-
- exit:
-  ret i32 42
-}
-
-define i32 @test.unsigned.sub.1(i32* %array, i32* %length_ptr, i32 %init) {
-; CHECK-LABEL: @test.unsigned.sub.1
- entry:
-  %length = load i32* %length_ptr, !range !1
-  %upper = icmp ult i32 %init, %length
-  br i1 %upper, label %loop, label %exit
-
- loop:
-; CHECK-LABEL: loop
-  %civ = phi i32 [ %init, %entry ], [ %civ.inc, %latch ]
-  %civ.inc = sub i32 %civ, 2
-; CHECK: %civ.inc = sub i32 %civ, 2
-  %cmp = icmp slt i32 %civ.inc, %length
-  br i1 %cmp, label %latch, label %break
-
- latch:
-  store i32 0, i32* %array
-  %check = icmp ult i32 %civ.inc, %length
-  br i1 %check, label %loop, label %break
-
- break:
-  ret i32 %civ.inc
-
- exit:
-  ret i32 42
-}
-
 !0 = !{i32 0, i32 2}
 !1 = !{i32 0, i32 42}
diff --git a/test/Transforms/InstCombine/call-cast-target.ll b/test/Transforms/InstCombine/call-cast-target.ll
index b82dd99db36f..4a5c94961e29 100644
--- a/test/Transforms/InstCombine/call-cast-target.ll
+++ b/test/Transforms/InstCombine/call-cast-target.ll
@@ -61,3 +61,14 @@ entry:
   %call = tail call i32 bitcast (i32 (i64)* @fn3 to i32 (i32*)*)(i32* %a)
   ret i32 %call
 }
+
+declare i32 @fn4(i32) "thunk"
+
+define i32 @test4(i32* %a) {
+; CHECK-LABEL: @test4
+; CHECK:      %[[call:.*]] = tail call i32 bitcast (i32 (i32)* @fn4 to i32 (i32*)*)(i32* %a)
+; CHECK-NEXT: ret i32 %[[call]]
+entry:
+  %call = tail call i32 bitcast (i32 (i32)* @fn4 to i32 (i32*)*)(i32* %a)
+  ret i32 %call
+}
diff --git a/test/Transforms/InstCombine/memcpy_chk-1.ll b/test/Transforms/InstCombine/memcpy_chk-1.ll
index 008b838201ed..ddaaf82a8e2d 100644
--- a/test/Transforms/InstCombine/memcpy_chk-1.ll
+++ b/test/Transforms/InstCombine/memcpy_chk-1.ll
@@ -15,46 +15,50 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 
 ; Check cases where dstlen >= len.
 
-define void @test_simplify1() {
+define i8* @test_simplify1() {
 ; CHECK-LABEL: @test_simplify1(
   %dst = bitcast %struct.T1* @t1 to i8*
   %src = bitcast %struct.T2* @t2 to i8*
 
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64
-  call i8* @__memcpy_chk(i8* %dst, i8* %src, i64 1824, i64 1824)
-  ret void
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* bitcast (%struct.T1* @t1 to i8*), i8* bitcast (%struct.T2* @t2 to i8*), i64 1824, i32 4, i1 false)
+; CHECK-NEXT: ret i8* bitcast (%struct.T1* @t1 to i8*)
+  %ret = call i8* @__memcpy_chk(i8* %dst, i8* %src, i64 1824, i64 1824)
+  ret i8* %ret
 }
 
-define void @test_simplify2() {
+define i8* @test_simplify2() {
 ; CHECK-LABEL: @test_simplify2(
   %dst = bitcast %struct.T1* @t1 to i8*
   %src = bitcast %struct.T3* @t3 to i8*
 
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64
-  call i8* @__memcpy_chk(i8* %dst, i8* %src, i64 1824, i64 2848)
-  ret void
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* bitcast (%struct.T1* @t1 to i8*), i8* bitcast (%struct.T3* @t3 to i8*), i64 1824, i32 4, i1 false)
+; CHECK-NEXT: ret i8* bitcast (%struct.T1* @t1 to i8*)
+  %ret = call i8* @__memcpy_chk(i8* %dst, i8* %src, i64 1824, i64 2848)
+  ret i8* %ret
 }
 
 ; Check cases where dstlen < len.
 
-define void @test_no_simplify1() {
+define i8* @test_no_simplify1() {
 ; CHECK-LABEL: @test_no_simplify1(
   %dst = bitcast %struct.T3* @t3 to i8*
   %src = bitcast %struct.T1* @t1 to i8*
 
-; CHECK-NEXT: call i8* @__memcpy_chk
-  call i8* @__memcpy_chk(i8* %dst, i8* %src, i64 2848, i64 1824)
-  ret void
+; CHECK-NEXT: %ret = call i8* @__memcpy_chk(i8* bitcast (%struct.T3* @t3 to i8*), i8* bitcast (%struct.T1* @t1 to i8*), i64 2848, i64 1824)
+; CHECK-NEXT: ret i8* %ret
+  %ret = call i8* @__memcpy_chk(i8* %dst, i8* %src, i64 2848, i64 1824)
+  ret i8* %ret
 }
 
-define void @test_no_simplify2() {
+define i8* @test_no_simplify2() {
 ; CHECK-LABEL: @test_no_simplify2(
   %dst = bitcast %struct.T1* @t1 to i8*
   %src = bitcast %struct.T2* @t2 to i8*
 
-; CHECK-NEXT: call i8* @__memcpy_chk
-  call i8* @__memcpy_chk(i8* %dst, i8* %src, i64 1024, i64 0)
-  ret void
+; CHECK-NEXT: %ret = call i8* @__memcpy_chk(i8* bitcast (%struct.T1* @t1 to i8*), i8* bitcast (%struct.T2* @t2 to i8*), i64 1024, i64 0)
+; CHECK-NEXT: ret i8* %ret
+  %ret = call i8* @__memcpy_chk(i8* %dst, i8* %src, i64 1024, i64 0)
+  ret i8* %ret
 }
 
 define i8* @test_simplify_return_indcall(i8* ()* %alloc) {
diff --git a/test/Transforms/InstCombine/memmove_chk-1.ll b/test/Transforms/InstCombine/memmove_chk-1.ll
index 6d93bbbf959e..e4e1f6eedf39 100644
--- a/test/Transforms/InstCombine/memmove_chk-1.ll
+++ b/test/Transforms/InstCombine/memmove_chk-1.ll
@@ -15,46 +15,50 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 
 ; Check cases where dstlen >= len.
 
-define void @test_simplify1() {
+define i8* @test_simplify1() {
 ; CHECK-LABEL: @test_simplify1(
   %dst = bitcast %struct.T1* @t1 to i8*
   %src = bitcast %struct.T2* @t2 to i8*
 
-; CHECK-NEXT: call void @llvm.memmove.p0i8.p0i8.i64
-  call i8* @__memmove_chk(i8* %dst, i8* %src, i64 1824, i64 1824)
-  ret void
+; CHECK-NEXT: call void @llvm.memmove.p0i8.p0i8.i64(i8* bitcast (%struct.T1* @t1 to i8*), i8* bitcast (%struct.T2* @t2 to i8*), i64 1824, i32 4, i1 false)
+; CHECK-NEXT: ret i8* bitcast (%struct.T1* @t1 to i8*)
+  %ret = call i8* @__memmove_chk(i8* %dst, i8* %src, i64 1824, i64 1824)
+  ret i8* %ret
 }
 
-define void @test_simplify2() {
+define i8* @test_simplify2() {
 ; CHECK-LABEL: @test_simplify2(
   %dst = bitcast %struct.T1* @t1 to i8*
   %src = bitcast %struct.T3* @t3 to i8*
 
-; CHECK-NEXT: call void @llvm.memmove.p0i8.p0i8.i64
-  call i8* @__memmove_chk(i8* %dst, i8* %src, i64 1824, i64 2848)
-  ret void
+; CHECK-NEXT: call void @llvm.memmove.p0i8.p0i8.i64(i8* bitcast (%struct.T1* @t1 to i8*), i8* bitcast (%struct.T3* @t3 to i8*), i64 1824, i32 4, i1 false)
+; CHECK-NEXT: ret i8* bitcast (%struct.T1* @t1 to i8*)
+  %ret = call i8* @__memmove_chk(i8* %dst, i8* %src, i64 1824, i64 2848)
+  ret i8* %ret
 }
 
 ; Check cases where dstlen < len.
 
-define void @test_no_simplify1() {
+define i8* @test_no_simplify1() {
 ; CHECK-LABEL: @test_no_simplify1(
   %dst = bitcast %struct.T3* @t3 to i8*
   %src = bitcast %struct.T1* @t1 to i8*
 
-; CHECK-NEXT: call i8* @__memmove_chk
-  call i8* @__memmove_chk(i8* %dst, i8* %src, i64 2848, i64 1824)
-  ret void
+; CHECK-NEXT: %ret = call i8* @__memmove_chk(i8* bitcast (%struct.T3* @t3 to i8*), i8* bitcast (%struct.T1* @t1 to i8*), i64 2848, i64 1824)
+; CHECK-NEXT: ret i8* %ret
+  %ret = call i8* @__memmove_chk(i8* %dst, i8* %src, i64 2848, i64 1824)
+  ret i8* %ret
 }
 
-define void @test_no_simplify2() {
+define i8* @test_no_simplify2() {
 ; CHECK-LABEL: @test_no_simplify2(
   %dst = bitcast %struct.T1* @t1 to i8*
   %src = bitcast %struct.T2* @t2 to i8*
 
-; CHECK-NEXT: call i8* @__memmove_chk
-  call i8* @__memmove_chk(i8* %dst, i8* %src, i64 1024, i64 0)
-  ret void
+; CHECK-NEXT: %ret = call i8* @__memmove_chk(i8* bitcast (%struct.T1* @t1 to i8*), i8* bitcast (%struct.T2* @t2 to i8*), i64 1024, i64 0)
+; CHECK-NEXT: ret i8* %ret
+  %ret = call i8* @__memmove_chk(i8* %dst, i8* %src, i64 1024, i64 0)
+  ret i8* %ret
 }
 
 declare i8* @__memmove_chk(i8*, i8*, i64, i64)
diff --git a/test/Transforms/InstCombine/memset_chk-1.ll b/test/Transforms/InstCombine/memset_chk-1.ll
index 47cc7db998e4..27f7293a6bce 100644
--- a/test/Transforms/InstCombine/memset_chk-1.ll
+++ b/test/Transforms/InstCombine/memset_chk-1.ll
@@ -11,51 +11,56 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 
 ; Check cases where dstlen >= len.
 
-define void @test_simplify1() {
+define i8* @test_simplify1() {
 ; CHECK-LABEL: @test_simplify1(
   %dst = bitcast %struct.T* @t to i8*
 
-; CHECK-NEXT: call void @llvm.memset.p0i8.i64
-  call i8* @__memset_chk(i8* %dst, i32 0, i64 1824, i64 1824)
-  ret void
+; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* bitcast (%struct.T* @t to i8*), i8 0, i64 1824, i32 4, i1 false)
+; CHECK-NEXT: ret i8* bitcast (%struct.T* @t to i8*)
+  %ret = call i8* @__memset_chk(i8* %dst, i32 0, i64 1824, i64 1824)
+  ret i8* %ret
 }
 
-define void @test_simplify2() {
+define i8* @test_simplify2() {
 ; CHECK-LABEL: @test_simplify2(
   %dst = bitcast %struct.T* @t to i8*
 
-; CHECK-NEXT: call void @llvm.memset.p0i8.i64
-  call i8* @__memset_chk(i8* %dst, i32 0, i64 1824, i64 3648)
-  ret void
+; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* bitcast (%struct.T* @t to i8*), i8 0, i64 1824, i32 4, i1 false)
+; CHECK-NEXT: ret i8* bitcast (%struct.T* @t to i8*)
+  %ret = call i8* @__memset_chk(i8* %dst, i32 0, i64 1824, i64 3648)
+  ret i8* %ret
 }
 
-define void @test_simplify3() {
+define i8* @test_simplify3() {
 ; CHECK-LABEL: @test_simplify3(
   %dst = bitcast %struct.T* @t to i8*
 
-; CHECK-NEXT: call void @llvm.memset.p0i8.i64
-  call i8* @__memset_chk(i8* %dst, i32 0, i64 1824, i64 -1)
-  ret void
+; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* bitcast (%struct.T* @t to i8*), i8 0, i64 1824, i32 4, i1 false)
+; CHECK-NEXT: ret i8* bitcast (%struct.T* @t to i8*)
+  %ret = call i8* @__memset_chk(i8* %dst, i32 0, i64 1824, i64 -1)
+  ret i8* %ret
 }
 
 ; Check cases where dstlen < len.
 
-define void @test_no_simplify1() {
+define i8* @test_no_simplify1() {
 ; CHECK-LABEL: @test_no_simplify1(
   %dst = bitcast %struct.T* @t to i8*
 
-; CHECK-NEXT: call i8* @__memset_chk
-  call i8* @__memset_chk(i8* %dst, i32 0, i64 1824, i64 400)
-  ret void
+; CHECK-NEXT: %ret = call i8* @__memset_chk(i8* bitcast (%struct.T* @t to i8*), i32 0, i64 1824, i64 400)
+; CHECK-NEXT: ret i8* %ret
+  %ret = call i8* @__memset_chk(i8* %dst, i32 0, i64 1824, i64 400)
+  ret i8* %ret
 }
 
-define void @test_no_simplify2() {
+define i8* @test_no_simplify2() {
 ; CHECK-LABEL: @test_no_simplify2(
   %dst = bitcast %struct.T* @t to i8*
 
-; CHECK-NEXT: call i8* @__memset_chk
-  call i8* @__memset_chk(i8* %dst, i32 0, i64 1824, i64 0)
-  ret void
+; CHECK-NEXT: %ret = call i8* @__memset_chk(i8* bitcast (%struct.T* @t to i8*), i32 0, i64 1824, i64 0)
+; CHECK-NEXT: ret i8* %ret
+  %ret = call i8* @__memset_chk(i8* %dst, i32 0, i64 1824, i64 0)
+  ret i8* %ret
 }
 
 declare i8* @__memset_chk(i8*, i32, i64, i64)
diff --git a/test/Transforms/InstCombine/stpcpy_chk-1.ll b/test/Transforms/InstCombine/stpcpy_chk-1.ll
index 8a02529c61ca..393c5d96d10b 100644
--- a/test/Transforms/InstCombine/stpcpy_chk-1.ll
+++ b/test/Transforms/InstCombine/stpcpy_chk-1.ll
@@ -11,46 +11,50 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f3
 
 ; Check cases where slen >= strlen (src).
 
-define void @test_simplify1() {
+define i8* @test_simplify1() {
 ; CHECK-LABEL: @test_simplify1(
   %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
   %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
 
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32
-  call i8* @__stpcpy_chk(i8* %dst, i8* %src, i32 60)
-  ret void
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0), i8* getelementptr inbounds ([12 x i8]* @.str, i32 0, i32 0), i32 12, i32 1, i1 false)
+; CHECK-NEXT: ret i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 11)
+  %ret = call i8* @__stpcpy_chk(i8* %dst, i8* %src, i32 60)
+  ret i8* %ret
 }
 
-define void @test_simplify2() {
+define i8* @test_simplify2() {
 ; CHECK-LABEL: @test_simplify2(
   %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
   %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
 
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32
-  call i8* @__stpcpy_chk(i8* %dst, i8* %src, i32 12)
-  ret void
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0), i8* getelementptr inbounds ([12 x i8]* @.str, i32 0, i32 0), i32 12, i32 1, i1 false)
+; CHECK-NEXT: ret i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 11)
+  %ret = call i8* @__stpcpy_chk(i8* %dst, i8* %src, i32 12)
+  ret i8* %ret
 }
 
-define void @test_simplify3() {
+define i8* @test_simplify3() {
 ; CHECK-LABEL: @test_simplify3(
   %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
   %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
 
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32
-  call i8* @__stpcpy_chk(i8* %dst, i8* %src, i32 -1)
-  ret void
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0), i8* getelementptr inbounds ([12 x i8]* @.str, i32 0, i32 0), i32 12, i32 1, i1 false)
+; CHECK-NEXT: ret i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 11)
+  %ret = call i8* @__stpcpy_chk(i8* %dst, i8* %src, i32 -1)
+  ret i8* %ret
 }
 
 ; Check cases where there are no string constants.
 
-define void @test_simplify4() {
+define i8* @test_simplify4() {
 ; CHECK-LABEL: @test_simplify4(
   %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
   %src = getelementptr inbounds [60 x i8]* @b, i32 0, i32 0
 
-; CHECK-NEXT: call i8* @stpcpy
-  call i8* @__stpcpy_chk(i8* %dst, i8* %src, i32 -1)
-  ret void
+; CHECK-NEXT: %stpcpy = call i8* @stpcpy(i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0), i8* getelementptr inbounds ([60 x i8]* @b, i32 0, i32 0))
+; CHECK-NEXT: ret i8* %stpcpy
+  %ret = call i8* @__stpcpy_chk(i8* %dst, i8* %src, i32 -1)
+  ret i8* %ret
 }
 
 ; Check case where the string length is not constant.
@@ -60,10 +64,11 @@ define i8* @test_simplify5() {
   %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
   %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
 
-; CHECK: @__memcpy_chk
+; CHECK-NEXT: %len = call i32 @llvm.objectsize.i32.p0i8(i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0), i1 false)
+; CHECK-NEXT: %1 = call i8* @__memcpy_chk(i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0), i8* getelementptr inbounds ([12 x i8]* @.str, i32 0, i32 0), i32 12, i32 %len)
+; CHECK-NEXT: ret i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 11)
   %len = call i32 @llvm.objectsize.i32.p0i8(i8* %dst, i1 false)
   %ret = call i8* @__stpcpy_chk(i8* %dst, i8* %src, i32 %len)
-; CHECK: ret i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 11)
   ret i8* %ret
 }
 
@@ -73,8 +78,9 @@ define i8* @test_simplify6() {
 ; CHECK-LABEL: @test_simplify6(
   %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
 
-; CHECK: [[LEN:%[a-z]+]] = call i32 @strlen
-; CHECK-NEXT: getelementptr inbounds [60 x i8]* @a, i32 0, i32 [[LEN]]
+; CHECK-NEXT: %strlen = call i32 @strlen(i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0))
+; CHECK-NEXT: %1 = getelementptr inbounds [60 x i8]* @a, i32 0, i32 %strlen
+; CHECK-NEXT: ret i8* %1
   %len = call i32 @llvm.objectsize.i32.p0i8(i8* %dst, i1 false)
   %ret = call i8* @__stpcpy_chk(i8* %dst, i8* %dst, i32 %len)
   ret i8* %ret
@@ -82,14 +88,15 @@ define i8* @test_simplify6() {
 
 ; Check case where slen < strlen (src).
 
-define void @test_no_simplify1() {
+define i8* @test_no_simplify1() {
 ; CHECK-LABEL: @test_no_simplify1(
   %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
   %src = getelementptr inbounds [60 x i8]* @b, i32 0, i32 0
 
-; CHECK-NEXT: call i8* @__stpcpy_chk
-  call i8* @__stpcpy_chk(i8* %dst, i8* %src, i32 8)
-  ret void
+; CHECK-NEXT: %ret = call i8* @__stpcpy_chk(i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0), i8* getelementptr inbounds ([60 x i8]* @b, i32 0, i32 0), i32 8)
+; CHECK-NEXT: ret i8* %ret
+  %ret = call i8* @__stpcpy_chk(i8* %dst, i8* %src, i32 8)
+  ret i8* %ret
 }
 
 declare i8* @__stpcpy_chk(i8*, i8*, i32) nounwind
diff --git a/test/Transforms/InstCombine/strcpy_chk-1.ll b/test/Transforms/InstCombine/strcpy_chk-1.ll
index 8e7fec76ef5c..e3f163fd087d 100644
--- a/test/Transforms/InstCombine/strcpy_chk-1.ll
+++ b/test/Transforms/InstCombine/strcpy_chk-1.ll
@@ -11,59 +11,65 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f3
 
 ; Check cases where slen >= strlen (src).
 
-define void @test_simplify1() {
+define i8* @test_simplify1() {
 ; CHECK-LABEL: @test_simplify1(
   %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
   %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
 
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32
-  call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 60)
-  ret void
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0), i8* getelementptr inbounds ([12 x i8]* @.str, i32 0, i32 0), i32 12, i32 1, i1 false)
+; CHECK-NEXT: ret i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0)
+  %ret = call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 60)
+  ret i8* %ret
 }
 
-define void @test_simplify2() {
+define i8* @test_simplify2() {
 ; CHECK-LABEL: @test_simplify2(
   %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
   %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
 
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32
-  call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 12)
-  ret void
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0), i8* getelementptr inbounds ([12 x i8]* @.str, i32 0, i32 0), i32 12, i32 1, i1 false)
+; CHECK-NEXT: ret i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0)
+  %ret = call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 12)
+  ret i8* %ret
 }
 
-define void @test_simplify3() {
+define i8* @test_simplify3() {
 ; CHECK-LABEL: @test_simplify3(
   %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
   %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
 
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32
-  call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 -1)
-  ret void
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0), i8* getelementptr inbounds ([12 x i8]* @.str, i32 0, i32 0), i32 12, i32 1, i1 false)
+; CHECK-NEXT: ret i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0)
+  %ret = call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 -1)
+  ret i8* %ret
 }
 
 ; Check cases where there are no string constants.
 
-define void @test_simplify4() {
+define i8* @test_simplify4() {
 ; CHECK-LABEL: @test_simplify4(
   %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
   %src = getelementptr inbounds [60 x i8]* @b, i32 0, i32 0
 
-; CHECK-NEXT: call i8* @strcpy
-  call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 -1)
-  ret void
+; CHECK-NEXT: %strcpy = call i8* @strcpy(i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0), i8* getelementptr inbounds ([60 x i8]* @b, i32 0, i32 0))
+; CHECK-NEXT: ret i8* %strcpy
+  %ret = call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 -1)
+  ret i8* %ret
 }
 
 ; Check case where the string length is not constant.
 
-define void @test_simplify5() {
+define i8* @test_simplify5() {
 ; CHECK-LABEL: @test_simplify5(
   %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
   %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
 
-; CHECK: @__memcpy_chk
+; CHECK-NEXT: %len = call i32 @llvm.objectsize.i32.p0i8(i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0), i1 false)
+; CHECK-NEXT: %1 = call i8* @__memcpy_chk(i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0), i8* getelementptr inbounds ([12 x i8]* @.str, i32 0, i32 0), i32 12, i32 %len)
+; CHECK-NEXT: ret i8* %1
   %len = call i32 @llvm.objectsize.i32.p0i8(i8* %dst, i1 false)
-  call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 %len)
-  ret void
+  %ret = call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 %len)
+  ret i8* %ret
 }
 
 ; Check case where the source and destination are the same.
@@ -72,7 +78,9 @@ define i8* @test_simplify6() {
 ; CHECK-LABEL: @test_simplify6(
   %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
 
-; CHECK: getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0)
+; CHECK-NEXT: %len = call i32 @llvm.objectsize.i32.p0i8(i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0), i1 false)
+; CHECK-NEXT: %ret = call i8* @__strcpy_chk(i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0), i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0), i32 %len)
+; CHECK-NEXT: ret i8* %ret
   %len = call i32 @llvm.objectsize.i32.p0i8(i8* %dst, i1 false)
   %ret = call i8* @__strcpy_chk(i8* %dst, i8* %dst, i32 %len)
   ret i8* %ret
@@ -80,14 +88,15 @@ define i8* @test_simplify6() {
 
 ; Check case where slen < strlen (src).
 
-define void @test_no_simplify1() {
+define i8* @test_no_simplify1() {
 ; CHECK-LABEL: @test_no_simplify1(
   %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
   %src = getelementptr inbounds [60 x i8]* @b, i32 0, i32 0
 
-; CHECK-NEXT: call i8* @__strcpy_chk
-  call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 8)
-  ret void
+; CHECK-NEXT: %ret = call i8* @__strcpy_chk(i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0), i8* getelementptr inbounds ([60 x i8]* @b, i32 0, i32 0), i32 8)
+; CHECK-NEXT: ret i8* %ret
+  %ret = call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 8)
+  ret i8* %ret
 }
 
 declare i8* @__strcpy_chk(i8*, i8*, i32) nounwind
diff --git a/test/Transforms/InstCombine/strncpy_chk-1.ll b/test/Transforms/InstCombine/strncpy_chk-1.ll
index 90b4173ced77..9242a8acdbbe 100644
--- a/test/Transforms/InstCombine/strncpy_chk-1.ll
+++ b/test/Transforms/InstCombine/strncpy_chk-1.ll
@@ -11,56 +11,61 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f3
 
 ; Check cases where dstlen >= len
 
-define void @test_simplify1() {
+define i8* @test_simplify1() {
 ; CHECK-LABEL: @test_simplify1(
   %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
   %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
 
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32
-  call i8* @__strncpy_chk(i8* %dst, i8* %src, i32 12, i32 60)
-  ret void
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0), i8* getelementptr inbounds ([12 x i8]* @.str, i32 0, i32 0), i32 12, i32 1, i1 false)
+; CHECK-NEXT: ret i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0)
+  %ret = call i8* @__strncpy_chk(i8* %dst, i8* %src, i32 12, i32 60)
+  ret i8* %ret
 }
 
-define void @test_simplify2() {
+define i8* @test_simplify2() {
 ; CHECK-LABEL: @test_simplify2(
   %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
   %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
 
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32
-  call i8* @__strncpy_chk(i8* %dst, i8* %src, i32 12, i32 12)
-  ret void
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0), i8* getelementptr inbounds ([12 x i8]* @.str, i32 0, i32 0), i32 12, i32 1, i1 false)
+; CHECK-NEXT: ret i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0)
+  %ret = call i8* @__strncpy_chk(i8* %dst, i8* %src, i32 12, i32 12)
+  ret i8* %ret
 }
 
-define void @test_simplify3() {
+define i8* @test_simplify3() {
 ; CHECK-LABEL: @test_simplify3(
   %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
   %src = getelementptr inbounds [60 x i8]* @b, i32 0, i32 0
 
-; CHECK-NEXT: call i8* @strncpy
-  call i8* @__strncpy_chk(i8* %dst, i8* %src, i32 12, i32 60)
-  ret void
+; CHECK-NEXT: %strncpy = call i8* @strncpy(i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0), i8* getelementptr inbounds ([60 x i8]* @b, i32 0, i32 0), i32 12)
+; CHECK-NEXT: ret i8* %strncpy
+  %ret = call i8* @__strncpy_chk(i8* %dst, i8* %src, i32 12, i32 60)
+  ret i8* %ret
 }
 
 ; Check cases where dstlen < len
 
-define void @test_no_simplify1() {
+define i8* @test_no_simplify1() {
 ; CHECK-LABEL: @test_no_simplify1(
   %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
   %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
 
-; CHECK-NEXT: call i8* @__strncpy_chk
-  call i8* @__strncpy_chk(i8* %dst, i8* %src, i32 8, i32 4)
-  ret void
+; CHECK-NEXT: %ret = call i8* @__strncpy_chk(i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0), i8* getelementptr inbounds ([12 x i8]* @.str, i32 0, i32 0), i32 8, i32 4)
+; CHECK-NEXT: ret i8* %ret
+  %ret = call i8* @__strncpy_chk(i8* %dst, i8* %src, i32 8, i32 4)
+  ret i8* %ret
 }
 
-define void @test_no_simplify2() {
+define i8* @test_no_simplify2() {
 ; CHECK-LABEL: @test_no_simplify2(
   %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
   %src = getelementptr inbounds [60 x i8]* @b, i32 0, i32 0
 
-; CHECK-NEXT: call i8* @__strncpy_chk
-  call i8* @__strncpy_chk(i8* %dst, i8* %src, i32 8, i32 0)
-  ret void
+; CHECK-NEXT: %ret = call i8* @__strncpy_chk(i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0), i8* getelementptr inbounds ([60 x i8]* @b, i32 0, i32 0), i32 8, i32 0)
+; CHECK-NEXT: ret i8* %ret
+  %ret = call i8* @__strncpy_chk(i8* %dst, i8* %src, i32 8, i32 0)
+  ret i8* %ret
 }
 
 declare i8* @__strncpy_chk(i8*, i8*, i32, i32)
diff --git a/test/tools/gold/emit-llvm.ll b/test/tools/gold/emit-llvm.ll
index cfdc55108c0b..2c43147a2eab 100644
--- a/test/tools/gold/emit-llvm.ll
+++ b/test/tools/gold/emit-llvm.ll
@@ -21,6 +21,11 @@
 
 target triple = "x86_64-unknown-linux-gnu"
 
+@g7 = extern_weak global i32
+; CHECK-DAG: @g7 = extern_weak global i32
+
+@g8 = external global i32
+
 ; CHECK: define internal void @f1()
 ; OPT-NOT: @f1
 define hidden void @f1() {
@@ -62,6 +67,13 @@ define linkonce_odr void @f6() unnamed_addr {
 }
 @g6 = global void()* @f6
 
+define i32* @f7() {
+  ret i32* @g7
+}
+
+define i32* @f8() {
+  ret i32* @g8
+}
 
 ; API: f1 PREVAILING_DEF_IRONLY
 ; API: f2 PREVAILING_DEF_IRONLY
@@ -69,5 +81,9 @@ define linkonce_odr void @f6() unnamed_addr {
 ; API: f4 PREVAILING_DEF_IRONLY_EXP
 ; API: f5 PREVAILING_DEF_IRONLY_EXP
 ; API: f6 PREVAILING_DEF_IRONLY_EXP
+; API: f7 PREVAILING_DEF_IRONLY_EXP
+; API: f8 PREVAILING_DEF_IRONLY_EXP
+; API: g7 UNDEF
+; API: g8 UNDEF
 ; API: g5 PREVAILING_DEF_IRONLY_EXP
 ; API: g6 PREVAILING_DEF_IRONLY_EXP
diff --git a/tools/gold/gold-plugin.cpp b/tools/gold/gold-plugin.cpp
index bcc91e9d0612..5524bb9922c5 100644
--- a/tools/gold/gold-plugin.cpp
+++ b/tools/gold/gold-plugin.cpp
@@ -633,8 +633,10 @@ getModuleForFile(LLVMContext &Context, claimed_file &F, raw_fd_ostream *ApiFile,
       break;
 
     case LDPR_UNDEF:
-      assert(GV->hasComdat());
-      Drop.insert(GV);
+      if (!GV->isDeclarationForLinker()) {
+        assert(GV->hasComdat());
+        Drop.insert(GV);
+      }
       break;
 
     case LDPR_PREVAILING_DEF_IRONLY: {
diff --git a/tools/llvm-c-test/CMakeLists.txt b/tools/llvm-c-test/CMakeLists.txt
index baf970a02781..f22dffb30e8a 100644
--- a/tools/llvm-c-test/CMakeLists.txt
+++ b/tools/llvm-c-test/CMakeLists.txt
@@ -41,6 +41,7 @@ add_llvm_tool(llvm-c-test
   include-all.c
   main.c
   module.c
+  metadata.c
   object.c
   targets.c
   )
diff --git a/tools/llvm-c-test/llvm-c-test.h b/tools/llvm-c-test/llvm-c-test.h
index 0a25aa606197..1b4976ae1423 100644
--- a/tools/llvm-c-test/llvm-c-test.h
+++ b/tools/llvm-c-test/llvm-c-test.h
@@ -27,6 +27,10 @@ int calc(void);
 // disassemble.c
 int disassemble(void);
 
+// metadata.c
+int add_named_metadata_operand(void);
+int set_metadata(void);
+
 // object.c
 int object_list_sections(void);
 int object_list_symbols(void);
diff --git a/tools/llvm-c-test/main.c b/tools/llvm-c-test/main.c
index 72f8b0428994..59cc749fb155 100644
--- a/tools/llvm-c-test/main.c
+++ b/tools/llvm-c-test/main.c
@@ -65,6 +65,10 @@ int main(int argc, char **argv) {
     return disassemble();
   } else if (argc == 2 && !strcmp(argv[1], "--calc")) {
     return calc();
+  } else if (argc == 2 && !strcmp(argv[1], "--add-named-metadata-operand")) {
+    return add_named_metadata_operand();
+  } else if (argc == 2 && !strcmp(argv[1], "--set-metadata")) {
+    return set_metadata();
   } else {
     print_usage();
   }
diff --git a/tools/llvm-c-test/metadata.c b/tools/llvm-c-test/metadata.c
new file mode 100644
index 000000000000..b64a696c5281
--- /dev/null
+++ b/tools/llvm-c-test/metadata.c
@@ -0,0 +1,43 @@
+/*===-- object.c - tool for testing libLLVM and llvm-c API ----------------===*\
+|*                                                                            *|
+|*                     The LLVM Compiler Infrastructure                       *|
+|*                                                                            *|
+|* This file is distributed under the University of Illinois Open Source      *|
+|* License. See LICENSE.TXT for details.                                      *|
+|*                                                                            *|
+|*===----------------------------------------------------------------------===*|
+|*                                                                            *|
+|* This file implements the --add-named-metadata-operand and --set-metadata   *|
+|* commands in llvm-c-test.                                                   *|
+|*                                                                            *|
+\*===----------------------------------------------------------------------===*/
+
+#include "llvm-c-test.h"
+#include "llvm-c/Core.h"
+
+int add_named_metadata_operand(void) {
+  LLVMModuleRef m = LLVMModuleCreateWithName("Mod");
+  LLVMValueRef values[] = { LLVMConstInt(LLVMInt32Type(), 0, 0) };
+
+  // This used to trigger an assertion
+  LLVMAddNamedMetadataOperand(m, "name", LLVMMDNode(values, 1));
+
+  LLVMDisposeModule(m);
+
+  return 0;
+}
+
+int set_metadata(void) {
+  LLVMBuilderRef b = LLVMCreateBuilder();
+  LLVMValueRef values[] = { LLVMConstInt(LLVMInt32Type(), 0, 0) };
+
+  // This used to trigger an assertion
+  LLVMSetMetadata(
+      LLVMBuildRetVoid(b),
+      LLVMGetMDKindID("kind", 4),
+      LLVMMDNode(values, 1));
+
+  LLVMDisposeBuilder(b);
+
+  return 0;
+}
diff --git a/unittests/ADT/TripleTest.cpp b/unittests/ADT/TripleTest.cpp
index cacbde66f350..1f9aa32aeab0 100644
--- a/unittests/ADT/TripleTest.cpp
+++ b/unittests/ADT/TripleTest.cpp
@@ -665,3 +665,20 @@ TEST(TripleTest, getARMCPUForArch) {
   }
 }
 }
+
+TEST(TripleTest, NormalizeARM) {
+  EXPECT_EQ("armv6--netbsd-eabi", Triple::normalize("armv6-netbsd-eabi"));
+  EXPECT_EQ("armv7--netbsd-eabi", Triple::normalize("armv7-netbsd-eabi"));
+  EXPECT_EQ("armv6eb--netbsd-eabi", Triple::normalize("armv6eb-netbsd-eabi"));
+  EXPECT_EQ("armv7eb--netbsd-eabi", Triple::normalize("armv7eb-netbsd-eabi"));
+  EXPECT_EQ("armv6--netbsd-eabihf", Triple::normalize("armv6-netbsd-eabihf"));
+  EXPECT_EQ("armv7--netbsd-eabihf", Triple::normalize("armv7-netbsd-eabihf"));
+  EXPECT_EQ("armv6eb--netbsd-eabihf", Triple::normalize("armv6eb-netbsd-eabihf"));
+  EXPECT_EQ("armv7eb--netbsd-eabihf", Triple::normalize("armv7eb-netbsd-eabihf"));
+
+  Triple T;
+  T = Triple("armv6--netbsd-eabi");
+  EXPECT_EQ(Triple::arm, T.getArch());
+  T = Triple("armv6eb--netbsd-eabi");
+  EXPECT_EQ(Triple::armeb, T.getArch());
+}
diff --git a/unittests/ExecutionEngine/MCJIT/MCJITCAPITest.cpp b/unittests/ExecutionEngine/MCJIT/MCJITCAPITest.cpp
index 62967bdd3270..f2a3000906e3 100644
--- a/unittests/ExecutionEngine/MCJIT/MCJITCAPITest.cpp
+++ b/unittests/ExecutionEngine/MCJIT/MCJITCAPITest.cpp
@@ -359,13 +359,10 @@ TEST_F(MCJITCAPITest, gva) {
   buildMCJITEngine();
   buildAndRunPasses();
 
-  union {
-    uint64_t raw;
-    int32_t *usable;
-  } valuePointer;
-  valuePointer.raw = LLVMGetGlobalValueAddress(Engine, "simple_value");
+  uint64_t raw = LLVMGetGlobalValueAddress(Engine, "simple_value");
+  int32_t *usable  = (int32_t *) raw;
 
-  EXPECT_EQ(42, *valuePointer.usable);
+  EXPECT_EQ(42, *usable);
 }
 
 TEST_F(MCJITCAPITest, gfa) {
@@ -376,13 +373,10 @@ TEST_F(MCJITCAPITest, gfa) {
   buildMCJITEngine();
   buildAndRunPasses();
 
-  union {
-    uint64_t raw;
-    int (*usable)();
-  } functionPointer;
-  functionPointer.raw = LLVMGetFunctionAddress(Engine, "simple_function");
+  uint64_t raw = LLVMGetFunctionAddress(Engine, "simple_function");
+  int (*usable)() = (int (*)()) raw;
 
-  EXPECT_EQ(42, functionPointer.usable());
+  EXPECT_EQ(42, usable());
 }
 
 TEST_F(MCJITCAPITest, custom_memory_manager) {