85 files changed, 6142 insertions, 3183 deletions
diff --git a/lib/Target/ARM/A15SDOptimizer.cpp b/lib/Target/ARM/A15SDOptimizer.cpp
index 7a1865ce5fd6..9228cc2d7a9c 100644
--- a/lib/Target/ARM/A15SDOptimizer.cpp
+++ b/lib/Target/ARM/A15SDOptimizer.cpp
@@ -68,34 +68,31 @@ namespace {
     //
     unsigned createDupLane(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator InsertBefore,
-                           DebugLoc DL,
-                           unsigned Reg, unsigned Lane,
-                           bool QPR=false);
+                           const DebugLoc &DL, unsigned Reg, unsigned Lane,
+                           bool QPR = false);
 
     unsigned createExtractSubreg(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator InsertBefore,
-                                 DebugLoc DL,
-                                 unsigned DReg, unsigned Lane,
-                                 const TargetRegisterClass *TRC);
+                                 const DebugLoc &DL, unsigned DReg,
+                                 unsigned Lane, const TargetRegisterClass *TRC);
 
     unsigned createVExt(MachineBasicBlock &MBB,
                         MachineBasicBlock::iterator InsertBefore,
-                        DebugLoc DL,
-                        unsigned Ssub0, unsigned Ssub1);
+                        const DebugLoc &DL, unsigned Ssub0, unsigned Ssub1);
 
     unsigned createRegSequence(MachineBasicBlock &MBB,
                                MachineBasicBlock::iterator InsertBefore,
-                               DebugLoc DL,
-                               unsigned Reg1, unsigned Reg2);
+                               const DebugLoc &DL, unsigned Reg1,
+                               unsigned Reg2);
 
     unsigned createInsertSubreg(MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator InsertBefore,
-                                DebugLoc DL, unsigned DReg, unsigned Lane,
-                                unsigned ToInsert);
+                                const DebugLoc &DL, unsigned DReg,
+                                unsigned Lane, unsigned ToInsert);
 
     unsigned createImplicitDef(MachineBasicBlock &MBB,
                                MachineBasicBlock::iterator InsertBefore,
-                               DebugLoc DL);
+                               const DebugLoc &DL);
 
     //
     // Various property checkers
@@ -426,11 +423,10 @@ SmallVector<unsigned, 8> A15SDOptimizer::getReadDPRs(MachineInstr *MI) {
 }
 
 // Creates a DPR register from an SPR one by using a VDUP.
-unsigned
-A15SDOptimizer::createDupLane(MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator InsertBefore,
-                              DebugLoc DL,
-                              unsigned Reg, unsigned Lane, bool QPR) {
+unsigned A15SDOptimizer::createDupLane(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator InsertBefore,
+                                       const DebugLoc &DL, unsigned Reg,
+                                       unsigned Lane, bool QPR) {
   unsigned Out = MRI->createVirtualRegister(QPR ? &ARM::QPRRegClass :
                                                   &ARM::DPRRegClass);
   AddDefaultPred(BuildMI(MBB,
@@ -445,12 +441,10 @@ A15SDOptimizer::createDupLane(MachineBasicBlock &MBB,
 }
 
 // Creates a SPR register from a DPR by copying the value in lane 0.
-unsigned
-A15SDOptimizer::createExtractSubreg(MachineBasicBlock &MBB,
-                                    MachineBasicBlock::iterator InsertBefore,
-                                    DebugLoc DL,
-                                    unsigned DReg, unsigned Lane,
-                                    const TargetRegisterClass *TRC) {
+unsigned A15SDOptimizer::createExtractSubreg(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
+    const DebugLoc &DL, unsigned DReg, unsigned Lane,
+    const TargetRegisterClass *TRC) {
   unsigned Out = MRI->createVirtualRegister(TRC);
   BuildMI(MBB,
           InsertBefore,
@@ -462,11 +456,9 @@ A15SDOptimizer::createExtractSubreg(MachineBasicBlock &MBB,
 }
 
 // Takes two SPR registers and creates a DPR by using a REG_SEQUENCE.
-unsigned
-A15SDOptimizer::createRegSequence(MachineBasicBlock &MBB,
-                                  MachineBasicBlock::iterator InsertBefore,
-                                  DebugLoc DL,
-                                  unsigned Reg1, unsigned Reg2) {
+unsigned A15SDOptimizer::createRegSequence(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
+    const DebugLoc &DL, unsigned Reg1, unsigned Reg2) {
   unsigned Out = MRI->createVirtualRegister(&ARM::QPRRegClass);
   BuildMI(MBB,
           InsertBefore,
@@ -481,11 +473,10 @@ A15SDOptimizer::createRegSequence(MachineBasicBlock &MBB,
 
 // Takes two DPR registers that have previously been VDUPed (Ssub0 and Ssub1)
 // and merges them into one DPR register.
-unsigned
-A15SDOptimizer::createVExt(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator InsertBefore,
-                           DebugLoc DL,
-                           unsigned Ssub0, unsigned Ssub1) {
+unsigned A15SDOptimizer::createVExt(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator InsertBefore,
+                                    const DebugLoc &DL, unsigned Ssub0,
+                                    unsigned Ssub1) {
   unsigned Out = MRI->createVirtualRegister(&ARM::DPRRegClass);
   AddDefaultPred(BuildMI(MBB,
                          InsertBefore,
@@ -497,11 +488,9 @@ A15SDOptimizer::createVExt(MachineBasicBlock &MBB,
   return Out;
 }
 
-unsigned
-A15SDOptimizer::createInsertSubreg(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator InsertBefore,
-                                   DebugLoc DL, unsigned DReg, unsigned Lane,
-                                   unsigned ToInsert) {
+unsigned A15SDOptimizer::createInsertSubreg(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
+    const DebugLoc &DL, unsigned DReg, unsigned Lane, unsigned ToInsert) {
   unsigned Out = MRI->createVirtualRegister(&ARM::DPR_VFP2RegClass);
   BuildMI(MBB,
           InsertBefore,
@@ -517,7 +506,7 @@ A15SDOptimizer::createInsertSubreg(MachineBasicBlock &MBB,
 unsigned
 A15SDOptimizer::createImplicitDef(MachineBasicBlock &MBB,
                                   MachineBasicBlock::iterator InsertBefore,
-                                  DebugLoc DL) {
+                                  const DebugLoc &DL) {
   unsigned Out = MRI->createVirtualRegister(&ARM::DPRRegClass);
   BuildMI(MBB,
           InsertBefore,
@@ -681,6 +670,9 @@ bool A15SDOptimizer::runOnInstruction(MachineInstr *MI) {
 }
 
 bool A15SDOptimizer::runOnMachineFunction(MachineFunction &Fn) {
+  if (skipFunction(*Fn.getFunction()))
+    return false;
+
   const ARMSubtarget &STI = Fn.getSubtarget<ARMSubtarget>();
   // Since the A15SDOptimizer pass can insert VDUP instructions, it can only be
   // enabled when NEON is available.
@@ -701,7 +693,7 @@ bool A15SDOptimizer::runOnMachineFunction(MachineFunction &Fn) {
 
     for (MachineBasicBlock::iterator MI = MFI->begin(), ME = MFI->end();
       MI != ME;) {
-      Modified |= runOnInstruction(MI++);
+      Modified |= runOnInstruction(&*MI++);
     }
 
   }
diff --git a/lib/Target/ARM/ARM.h b/lib/Target/ARM/ARM.h
index cd7540e52410..690ff86a0c86 100644
--- a/lib/Target/ARM/ARM.h
+++ b/lib/Target/ARM/ARM.h
@@ -27,6 +27,7 @@ class FunctionPass;
 class ImmutablePass;
 class MachineInstr;
 class MCInst;
+class PassRegistry;
 class TargetLowering;
 class TargetMachine;
 
@@ -45,6 +46,9 @@ FunctionPass *createThumb2SizeReductionPass(
 void LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
                                   ARMAsmPrinter &AP);
 
+void initializeARMLoadStoreOptPass(PassRegistry &);
+void initializeARMPreAllocLoadStoreOptPass(PassRegistry &);
+
 } // end namespace llvm;
 
 #endif
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index c171656b48ab..ef626b66a1e7 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -76,6 +76,11 @@ def FeatureT2XtPk : SubtargetFeature<"t2xtpk", "HasT2ExtractPack", "true",
                                  "Enable Thumb2 extract and pack instructions">;
 def FeatureDB     : SubtargetFeature<"db", "HasDataBarrier", "true",
                                    "Has data barrier (dmb / dsb) instructions">;
+def FeatureV7Clrex : SubtargetFeature<"v7clrex", "HasV7Clrex", "true",
+                                      "Has v7 clrex instruction">;
+def FeatureAcquireRelease : SubtargetFeature<"acquire-release",
+                                             "HasAcquireRelease", "true",
+                         "Has v8 acquire/release (lda/ldaex etc) instructions">;
 def FeatureSlowFPBrcc : SubtargetFeature<"slow-fp-brcc", "SlowFPBrcc", "true",
                                          "FP compare + branch is slow">;
 def FeatureVFPOnlySP : SubtargetFeature<"fp-only-sp", "FPOnlySP", "true",
@@ -84,17 +89,98 @@ def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true",
                            "Enable support for Performance Monitor extensions">;
 def FeatureTrustZone : SubtargetFeature<"trustzone", "HasTrustZone", "true",
                           "Enable support for TrustZone security extensions">;
+def Feature8MSecExt : SubtargetFeature<"8msecext", "Has8MSecExt", "true",
+                          "Enable support for ARMv8-M Security Extensions">;
 def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true",
                           "Enable support for Cryptography extensions",
                           [FeatureNEON]>;
 def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true",
                           "Enable support for CRC instructions">;
+// Not to be confused with FeatureHasRetAddrStack (return address stack)
+def FeatureRAS : SubtargetFeature<"ras", "HasRAS", "true",
+                "Enable Reliability, Availability and Serviceability extensions">;
+
 
 // Cyclone has preferred instructions for zeroing VFP registers, which can
 // execute in 0 cycles.
 def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true",
                                         "Has zero-cycle zeroing instructions">;
 
+// Whether or not it may be profitable to unpredicate certain instructions
+// during if conversion.
+def FeatureProfUnpredicate : SubtargetFeature<"prof-unpr",
+                                              "IsProfitableToUnpredicate",
+                                              "true",
+                                              "Is profitable to unpredicate">;
+
+// Some targets (e.g. Swift) have microcoded VGETLNi32.
+def FeatureSlowVGETLNi32 : SubtargetFeature<"slow-vgetlni32",
+                                            "HasSlowVGETLNi32", "true",
+                                            "Has slow VGETLNi32 - prefer VMOV">;
+
+// Some targets (e.g. Swift) have microcoded VDUP32.
+def FeatureSlowVDUP32 : SubtargetFeature<"slow-vdup32", "HasSlowVDUP32", "true",
+                                         "Has slow VDUP32 - prefer VMOV">;
+
+// Some targets (e.g. Cortex-A9) prefer VMOVSR to VMOVDRR even when using NEON
+// for scalar FP, as this allows more effective execution domain optimization.
+def FeaturePreferVMOVSR : SubtargetFeature<"prefer-vmovsr", "PreferVMOVSR",
+                                           "true", "Prefer VMOVSR">;
+
+// Swift has ISHST barriers compatible with Atomic Release semantics but weaker
+// than ISH
+def FeaturePrefISHSTBarrier : SubtargetFeature<"prefer-ishst", "PreferISHST",
+                                           "true", "Prefer ISHST barriers">;
+
+// Some targets (e.g. Cortex-A9) have muxed AGU and NEON/FPU.
+def FeatureMuxedUnits : SubtargetFeature<"muxed-units", "HasMuxedUnits", "true",
+                                         "Has muxed AGU and NEON/FPU">;
+
+// On some targets, a VLDM/VSTM starting with an odd register number needs more
+// microops than single VLDRS.
+def FeatureSlowOddRegister : SubtargetFeature<"slow-odd-reg", "SlowOddRegister",
+                     "true", "VLDM/VSTM starting with an odd register is slow">;
+
+// Some targets have a renaming dependency when loading into D subregisters.
+def FeatureSlowLoadDSubreg : SubtargetFeature<"slow-load-D-subreg",
+                                              "SlowLoadDSubregister", "true",
+                                              "Loading into D subregs is slow">;
+// Some targets (e.g. Cortex-A15) never want VMOVS to be widened to VMOVD.
+def FeatureDontWidenVMOVS : SubtargetFeature<"dont-widen-vmovs",
+                                             "DontWidenVMOVS", "true",
+                                             "Don't widen VMOVS to VMOVD">;
+
+// Whether or not it is profitable to expand VFP/NEON MLA/MLS instructions.
+def FeatureExpandMLx : SubtargetFeature<"expand-fp-mlx", "ExpandMLx", "true",
+                                        "Expand VFP/NEON MLA/MLS instructions">;
+
+// Some targets have special RAW hazards for VFP/NEON VMLA/VMLS.
+def FeatureHasVMLxHazards : SubtargetFeature<"vmlx-hazards", "HasVMLxHazards",
+                                             "true", "Has VMLx hazards">;
+
+// Some targets (e.g. Cortex-A9) want to convert VMOVRS, VMOVSR and VMOVS from
+// VFP to NEON, as an execution domain optimization.
+def FeatureNEONForFPMovs : SubtargetFeature<"neon-fpmovs", "UseNEONForFPMovs",
+                              "true", "Convert VMOVSR, VMOVRS, VMOVS to NEON">;
+
+// Some processors benefit from using NEON instructions for scalar
+// single-precision FP operations. This affects instruction selection and should
+// only be enabled if the handling of denormals is not important.
+def FeatureNEONForFP : SubtargetFeature<"neonfp", "UseNEONForSinglePrecisionFP",
+                                        "true",
+                                        "Use NEON for single precision FP">;
+
+// On some processors, VLDn instructions that access unaligned data take one
+// extra cycle. Take that into account when computing operand latencies.
+def FeatureCheckVLDnAlign : SubtargetFeature<"vldn-align", "CheckVLDnAlign",
+                                             "true",
+                                             "Check for VLDn unaligned access">;
+
+// Some processors have a nonpipelined VFP coprocessor.
+def FeatureNonpipelinedVFP : SubtargetFeature<"nonpipelined-vfp",
+                                              "NonpipelinedVFP", "true",
+                                          "VFP instructions are not pipelined">;
+
 // Some processors have FP multiply-accumulate instructions that don't
 // play nicely with other VFP / NEON instructions, and it's generally better
 // to just not use them.
@@ -106,12 +192,6 @@ def FeatureVMLxForwarding : SubtargetFeature<"vmlx-forwarding",
                                        "HasVMLxForwarding", "true",
                                        "Has multiplier accumulator forwarding">;
 
-// Some processors benefit from using NEON instructions for scalar
-// single-precision FP operations.
-def FeatureNEONForFP : SubtargetFeature<"neonfp", "UseNEONForSinglePrecisionFP",
-                                        "true",
-                                        "Use NEON for single precision FP">;
-
 // Disable 32-bit to 16-bit narrowing for experimentation.
 def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Pref32BitThumb", "true",
                                              "Prefer 32-bit Thumb instrs">;
@@ -130,7 +210,7 @@ def FeatureAvoidMOVsShOp : SubtargetFeature<"avoid-movs-shop",
 
 // Some processors perform return stack prediction. CodeGen should avoid issue
 // "normal" call instructions to callees which do not return.
-def FeatureHasRAS : SubtargetFeature<"ras", "HasRAS", "true",
+def FeatureHasRetAddrStack : SubtargetFeature<"ret-addr-stack", "HasRetAddrStack", "true",
                                      "Has return address stack">;
 
 /// DSP extension.
@@ -200,24 +280,31 @@ def HasV6Ops    : SubtargetFeature<"v6", "HasV6Ops", "true",
 def HasV6MOps   : SubtargetFeature<"v6m", "HasV6MOps", "true",
                                    "Support ARM v6M instructions",
                                    [HasV6Ops]>;
+def HasV8MBaselineOps : SubtargetFeature<"v8m", "HasV8MBaselineOps", "true",
+                                         "Support ARM v8M Baseline instructions",
+                                         [HasV6MOps]>;
 def HasV6KOps   : SubtargetFeature<"v6k", "HasV6KOps", "true",
                                    "Support ARM v6k instructions",
                                    [HasV6Ops]>;
 def HasV6T2Ops  : SubtargetFeature<"v6t2", "HasV6T2Ops", "true",
                                    "Support ARM v6t2 instructions",
-                                   [HasV6MOps, HasV6KOps, FeatureThumb2]>;
+                                   [HasV8MBaselineOps, HasV6KOps, FeatureThumb2]>;
 def HasV7Ops    : SubtargetFeature<"v7", "HasV7Ops", "true",
                                    "Support ARM v7 instructions",
-                                   [HasV6T2Ops, FeaturePerfMon]>;
+                                   [HasV6T2Ops, FeaturePerfMon,
+                                    FeatureV7Clrex]>;
 def HasV8Ops    : SubtargetFeature<"v8", "HasV8Ops", "true",
                                    "Support ARM v8 instructions",
-                                   [HasV7Ops]>;
+                                   [HasV7Ops, FeatureAcquireRelease]>;
 def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true",
                                    "Support ARM v8.1a instructions",
                                    [HasV8Ops]>;
 def HasV8_2aOps   : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true",
                                    "Support ARM v8.2a instructions",
                                    [HasV8_1aOps]>;
+def HasV8MMainlineOps : SubtargetFeature<"v8m.main", "HasV8MMainlineOps", "true",
+                                         "Support ARM v8M Mainline instructions",
+                                         [HasV7Ops]>;
 
 
 //===----------------------------------------------------------------------===//
@@ -238,6 +325,8 @@ def ProcA15     : SubtargetFeature<"a15", "ARMProcFamily", "CortexA15",
                                    "Cortex-A15 ARM processors", []>;
 def ProcA17     : SubtargetFeature<"a17", "ARMProcFamily", "CortexA17",
                                    "Cortex-A17 ARM processors", []>;
+def ProcA32     : SubtargetFeature<"a32", "ARMProcFamily", "CortexA32",
+                                   "Cortex-A32 ARM processors", []>;
 def ProcA35     : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35",
                                    "Cortex-A35 ARM processors", []>;
 def ProcA53     : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
@@ -246,6 +335,8 @@ def ProcA57     : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
                                    "Cortex-A57 ARM processors", []>;
 def ProcA72     : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72",
                                    "Cortex-A72 ARM processors", []>;
+def ProcA73     : SubtargetFeature<"a73", "ARMProcFamily", "CortexA73",
+                                   "Cortex-A73 ARM processors", []>;
 
 def ProcKrait   : SubtargetFeature<"krait", "ARMProcFamily", "Krait",
                                    "Qualcomm ARM processors", []>;
@@ -256,12 +347,14 @@ def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1",
                                     "Samsung Exynos-M1 processors", []>;
 
 def ProcR4      : SubtargetFeature<"r4", "ARMProcFamily", "CortexR4",
-                                    "Cortex-R4 ARM processors", []>;
+                                   "Cortex-R4 ARM processors", []>;
 def ProcR5      : SubtargetFeature<"r5", "ARMProcFamily", "CortexR5",
                                    "Cortex-R5 ARM processors", []>;
 def ProcR7      : SubtargetFeature<"r7", "ARMProcFamily", "CortexR7",
                                    "Cortex-R7 ARM processors", []>;
 
+def ProcM3      : SubtargetFeature<"m3", "ARMProcFamily", "CortexM3",
+                                   "Cortex-M3 ARM processors", []>;
 
 //===----------------------------------------------------------------------===//
 // ARM schedules.
@@ -374,7 +467,27 @@ def ARMv82a   : Architecture<"armv8.2-a", "ARMv82a",  [HasV8_2aOps,
                                                        FeatureMP,
                                                        FeatureVirtualization,
                                                        FeatureCrypto,
-                                                       FeatureCRC]>;
+                                                       FeatureCRC,
+                                                       FeatureRAS]>;
+
+def ARMv8mBaseline : Architecture<"armv8-m.base", "ARMv8mBaseline",
+                                                      [HasV8MBaselineOps,
+                                                       FeatureNoARM,
+                                                       FeatureDB,
+                                                       FeatureHWDiv,
+                                                       FeatureV7Clrex,
+                                                       Feature8MSecExt,
+                                                       FeatureAcquireRelease,
+                                                       FeatureMClass]>;
+
+def ARMv8mMainline : Architecture<"armv8-m.main", "ARMv8mMainline",
+                                                      [HasV8MMainlineOps,
+                                                       FeatureNoARM,
+                                                       FeatureDB,
+                                                       FeatureHWDiv,
+                                                       Feature8MSecExt,
+                                                       FeatureAcquireRelease,
+                                                       FeatureMClass]>;
 
 // Aliases
 def IWMMXT   : Architecture<"iwmmxt",      "ARMv5te",  [ARMv5te]>;
@@ -452,7 +565,7 @@ def : Processor<"arm1156t2f-s",     ARMV6Itineraries,   [ARMv6t2,
 
 // FIXME: A5 has currently the same Schedule model as A8
 def : ProcessorModel<"cortex-a5",   CortexA8Model,      [ARMv7a, ProcA5,
-                                                         FeatureHasRAS,
+                                                         FeatureHasRetAddrStack,
                                                          FeatureTrustZone,
                                                          FeatureSlowFPBrcc,
                                                          FeatureHasSlowFPVMLx,
@@ -462,9 +575,10 @@ def : ProcessorModel<"cortex-a5",   CortexA8Model,      [ARMv7a, ProcA5,
                                                          FeatureVFP4]>;
 
 def : ProcessorModel<"cortex-a7",   CortexA8Model,      [ARMv7a, ProcA7,
-                                                         FeatureHasRAS,
+                                                         FeatureHasRetAddrStack,
                                                          FeatureTrustZone,
                                                          FeatureSlowFPBrcc,
+                                                         FeatureHasVMLxHazards,
                                                          FeatureHasSlowFPVMLx,
                                                          FeatureVMLxForwarding,
                                                          FeatureT2XtPk,
@@ -475,25 +589,33 @@ def : ProcessorModel<"cortex-a7",   CortexA8Model,      [ARMv7a, ProcA7,
                                                          FeatureVirtualization]>;
 
 def : ProcessorModel<"cortex-a8",   CortexA8Model,      [ARMv7a, ProcA8,
-                                                         FeatureHasRAS,
+                                                         FeatureHasRetAddrStack,
+                                                         FeatureNonpipelinedVFP,
                                                          FeatureTrustZone,
                                                          FeatureSlowFPBrcc,
+                                                         FeatureHasVMLxHazards,
                                                          FeatureHasSlowFPVMLx,
                                                          FeatureVMLxForwarding,
                                                          FeatureT2XtPk]>;
 
 def : ProcessorModel<"cortex-a9",   CortexA9Model,      [ARMv7a, ProcA9,
-                                                         FeatureHasRAS,
+                                                         FeatureHasRetAddrStack,
                                                          FeatureTrustZone,
+                                                         FeatureHasVMLxHazards,
                                                          FeatureVMLxForwarding,
                                                          FeatureT2XtPk,
                                                          FeatureFP16,
                                                          FeatureAvoidPartialCPSR,
+                                                         FeatureExpandMLx,
+                                                         FeaturePreferVMOVSR,
+                                                         FeatureMuxedUnits,
+                                                         FeatureNEONForFPMovs,
+                                                         FeatureCheckVLDnAlign,
                                                          FeatureMP]>;
 
 // FIXME: A12 has currently the same Schedule model as A9
 def : ProcessorModel<"cortex-a12",  CortexA9Model,      [ARMv7a, ProcA12,
-                                                         FeatureHasRAS,
+                                                         FeatureHasRetAddrStack,
                                                          FeatureTrustZone,
                                                          FeatureVMLxForwarding,
                                                          FeatureT2XtPk,
@@ -506,11 +628,14 @@ def : ProcessorModel<"cortex-a12",  CortexA9Model,      [ARMv7a, ProcA12,
 
 // FIXME: A15 has currently the same Schedule model as A9.
 def : ProcessorModel<"cortex-a15",  CortexA9Model,      [ARMv7a, ProcA15,
-                                                         FeatureHasRAS,
+                                                         FeatureDontWidenVMOVS,
+                                                         FeatureHasRetAddrStack,
+                                                         FeatureMuxedUnits,
                                                          FeatureTrustZone,
                                                          FeatureT2XtPk,
                                                          FeatureVFP4,
                                                          FeatureMP,
+                                                         FeatureCheckVLDnAlign,
                                                          FeatureHWDiv,
                                                          FeatureHWDivARM,
                                                          FeatureAvoidPartialCPSR,
@@ -518,7 +643,7 @@ def : ProcessorModel<"cortex-a15",  CortexA9Model,      [ARMv7a, ProcA15,
 
 // FIXME: A17 has currently the same Schedule model as A9
 def : ProcessorModel<"cortex-a17",  CortexA9Model,      [ARMv7a, ProcA17,
-                                                         FeatureHasRAS,
+                                                         FeatureHasRetAddrStack,
                                                          FeatureTrustZone,
                                                          FeatureMP,
                                                          FeatureVMLxForwarding,
@@ -533,7 +658,9 @@ def : ProcessorModel<"cortex-a17",  CortexA9Model,      [ARMv7a, ProcA17,
 // FIXME: krait has currently the same features as A9 plus VFP4 and hardware
 //        division features.
 def : ProcessorModel<"krait",       CortexA9Model,      [ARMv7a, ProcKrait,
-                                                         FeatureHasRAS,
+                                                         FeatureHasRetAddrStack,
+                                                         FeatureMuxedUnits,
+                                                         FeatureCheckVLDnAlign,
                                                          FeatureVMLxForwarding,
                                                          FeatureT2XtPk,
                                                          FeatureFP16,
@@ -543,7 +670,7 @@ def : ProcessorModel<"krait",       CortexA9Model,      [ARMv7a, ProcKrait,
                                                          FeatureHWDivARM]>;
 
 def : ProcessorModel<"swift",       SwiftModel,         [ARMv7a, ProcSwift,
-                                                         FeatureHasRAS,
+                                                         FeatureHasRetAddrStack,
                                                          FeatureNEONForFP,
                                                          FeatureT2XtPk,
                                                          FeatureVFP4,
@@ -552,17 +679,24 @@ def : ProcessorModel<"swift",       SwiftModel,         [ARMv7a, ProcSwift,
                                                          FeatureHWDivARM,
                                                          FeatureAvoidPartialCPSR,
                                                          FeatureAvoidMOVsShOp,
-                                                         FeatureHasSlowFPVMLx]>;
+                                                         FeatureHasSlowFPVMLx,
+                                                         FeatureHasVMLxHazards,
+                                                         FeatureProfUnpredicate,
+                                                         FeaturePrefISHSTBarrier,
+                                                         FeatureSlowOddRegister,
+                                                         FeatureSlowLoadDSubreg,
+                                                         FeatureSlowVGETLNi32,
+                                                         FeatureSlowVDUP32]>;
 
 // FIXME: R4 has currently the same ProcessorModel as A8.
 def : ProcessorModel<"cortex-r4",   CortexA8Model,      [ARMv7r, ProcR4,
-                                                         FeatureHasRAS,
+                                                         FeatureHasRetAddrStack,
                                                          FeatureAvoidPartialCPSR,
                                                          FeatureT2XtPk]>;
 
 // FIXME: R4F has currently the same ProcessorModel as A8.
 def : ProcessorModel<"cortex-r4f",  CortexA8Model,      [ARMv7r, ProcR4,
-                                                         FeatureHasRAS,
+                                                         FeatureHasRetAddrStack,
                                                          FeatureSlowFPBrcc,
                                                          FeatureHasSlowFPVMLx,
                                                          FeatureVFP3,
@@ -572,7 +706,7 @@ def : ProcessorModel<"cortex-r4f",  CortexA8Model,      [ARMv7r, ProcR4,
 
 // FIXME: R5 has currently the same ProcessorModel as A8.
 def : ProcessorModel<"cortex-r5",   CortexA8Model,      [ARMv7r, ProcR5,
-                                                         FeatureHasRAS,
+                                                         FeatureHasRetAddrStack,
                                                          FeatureVFP3,
                                                          FeatureD16,
                                                          FeatureSlowFPBrcc,
@@ -583,9 +717,20 @@ def : ProcessorModel<"cortex-r5",   CortexA8Model,      [ARMv7r, ProcR5,
 
 // FIXME: R7 has currently the same ProcessorModel as A8 and is modelled as R5.
 def : ProcessorModel<"cortex-r7",   CortexA8Model,      [ARMv7r, ProcR7,
-                                                         FeatureHasRAS,
+                                                         FeatureHasRetAddrStack,
+                                                         FeatureVFP3,
+                                                         FeatureD16,
+                                                         FeatureFP16,
+                                                         FeatureMP,
+                                                         FeatureSlowFPBrcc,
+                                                         FeatureHWDivARM,
+                                                         FeatureHasSlowFPVMLx,
+                                                         FeatureAvoidPartialCPSR,
+                                                         FeatureT2XtPk]>;
+
+def : ProcessorModel<"cortex-r8",   CortexA8Model,      [ARMv7r,
+                                                         FeatureHasRetAddrStack,
                                                          FeatureVFP3,
-                                                         FeatureVFPOnlySP,
                                                          FeatureD16,
                                                          FeatureFP16,
                                                          FeatureMP,
@@ -595,8 +740,8 @@ def : ProcessorModel<"cortex-r7",   CortexA8Model,      [ARMv7r, ProcR7,
                                                          FeatureAvoidPartialCPSR,
                                                          FeatureT2XtPk]>;
 
-def : ProcNoItin<"cortex-m3",                           [ARMv7m]>;
-def : ProcNoItin<"sc300",                               [ARMv7m]>;
+def : ProcNoItin<"cortex-m3",                           [ARMv7m, ProcM3]>;
+def : ProcNoItin<"sc300",                               [ARMv7m, ProcM3]>;
 
 def : ProcNoItin<"cortex-m4",                           [ARMv7em,
                                                          FeatureVFP4,
@@ -607,6 +752,12 @@ def : ProcNoItin<"cortex-m7",                           [ARMv7em,
                                                          FeatureFPARMv8,
                                                          FeatureD16]>;
 
+def : ProcNoItin<"cortex-a32",                           [ARMv8a,
+                                                         FeatureHWDiv,
+                                                         FeatureHWDivARM,
+                                                         FeatureT2XtPk,
+                                                         FeatureCrypto,
+                                                         FeatureCRC]>;
 
 def : ProcNoItin<"cortex-a35",                          [ARMv8a, ProcA35,
                                                          FeatureHWDiv,
@@ -636,9 +787,16 @@ def : ProcNoItin<"cortex-a72",                          [ARMv8a, ProcA72,
                                                          FeatureCrypto,
                                                          FeatureCRC]>;
 
+def : ProcNoItin<"cortex-a73",                          [ARMv8a, ProcA73,
+                                                         FeatureHWDiv,
+                                                         FeatureHWDivARM,
+                                                         FeatureT2XtPk,
+                                                         FeatureCrypto,
+                                                         FeatureCRC]>;
+
 // Cyclone is very similar to swift
 def : ProcessorModel<"cyclone",     SwiftModel,         [ARMv8a, ProcSwift,
-                                                         FeatureHasRAS,
+                                                         FeatureHasRetAddrStack,
                                                          FeatureNEONForFP,
                                                          FeatureT2XtPk,
                                                          FeatureVFP4,
diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp
index 206db9619a2f..04863a7ecf8f 100644
--- a/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -43,12 +43,11 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/ARMBuildAttributes.h"
-#include "llvm/Support/TargetParser.h"
 #include "llvm/Support/COFF.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetParser.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
@@ -213,8 +212,6 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
     GetARMGVSymbol(GV, TF)->print(O, MAI);
 
     printOffset(MO.getOffset(), O);
-    if (TF == ARMII::MO_PLT)
-      O << "(PLT)";
     break;
   }
   case MachineOperand::MO_ConstantPoolIndex:
@@ -516,9 +513,10 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
       OutStreamer->AddBlankLine();
     }
 
-    Stubs = MMIMacho.GetHiddenGVStubList();
+    Stubs = MMIMacho.GetThreadLocalGVStubList();
     if (!Stubs.empty()) {
-      OutStreamer->SwitchSection(TLOFMacho.getNonLazySymbolPointerSection());
+      // Switch with ".non_lazy_symbol_pointer" directive.
+      OutStreamer->SwitchSection(TLOFMacho.getThreadLocalPointerSection());
       EmitAlignment(2);
 
       for (auto &Stub : Stubs)
@@ -536,18 +534,48 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
     OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
   }
 
+  if (TT.isOSBinFormatCOFF()) {
+    const auto &TLOF =
+        static_cast<const TargetLoweringObjectFileCOFF &>(getObjFileLowering());
+
+    std::string Flags;
+    raw_string_ostream OS(Flags);
+
+    for (const auto &Function : M)
+      TLOF.emitLinkerFlagsForGlobal(OS, &Function, *Mang);
+    for (const auto &Global : M.globals())
+      TLOF.emitLinkerFlagsForGlobal(OS, &Global, *Mang);
+    for (const auto &Alias : M.aliases())
+      TLOF.emitLinkerFlagsForGlobal(OS, &Alias, *Mang);
+
+    OS.flush();
+
+    // Output collected flags
+    if (!Flags.empty()) {
+      OutStreamer->SwitchSection(TLOF.getDrectveSection());
+      OutStreamer->EmitBytes(Flags);
+    }
+  }
+
   // The last attribute to be emitted is ABI_optimization_goals
   MCTargetStreamer &TS = *OutStreamer->getTargetStreamer();
   ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
 
   if (OptimizationGoals > 0 &&
-      (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI()))
+      (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() ||
+       Subtarget->isTargetMuslAEABI()))
     ATS.emitAttribute(ARMBuildAttrs::ABI_optimization_goals, OptimizationGoals);
   OptimizationGoals = -1;
 
   ATS.finishAttributeSection();
 }
 
+static bool isV8M(const ARMSubtarget *Subtarget) {
+  // Note that v8M Baseline is a subset of v6T2!
+  return (Subtarget->hasV8MBaselineOps() && !Subtarget->hasV6T2Ops()) ||
+         Subtarget->hasV8MMainlineOps();
+}
+
 //===----------------------------------------------------------------------===//
 // Helper routines for EmitStartOfAsmFile() and EmitEndOfAsmFile()
 // FIXME:
@@ -561,13 +589,17 @@ static ARMBuildAttrs::CPUArch getArchForCPU(StringRef CPU,
     return ARMBuildAttrs::v5TEJ;
 
   if (Subtarget->hasV8Ops())
-    return ARMBuildAttrs::v8;
+    return ARMBuildAttrs::v8_A;
+  else if (Subtarget->hasV8MMainlineOps())
+    return ARMBuildAttrs::v8_M_Main;
   else if (Subtarget->hasV7Ops()) {
     if (Subtarget->isMClass() && Subtarget->hasDSP())
       return ARMBuildAttrs::v7E_M;
     return ARMBuildAttrs::v7;
   } else if (Subtarget->hasV6T2Ops())
     return ARMBuildAttrs::v6T2;
+  else if (Subtarget->hasV8MBaselineOps())
+    return ARMBuildAttrs::v8_M_Base;
   else if (Subtarget->hasV6MOps())
     return ARMBuildAttrs::v6S_M;
   else if (Subtarget->hasV6Ops())
@@ -609,9 +641,9 @@ void ARMAsmPrinter::emitAttributes() {
       static_cast<const ARMBaseTargetMachine &>(TM);
   const ARMSubtarget STI(TT, CPU, ArchFS, ATM, ATM.isLittleEndian());
 
-  std::string CPUString = STI.getCPUString();
+  const std::string &CPUString = STI.getCPUString();
 
-  if (CPUString.find("generic") != 0) { //CPUString doesn't start with "generic"
+  if (!StringRef(CPUString).startswith("generic")) {
     // FIXME: remove krait check when GNU tools support krait cpu
     if (STI.isKrait()) {
       ATS.emitTextAttribute(ARMBuildAttrs::CPU_name, "cortex-a9");
@@ -627,7 +659,7 @@ void ARMAsmPrinter::emitAttributes() {
 
   // Tag_CPU_arch_profile must have the default value of 0 when "Architecture
   // profile is not applicable (e.g. pre v7, or cross-profile code)".
-  if (STI.hasV7Ops()) {
+  if (STI.hasV7Ops() || isV8M(&STI)) {
     if (STI.isAClass()) {
       ATS.emitAttribute(ARMBuildAttrs::CPU_arch_profile,
                         ARMBuildAttrs::ApplicationProfile);
@@ -643,7 +675,10 @@ void ARMAsmPrinter::emitAttributes() {
   ATS.emitAttribute(ARMBuildAttrs::ARM_ISA_use,
                     STI.hasARMOps() ? ARMBuildAttrs::Allowed
                                     : ARMBuildAttrs::Not_Allowed);
-  if (STI.isThumb1Only()) {
+  if (isV8M(&STI)) {
+    ATS.emitAttribute(ARMBuildAttrs::THUMB_ISA_use,
+                      ARMBuildAttrs::AllowThumbDerived);
+  } else if (STI.isThumb1Only()) {
     ATS.emitAttribute(ARMBuildAttrs::THUMB_ISA_use, ARMBuildAttrs::Allowed);
   } else if (STI.hasThumb2()) {
     ATS.emitAttribute(ARMBuildAttrs::THUMB_ISA_use,
@@ -690,7 +725,7 @@ void ARMAsmPrinter::emitAttributes() {
       ATS.emitFPU(ARM::FK_VFPV2);
   }
 
-  if (TM.getRelocationModel() == Reloc::PIC_) {
+  if (isPositionIndependent()) {
     // PIC specific attributes.
     ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_RW_data,
                       ARMBuildAttrs::AddressRWPCRel);
@@ -794,6 +829,9 @@ void ARMAsmPrinter::emitAttributes() {
   if (STI.hasDivideInARMMode() && !STI.hasV8Ops())
     ATS.emitAttribute(ARMBuildAttrs::DIV_use, ARMBuildAttrs::AllowDIVExt);
 
+  if (STI.hasDSP() && isV8M(&STI))
+    ATS.emitAttribute(ARMBuildAttrs::DSP_extension, ARMBuildAttrs::Allowed);
+
   if (MMI) {
     if (const Module *SourceModule = MMI->getModule()) {
       // ABI_PCS_wchar_t to indicate wchar_t width
@@ -853,11 +891,18 @@ static MCSymbol *getPICLabel(const char *Prefix, unsigned FunctionNumber,
 static MCSymbolRefExpr::VariantKind
 getModifierVariantKind(ARMCP::ARMCPModifier Modifier) {
   switch (Modifier) {
-  case ARMCP::no_modifier: return MCSymbolRefExpr::VK_None;
-  case ARMCP::TLSGD:       return MCSymbolRefExpr::VK_TLSGD;
-  case ARMCP::TPOFF:       return MCSymbolRefExpr::VK_TPOFF;
-  case ARMCP::GOTTPOFF:    return MCSymbolRefExpr::VK_GOTTPOFF;
-  case ARMCP::GOT_PREL:    return MCSymbolRefExpr::VK_ARM_GOT_PREL;
+  case ARMCP::no_modifier:
+    return MCSymbolRefExpr::VK_None;
+  case ARMCP::TLSGD:
+    return MCSymbolRefExpr::VK_TLSGD;
+  case ARMCP::TPOFF:
+    return MCSymbolRefExpr::VK_TPOFF;
+  case ARMCP::GOTTPOFF:
+    return MCSymbolRefExpr::VK_GOTTPOFF;
+  case ARMCP::GOT_PREL:
+    return MCSymbolRefExpr::VK_ARM_GOT_PREL;
+  case ARMCP::SECREL:
+    return MCSymbolRefExpr::VK_SECREL;
   }
   llvm_unreachable("Invalid ARMCPModifier!");
 }
@@ -865,8 +910,8 @@ getModifierVariantKind(ARMCP::ARMCPModifier Modifier) {
 MCSymbol *ARMAsmPrinter::GetARMGVSymbol(const GlobalValue *GV,
                                         unsigned char TargetFlags) {
   if (Subtarget->isTargetMachO()) {
-    bool IsIndirect = (TargetFlags & ARMII::MO_NONLAZY) &&
-      Subtarget->GVIsIndirectSymbol(GV, TM.getRelocationModel());
+    bool IsIndirect =
+        (TargetFlags & ARMII::MO_NONLAZY) && Subtarget->isGVIndirectSymbol(GV);
 
     if (!IsIndirect)
       return getSymbol(GV);
@@ -876,8 +921,9 @@ MCSymbol *ARMAsmPrinter::GetARMGVSymbol(const GlobalValue *GV,
     MachineModuleInfoMachO &MMIMachO =
       MMI->getObjFileInfo<MachineModuleInfoMachO>();
     MachineModuleInfoImpl::StubValueTy &StubSym =
-      GV->hasHiddenVisibility() ? MMIMachO.getHiddenGVStubEntry(MCSym)
-                                : MMIMachO.getGVStubEntry(MCSym);
+        GV->isThreadLocal() ? MMIMachO.getThreadLocalGVStubEntry(MCSym)
+                            : MMIMachO.getGVStubEntry(MCSym);
+
     if (!StubSym.getPointer())
       StubSym = MachineModuleInfoImpl::StubValueTy(getSymbol(GV),
                                                    !GV->hasInternalLinkage());
@@ -991,7 +1037,7 @@ void ARMAsmPrinter::EmitJumpTableAddrs(const MachineInstr *MI) {
     //    .word (LBB1 - LJTI_0_0)
     const MCExpr *Expr = MCSymbolRefExpr::create(MBB->getSymbol(), OutContext);
 
-    if (TM.getRelocationModel() == Reloc::PIC_)
+    if (isPositionIndependent())
       Expr = MCBinaryExpr::createSub(Expr, MCSymbolRefExpr::create(JTISymbol,
                                                                    OutContext),
                                      OutContext);
@@ -1227,6 +1273,8 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
 
 void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   const DataLayout &DL = getDataLayout();
+  MCTargetStreamer &TS = *OutStreamer->getTargetStreamer();
+  ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
 
   // If we just ended a constant pool, mark it as such.
   if (InConstantPool && MI->getOpcode() != ARM::CONSTPOOL_ENTRY) {
@@ -1643,29 +1691,26 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     // Non-Darwin binutils don't yet support the "trap" mnemonic.
     // FIXME: Remove this special case when they do.
     if (!Subtarget->isTargetMachO()) {
-      //.long 0xe7ffdefe @ trap
       uint32_t Val = 0xe7ffdefeUL;
       OutStreamer->AddComment("trap");
-      OutStreamer->EmitIntValue(Val, 4);
+      ATS.emitInst(Val);
       return;
     }
     break;
   }
   case ARM::TRAPNaCl: {
-    //.long 0xe7fedef0 @ trap
     uint32_t Val = 0xe7fedef0UL;
     OutStreamer->AddComment("trap");
-    OutStreamer->EmitIntValue(Val, 4);
+    ATS.emitInst(Val);
     return;
   }
   case ARM::tTRAP: {
     // Non-Darwin binutils don't yet support the "trap" mnemonic.
     // FIXME: Remove this special case when they do.
     if (!Subtarget->isTargetMachO()) {
-      //.short 57086 @ trap
       uint16_t Val = 0xdefe;
       OutStreamer->AddComment("trap");
-      OutStreamer->EmitIntValue(Val, 2);
+      ATS.emitInst(Val, 'n');
       return;
     }
     break;
@@ -1845,6 +1890,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     // bx $scratch
     unsigned SrcReg = MI->getOperand(0).getReg();
     unsigned ScratchReg = MI->getOperand(1).getReg();
+
     EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tLDRi)
       .addReg(ScratchReg)
       .addReg(SrcReg)
@@ -1885,6 +1931,36 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       .addReg(0));
     return;
   }
+  case ARM::tInt_WIN_eh_sjlj_longjmp: {
+    // ldr.w r11, [$src, #0]
+    // ldr.w  sp, [$src, #8]
+    // ldr.w  pc, [$src, #4]
+
+    unsigned SrcReg = MI->getOperand(0).getReg();
+
+    EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::t2LDRi12)
+                                     .addReg(ARM::R11)
+                                     .addReg(SrcReg)
+                                     .addImm(0)
+                                     // Predicate
+                                     .addImm(ARMCC::AL)
+                                     .addReg(0));
+    EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::t2LDRi12)
+                                     .addReg(ARM::SP)
+                                     .addReg(SrcReg)
+                                     .addImm(8)
+                                     // Predicate
+                                     .addImm(ARMCC::AL)
+                                     .addReg(0));
+    EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::t2LDRi12)
+                                     .addReg(ARM::PC)
+                                     .addReg(SrcReg)
+                                     .addImm(4)
+                                     // Predicate
+                                     .addImm(ARMCC::AL)
+                                     .addReg(0));
+    return;
+  }
   }
 
   MCInst TmpInst;
diff --git a/lib/Target/ARM/ARMAsmPrinter.h b/lib/Target/ARM/ARMAsmPrinter.h
index ed7be2de51ca..97f5ca0ecbc2 100644
--- a/lib/Target/ARM/ARMAsmPrinter.h
+++ b/lib/Target/ARM/ARMAsmPrinter.h
@@ -95,6 +95,7 @@ public:
   bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp);
 
 private:
+
   // Helpers for EmitStartOfAsmFile() and EmitEndOfAsmFile()
   void emitAttributes();
 
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 49f328852667..693f16499717 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -51,15 +51,6 @@ static cl::opt<bool>
 EnableARM3Addr("enable-arm-3-addr-conv", cl::Hidden,
                cl::desc("Enable ARM 2-addr to 3-addr conv"));
 
-static cl::opt<bool>
-WidenVMOVS("widen-vmovs", cl::Hidden, cl::init(true),
-           cl::desc("Widen ARM vmovs to vmovd when possible"));
-
-static cl::opt<unsigned>
-SwiftPartialUpdateClearance("swift-partial-update-clearance",
-     cl::Hidden, cl::init(12),
-     cl::desc("Clearance before partial register updates"));
-
 /// ARM_MLxEntry - Record information about MLA / MLS instructions.
 struct ARM_MLxEntry {
   uint16_t MLxOpc;     // MLA / MLS opcode
@@ -124,18 +115,15 @@ CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
   return TargetInstrInfo::CreateTargetPostRAHazardRecognizer(II, DAG);
 }
 
-MachineInstr *
-ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
-                                        MachineBasicBlock::iterator &MBBI,
-                                        LiveVariables *LV) const {
+MachineInstr *ARMBaseInstrInfo::convertToThreeAddress(
+    MachineFunction::iterator &MFI, MachineInstr &MI, LiveVariables *LV) const {
   // FIXME: Thumb2 support.
 
   if (!EnableARM3Addr)
     return nullptr;
 
-  MachineInstr *MI = MBBI;
-  MachineFunction &MF = *MI->getParent()->getParent();
-  uint64_t TSFlags = MI->getDesc().TSFlags;
+  MachineFunction &MF = *MI.getParent()->getParent();
+  uint64_t TSFlags = MI.getDesc().TSFlags;
   bool isPre = false;
   switch ((TSFlags & ARMII::IndexModeMask) >> ARMII::IndexModeShift) {
   default: return nullptr;
@@ -148,24 +136,24 @@ ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
 
   // Try splitting an indexed load/store to an un-indexed one plus an add/sub
   // operation.
-  unsigned MemOpc = getUnindexedOpcode(MI->getOpcode());
+  unsigned MemOpc = getUnindexedOpcode(MI.getOpcode());
   if (MemOpc == 0)
     return nullptr;
 
   MachineInstr *UpdateMI = nullptr;
   MachineInstr *MemMI = nullptr;
   unsigned AddrMode = (TSFlags & ARMII::AddrModeMask);
-  const MCInstrDesc &MCID = MI->getDesc();
+  const MCInstrDesc &MCID = MI.getDesc();
   unsigned NumOps = MCID.getNumOperands();
-  bool isLoad = !MI->mayStore();
-  const MachineOperand &WB = isLoad ? MI->getOperand(1) : MI->getOperand(0);
-  const MachineOperand &Base = MI->getOperand(2);
-  const MachineOperand &Offset = MI->getOperand(NumOps-3);
+  bool isLoad = !MI.mayStore();
+  const MachineOperand &WB = isLoad ? MI.getOperand(1) : MI.getOperand(0);
+  const MachineOperand &Base = MI.getOperand(2);
+  const MachineOperand &Offset = MI.getOperand(NumOps - 3);
   unsigned WBReg = WB.getReg();
   unsigned BaseReg = Base.getReg();
   unsigned OffReg = Offset.getReg();
-  unsigned OffImm = MI->getOperand(NumOps-2).getImm();
-  ARMCC::CondCodes Pred = (ARMCC::CondCodes)MI->getOperand(NumOps-1).getImm();
+  unsigned OffImm = MI.getOperand(NumOps - 2).getImm();
+  ARMCC::CondCodes Pred = (ARMCC::CondCodes)MI.getOperand(NumOps - 1).getImm();
   switch (AddrMode) {
   default: llvm_unreachable("Unknown indexed op!");
   case ARMII::AddrMode2: {
@@ -176,22 +164,33 @@ ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
         // Can't encode it in a so_imm operand. This transformation will
         // add more than 1 instruction. Abandon!
         return nullptr;
-      UpdateMI = BuildMI(MF, MI->getDebugLoc(),
+      UpdateMI = BuildMI(MF, MI.getDebugLoc(),
                          get(isSub ? ARM::SUBri : ARM::ADDri), WBReg)
-        .addReg(BaseReg).addImm(Amt)
-        .addImm(Pred).addReg(0).addReg(0);
+                     .addReg(BaseReg)
+                     .addImm(Amt)
+                     .addImm(Pred)
+                     .addReg(0)
+                     .addReg(0);
     } else if (Amt != 0) {
       ARM_AM::ShiftOpc ShOpc = ARM_AM::getAM2ShiftOpc(OffImm);
       unsigned SOOpc = ARM_AM::getSORegOpc(ShOpc, Amt);
-      UpdateMI = BuildMI(MF, MI->getDebugLoc(),
+      UpdateMI = BuildMI(MF, MI.getDebugLoc(),
                          get(isSub ? ARM::SUBrsi : ARM::ADDrsi), WBReg)
-        .addReg(BaseReg).addReg(OffReg).addReg(0).addImm(SOOpc)
-        .addImm(Pred).addReg(0).addReg(0);
+                     .addReg(BaseReg)
+                     .addReg(OffReg)
+                     .addReg(0)
+                     .addImm(SOOpc)
+                     .addImm(Pred)
+                     .addReg(0)
+                     .addReg(0);
     } else
-      UpdateMI = BuildMI(MF, MI->getDebugLoc(),
+      UpdateMI = BuildMI(MF, MI.getDebugLoc(),
                          get(isSub ? ARM::SUBrr : ARM::ADDrr), WBReg)
-        .addReg(BaseReg).addReg(OffReg)
-        .addImm(Pred).addReg(0).addReg(0);
+                     .addReg(BaseReg)
+                     .addReg(OffReg)
+                     .addImm(Pred)
+                     .addReg(0)
+                     .addReg(0);
     break;
   }
   case ARMII::AddrMode3 : {
@@ -199,15 +198,21 @@ ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     unsigned Amt = ARM_AM::getAM3Offset(OffImm);
     if (OffReg == 0)
       // Immediate is 8-bits. It's guaranteed to fit in a so_imm operand.
-      UpdateMI = BuildMI(MF, MI->getDebugLoc(),
+      UpdateMI = BuildMI(MF, MI.getDebugLoc(),
                          get(isSub ? ARM::SUBri : ARM::ADDri), WBReg)
-        .addReg(BaseReg).addImm(Amt)
-        .addImm(Pred).addReg(0).addReg(0);
+                     .addReg(BaseReg)
+                     .addImm(Amt)
+                     .addImm(Pred)
+                     .addReg(0)
+                     .addReg(0);
     else
-      UpdateMI = BuildMI(MF, MI->getDebugLoc(),
+      UpdateMI = BuildMI(MF, MI.getDebugLoc(),
                          get(isSub ? ARM::SUBrr : ARM::ADDrr), WBReg)
-        .addReg(BaseReg).addReg(OffReg)
-        .addImm(Pred).addReg(0).addReg(0);
+                     .addReg(BaseReg)
+                     .addReg(OffReg)
+                     .addImm(Pred)
+                     .addReg(0)
+                     .addReg(0);
     break;
   }
   }
@@ -215,24 +220,34 @@ ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   std::vector<MachineInstr*> NewMIs;
   if (isPre) {
     if (isLoad)
-      MemMI = BuildMI(MF, MI->getDebugLoc(),
-                      get(MemOpc), MI->getOperand(0).getReg())
-        .addReg(WBReg).addImm(0).addImm(Pred);
+      MemMI =
+          BuildMI(MF, MI.getDebugLoc(), get(MemOpc), MI.getOperand(0).getReg())
+              .addReg(WBReg)
+              .addImm(0)
+              .addImm(Pred);
     else
-      MemMI = BuildMI(MF, MI->getDebugLoc(),
-                      get(MemOpc)).addReg(MI->getOperand(1).getReg())
-        .addReg(WBReg).addReg(0).addImm(0).addImm(Pred);
+      MemMI = BuildMI(MF, MI.getDebugLoc(), get(MemOpc))
+                  .addReg(MI.getOperand(1).getReg())
+                  .addReg(WBReg)
+                  .addReg(0)
+                  .addImm(0)
+                  .addImm(Pred);
     NewMIs.push_back(MemMI);
     NewMIs.push_back(UpdateMI);
   } else {
     if (isLoad)
-      MemMI = BuildMI(MF, MI->getDebugLoc(),
-                      get(MemOpc), MI->getOperand(0).getReg())
-        .addReg(BaseReg).addImm(0).addImm(Pred);
+      MemMI =
+          BuildMI(MF, MI.getDebugLoc(), get(MemOpc), MI.getOperand(0).getReg())
+              .addReg(BaseReg)
+              .addImm(0)
+              .addImm(Pred);
     else
-      MemMI = BuildMI(MF, MI->getDebugLoc(),
-                      get(MemOpc)).addReg(MI->getOperand(1).getReg())
-        .addReg(BaseReg).addReg(0).addImm(0).addImm(Pred);
+      MemMI = BuildMI(MF, MI.getDebugLoc(), get(MemOpc))
+                  .addReg(MI.getOperand(1).getReg())
+                  .addReg(BaseReg)
+                  .addReg(0)
+                  .addImm(0)
+                  .addImm(Pred);
     if (WB.isDead())
       UpdateMI->getOperand(0).setIsDead();
     NewMIs.push_back(UpdateMI);
@@ -241,8 +256,8 @@ ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
 
   // Transfer LiveVariables states, kill / dead info.
   if (LV) {
-    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = MI->getOperand(i);
+    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI.getOperand(i);
       if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
         unsigned Reg = MO.getReg();
 
@@ -250,7 +265,7 @@ ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
         if (MO.isDef()) {
           MachineInstr *NewMI = (Reg == WBReg) ? UpdateMI : MemMI;
           if (MO.isDead())
-            LV->addVirtualRegisterDead(Reg, NewMI);
+            LV->addVirtualRegisterDead(Reg, *NewMI);
         }
         if (MO.isUse() && MO.isKill()) {
           for (unsigned j = 0; j < 2; ++j) {
@@ -258,7 +273,7 @@ ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
             MachineInstr *NewMI = NewMIs[j];
             if (!NewMI->readsRegister(Reg))
               continue;
-            LV->addVirtualRegisterKilled(Reg, NewMI);
+            LV->addVirtualRegisterKilled(Reg, *NewMI);
             if (VI.removeKill(MI))
               VI.Kills.push_back(NewMI);
             break;
@@ -268,17 +283,18 @@ ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     }
   }
 
+  MachineBasicBlock::iterator MBBI = MI.getIterator();
   MFI->insert(MBBI, NewMIs[1]);
   MFI->insert(MBBI, NewMIs[0]);
   return NewMIs[0];
 }
 
 // Branch analysis.
-bool
-ARMBaseInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
-                                MachineBasicBlock *&FBB,
-                                SmallVectorImpl<MachineOperand> &Cond,
-                                bool AllowModify) const {
+bool ARMBaseInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+                                     MachineBasicBlock *&TBB,
+                                     MachineBasicBlock *&FBB,
+                                     SmallVectorImpl<MachineOperand> &Cond,
+                                     bool AllowModify) const {
   TBB = nullptr;
   FBB = nullptr;
 
@@ -289,7 +305,7 @@ ARMBaseInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
 
   // Walk backwards from the end of the basic block until the branch is
   // analyzed or we give up.
-  while (isPredicated(I) || I->isTerminator() || I->isDebugValue()) {
+  while (isPredicated(*I) || I->isTerminator() || I->isDebugValue()) {
 
     // Flag to be raised on unanalyzeable instructions. This is useful in cases
     // where we want to clean up on the end of the basic block before we bail
@@ -322,7 +338,7 @@ ARMBaseInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
       Cond.push_back(I->getOperand(2));
     } else if (I->isReturn()) {
       // Returns can't be analyzed, but we should run cleanup.
-      CantAnalyze = !isPredicated(I);
+      CantAnalyze = !isPredicated(*I);
     } else {
       // We encountered other unrecognized terminator. Bail out immediately.
       return true;
@@ -330,7 +346,7 @@ ARMBaseInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
 
     // Cleanup code - to be run for unpredicated unconditional branches and
     //                returns.
-    if (!isPredicated(I) &&
+    if (!isPredicated(*I) &&
           (isUncondBranchOpcode(I->getOpcode()) ||
            isIndirectBranchOpcode(I->getOpcode()) ||
            isJumpTableBranchOpcode(I->getOpcode()) ||
@@ -344,9 +360,9 @@ ARMBaseInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
       if (AllowModify) {
         MachineBasicBlock::iterator DI = std::next(I);
         while (DI != MBB.end()) {
-          MachineInstr *InstToDelete = DI;
+          MachineInstr &InstToDelete = *DI;
           ++DI;
-          InstToDelete->eraseFromParent();
+          InstToDelete.eraseFromParent();
         }
       }
     }
@@ -390,11 +406,11 @@ unsigned ARMBaseInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
   return 2;
 }
 
-unsigned
-ARMBaseInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
-                               MachineBasicBlock *FBB,
-                               ArrayRef<MachineOperand> Cond,
-                               DebugLoc DL) const {
+unsigned ARMBaseInstrInfo::InsertBranch(MachineBasicBlock &MBB,
+                                        MachineBasicBlock *TBB,
+                                        MachineBasicBlock *FBB,
+                                        ArrayRef<MachineOperand> Cond,
+                                        const DebugLoc &DL) const {
   ARMFunctionInfo *AFI = MBB.getParent()->getInfo<ARMFunctionInfo>();
   int BOpc   = !AFI->isThumbFunction()
     ? ARM::B : (AFI->isThumb2Function() ? ARM::t2B : ARM::tB);
@@ -438,10 +454,10 @@ ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
   return false;
 }
 
-bool ARMBaseInstrInfo::isPredicated(const MachineInstr *MI) const {
-  if (MI->isBundle()) {
-    MachineBasicBlock::const_instr_iterator I = MI->getIterator();
-    MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
+bool ARMBaseInstrInfo::isPredicated(const MachineInstr &MI) const {
+  if (MI.isBundle()) {
+    MachineBasicBlock::const_instr_iterator I = MI.getIterator();
+    MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
     while (++I != E && I->isInsideBundle()) {
       int PIdx = I->findFirstPredOperandIdx();
       if (PIdx != -1 && I->getOperand(PIdx).getImm() != ARMCC::AL)
@@ -450,26 +466,26 @@ bool ARMBaseInstrInfo::isPredicated(const MachineInstr *MI) const {
     return false;
   }
 
-  int PIdx = MI->findFirstPredOperandIdx();
-  return PIdx != -1 && MI->getOperand(PIdx).getImm() != ARMCC::AL;
+  int PIdx = MI.findFirstPredOperandIdx();
+  return PIdx != -1 && MI.getOperand(PIdx).getImm() != ARMCC::AL;
 }
 
-bool ARMBaseInstrInfo::
-PredicateInstruction(MachineInstr *MI, ArrayRef<MachineOperand> Pred) const {
-  unsigned Opc = MI->getOpcode();
+bool ARMBaseInstrInfo::PredicateInstruction(
+    MachineInstr &MI, ArrayRef<MachineOperand> Pred) const {
+  unsigned Opc = MI.getOpcode();
   if (isUncondBranchOpcode(Opc)) {
-    MI->setDesc(get(getMatchingCondBranchOpcode(Opc)));
-    MachineInstrBuilder(*MI->getParent()->getParent(), MI)
+    MI.setDesc(get(getMatchingCondBranchOpcode(Opc)));
+    MachineInstrBuilder(*MI.getParent()->getParent(), MI)
       .addImm(Pred[0].getImm())
       .addReg(Pred[1].getReg());
     return true;
   }
 
-  int PIdx = MI->findFirstPredOperandIdx();
+  int PIdx = MI.findFirstPredOperandIdx();
   if (PIdx != -1) {
-    MachineOperand &PMO = MI->getOperand(PIdx);
+    MachineOperand &PMO = MI.getOperand(PIdx);
     PMO.setImm(Pred[0].getImm());
-    MI->getOperand(PIdx+1).setReg(Pred[1].getReg());
+    MI.getOperand(PIdx+1).setReg(Pred[1].getReg());
     return true;
   }
   return false;
@@ -501,11 +517,11 @@ bool ARMBaseInstrInfo::SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
   }
 }
 
-bool ARMBaseInstrInfo::DefinesPredicate(MachineInstr *MI,
-                                    std::vector<MachineOperand> &Pred) const {
+bool ARMBaseInstrInfo::DefinesPredicate(
+    MachineInstr &MI, std::vector<MachineOperand> &Pred) const {
   bool Found = false;
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
     if ((MO.isRegMask() && MO.clobbersPhysReg(ARM::CPSR)) ||
         (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR)) {
       Pred.push_back(MO);
@@ -555,21 +571,21 @@ static bool isEligibleForITBlock(const MachineInstr *MI) {
 /// isPredicable - Return true if the specified instruction can be predicated.
 /// By default, this returns true for every instruction with a
 /// PredicateOperand.
-bool ARMBaseInstrInfo::isPredicable(MachineInstr *MI) const {
-  if (!MI->isPredicable())
+bool ARMBaseInstrInfo::isPredicable(MachineInstr &MI) const {
+  if (!MI.isPredicable())
     return false;
 
-  if (!isEligibleForITBlock(MI))
+  if (!isEligibleForITBlock(&MI))
     return false;
 
   ARMFunctionInfo *AFI =
-    MI->getParent()->getParent()->getInfo<ARMFunctionInfo>();
+      MI.getParent()->getParent()->getInfo<ARMFunctionInfo>();
 
   if (AFI->isThumb2Function()) {
     if (getSubtarget().restrictIT())
-      return isV8EligibleForIT(MI);
+      return isV8EligibleForIT(&MI);
   } else { // non-Thumb
-    if ((MI->getDesc().TSFlags & ARMII::DomainMask) == ARMII::DomainNEON)
+    if ((MI.getDesc().TSFlags & ARMII::DomainMask) == ARMII::DomainNEON)
       return false;
   }
 
@@ -594,19 +610,19 @@ template <> bool IsCPSRDead<MachineInstr>(MachineInstr *MI) {
 
 /// GetInstSize - Return the size of the specified MachineInstr.
 ///
-unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
-  const MachineBasicBlock &MBB = *MI->getParent();
+unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr &MI) const {
+  const MachineBasicBlock &MBB = *MI.getParent();
   const MachineFunction *MF = MBB.getParent();
   const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
 
-  const MCInstrDesc &MCID = MI->getDesc();
+  const MCInstrDesc &MCID = MI.getDesc();
   if (MCID.getSize())
     return MCID.getSize();
 
   // If this machine instr is an inline asm, measure it.
-  if (MI->getOpcode() == ARM::INLINEASM)
-    return getInlineAsmLength(MI->getOperand(0).getSymbolName(), *MAI);
-  unsigned Opc = MI->getOpcode();
+  if (MI.getOpcode() == ARM::INLINEASM)
+    return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
+  unsigned Opc = MI.getOpcode();
   switch (Opc) {
   default:
     // pseudo-instruction sizes are zero.
@@ -628,11 +644,13 @@ unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
   case ARM::JUMPTABLE_TBH:
     // If this machine instr is a constant pool entry, its size is recorded as
     // operand #2.
-    return MI->getOperand(2).getImm();
+    return MI.getOperand(2).getImm();
   case ARM::Int_eh_sjlj_longjmp:
     return 16;
   case ARM::tInt_eh_sjlj_longjmp:
     return 10;
+  case ARM::tInt_WIN_eh_sjlj_longjmp:
+    return 12;
   case ARM::Int_eh_sjlj_setjmp:
   case ARM::Int_eh_sjlj_setjmp_nofp:
     return 20;
@@ -641,17 +659,17 @@ unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
   case ARM::t2Int_eh_sjlj_setjmp_nofp:
     return 12;
   case ARM::SPACE:
-    return MI->getOperand(1).getImm();
+    return MI.getOperand(1).getImm();
   }
 }
 
-unsigned ARMBaseInstrInfo::getInstBundleLength(const MachineInstr *MI) const {
+unsigned ARMBaseInstrInfo::getInstBundleLength(const MachineInstr &MI) const {
   unsigned Size = 0;
-  MachineBasicBlock::const_instr_iterator I = MI->getIterator();
-  MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
+  MachineBasicBlock::const_instr_iterator I = MI.getIterator();
+  MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
   while (++I != E && I->isInsideBundle()) {
     assert(!I->isBundle() && "No nested bundle!");
-    Size += GetInstSizeInBytes(&*I);
+    Size += GetInstSizeInBytes(*I);
   }
   return Size;
 }
@@ -700,9 +718,9 @@ void ARMBaseInstrInfo::copyToCPSR(MachineBasicBlock &MBB,
 }
 
 void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator I, DebugLoc DL,
-                                   unsigned DestReg, unsigned SrcReg,
-                                   bool KillSrc) const {
+                                   MachineBasicBlock::iterator I,
+                                   const DebugLoc &DL, unsigned DestReg,
+                                   unsigned SrcReg, bool KillSrc) const {
   bool GPRDest = ARM::GPRRegClass.contains(DestReg);
   bool GPRSrc = ARM::GPRRegClass.contains(SrcReg);
 
@@ -976,20 +994,17 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   }
 }
 
-unsigned
-ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
-                                     int &FrameIndex) const {
-  switch (MI->getOpcode()) {
+unsigned ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
+                                              int &FrameIndex) const {
+  switch (MI.getOpcode()) {
   default: break;
   case ARM::STRrs:
   case ARM::t2STRs: // FIXME: don't use t2STRs to access frame.
-    if (MI->getOperand(1).isFI() &&
-        MI->getOperand(2).isReg() &&
-        MI->getOperand(3).isImm() &&
-        MI->getOperand(2).getReg() == 0 &&
-        MI->getOperand(3).getImm() == 0) {
-      FrameIndex = MI->getOperand(1).getIndex();
-      return MI->getOperand(0).getReg();
+    if (MI.getOperand(1).isFI() && MI.getOperand(2).isReg() &&
+        MI.getOperand(3).isImm() && MI.getOperand(2).getReg() == 0 &&
+        MI.getOperand(3).getImm() == 0) {
+      FrameIndex = MI.getOperand(1).getIndex();
+      return MI.getOperand(0).getReg();
     }
     break;
   case ARM::STRi12:
@@ -997,27 +1012,24 @@ ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
   case ARM::tSTRspi:
   case ARM::VSTRD:
   case ARM::VSTRS:
-    if (MI->getOperand(1).isFI() &&
-        MI->getOperand(2).isImm() &&
-        MI->getOperand(2).getImm() == 0) {
-      FrameIndex = MI->getOperand(1).getIndex();
-      return MI->getOperand(0).getReg();
+    if (MI.getOperand(1).isFI() && MI.getOperand(2).isImm() &&
+        MI.getOperand(2).getImm() == 0) {
+      FrameIndex = MI.getOperand(1).getIndex();
+      return MI.getOperand(0).getReg();
     }
     break;
   case ARM::VST1q64:
   case ARM::VST1d64TPseudo:
   case ARM::VST1d64QPseudo:
-    if (MI->getOperand(0).isFI() &&
-        MI->getOperand(2).getSubReg() == 0) {
-      FrameIndex = MI->getOperand(0).getIndex();
-      return MI->getOperand(2).getReg();
+    if (MI.getOperand(0).isFI() && MI.getOperand(2).getSubReg() == 0) {
+      FrameIndex = MI.getOperand(0).getIndex();
+      return MI.getOperand(2).getReg();
     }
     break;
   case ARM::VSTMQIA:
-    if (MI->getOperand(1).isFI() &&
-        MI->getOperand(0).getSubReg() == 0) {
-      FrameIndex = MI->getOperand(1).getIndex();
-      return MI->getOperand(0).getReg();
+    if (MI.getOperand(1).isFI() && MI.getOperand(0).getSubReg() == 0) {
+      FrameIndex = MI.getOperand(1).getIndex();
+      return MI.getOperand(0).getReg();
     }
     break;
   }
@@ -1025,10 +1037,10 @@ ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
   return 0;
 }
 
-unsigned ARMBaseInstrInfo::isStoreToStackSlotPostFE(const MachineInstr *MI,
+unsigned ARMBaseInstrInfo::isStoreToStackSlotPostFE(const MachineInstr &MI,
                                                     int &FrameIndex) const {
   const MachineMemOperand *Dummy;
-  return MI->mayStore() && hasStoreToStackSlot(MI, Dummy, FrameIndex);
+  return MI.mayStore() && hasStoreToStackSlot(MI, Dummy, FrameIndex);
 }
 
 void ARMBaseInstrInfo::
@@ -1164,20 +1176,17 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   }
 }
 
-unsigned
-ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
-                                      int &FrameIndex) const {
-  switch (MI->getOpcode()) {
+unsigned ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
+                                               int &FrameIndex) const {
+  switch (MI.getOpcode()) {
   default: break;
   case ARM::LDRrs:
   case ARM::t2LDRs:  // FIXME: don't use t2LDRs to access frame.
-    if (MI->getOperand(1).isFI() &&
-        MI->getOperand(2).isReg() &&
-        MI->getOperand(3).isImm() &&
-        MI->getOperand(2).getReg() == 0 &&
-        MI->getOperand(3).getImm() == 0) {
-      FrameIndex = MI->getOperand(1).getIndex();
-      return MI->getOperand(0).getReg();
+    if (MI.getOperand(1).isFI() && MI.getOperand(2).isReg() &&
+        MI.getOperand(3).isImm() && MI.getOperand(2).getReg() == 0 &&
+        MI.getOperand(3).getImm() == 0) {
+      FrameIndex = MI.getOperand(1).getIndex();
+      return MI.getOperand(0).getReg();
     }
     break;
   case ARM::LDRi12:
@@ -1185,27 +1194,24 @@ ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
   case ARM::tLDRspi:
   case ARM::VLDRD:
   case ARM::VLDRS:
-    if (MI->getOperand(1).isFI() &&
-        MI->getOperand(2).isImm() &&
-        MI->getOperand(2).getImm() == 0) {
-      FrameIndex = MI->getOperand(1).getIndex();
-      return MI->getOperand(0).getReg();
+    if (MI.getOperand(1).isFI() && MI.getOperand(2).isImm() &&
+        MI.getOperand(2).getImm() == 0) {
+      FrameIndex = MI.getOperand(1).getIndex();
+      return MI.getOperand(0).getReg();
     }
     break;
   case ARM::VLD1q64:
   case ARM::VLD1d64TPseudo:
   case ARM::VLD1d64QPseudo:
-    if (MI->getOperand(1).isFI() &&
-        MI->getOperand(0).getSubReg() == 0) {
-      FrameIndex = MI->getOperand(1).getIndex();
-      return MI->getOperand(0).getReg();
+    if (MI.getOperand(1).isFI() && MI.getOperand(0).getSubReg() == 0) {
+      FrameIndex = MI.getOperand(1).getIndex();
+      return MI.getOperand(0).getReg();
     }
     break;
   case ARM::VLDMQIA:
-    if (MI->getOperand(1).isFI() &&
-        MI->getOperand(0).getSubReg() == 0) {
-      FrameIndex = MI->getOperand(1).getIndex();
-      return MI->getOperand(0).getReg();
+    if (MI.getOperand(1).isFI() && MI.getOperand(0).getSubReg() == 0) {
+      FrameIndex = MI.getOperand(1).getIndex();
+      return MI.getOperand(0).getReg();
     }
     break;
   }
@@ -1213,20 +1219,19 @@ ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
   return 0;
 }
 
-unsigned ARMBaseInstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI,
-                                             int &FrameIndex) const {
+unsigned ARMBaseInstrInfo::isLoadFromStackSlotPostFE(const MachineInstr &MI,
+                                                     int &FrameIndex) const {
   const MachineMemOperand *Dummy;
-  return MI->mayLoad() && hasLoadFromStackSlot(MI, Dummy, FrameIndex);
+  return MI.mayLoad() && hasLoadFromStackSlot(MI, Dummy, FrameIndex);
 }
 
 /// \brief Expands MEMCPY to either LDMIA/STMIA or LDMIA_UPD/STMID_UPD
 /// depending on whether the result is used.
-void ARMBaseInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MBBI) const {
+void ARMBaseInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MI) const {
   bool isThumb1 = Subtarget.isThumb1Only();
   bool isThumb2 = Subtarget.isThumb2();
   const ARMBaseInstrInfo *TII = Subtarget.getInstrInfo();
 
-  MachineInstr *MI = MBBI;
   DebugLoc dl = MI->getDebugLoc();
   MachineBasicBlock *BB = MI->getParent();
 
@@ -1269,24 +1274,20 @@ void ARMBaseInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MBBI) const {
     STM.addReg(Reg, RegState::Kill);
   }
 
-  BB->erase(MBBI);
+  BB->erase(MI);
 }
 
 
-bool
-ARMBaseInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
-  MachineFunction &MF = *MI->getParent()->getParent();
-  Reloc::Model RM = MF.getTarget().getRelocationModel();
-
-  if (MI->getOpcode() == TargetOpcode::LOAD_STACK_GUARD) {
+bool ARMBaseInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+  if (MI.getOpcode() == TargetOpcode::LOAD_STACK_GUARD) {
     assert(getSubtarget().getTargetTriple().isOSBinFormatMachO() &&
            "LOAD_STACK_GUARD currently supported only for MachO.");
-    expandLoadStackGuard(MI, RM);
-    MI->getParent()->erase(MI);
+    expandLoadStackGuard(MI);
+    MI.getParent()->erase(MI);
     return true;
   }
 
-  if (MI->getOpcode() == ARM::MEMCPY) {
+  if (MI.getOpcode() == ARM::MEMCPY) {
     expandMEMCPY(MI);
     return true;
   }
@@ -1295,14 +1296,13 @@ ARMBaseInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
   // copyPhysReg() calls.  Look for VMOVS instructions that can legally be
   // widened to VMOVD.  We prefer the VMOVD when possible because it may be
   // changed into a VORR that can go down the NEON pipeline.
-  if (!WidenVMOVS || !MI->isCopy() || Subtarget.isCortexA15() ||
-      Subtarget.isFPOnlySP())
+  if (!MI.isCopy() || Subtarget.dontWidenVMOVS() || Subtarget.isFPOnlySP())
     return false;
 
   // Look for a copy between even S-registers.  That is where we keep floats
   // when using NEON v2f32 instructions for f32 arithmetic.
-  unsigned DstRegS = MI->getOperand(0).getReg();
-  unsigned SrcRegS = MI->getOperand(1).getReg();
+  unsigned DstRegS = MI.getOperand(0).getReg();
+  unsigned SrcRegS = MI.getOperand(1).getReg();
   if (!ARM::SPRRegClass.contains(DstRegS, SrcRegS))
     return false;
 
@@ -1317,44 +1317,44 @@ ARMBaseInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
   // We want to widen this into a DstRegD = VMOVD SrcRegD copy.  This is only
   // legal if the COPY already defines the full DstRegD, and it isn't a
   // sub-register insertion.
-  if (!MI->definesRegister(DstRegD, TRI) || MI->readsRegister(DstRegD, TRI))
+  if (!MI.definesRegister(DstRegD, TRI) || MI.readsRegister(DstRegD, TRI))
     return false;
 
   // A dead copy shouldn't show up here, but reject it just in case.
-  if (MI->getOperand(0).isDead())
+  if (MI.getOperand(0).isDead())
     return false;
 
   // All clear, widen the COPY.
-  DEBUG(dbgs() << "widening:    " << *MI);
-  MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI);
+  DEBUG(dbgs() << "widening:    " << MI);
+  MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
 
   // Get rid of the old <imp-def> of DstRegD.  Leave it if it defines a Q-reg
   // or some other super-register.
-  int ImpDefIdx = MI->findRegisterDefOperandIdx(DstRegD);
+  int ImpDefIdx = MI.findRegisterDefOperandIdx(DstRegD);
   if (ImpDefIdx != -1)
-    MI->RemoveOperand(ImpDefIdx);
+    MI.RemoveOperand(ImpDefIdx);
 
   // Change the opcode and operands.
-  MI->setDesc(get(ARM::VMOVD));
-  MI->getOperand(0).setReg(DstRegD);
-  MI->getOperand(1).setReg(SrcRegD);
+  MI.setDesc(get(ARM::VMOVD));
+  MI.getOperand(0).setReg(DstRegD);
+  MI.getOperand(1).setReg(SrcRegD);
   AddDefaultPred(MIB);
 
   // We are now reading SrcRegD instead of SrcRegS.  This may upset the
   // register scavenger and machine verifier, so we need to indicate that we
   // are reading an undefined value from SrcRegD, but a proper value from
   // SrcRegS.
-  MI->getOperand(1).setIsUndef();
+  MI.getOperand(1).setIsUndef();
   MIB.addReg(SrcRegS, RegState::Implicit);
 
   // SrcRegD may actually contain an unrelated value in the ssub_1
   // sub-register.  Don't kill it.  Only kill the ssub_0 sub-register.
-  if (MI->getOperand(1).isKill()) {
-    MI->getOperand(1).setIsKill(false);
-    MI->addRegisterKilled(SrcRegS, TRI, true);
+  if (MI.getOperand(1).isKill()) {
+    MI.getOperand(1).setIsKill(false);
+    MI.addRegisterKilled(SrcRegS, TRI, true);
   }
 
-  DEBUG(dbgs() << "replaced by: " << *MI);
+  DEBUG(dbgs() << "replaced by: " << MI);
   return true;
 }
 
@@ -1403,54 +1403,54 @@ static unsigned duplicateCPV(MachineFunction &MF, unsigned &CPI) {
   return PCLabelId;
 }
 
-void ARMBaseInstrInfo::
-reMaterialize(MachineBasicBlock &MBB,
-              MachineBasicBlock::iterator I,
-              unsigned DestReg, unsigned SubIdx,
-              const MachineInstr *Orig,
-              const TargetRegisterInfo &TRI) const {
-  unsigned Opcode = Orig->getOpcode();
+void ARMBaseInstrInfo::reMaterialize(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I,
+                                     unsigned DestReg, unsigned SubIdx,
+                                     const MachineInstr &Orig,
+                                     const TargetRegisterInfo &TRI) const {
+  unsigned Opcode = Orig.getOpcode();
   switch (Opcode) {
   default: {
-    MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig);
-    MI->substituteRegister(Orig->getOperand(0).getReg(), DestReg, SubIdx, TRI);
+    MachineInstr *MI = MBB.getParent()->CloneMachineInstr(&Orig);
+    MI->substituteRegister(Orig.getOperand(0).getReg(), DestReg, SubIdx, TRI);
     MBB.insert(I, MI);
     break;
   }
   case ARM::tLDRpci_pic:
   case ARM::t2LDRpci_pic: {
     MachineFunction &MF = *MBB.getParent();
-    unsigned CPI = Orig->getOperand(1).getIndex();
+    unsigned CPI = Orig.getOperand(1).getIndex();
     unsigned PCLabelId = duplicateCPV(MF, CPI);
-    MachineInstrBuilder MIB = BuildMI(MBB, I, Orig->getDebugLoc(), get(Opcode),
-                                      DestReg)
-      .addConstantPoolIndex(CPI).addImm(PCLabelId);
-    MIB->setMemRefs(Orig->memoperands_begin(), Orig->memoperands_end());
+    MachineInstrBuilder MIB =
+        BuildMI(MBB, I, Orig.getDebugLoc(), get(Opcode), DestReg)
+            .addConstantPoolIndex(CPI)
+            .addImm(PCLabelId);
+    MIB->setMemRefs(Orig.memoperands_begin(), Orig.memoperands_end());
     break;
   }
   }
 }
 
-MachineInstr *
-ARMBaseInstrInfo::duplicate(MachineInstr *Orig, MachineFunction &MF) const {
+MachineInstr *ARMBaseInstrInfo::duplicate(MachineInstr &Orig,
+                                          MachineFunction &MF) const {
   MachineInstr *MI = TargetInstrInfo::duplicate(Orig, MF);
-  switch(Orig->getOpcode()) {
+  switch (Orig.getOpcode()) {
   case ARM::tLDRpci_pic:
   case ARM::t2LDRpci_pic: {
-    unsigned CPI = Orig->getOperand(1).getIndex();
+    unsigned CPI = Orig.getOperand(1).getIndex();
     unsigned PCLabelId = duplicateCPV(MF, CPI);
-    Orig->getOperand(1).setIndex(CPI);
-    Orig->getOperand(2).setImm(PCLabelId);
+    Orig.getOperand(1).setIndex(CPI);
+    Orig.getOperand(2).setImm(PCLabelId);
     break;
   }
   }
   return MI;
 }
 
-bool ARMBaseInstrInfo::produceSameValue(const MachineInstr *MI0,
-                                        const MachineInstr *MI1,
+bool ARMBaseInstrInfo::produceSameValue(const MachineInstr &MI0,
+                                        const MachineInstr &MI1,
                                         const MachineRegisterInfo *MRI) const {
-  unsigned Opcode = MI0->getOpcode();
+  unsigned Opcode = MI0.getOpcode();
   if (Opcode == ARM::t2LDRpci ||
       Opcode == ARM::t2LDRpci_pic ||
       Opcode == ARM::tLDRpci ||
@@ -1461,13 +1461,13 @@ bool ARMBaseInstrInfo::produceSameValue(const MachineInstr *MI0,
       Opcode == ARM::MOV_ga_pcrel ||
       Opcode == ARM::MOV_ga_pcrel_ldr ||
       Opcode == ARM::t2MOV_ga_pcrel) {
-    if (MI1->getOpcode() != Opcode)
+    if (MI1.getOpcode() != Opcode)
       return false;
-    if (MI0->getNumOperands() != MI1->getNumOperands())
+    if (MI0.getNumOperands() != MI1.getNumOperands())
       return false;
 
-    const MachineOperand &MO0 = MI0->getOperand(1);
-    const MachineOperand &MO1 = MI1->getOperand(1);
+    const MachineOperand &MO0 = MI0.getOperand(1);
+    const MachineOperand &MO1 = MI1.getOperand(1);
     if (MO0.getOffset() != MO1.getOffset())
       return false;
 
@@ -1480,7 +1480,7 @@ bool ARMBaseInstrInfo::produceSameValue(const MachineInstr *MI0,
       // Ignore the PC labels.
       return MO0.getGlobal() == MO1.getGlobal();
 
-    const MachineFunction *MF = MI0->getParent()->getParent();
+    const MachineFunction *MF = MI0.getParent()->getParent();
     const MachineConstantPool *MCP = MF->getConstantPool();
     int CPI0 = MO0.getIndex();
     int CPI1 = MO1.getIndex();
@@ -1499,13 +1499,13 @@ bool ARMBaseInstrInfo::produceSameValue(const MachineInstr *MI0,
     }
     return false;
   } else if (Opcode == ARM::PICLDR) {
-    if (MI1->getOpcode() != Opcode)
+    if (MI1.getOpcode() != Opcode)
       return false;
-    if (MI0->getNumOperands() != MI1->getNumOperands())
+    if (MI0.getNumOperands() != MI1.getNumOperands())
       return false;
 
-    unsigned Addr0 = MI0->getOperand(1).getReg();
-    unsigned Addr1 = MI1->getOperand(1).getReg();
+    unsigned Addr0 = MI0.getOperand(1).getReg();
+    unsigned Addr1 = MI1.getOperand(1).getReg();
     if (Addr0 != Addr1) {
       if (!MRI ||
           !TargetRegisterInfo::isVirtualRegister(Addr0) ||
@@ -1517,21 +1517,21 @@ bool ARMBaseInstrInfo::produceSameValue(const MachineInstr *MI0,
       MachineInstr *Def1 = MRI->getVRegDef(Addr1);
       // Check if the loaded value, e.g. a constantpool of a global address, are
       // the same.
-      if (!produceSameValue(Def0, Def1, MRI))
+      if (!produceSameValue(*Def0, *Def1, MRI))
         return false;
     }
 
-    for (unsigned i = 3, e = MI0->getNumOperands(); i != e; ++i) {
+    for (unsigned i = 3, e = MI0.getNumOperands(); i != e; ++i) {
       // %vreg12<def> = PICLDR %vreg11, 0, pred:14, pred:%noreg
-      const MachineOperand &MO0 = MI0->getOperand(i);
-      const MachineOperand &MO1 = MI1->getOperand(i);
+      const MachineOperand &MO0 = MI0.getOperand(i);
+      const MachineOperand &MO1 = MI1.getOperand(i);
       if (!MO0.isIdenticalTo(MO1))
         return false;
     }
     return true;
   }
 
-  return MI0->isIdenticalTo(MI1, MachineInstr::IgnoreVRegDefs);
+  return MI0.isIdenticalTo(MI1, MachineInstr::IgnoreVRegDefs);
 }
 
 /// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler to
@@ -1653,7 +1653,7 @@ bool ARMBaseInstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
   return true;
 }
 
-bool ARMBaseInstrInfo::isSchedulingBoundary(const MachineInstr *MI,
+bool ARMBaseInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
                                             const MachineBasicBlock *MBB,
                                             const MachineFunction &MF) const {
   // Debug info is never a scheduling boundary. It's necessary to be explicit
@@ -1662,11 +1662,11 @@ bool ARMBaseInstrInfo::isSchedulingBoundary(const MachineInstr *MI,
   // considered a scheduling hazard, which is wrong. It should be the actual
   // instruction preceding the dbg_value instruction(s), just like it is
   // when debug info is not present.
-  if (MI->isDebugValue())
+  if (MI.isDebugValue())
     return false;
 
   // Terminators and labels can't be scheduled around.
-  if (MI->isTerminator() || MI->isPosition())
+  if (MI.isTerminator() || MI.isPosition())
     return true;
 
   // Treat the start of the IT block as a scheduling boundary, but schedule
@@ -1690,7 +1690,7 @@ bool ARMBaseInstrInfo::isSchedulingBoundary(const MachineInstr *MI,
   // Calls don't actually change the stack pointer, even if they have imp-defs.
   // No ARM calling conventions change the stack pointer. (X86 calling
   // conventions sometimes do).
-  if (!MI->isCall() && MI->definesRegister(ARM::SP))
+  if (!MI.isCall() && MI.definesRegister(ARM::SP))
     return true;
 
   return false;
@@ -1718,7 +1718,7 @@ isProfitableToIfCvt(MachineBasicBlock &MBB,
               CmpMI->getOpcode() == ARM::t2CMPri) {
             unsigned Reg = CmpMI->getOperand(0).getReg();
             unsigned PredReg = 0;
-            ARMCC::CondCodes P = getInstrPredicate(CmpMI, PredReg);
+            ARMCC::CondCodes P = getInstrPredicate(*CmpMI, PredReg);
             if (P == ARMCC::AL && CmpMI->getOperand(1).getImm() == 0 &&
                 isARMLowRegister(Reg))
               return false;
@@ -1765,24 +1765,24 @@ isProfitableToIfCvt(MachineBasicBlock &TMBB,
 bool
 ARMBaseInstrInfo::isProfitableToUnpredicate(MachineBasicBlock &TMBB,
                                             MachineBasicBlock &FMBB) const {
-  // Reduce false anti-dependencies to let Swift's out-of-order execution
+  // Reduce false anti-dependencies to let the target's out-of-order execution
   // engine do its thing.
-  return Subtarget.isSwift();
+  return Subtarget.isProfitableToUnpredicate();
 }
 
 /// getInstrPredicate - If instruction is predicated, returns its predicate
 /// condition, otherwise returns AL. It also returns the condition code
 /// register by reference.
-ARMCC::CondCodes
-llvm::getInstrPredicate(const MachineInstr *MI, unsigned &PredReg) {
-  int PIdx = MI->findFirstPredOperandIdx();
+ARMCC::CondCodes llvm::getInstrPredicate(const MachineInstr &MI,
+                                         unsigned &PredReg) {
+  int PIdx = MI.findFirstPredOperandIdx();
   if (PIdx == -1) {
     PredReg = 0;
     return ARMCC::AL;
   }
 
-  PredReg = MI->getOperand(PIdx+1).getReg();
-  return (ARMCC::CondCodes)MI->getOperand(PIdx).getImm();
+  PredReg = MI.getOperand(PIdx+1).getReg();
+  return (ARMCC::CondCodes)MI.getOperand(PIdx).getImm();
 }
 
 
@@ -1797,11 +1797,11 @@ unsigned llvm::getMatchingCondBranchOpcode(unsigned Opc) {
   llvm_unreachable("Unknown unconditional branch opcode!");
 }
 
-MachineInstr *ARMBaseInstrInfo::commuteInstructionImpl(MachineInstr *MI,
+MachineInstr *ARMBaseInstrInfo::commuteInstructionImpl(MachineInstr &MI,
                                                        bool NewMI,
                                                        unsigned OpIdx1,
                                                        unsigned OpIdx2) const {
-  switch (MI->getOpcode()) {
+  switch (MI.getOpcode()) {
   case ARM::MOVCCr:
   case ARM::t2MOVCCr: {
     // MOVCC can be commuted by inverting the condition.
@@ -1810,13 +1810,14 @@ MachineInstr *ARMBaseInstrInfo::commuteInstructionImpl(MachineInstr *MI,
     // MOVCC AL can't be inverted. Shouldn't happen.
     if (CC == ARMCC::AL || PredReg != ARM::CPSR)
       return nullptr;
-    MI = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
-    if (!MI)
+    MachineInstr *CommutedMI =
+        TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
+    if (!CommutedMI)
       return nullptr;
     // After swapping the MOVCC operands, also invert the condition.
-    MI->getOperand(MI->findFirstPredOperandIdx())
-      .setImm(ARMCC::getOppositeCondition(CC));
-    return MI;
+    CommutedMI->getOperand(CommutedMI->findFirstPredOperandIdx())
+        .setImm(ARMCC::getOppositeCondition(CC));
+    return CommutedMI;
   }
   }
   return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
@@ -1860,11 +1861,11 @@ static MachineInstr *canFoldIntoMOVCC(unsigned Reg,
   return MI;
 }
 
-bool ARMBaseInstrInfo::analyzeSelect(const MachineInstr *MI,
+bool ARMBaseInstrInfo::analyzeSelect(const MachineInstr &MI,
                                      SmallVectorImpl<MachineOperand> &Cond,
                                      unsigned &TrueOp, unsigned &FalseOp,
                                      bool &Optimizable) const {
-  assert((MI->getOpcode() == ARM::MOVCCr || MI->getOpcode() == ARM::t2MOVCCr) &&
+  assert((MI.getOpcode() == ARM::MOVCCr || MI.getOpcode() == ARM::t2MOVCCr) &&
          "Unknown select instruction");
   // MOVCC operands:
   // 0: Def.
@@ -1874,38 +1875,38 @@ bool ARMBaseInstrInfo::analyzeSelect(const MachineInstr *MI,
   // 4: CPSR use.
   TrueOp = 1;
   FalseOp = 2;
-  Cond.push_back(MI->getOperand(3));
-  Cond.push_back(MI->getOperand(4));
+  Cond.push_back(MI.getOperand(3));
+  Cond.push_back(MI.getOperand(4));
   // We can always fold a def.
   Optimizable = true;
   return false;
 }
 
 MachineInstr *
-ARMBaseInstrInfo::optimizeSelect(MachineInstr *MI,
+ARMBaseInstrInfo::optimizeSelect(MachineInstr &MI,
                                  SmallPtrSetImpl<MachineInstr *> &SeenMIs,
                                  bool PreferFalse) const {
-  assert((MI->getOpcode() == ARM::MOVCCr || MI->getOpcode() == ARM::t2MOVCCr) &&
+  assert((MI.getOpcode() == ARM::MOVCCr || MI.getOpcode() == ARM::t2MOVCCr) &&
          "Unknown select instruction");
-  MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
-  MachineInstr *DefMI = canFoldIntoMOVCC(MI->getOperand(2).getReg(), MRI, this);
+  MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+  MachineInstr *DefMI = canFoldIntoMOVCC(MI.getOperand(2).getReg(), MRI, this);
   bool Invert = !DefMI;
   if (!DefMI)
-    DefMI = canFoldIntoMOVCC(MI->getOperand(1).getReg(), MRI, this);
+    DefMI = canFoldIntoMOVCC(MI.getOperand(1).getReg(), MRI, this);
   if (!DefMI)
     return nullptr;
 
   // Find new register class to use.
-  MachineOperand FalseReg = MI->getOperand(Invert ? 2 : 1);
-  unsigned       DestReg  = MI->getOperand(0).getReg();
+  MachineOperand FalseReg = MI.getOperand(Invert ? 2 : 1);
+  unsigned DestReg = MI.getOperand(0).getReg();
   const TargetRegisterClass *PreviousClass = MRI.getRegClass(FalseReg.getReg());
   if (!MRI.constrainRegClass(DestReg, PreviousClass))
     return nullptr;
 
   // Create a new predicated version of DefMI.
   // Rfalse is the first use.
-  MachineInstrBuilder NewMI = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
-                                      DefMI->getDesc(), DestReg);
+  MachineInstrBuilder NewMI =
+      BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), DefMI->getDesc(), DestReg);
 
   // Copy all the DefMI operands, excluding its (null) predicate.
   const MCInstrDesc &DefDesc = DefMI->getDesc();
@@ -1913,12 +1914,12 @@ ARMBaseInstrInfo::optimizeSelect(MachineInstr *MI,
        i != e && !DefDesc.OpInfo[i].isPredicate(); ++i)
     NewMI.addOperand(DefMI->getOperand(i));
 
-  unsigned CondCode = MI->getOperand(3).getImm();
+  unsigned CondCode = MI.getOperand(3).getImm();
   if (Invert)
     NewMI.addImm(ARMCC::getOppositeCondition(ARMCC::CondCodes(CondCode)));
   else
     NewMI.addImm(CondCode);
-  NewMI.addOperand(MI->getOperand(4));
+  NewMI.addOperand(MI.getOperand(4));
 
   // DefMI is not the -S version that sets CPSR, so add an optional %noreg.
   if (NewMI->hasOptionalDef())
@@ -1940,7 +1941,7 @@ ARMBaseInstrInfo::optimizeSelect(MachineInstr *MI,
   // DefMI would be invalid when tranferred inside the loop.  Checking for a
   // loop is expensive, but at least remove kill flags if they are in different
   // BBs.
-  if (DefMI->getParent() != MI->getParent())
+  if (DefMI->getParent() != MI.getParent())
     NewMI->clearKillInfo();
 
   // The caller will erase MI, but not DefMI.
@@ -1994,10 +1995,12 @@ unsigned llvm::convertAddSubFlagsOpcode(unsigned OldOpc) {
 }
 
 void llvm::emitARMRegPlusImmediate(MachineBasicBlock &MBB,
-                               MachineBasicBlock::iterator &MBBI, DebugLoc dl,
-                               unsigned DestReg, unsigned BaseReg, int NumBytes,
-                               ARMCC::CondCodes Pred, unsigned PredReg,
-                               const ARMBaseInstrInfo &TII, unsigned MIFlags) {
+                                   MachineBasicBlock::iterator &MBBI,
+                                   const DebugLoc &dl, unsigned DestReg,
+                                   unsigned BaseReg, int NumBytes,
+                                   ARMCC::CondCodes Pred, unsigned PredReg,
+                                   const ARMBaseInstrInfo &TII,
+                                   unsigned MIFlags) {
   if (NumBytes == 0 && DestReg != BaseReg) {
     BuildMI(MBB, MBBI, dl, TII.get(ARM::MOVr), DestReg)
       .addReg(BaseReg, RegState::Kill)
@@ -2281,30 +2284,30 @@ bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
 /// in SrcReg and SrcReg2 if having two register operands, and the value it
 /// compares against in CmpValue. Return true if the comparison instruction
 /// can be analyzed.
-bool ARMBaseInstrInfo::
-analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, unsigned &SrcReg2,
-               int &CmpMask, int &CmpValue) const {
-  switch (MI->getOpcode()) {
+bool ARMBaseInstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
+                                      unsigned &SrcReg2, int &CmpMask,
+                                      int &CmpValue) const {
+  switch (MI.getOpcode()) {
   default: break;
   case ARM::CMPri:
   case ARM::t2CMPri:
-    SrcReg = MI->getOperand(0).getReg();
+    SrcReg = MI.getOperand(0).getReg();
     SrcReg2 = 0;
     CmpMask = ~0;
-    CmpValue = MI->getOperand(1).getImm();
+    CmpValue = MI.getOperand(1).getImm();
     return true;
   case ARM::CMPrr:
   case ARM::t2CMPrr:
-    SrcReg = MI->getOperand(0).getReg();
-    SrcReg2 = MI->getOperand(1).getReg();
+    SrcReg = MI.getOperand(0).getReg();
+    SrcReg2 = MI.getOperand(1).getReg();
     CmpMask = ~0;
     CmpValue = 0;
     return true;
   case ARM::TSTri:
   case ARM::t2TSTri:
-    SrcReg = MI->getOperand(0).getReg();
+    SrcReg = MI.getOperand(0).getReg();
     SrcReg2 = 0;
-    CmpMask = MI->getOperand(1).getImm();
+    CmpMask = MI.getOperand(1).getImm();
     CmpValue = 0;
     return true;
   }
@@ -2385,25 +2388,25 @@ inline static bool isRedundantFlagInstr(MachineInstr *CmpI, unsigned SrcReg,
 /// E.g. SUBrr(r1,r2) and CMPrr(r1,r2). We also handle the case where two
 /// operands are swapped: SUBrr(r1,r2) and CMPrr(r2,r1), by updating the
 /// condition code of instructions which use the flags.
-bool ARMBaseInstrInfo::
-optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
-                     int CmpMask, int CmpValue,
-                     const MachineRegisterInfo *MRI) const {
+bool ARMBaseInstrInfo::optimizeCompareInstr(
+    MachineInstr &CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask,
+    int CmpValue, const MachineRegisterInfo *MRI) const {
   // Get the unique definition of SrcReg.
   MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
   if (!MI) return false;
 
   // Masked compares sometimes use the same register as the corresponding 'and'.
   if (CmpMask != ~0) {
-    if (!isSuitableForMask(MI, SrcReg, CmpMask, false) || isPredicated(MI)) {
+    if (!isSuitableForMask(MI, SrcReg, CmpMask, false) || isPredicated(*MI)) {
       MI = nullptr;
       for (MachineRegisterInfo::use_instr_iterator
            UI = MRI->use_instr_begin(SrcReg), UE = MRI->use_instr_end();
            UI != UE; ++UI) {
-        if (UI->getParent() != CmpInstr->getParent()) continue;
+        if (UI->getParent() != CmpInstr.getParent())
+          continue;
         MachineInstr *PotentialAND = &*UI;
         if (!isSuitableForMask(PotentialAND, SrcReg, CmpMask, true) ||
-            isPredicated(PotentialAND))
+            isPredicated(*PotentialAND))
           continue;
         MI = PotentialAND;
         break;
@@ -2414,7 +2417,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
 
   // Get ready to iterate backward from CmpInstr.
   MachineBasicBlock::iterator I = CmpInstr, E = MI,
-                              B = CmpInstr->getParent()->begin();
+                              B = CmpInstr.getParent()->begin();
 
   // Early exit if CmpInstr is at the beginning of the BB.
   if (I == B) return false;
@@ -2427,13 +2430,13 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
   if (SrcReg2 != 0)
     // MI is not a candidate for CMPrr.
     MI = nullptr;
-  else if (MI->getParent() != CmpInstr->getParent() || CmpValue != 0) {
+  else if (MI->getParent() != CmpInstr.getParent() || CmpValue != 0) {
     // Conservatively refuse to convert an instruction which isn't in the same
     // BB as the comparison.
     // For CMPri w/ CmpValue != 0, a Sub may still be a candidate.
     // Thus we cannot return here.
-    if (CmpInstr->getOpcode() == ARM::CMPri ||
-       CmpInstr->getOpcode() == ARM::t2CMPri)
+    if (CmpInstr.getOpcode() == ARM::CMPri ||
+        CmpInstr.getOpcode() == ARM::t2CMPri)
       MI = nullptr;
     else
       return false;
@@ -2453,7 +2456,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
       return false;
 
     // Check whether CmpInstr can be made redundant by the current instruction.
-    if (isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpValue, &*I)) {
+    if (isRedundantFlagInstr(&CmpInstr, SrcReg, SrcReg2, CmpValue, &*I)) {
       Sub = &*I;
       break;
     }
@@ -2471,7 +2474,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
   if (!MI) MI = Sub;
 
   // We can't use a predicated instruction - it doesn't always write the flags.
-  if (isPredicated(MI))
+  if (isPredicated(*MI))
     return false;
 
   switch (MI->getOpcode()) {
@@ -2519,7 +2522,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
         OperandsToUpdate;
     bool isSafe = false;
     I = CmpInstr;
-    E = CmpInstr->getParent()->end();
+    E = CmpInstr.getParent()->end();
     while (!isSafe && ++I != E) {
       const MachineInstr &Instr = *I;
       for (unsigned IO = 0, EO = Instr.getNumOperands();
@@ -2608,7 +2611,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
     // If CPSR is not killed nor re-defined, we should check whether it is
     // live-out. If it is live-out, do not optimize.
     if (!isSafe) {
-      MachineBasicBlock *MBB = CmpInstr->getParent();
+      MachineBasicBlock *MBB = CmpInstr.getParent();
       for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
                SE = MBB->succ_end(); SI != SE; ++SI)
         if ((*SI)->isLiveIn(ARM::CPSR))
@@ -2618,8 +2621,8 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
     // Toggle the optional operand to CPSR.
     MI->getOperand(5).setReg(ARM::CPSR);
     MI->getOperand(5).setIsDef(true);
-    assert(!isPredicated(MI) && "Can't use flags from predicated instruction");
-    CmpInstr->eraseFromParent();
+    assert(!isPredicated(*MI) && "Can't use flags from predicated instruction");
+    CmpInstr.eraseFromParent();
 
     // Modify the condition code of operands in OperandsToUpdate.
     // Since we have SUB(r1, r2) and CMP(r2, r1), the condition code needs to
@@ -2633,42 +2636,42 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
   return false;
 }
 
-bool ARMBaseInstrInfo::FoldImmediate(MachineInstr *UseMI,
-                                     MachineInstr *DefMI, unsigned Reg,
+bool ARMBaseInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
+                                     unsigned Reg,
                                      MachineRegisterInfo *MRI) const {
   // Fold large immediates into add, sub, or, xor.
-  unsigned DefOpc = DefMI->getOpcode();
+  unsigned DefOpc = DefMI.getOpcode();
   if (DefOpc != ARM::t2MOVi32imm && DefOpc != ARM::MOVi32imm)
     return false;
-  if (!DefMI->getOperand(1).isImm())
+  if (!DefMI.getOperand(1).isImm())
     // Could be t2MOVi32imm <ga:xx>
     return false;
 
   if (!MRI->hasOneNonDBGUse(Reg))
     return false;
 
-  const MCInstrDesc &DefMCID = DefMI->getDesc();
+  const MCInstrDesc &DefMCID = DefMI.getDesc();
   if (DefMCID.hasOptionalDef()) {
     unsigned NumOps = DefMCID.getNumOperands();
-    const MachineOperand &MO = DefMI->getOperand(NumOps-1);
+    const MachineOperand &MO = DefMI.getOperand(NumOps - 1);
     if (MO.getReg() == ARM::CPSR && !MO.isDead())
       // If DefMI defines CPSR and it is not dead, it's obviously not safe
       // to delete DefMI.
       return false;
   }
 
-  const MCInstrDesc &UseMCID = UseMI->getDesc();
+  const MCInstrDesc &UseMCID = UseMI.getDesc();
   if (UseMCID.hasOptionalDef()) {
     unsigned NumOps = UseMCID.getNumOperands();
-    if (UseMI->getOperand(NumOps-1).getReg() == ARM::CPSR)
+    if (UseMI.getOperand(NumOps - 1).getReg() == ARM::CPSR)
       // If the instruction sets the flag, do not attempt this optimization
       // since it may change the semantics of the code.
       return false;
   }
 
-  unsigned UseOpc = UseMI->getOpcode();
+  unsigned UseOpc = UseMI.getOpcode();
   unsigned NewUseOpc = 0;
-  uint32_t ImmVal = (uint32_t)DefMI->getOperand(1).getImm();
+  uint32_t ImmVal = (uint32_t)DefMI.getOperand(1).getImm();
   uint32_t SOImmValV1 = 0, SOImmValV2 = 0;
   bool Commute = false;
   switch (UseOpc) {
@@ -2681,17 +2684,27 @@ bool ARMBaseInstrInfo::FoldImmediate(MachineInstr *UseMI,
   case ARM::t2ADDrr:
   case ARM::t2ORRrr:
   case ARM::t2EORrr: {
-    Commute = UseMI->getOperand(2).getReg() != Reg;
+    Commute = UseMI.getOperand(2).getReg() != Reg;
     switch (UseOpc) {
     default: break;
+    case ARM::ADDrr:
     case ARM::SUBrr: {
-      if (Commute)
+      if (UseOpc == ARM::SUBrr && Commute)
+        return false;
+
+      // ADD/SUB are special because they're essentially the same operation, so
+      // we can handle a larger range of immediates.
+      if (ARM_AM::isSOImmTwoPartVal(ImmVal))
+        NewUseOpc = UseOpc == ARM::ADDrr ? ARM::ADDri : ARM::SUBri;
+      else if (ARM_AM::isSOImmTwoPartVal(-ImmVal)) {
+        ImmVal = -ImmVal;
+        NewUseOpc = UseOpc == ARM::ADDrr ? ARM::SUBri : ARM::ADDri;
+      } else
         return false;
-      ImmVal = -ImmVal;
-      NewUseOpc = ARM::SUBri;
-      // Fallthrough
+      SOImmValV1 = (uint32_t)ARM_AM::getSOImmTwoPartFirst(ImmVal);
+      SOImmValV2 = (uint32_t)ARM_AM::getSOImmTwoPartSecond(ImmVal);
+      break;
     }
-    case ARM::ADDrr:
     case ARM::ORRrr:
     case ARM::EORrr: {
       if (!ARM_AM::isSOImmTwoPartVal(ImmVal))
@@ -2700,20 +2713,29 @@ bool ARMBaseInstrInfo::FoldImmediate(MachineInstr *UseMI,
       SOImmValV2 = (uint32_t)ARM_AM::getSOImmTwoPartSecond(ImmVal);
       switch (UseOpc) {
       default: break;
-      case ARM::ADDrr: NewUseOpc = ARM::ADDri; break;
       case ARM::ORRrr: NewUseOpc = ARM::ORRri; break;
       case ARM::EORrr: NewUseOpc = ARM::EORri; break;
       }
       break;
     }
+    case ARM::t2ADDrr:
     case ARM::t2SUBrr: {
-      if (Commute)
+      if (UseOpc == ARM::t2SUBrr && Commute)
         return false;
-      ImmVal = -ImmVal;
-      NewUseOpc = ARM::t2SUBri;
-      // Fallthrough
+
+      // ADD/SUB are special because they're essentially the same operation, so
+      // we can handle a larger range of immediates.
+      if (ARM_AM::isT2SOImmTwoPartVal(ImmVal))
+        NewUseOpc = UseOpc == ARM::t2ADDrr ? ARM::t2ADDri : ARM::t2SUBri;
+      else if (ARM_AM::isT2SOImmTwoPartVal(-ImmVal)) {
+        ImmVal = -ImmVal;
+        NewUseOpc = UseOpc == ARM::t2ADDrr ? ARM::t2SUBri : ARM::t2ADDri;
+      } else
+        return false;
+      SOImmValV1 = (uint32_t)ARM_AM::getT2SOImmTwoPartFirst(ImmVal);
+      SOImmValV2 = (uint32_t)ARM_AM::getT2SOImmTwoPartSecond(ImmVal);
+      break;
     }
-    case ARM::t2ADDrr:
     case ARM::t2ORRrr:
     case ARM::t2EORrr: {
       if (!ARM_AM::isT2SOImmTwoPartVal(ImmVal))
@@ -2722,7 +2744,6 @@ bool ARMBaseInstrInfo::FoldImmediate(MachineInstr *UseMI,
       SOImmValV2 = (uint32_t)ARM_AM::getT2SOImmTwoPartSecond(ImmVal);
       switch (UseOpc) {
       default: break;
-      case ARM::t2ADDrr: NewUseOpc = ARM::t2ADDri; break;
       case ARM::t2ORRrr: NewUseOpc = ARM::t2ORRri; break;
       case ARM::t2EORrr: NewUseOpc = ARM::t2EORri; break;
       }
@@ -2733,27 +2754,27 @@ bool ARMBaseInstrInfo::FoldImmediate(MachineInstr *UseMI,
   }
 
   unsigned OpIdx = Commute ? 2 : 1;
-  unsigned Reg1 = UseMI->getOperand(OpIdx).getReg();
-  bool isKill = UseMI->getOperand(OpIdx).isKill();
+  unsigned Reg1 = UseMI.getOperand(OpIdx).getReg();
+  bool isKill = UseMI.getOperand(OpIdx).isKill();
   unsigned NewReg = MRI->createVirtualRegister(MRI->getRegClass(Reg));
-  AddDefaultCC(AddDefaultPred(BuildMI(*UseMI->getParent(),
-                                      UseMI, UseMI->getDebugLoc(),
-                                      get(NewUseOpc), NewReg)
-                              .addReg(Reg1, getKillRegState(isKill))
-                              .addImm(SOImmValV1)));
-  UseMI->setDesc(get(NewUseOpc));
-  UseMI->getOperand(1).setReg(NewReg);
-  UseMI->getOperand(1).setIsKill();
-  UseMI->getOperand(2).ChangeToImmediate(SOImmValV2);
-  DefMI->eraseFromParent();
+  AddDefaultCC(
+      AddDefaultPred(BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
+                             get(NewUseOpc), NewReg)
+                         .addReg(Reg1, getKillRegState(isKill))
+                         .addImm(SOImmValV1)));
+  UseMI.setDesc(get(NewUseOpc));
+  UseMI.getOperand(1).setReg(NewReg);
+  UseMI.getOperand(1).setIsKill();
+  UseMI.getOperand(2).ChangeToImmediate(SOImmValV2);
+  DefMI.eraseFromParent();
   return true;
 }
 
 static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
-                                        const MachineInstr *MI) {
-  switch (MI->getOpcode()) {
+                                        const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
   default: {
-    const MCInstrDesc &Desc = MI->getDesc();
+    const MCInstrDesc &Desc = MI.getDesc();
     int UOps = ItinData->getNumMicroOps(Desc.getSchedClass());
     assert(UOps >= 0 && "bad # UOps");
     return UOps;
@@ -2763,7 +2784,7 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
   case ARM::LDRBrs:
   case ARM::STRrs:
   case ARM::STRBrs: {
-    unsigned ShOpVal = MI->getOperand(3).getImm();
+    unsigned ShOpVal = MI.getOperand(3).getImm();
     bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub;
     unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
     if (!isSub &&
@@ -2776,10 +2797,10 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
 
   case ARM::LDRH:
   case ARM::STRH: {
-    if (!MI->getOperand(2).getReg())
+    if (!MI.getOperand(2).getReg())
       return 1;
 
-    unsigned ShOpVal = MI->getOperand(3).getImm();
+    unsigned ShOpVal = MI.getOperand(3).getImm();
     bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub;
     unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
     if (!isSub &&
@@ -2792,22 +2813,22 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
 
   case ARM::LDRSB:
   case ARM::LDRSH:
-    return (ARM_AM::getAM3Op(MI->getOperand(3).getImm()) == ARM_AM::sub) ? 3:2;
+    return (ARM_AM::getAM3Op(MI.getOperand(3).getImm()) == ARM_AM::sub) ? 3 : 2;
 
   case ARM::LDRSB_POST:
   case ARM::LDRSH_POST: {
-    unsigned Rt = MI->getOperand(0).getReg();
-    unsigned Rm = MI->getOperand(3).getReg();
+    unsigned Rt = MI.getOperand(0).getReg();
+    unsigned Rm = MI.getOperand(3).getReg();
     return (Rt == Rm) ? 4 : 3;
   }
 
   case ARM::LDR_PRE_REG:
   case ARM::LDRB_PRE_REG: {
-    unsigned Rt = MI->getOperand(0).getReg();
-    unsigned Rm = MI->getOperand(3).getReg();
+    unsigned Rt = MI.getOperand(0).getReg();
+    unsigned Rm = MI.getOperand(3).getReg();
     if (Rt == Rm)
       return 3;
-    unsigned ShOpVal = MI->getOperand(4).getImm();
+    unsigned ShOpVal = MI.getOperand(4).getImm();
     bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub;
     unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
     if (!isSub &&
@@ -2820,7 +2841,7 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
 
   case ARM::STR_PRE_REG:
   case ARM::STRB_PRE_REG: {
-    unsigned ShOpVal = MI->getOperand(4).getImm();
+    unsigned ShOpVal = MI.getOperand(4).getImm();
     bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub;
     unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
     if (!isSub &&
@@ -2833,21 +2854,20 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
 
   case ARM::LDRH_PRE:
   case ARM::STRH_PRE: {
-    unsigned Rt = MI->getOperand(0).getReg();
-    unsigned Rm = MI->getOperand(3).getReg();
+    unsigned Rt = MI.getOperand(0).getReg();
+    unsigned Rm = MI.getOperand(3).getReg();
     if (!Rm)
       return 2;
     if (Rt == Rm)
       return 3;
-    return (ARM_AM::getAM3Op(MI->getOperand(4).getImm()) == ARM_AM::sub)
-      ? 3 : 2;
+    return (ARM_AM::getAM3Op(MI.getOperand(4).getImm()) == ARM_AM::sub) ? 3 : 2;
   }
 
   case ARM::LDR_POST_REG:
   case ARM::LDRB_POST_REG:
   case ARM::LDRH_POST: {
-    unsigned Rt = MI->getOperand(0).getReg();
-    unsigned Rm = MI->getOperand(3).getReg();
+    unsigned Rt = MI.getOperand(0).getReg();
+    unsigned Rm = MI.getOperand(3).getReg();
     return (Rt == Rm) ? 3 : 2;
   }
 
@@ -2866,13 +2886,13 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
 
   case ARM::LDRSB_PRE:
   case ARM::LDRSH_PRE: {
-    unsigned Rm = MI->getOperand(3).getReg();
+    unsigned Rm = MI.getOperand(3).getReg();
     if (Rm == 0)
       return 3;
-    unsigned Rt = MI->getOperand(0).getReg();
+    unsigned Rt = MI.getOperand(0).getReg();
     if (Rt == Rm)
       return 4;
-    unsigned ShOpVal = MI->getOperand(4).getImm();
+    unsigned ShOpVal = MI.getOperand(4).getImm();
     bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub;
     unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
     if (!isSub &&
@@ -2884,18 +2904,20 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
   }
 
   case ARM::LDRD: {
-    unsigned Rt = MI->getOperand(0).getReg();
-    unsigned Rn = MI->getOperand(2).getReg();
-    unsigned Rm = MI->getOperand(3).getReg();
+    unsigned Rt = MI.getOperand(0).getReg();
+    unsigned Rn = MI.getOperand(2).getReg();
+    unsigned Rm = MI.getOperand(3).getReg();
     if (Rm)
-      return (ARM_AM::getAM3Op(MI->getOperand(4).getImm()) == ARM_AM::sub) ?4:3;
+      return (ARM_AM::getAM3Op(MI.getOperand(4).getImm()) == ARM_AM::sub) ? 4
+                                                                          : 3;
     return (Rt == Rn) ? 3 : 2;
   }
 
   case ARM::STRD: {
-    unsigned Rm = MI->getOperand(3).getReg();
+    unsigned Rm = MI.getOperand(3).getReg();
     if (Rm)
-      return (ARM_AM::getAM3Op(MI->getOperand(4).getImm()) == ARM_AM::sub) ?4:3;
+      return (ARM_AM::getAM3Op(MI.getOperand(4).getImm()) == ARM_AM::sub) ? 4
+                                                                          : 3;
     return 2;
   }
 
@@ -2908,24 +2930,26 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
     return 4;
 
   case ARM::LDRD_PRE: {
-    unsigned Rt = MI->getOperand(0).getReg();
-    unsigned Rn = MI->getOperand(3).getReg();
-    unsigned Rm = MI->getOperand(4).getReg();
+    unsigned Rt = MI.getOperand(0).getReg();
+    unsigned Rn = MI.getOperand(3).getReg();
+    unsigned Rm = MI.getOperand(4).getReg();
     if (Rm)
-      return (ARM_AM::getAM3Op(MI->getOperand(5).getImm()) == ARM_AM::sub) ?5:4;
+      return (ARM_AM::getAM3Op(MI.getOperand(5).getImm()) == ARM_AM::sub) ? 5
+                                                                          : 4;
     return (Rt == Rn) ? 4 : 3;
   }
 
   case ARM::t2LDRD_PRE: {
-    unsigned Rt = MI->getOperand(0).getReg();
-    unsigned Rn = MI->getOperand(3).getReg();
+    unsigned Rt = MI.getOperand(0).getReg();
+    unsigned Rn = MI.getOperand(3).getReg();
     return (Rt == Rn) ? 4 : 3;
   }
 
   case ARM::STRD_PRE: {
-    unsigned Rm = MI->getOperand(4).getReg();
+    unsigned Rm = MI.getOperand(4).getReg();
     if (Rm)
-      return (ARM_AM::getAM3Op(MI->getOperand(5).getImm()) == ARM_AM::sub) ?5:4;
+      return (ARM_AM::getAM3Op(MI.getOperand(5).getImm()) == ARM_AM::sub) ? 5
+                                                                          : 4;
     return 3;
   }
 
@@ -2953,8 +2977,8 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
     return 2;
 
   case ARM::t2LDRDi8: {
-    unsigned Rt = MI->getOperand(0).getReg();
-    unsigned Rn = MI->getOperand(2).getReg();
+    unsigned Rt = MI.getOperand(0).getReg();
+    unsigned Rn = MI.getOperand(2).getReg();
     return (Rt == Rn) ? 3 : 2;
   }
 
@@ -2994,22 +3018,61 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
 // sizes during MC lowering. That target hook should be local to MC lowering
 // because we can't ensure that it is aware of other MI forms. Doing this will
 // ensure that MachineMemOperands are correctly propagated through all passes.
-unsigned ARMBaseInstrInfo::getNumLDMAddresses(const MachineInstr *MI) const {
+unsigned ARMBaseInstrInfo::getNumLDMAddresses(const MachineInstr &MI) const {
   unsigned Size = 0;
-  for (MachineInstr::mmo_iterator I = MI->memoperands_begin(),
-         E = MI->memoperands_end(); I != E; ++I) {
+  for (MachineInstr::mmo_iterator I = MI.memoperands_begin(),
+                                  E = MI.memoperands_end();
+       I != E; ++I) {
     Size += (*I)->getSize();
   }
   return Size / 4;
 }
 
-unsigned
-ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
-                                 const MachineInstr *MI) const {
+static unsigned getNumMicroOpsSingleIssuePlusExtras(unsigned Opc,
+                                                    unsigned NumRegs) {
+  unsigned UOps = 1 + NumRegs; // 1 for address computation.
+  switch (Opc) {
+  default:
+    break;
+  case ARM::VLDMDIA_UPD:
+  case ARM::VLDMDDB_UPD:
+  case ARM::VLDMSIA_UPD:
+  case ARM::VLDMSDB_UPD:
+  case ARM::VSTMDIA_UPD:
+  case ARM::VSTMDDB_UPD:
+  case ARM::VSTMSIA_UPD:
+  case ARM::VSTMSDB_UPD:
+  case ARM::LDMIA_UPD:
+  case ARM::LDMDA_UPD:
+  case ARM::LDMDB_UPD:
+  case ARM::LDMIB_UPD:
+  case ARM::STMIA_UPD:
+  case ARM::STMDA_UPD:
+  case ARM::STMDB_UPD:
+  case ARM::STMIB_UPD:
+  case ARM::tLDMIA_UPD:
+  case ARM::tSTMIA_UPD:
+  case ARM::t2LDMIA_UPD:
+  case ARM::t2LDMDB_UPD:
+  case ARM::t2STMIA_UPD:
+  case ARM::t2STMDB_UPD:
+    ++UOps; // One for base register writeback.
+    break;
+  case ARM::LDMIA_RET:
+  case ARM::tPOP_RET:
+  case ARM::t2LDMIA_RET:
+    UOps += 2; // One for base reg wb, one for write to pc.
+    break;
+  }
+  return UOps;
+}
+
+unsigned ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
+                                          const MachineInstr &MI) const {
   if (!ItinData || ItinData->isEmpty())
     return 1;
 
-  const MCInstrDesc &Desc = MI->getDesc();
+  const MCInstrDesc &Desc = MI.getDesc();
   unsigned Class = Desc.getSchedClass();
   int ItinUOps = ItinData->getNumMicroOps(Class);
   if (ItinUOps >= 0) {
@@ -3019,7 +3082,7 @@ ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
     return ItinUOps;
   }
 
-  unsigned Opc = MI->getOpcode();
+  unsigned Opc = MI.getOpcode();
   switch (Opc) {
   default:
     llvm_unreachable("Unexpected multi-uops instruction!");
@@ -3049,7 +3112,7 @@ ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
   case ARM::VSTMSIA:
   case ARM::VSTMSIA_UPD:
   case ARM::VSTMSDB_UPD: {
-    unsigned NumRegs = MI->getNumOperands() - Desc.getNumOperands();
+    unsigned NumRegs = MI.getNumOperands() - Desc.getNumOperands();
     return (NumRegs / 2) + (NumRegs % 2) + 1;
   }
 
@@ -3085,66 +3148,36 @@ ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
   case ARM::t2STMDB:
   case ARM::t2STMIA_UPD:
   case ARM::t2STMDB_UPD: {
-    unsigned NumRegs = MI->getNumOperands() - Desc.getNumOperands() + 1;
-    if (Subtarget.isSwift()) {
-      int UOps = 1 + NumRegs;  // One for address computation, one for each ld / st.
-      switch (Opc) {
-      default: break;
-      case ARM::VLDMDIA_UPD:
-      case ARM::VLDMDDB_UPD:
-      case ARM::VLDMSIA_UPD:
-      case ARM::VLDMSDB_UPD:
-      case ARM::VSTMDIA_UPD:
-      case ARM::VSTMDDB_UPD:
-      case ARM::VSTMSIA_UPD:
-      case ARM::VSTMSDB_UPD:
-      case ARM::LDMIA_UPD:
-      case ARM::LDMDA_UPD:
-      case ARM::LDMDB_UPD:
-      case ARM::LDMIB_UPD:
-      case ARM::STMIA_UPD:
-      case ARM::STMDA_UPD:
-      case ARM::STMDB_UPD:
-      case ARM::STMIB_UPD:
-      case ARM::tLDMIA_UPD:
-      case ARM::tSTMIA_UPD:
-      case ARM::t2LDMIA_UPD:
-      case ARM::t2LDMDB_UPD:
-      case ARM::t2STMIA_UPD:
-      case ARM::t2STMDB_UPD:
-        ++UOps; // One for base register writeback.
-        break;
-      case ARM::LDMIA_RET:
-      case ARM::tPOP_RET:
-      case ARM::t2LDMIA_RET:
-        UOps += 2; // One for base reg wb, one for write to pc.
-        break;
-      }
-      return UOps;
-    } else if (Subtarget.isCortexA8() || Subtarget.isCortexA7()) {
+    unsigned NumRegs = MI.getNumOperands() - Desc.getNumOperands() + 1;
+    switch (Subtarget.getLdStMultipleTiming()) {
+    case ARMSubtarget::SingleIssuePlusExtras:
+      return getNumMicroOpsSingleIssuePlusExtras(Opc, NumRegs);
+    case ARMSubtarget::SingleIssue:
+      // Assume the worst.
+      return NumRegs;
+    case ARMSubtarget::DoubleIssue: {
       if (NumRegs < 4)
         return 2;
       // 4 registers would be issued: 2, 2.
       // 5 registers would be issued: 2, 2, 1.
-      int A8UOps = (NumRegs / 2);
+      unsigned UOps = (NumRegs / 2);
       if (NumRegs % 2)
-        ++A8UOps;
-      return A8UOps;
-    } else if (Subtarget.isLikeA9() || Subtarget.isSwift()) {
-      int A9UOps = (NumRegs / 2);
+        ++UOps;
+      return UOps;
+    }
+    case ARMSubtarget::DoubleIssueCheckUnalignedAccess: {
+      unsigned UOps = (NumRegs / 2);
       // If there are odd number of registers or if it's not 64-bit aligned,
       // then it takes an extra AGU (Address Generation Unit) cycle.
-      if ((NumRegs % 2) ||
-          !MI->hasOneMemOperand() ||
-          (*MI->memoperands_begin())->getAlignment() < 8)
-        ++A9UOps;
-      return A9UOps;
-    } else {
-      // Assume the worst.
-      return NumRegs;
+      if ((NumRegs % 2) || !MI.hasOneMemOperand() ||
+          (*MI.memoperands_begin())->getAlignment() < 8)
+        ++UOps;
+      return UOps;
+      }
     }
   }
   }
+  llvm_unreachable("Didn't find the number of microops");
 }
 
 int
@@ -3428,13 +3461,13 @@ static const MachineInstr *getBundledDefMI(const TargetRegisterInfo *TRI,
 }
 
 static const MachineInstr *getBundledUseMI(const TargetRegisterInfo *TRI,
-                                           const MachineInstr *MI, unsigned Reg,
+                                           const MachineInstr &MI, unsigned Reg,
                                            unsigned &UseIdx, unsigned &Dist) {
   Dist = 0;
 
-  MachineBasicBlock::const_instr_iterator II = ++MI->getIterator();
+  MachineBasicBlock::const_instr_iterator II = ++MI.getIterator();
   assert(II->isInsideBundle() && "Empty bundle?");
-  MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
+  MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
 
   // FIXME: This doesn't properly handle multiple uses.
   int Idx = -1;
@@ -3460,17 +3493,17 @@ static const MachineInstr *getBundledUseMI(const TargetRegisterInfo *TRI,
 /// itinerary based on the def opcode and alignment. The caller will ensure that
 /// adjusted latency is at least one cycle.
 static int adjustDefLatency(const ARMSubtarget &Subtarget,
-                            const MachineInstr *DefMI,
-                            const MCInstrDesc *DefMCID, unsigned DefAlign) {
+                            const MachineInstr &DefMI,
+                            const MCInstrDesc &DefMCID, unsigned DefAlign) {
   int Adjust = 0;
   if (Subtarget.isCortexA8() || Subtarget.isLikeA9() || Subtarget.isCortexA7()) {
     // FIXME: Shifter op hack: no shift (i.e. [r +/- r]) or [r + r << 2]
     // variants are one cycle cheaper.
-    switch (DefMCID->getOpcode()) {
+    switch (DefMCID.getOpcode()) {
     default: break;
     case ARM::LDRrs:
     case ARM::LDRBrs: {
-      unsigned ShOpVal = DefMI->getOperand(3).getImm();
+      unsigned ShOpVal = DefMI.getOperand(3).getImm();
       unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
       if (ShImm == 0 ||
           (ShImm == 2 && ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl))
@@ -3482,7 +3515,7 @@ static int adjustDefLatency(const ARMSubtarget &Subtarget,
     case ARM::t2LDRHs:
     case ARM::t2LDRSHs: {
       // Thumb2 mode: lsl only.
-      unsigned ShAmt = DefMI->getOperand(3).getImm();
+      unsigned ShAmt = DefMI.getOperand(3).getImm();
       if (ShAmt == 0 || ShAmt == 2)
         --Adjust;
       break;
@@ -3491,11 +3524,11 @@ static int adjustDefLatency(const ARMSubtarget &Subtarget,
   } else if (Subtarget.isSwift()) {
     // FIXME: Properly handle all of the latency adjustments for address
     // writeback.
-    switch (DefMCID->getOpcode()) {
+    switch (DefMCID.getOpcode()) {
     default: break;
     case ARM::LDRrs:
     case ARM::LDRBrs: {
-      unsigned ShOpVal = DefMI->getOperand(3).getImm();
+      unsigned ShOpVal = DefMI.getOperand(3).getImm();
       bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub;
       unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
       if (!isSub &&
@@ -3513,7 +3546,7 @@ static int adjustDefLatency(const ARMSubtarget &Subtarget,
     case ARM::t2LDRHs:
     case ARM::t2LDRSHs: {
       // Thumb2 mode: lsl only.
-      unsigned ShAmt = DefMI->getOperand(3).getImm();
+      unsigned ShAmt = DefMI.getOperand(3).getImm();
       if (ShAmt == 0 || ShAmt == 1 || ShAmt == 2 || ShAmt == 3)
         Adjust -= 2;
       break;
@@ -3521,8 +3554,8 @@ static int adjustDefLatency(const ARMSubtarget &Subtarget,
     }
   }
 
-  if (DefAlign < 8 && Subtarget.isLikeA9()) {
-    switch (DefMCID->getOpcode()) {
+  if (DefAlign < 8 && Subtarget.checkVLDnAccessAlignment()) {
+    switch (DefMCID.getOpcode()) {
     default: break;
     case ARM::VLD1q8:
     case ARM::VLD1q16:
@@ -3637,53 +3670,55 @@ static int adjustDefLatency(const ARMSubtarget &Subtarget,
   return Adjust;
 }
 
-
-
-int
-ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
-                                    const MachineInstr *DefMI, unsigned DefIdx,
-                                    const MachineInstr *UseMI,
-                                    unsigned UseIdx) const {
+int ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
+                                        const MachineInstr &DefMI,
+                                        unsigned DefIdx,
+                                        const MachineInstr &UseMI,
+                                        unsigned UseIdx) const {
   // No operand latency. The caller may fall back to getInstrLatency.
   if (!ItinData || ItinData->isEmpty())
     return -1;
 
-  const MachineOperand &DefMO = DefMI->getOperand(DefIdx);
+  const MachineOperand &DefMO = DefMI.getOperand(DefIdx);
   unsigned Reg = DefMO.getReg();
-  const MCInstrDesc *DefMCID = &DefMI->getDesc();
-  const MCInstrDesc *UseMCID = &UseMI->getDesc();
 
+  const MachineInstr *ResolvedDefMI = &DefMI;
   unsigned DefAdj = 0;
-  if (DefMI->isBundle()) {
-    DefMI = getBundledDefMI(&getRegisterInfo(), DefMI, Reg, DefIdx, DefAdj);
-    DefMCID = &DefMI->getDesc();
-  }
-  if (DefMI->isCopyLike() || DefMI->isInsertSubreg() ||
-      DefMI->isRegSequence() || DefMI->isImplicitDef()) {
+  if (DefMI.isBundle())
+    ResolvedDefMI =
+        getBundledDefMI(&getRegisterInfo(), &DefMI, Reg, DefIdx, DefAdj);
+  if (ResolvedDefMI->isCopyLike() || ResolvedDefMI->isInsertSubreg() ||
+      ResolvedDefMI->isRegSequence() || ResolvedDefMI->isImplicitDef()) {
     return 1;
   }
 
+  const MachineInstr *ResolvedUseMI = &UseMI;
   unsigned UseAdj = 0;
-  if (UseMI->isBundle()) {
-    unsigned NewUseIdx;
-    const MachineInstr *NewUseMI = getBundledUseMI(&getRegisterInfo(), UseMI,
-                                                   Reg, NewUseIdx, UseAdj);
-    if (!NewUseMI)
+  if (UseMI.isBundle()) {
+    ResolvedUseMI =
+        getBundledUseMI(&getRegisterInfo(), UseMI, Reg, UseIdx, UseAdj);
+    if (!ResolvedUseMI)
       return -1;
-
-    UseMI = NewUseMI;
-    UseIdx = NewUseIdx;
-    UseMCID = &UseMI->getDesc();
   }
 
+  return getOperandLatencyImpl(
+      ItinData, *ResolvedDefMI, DefIdx, ResolvedDefMI->getDesc(), DefAdj, DefMO,
+      Reg, *ResolvedUseMI, UseIdx, ResolvedUseMI->getDesc(), UseAdj);
+}
+
+int ARMBaseInstrInfo::getOperandLatencyImpl(
+    const InstrItineraryData *ItinData, const MachineInstr &DefMI,
+    unsigned DefIdx, const MCInstrDesc &DefMCID, unsigned DefAdj,
+    const MachineOperand &DefMO, unsigned Reg, const MachineInstr &UseMI,
+    unsigned UseIdx, const MCInstrDesc &UseMCID, unsigned UseAdj) const {
   if (Reg == ARM::CPSR) {
-    if (DefMI->getOpcode() == ARM::FMSTAT) {
+    if (DefMI.getOpcode() == ARM::FMSTAT) {
       // fpscr -> cpsr stalls over 20 cycles on A8 (and earlier?)
       return Subtarget.isLikeA9() ? 1 : 20;
     }
 
     // CPSR set and branch can be paired in the same cycle.
-    if (UseMI->isBranch())
+    if (UseMI.isBranch())
       return 0;
 
     // Otherwise it takes the instruction latency (generally one).
@@ -3694,7 +3729,7 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
     // incur a code size penalty (not able to use the CPSR setting 16-bit
     // instructions).
     if (Latency > 0 && Subtarget.isThumb2()) {
-      const MachineFunction *MF = DefMI->getParent()->getParent();
+      const MachineFunction *MF = DefMI.getParent()->getParent();
       // FIXME: Use Function::optForSize().
       if (MF->getFunction()->hasFnAttribute(Attribute::OptimizeForSize))
         --Latency;
@@ -3702,17 +3737,19 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
     return Latency;
   }
 
-  if (DefMO.isImplicit() || UseMI->getOperand(UseIdx).isImplicit())
+  if (DefMO.isImplicit() || UseMI.getOperand(UseIdx).isImplicit())
     return -1;
 
-  unsigned DefAlign = DefMI->hasOneMemOperand()
-    ? (*DefMI->memoperands_begin())->getAlignment() : 0;
-  unsigned UseAlign = UseMI->hasOneMemOperand()
-    ? (*UseMI->memoperands_begin())->getAlignment() : 0;
+  unsigned DefAlign = DefMI.hasOneMemOperand()
+                          ? (*DefMI.memoperands_begin())->getAlignment()
+                          : 0;
+  unsigned UseAlign = UseMI.hasOneMemOperand()
+                          ? (*UseMI.memoperands_begin())->getAlignment()
+                          : 0;
 
   // Get the itinerary's latency if possible, and handle variable_ops.
-  int Latency = getOperandLatency(ItinData, *DefMCID, DefIdx, DefAlign,
-                                  *UseMCID, UseIdx, UseAlign);
+  int Latency = getOperandLatency(ItinData, DefMCID, DefIdx, DefAlign, UseMCID,
+                                  UseIdx, UseAlign);
   // Unable to find operand latency. The caller may resort to getInstrLatency.
   if (Latency < 0)
     return Latency;
@@ -3746,10 +3783,9 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
 
   if (!UseNode->isMachineOpcode()) {
     int Latency = ItinData->getOperandCycle(DefMCID.getSchedClass(), DefIdx);
-    if (Subtarget.isLikeA9() || Subtarget.isSwift())
-      return Latency <= 2 ? 1 : Latency - 1;
-    else
-      return Latency <= 3 ? 1 : Latency - 2;
+    int Adj = Subtarget.getPreISelOperandLatencyAdjustment();
+    int Threshold = 1 + Adj;
+    return Latency <= Threshold ? 1 : Latency - Adj;
   }
 
   const MCInstrDesc &UseMCID = get(UseNode->getMachineOpcode());
@@ -3820,7 +3856,7 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
     }
   }
 
-  if (DefAlign < 8 && Subtarget.isLikeA9())
+  if (DefAlign < 8 && Subtarget.checkVLDnAccessAlignment())
     switch (DefMCID.getOpcode()) {
     default: break;
     case ARM::VLD1q8:
@@ -3946,15 +3982,15 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
   return Latency;
 }
 
-unsigned ARMBaseInstrInfo::getPredicationCost(const MachineInstr *MI) const {
-   if (MI->isCopyLike() || MI->isInsertSubreg() ||
-      MI->isRegSequence() || MI->isImplicitDef())
+unsigned ARMBaseInstrInfo::getPredicationCost(const MachineInstr &MI) const {
+  if (MI.isCopyLike() || MI.isInsertSubreg() || MI.isRegSequence() ||
+      MI.isImplicitDef())
     return 0;
 
-  if (MI->isBundle())
+  if (MI.isBundle())
     return 0;
 
-  const MCInstrDesc &MCID = MI->getDesc();
+  const MCInstrDesc &MCID = MI.getDesc();
 
   if (MCID.isCall() || MCID.hasImplicitDefOfPhysReg(ARM::CPSR)) {
     // When predicated, CPSR is an additional source operand for CPSR updating
@@ -3965,26 +4001,26 @@ unsigned ARMBaseInstrInfo::getPredicationCost(const MachineInstr *MI) const {
 }
 
 unsigned ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
-                                           const MachineInstr *MI,
+                                           const MachineInstr &MI,
                                            unsigned *PredCost) const {
-  if (MI->isCopyLike() || MI->isInsertSubreg() ||
-      MI->isRegSequence() || MI->isImplicitDef())
+  if (MI.isCopyLike() || MI.isInsertSubreg() || MI.isRegSequence() ||
+      MI.isImplicitDef())
     return 1;
 
   // An instruction scheduler typically runs on unbundled instructions, however
   // other passes may query the latency of a bundled instruction.
-  if (MI->isBundle()) {
+  if (MI.isBundle()) {
     unsigned Latency = 0;
-    MachineBasicBlock::const_instr_iterator I = MI->getIterator();
-    MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
+    MachineBasicBlock::const_instr_iterator I = MI.getIterator();
+    MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
     while (++I != E && I->isInsideBundle()) {
       if (I->getOpcode() != ARM::t2IT)
-        Latency += getInstrLatency(ItinData, &*I, PredCost);
+        Latency += getInstrLatency(ItinData, *I, PredCost);
     }
     return Latency;
   }
 
-  const MCInstrDesc &MCID = MI->getDesc();
+  const MCInstrDesc &MCID = MI.getDesc();
   if (PredCost && (MCID.isCall() || MCID.hasImplicitDefOfPhysReg(ARM::CPSR))) {
     // When predicated, CPSR is an additional source operand for CPSR updating
     // instructions, this apparently increases their latencies.
@@ -3993,7 +4029,7 @@ unsigned ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
   // Be sure to call getStageLatency for an empty itinerary in case it has a
   // valid MinLatency property.
   if (!ItinData)
-    return MI->mayLoad() ? 3 : 1;
+    return MI.mayLoad() ? 3 : 1;
 
   unsigned Class = MCID.getSchedClass();
 
@@ -4005,9 +4041,9 @@ unsigned ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
   unsigned Latency = ItinData->getStageLatency(Class);
 
   // Adjust for dynamic def-side opcode variants not captured by the itinerary.
-  unsigned DefAlign = MI->hasOneMemOperand()
-    ? (*MI->memoperands_begin())->getAlignment() : 0;
-  int Adj = adjustDefLatency(Subtarget, MI, &MCID, DefAlign);
+  unsigned DefAlign =
+      MI.hasOneMemOperand() ? (*MI.memoperands_begin())->getAlignment() : 0;
+  int Adj = adjustDefLatency(Subtarget, MI, MCID, DefAlign);
   if (Adj >= 0 || (int)Latency > -Adj) {
     return Latency + Adj;
   }
@@ -4032,46 +4068,46 @@ int ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
   }
 }
 
-bool ARMBaseInstrInfo::
-hasHighOperandLatency(const TargetSchedModel &SchedModel,
-                      const MachineRegisterInfo *MRI,
-                      const MachineInstr *DefMI, unsigned DefIdx,
-                      const MachineInstr *UseMI, unsigned UseIdx) const {
-  unsigned DDomain = DefMI->getDesc().TSFlags & ARMII::DomainMask;
-  unsigned UDomain = UseMI->getDesc().TSFlags & ARMII::DomainMask;
-  if (Subtarget.isCortexA8() &&
+bool ARMBaseInstrInfo::hasHighOperandLatency(const TargetSchedModel &SchedModel,
+                                             const MachineRegisterInfo *MRI,
+                                             const MachineInstr &DefMI,
+                                             unsigned DefIdx,
+                                             const MachineInstr &UseMI,
+                                             unsigned UseIdx) const {
+  unsigned DDomain = DefMI.getDesc().TSFlags & ARMII::DomainMask;
+  unsigned UDomain = UseMI.getDesc().TSFlags & ARMII::DomainMask;
+  if (Subtarget.nonpipelinedVFP() &&
       (DDomain == ARMII::DomainVFP || UDomain == ARMII::DomainVFP))
-    // CortexA8 VFP instructions are not pipelined.
     return true;
 
   // Hoist VFP / NEON instructions with 4 or higher latency.
-  unsigned Latency
-    = SchedModel.computeOperandLatency(DefMI, DefIdx, UseMI, UseIdx);
+  unsigned Latency =
+      SchedModel.computeOperandLatency(&DefMI, DefIdx, &UseMI, UseIdx);
   if (Latency <= 3)
     return false;
   return DDomain == ARMII::DomainVFP || DDomain == ARMII::DomainNEON ||
          UDomain == ARMII::DomainVFP || UDomain == ARMII::DomainNEON;
 }
 
-bool ARMBaseInstrInfo::
-hasLowDefLatency(const TargetSchedModel &SchedModel,
-                 const MachineInstr *DefMI, unsigned DefIdx) const {
+bool ARMBaseInstrInfo::hasLowDefLatency(const TargetSchedModel &SchedModel,
+                                        const MachineInstr &DefMI,
+                                        unsigned DefIdx) const {
   const InstrItineraryData *ItinData = SchedModel.getInstrItineraries();
   if (!ItinData || ItinData->isEmpty())
     return false;
 
-  unsigned DDomain = DefMI->getDesc().TSFlags & ARMII::DomainMask;
+  unsigned DDomain = DefMI.getDesc().TSFlags & ARMII::DomainMask;
   if (DDomain == ARMII::DomainGeneral) {
-    unsigned DefClass = DefMI->getDesc().getSchedClass();
+    unsigned DefClass = DefMI.getDesc().getSchedClass();
     int DefCycle = ItinData->getOperandCycle(DefClass, DefIdx);
     return (DefCycle != -1 && DefCycle <= 2);
   }
   return false;
 }
 
-bool ARMBaseInstrInfo::verifyInstruction(const MachineInstr *MI,
+bool ARMBaseInstrInfo::verifyInstruction(const MachineInstr &MI,
                                          StringRef &ErrInfo) const {
-  if (convertAddSubFlagsOpcode(MI->getOpcode())) {
+  if (convertAddSubFlagsOpcode(MI.getOpcode())) {
     ErrInfo = "Pseudo flag setting opcodes only exist in Selection DAG";
     return false;
   }
@@ -4082,8 +4118,7 @@ bool ARMBaseInstrInfo::verifyInstruction(const MachineInstr *MI,
 // sequence is needed for other targets.
 void ARMBaseInstrInfo::expandLoadStackGuardBase(MachineBasicBlock::iterator MI,
                                                 unsigned LoadImmOpc,
-                                                unsigned LoadOpc,
-                                                Reloc::Model RM) const {
+                                                unsigned LoadOpc) const {
   MachineBasicBlock &MBB = *MI->getParent();
   DebugLoc DL = MI->getDebugLoc();
   unsigned Reg = MI->getOperand(0).getReg();
@@ -4094,12 +4129,12 @@ void ARMBaseInstrInfo::expandLoadStackGuardBase(MachineBasicBlock::iterator MI,
   BuildMI(MBB, MI, DL, get(LoadImmOpc), Reg)
       .addGlobalAddress(GV, 0, ARMII::MO_NONLAZY);
 
-  if (Subtarget.GVIsIndirectSymbol(GV, RM)) {
+  if (Subtarget.isGVIndirectSymbol(GV)) {
     MIB = BuildMI(MBB, MI, DL, get(LoadOpc), Reg);
     MIB.addReg(Reg, RegState::Kill).addImm(0);
-    unsigned Flag = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant;
+    auto Flags = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant;
     MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand(
-        MachinePointerInfo::getGOT(*MBB.getParent()), Flag, 4, 4);
+        MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 4, 4);
     MIB.addMemOperand(MMO);
     AddDefaultPred(MIB);
   }
@@ -4146,24 +4181,24 @@ enum ARMExeDomain {
 // Also see ARMInstrFormats.td and Domain* enums in ARMBaseInfo.h
 //
 std::pair<uint16_t, uint16_t>
-ARMBaseInstrInfo::getExecutionDomain(const MachineInstr *MI) const {
+ARMBaseInstrInfo::getExecutionDomain(const MachineInstr &MI) const {
   // If we don't have access to NEON instructions then we won't be able
   // to swizzle anything to the NEON domain. Check to make sure.
   if (Subtarget.hasNEON()) {
     // VMOVD, VMOVRS and VMOVSR are VFP instructions, but can be changed to NEON
     // if they are not predicated.
-    if (MI->getOpcode() == ARM::VMOVD && !isPredicated(MI))
+    if (MI.getOpcode() == ARM::VMOVD && !isPredicated(MI))
       return std::make_pair(ExeVFP, (1 << ExeVFP) | (1 << ExeNEON));
 
     // CortexA9 is particularly picky about mixing the two and wants these
     // converted.
-    if (Subtarget.isCortexA9() && !isPredicated(MI) &&
-        (MI->getOpcode() == ARM::VMOVRS || MI->getOpcode() == ARM::VMOVSR ||
-         MI->getOpcode() == ARM::VMOVS))
+    if (Subtarget.useNEONForFPMovs() && !isPredicated(MI) &&
+        (MI.getOpcode() == ARM::VMOVRS || MI.getOpcode() == ARM::VMOVSR ||
+         MI.getOpcode() == ARM::VMOVS))
       return std::make_pair(ExeVFP, (1 << ExeVFP) | (1 << ExeNEON));
   }
   // No other instructions can be swizzled, so just determine their domain.
-  unsigned Domain = MI->getDesc().TSFlags & ARMII::DomainMask;
+  unsigned Domain = MI.getDesc().TSFlags & ARMII::DomainMask;
 
   if (Domain & ARMII::DomainNEON)
     return std::make_pair(ExeNEON, 0);
@@ -4210,12 +4245,11 @@ static unsigned getCorrespondingDRegAndLane(const TargetRegisterInfo *TRI,
 /// (including the case where the DPR itself is defined), it should not.
 ///
 static bool getImplicitSPRUseForDPRUse(const TargetRegisterInfo *TRI,
-                                       MachineInstr *MI,
-                                       unsigned DReg, unsigned Lane,
-                                       unsigned &ImplicitSReg) {
+                                       MachineInstr &MI, unsigned DReg,
+                                       unsigned Lane, unsigned &ImplicitSReg) {
   // If the DPR is defined or used already, the other SPR lane will be chained
   // correctly, so there is nothing to be done.
-  if (MI->definesRegister(DReg, TRI) || MI->readsRegister(DReg, TRI)) {
+  if (MI.definesRegister(DReg, TRI) || MI.readsRegister(DReg, TRI)) {
     ImplicitSReg = 0;
     return true;
   }
@@ -4224,7 +4258,7 @@ static bool getImplicitSPRUseForDPRUse(const TargetRegisterInfo *TRI,
   ImplicitSReg = TRI->getSubReg(DReg,
                                 (Lane & 1) ? ARM::ssub_0 : ARM::ssub_1);
   MachineBasicBlock::LivenessQueryResult LQR =
-    MI->getParent()->computeRegisterLiveness(TRI, ImplicitSReg, MI);
+      MI.getParent()->computeRegisterLiveness(TRI, ImplicitSReg, MI);
 
   if (LQR == MachineBasicBlock::LQR_Live)
     return true;
@@ -4237,106 +4271,105 @@ static bool getImplicitSPRUseForDPRUse(const TargetRegisterInfo *TRI,
   return true;
 }
 
-void
-ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
+void ARMBaseInstrInfo::setExecutionDomain(MachineInstr &MI,
+                                          unsigned Domain) const {
   unsigned DstReg, SrcReg, DReg;
   unsigned Lane;
-  MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI);
+  MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
   const TargetRegisterInfo *TRI = &getRegisterInfo();
-  switch (MI->getOpcode()) {
-    default:
-      llvm_unreachable("cannot handle opcode!");
+  switch (MI.getOpcode()) {
+  default:
+    llvm_unreachable("cannot handle opcode!");
+    break;
+  case ARM::VMOVD:
+    if (Domain != ExeNEON)
       break;
-    case ARM::VMOVD:
-      if (Domain != ExeNEON)
-        break;
 
-      // Zap the predicate operands.
-      assert(!isPredicated(MI) && "Cannot predicate a VORRd");
+    // Zap the predicate operands.
+    assert(!isPredicated(MI) && "Cannot predicate a VORRd");
 
-      // Make sure we've got NEON instructions.
-      assert(Subtarget.hasNEON() && "VORRd requires NEON");
+    // Make sure we've got NEON instructions.
+    assert(Subtarget.hasNEON() && "VORRd requires NEON");
 
-      // Source instruction is %DDst = VMOVD %DSrc, 14, %noreg (; implicits)
-      DstReg = MI->getOperand(0).getReg();
-      SrcReg = MI->getOperand(1).getReg();
+    // Source instruction is %DDst = VMOVD %DSrc, 14, %noreg (; implicits)
+    DstReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(1).getReg();
 
-      for (unsigned i = MI->getDesc().getNumOperands(); i; --i)
-        MI->RemoveOperand(i-1);
+    for (unsigned i = MI.getDesc().getNumOperands(); i; --i)
+      MI.RemoveOperand(i - 1);
 
-      // Change to a %DDst = VORRd %DSrc, %DSrc, 14, %noreg (; implicits)
-      MI->setDesc(get(ARM::VORRd));
-      AddDefaultPred(MIB.addReg(DstReg, RegState::Define)
-                        .addReg(SrcReg)
-                        .addReg(SrcReg));
+    // Change to a %DDst = VORRd %DSrc, %DSrc, 14, %noreg (; implicits)
+    MI.setDesc(get(ARM::VORRd));
+    AddDefaultPred(
+        MIB.addReg(DstReg, RegState::Define).addReg(SrcReg).addReg(SrcReg));
+    break;
+  case ARM::VMOVRS:
+    if (Domain != ExeNEON)
       break;
-    case ARM::VMOVRS:
-      if (Domain != ExeNEON)
-        break;
-      assert(!isPredicated(MI) && "Cannot predicate a VGETLN");
+    assert(!isPredicated(MI) && "Cannot predicate a VGETLN");
 
-      // Source instruction is %RDst = VMOVRS %SSrc, 14, %noreg (; implicits)
-      DstReg = MI->getOperand(0).getReg();
-      SrcReg = MI->getOperand(1).getReg();
+    // Source instruction is %RDst = VMOVRS %SSrc, 14, %noreg (; implicits)
+    DstReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(1).getReg();
 
-      for (unsigned i = MI->getDesc().getNumOperands(); i; --i)
-        MI->RemoveOperand(i-1);
+    for (unsigned i = MI.getDesc().getNumOperands(); i; --i)
+      MI.RemoveOperand(i - 1);
 
-      DReg = getCorrespondingDRegAndLane(TRI, SrcReg, Lane);
+    DReg = getCorrespondingDRegAndLane(TRI, SrcReg, Lane);
 
-      // Convert to %RDst = VGETLNi32 %DSrc, Lane, 14, %noreg (; imps)
-      // Note that DSrc has been widened and the other lane may be undef, which
-      // contaminates the entire register.
-      MI->setDesc(get(ARM::VGETLNi32));
-      AddDefaultPred(MIB.addReg(DstReg, RegState::Define)
-                        .addReg(DReg, RegState::Undef)
-                        .addImm(Lane));
+    // Convert to %RDst = VGETLNi32 %DSrc, Lane, 14, %noreg (; imps)
+    // Note that DSrc has been widened and the other lane may be undef, which
+    // contaminates the entire register.
+    MI.setDesc(get(ARM::VGETLNi32));
+    AddDefaultPred(MIB.addReg(DstReg, RegState::Define)
+                       .addReg(DReg, RegState::Undef)
+                       .addImm(Lane));
 
-      // The old source should be an implicit use, otherwise we might think it
-      // was dead before here.
-      MIB.addReg(SrcReg, RegState::Implicit);
+    // The old source should be an implicit use, otherwise we might think it
+    // was dead before here.
+    MIB.addReg(SrcReg, RegState::Implicit);
+    break;
+  case ARM::VMOVSR: {
+    if (Domain != ExeNEON)
       break;
-    case ARM::VMOVSR: {
-      if (Domain != ExeNEON)
-        break;
-      assert(!isPredicated(MI) && "Cannot predicate a VSETLN");
+    assert(!isPredicated(MI) && "Cannot predicate a VSETLN");
 
-      // Source instruction is %SDst = VMOVSR %RSrc, 14, %noreg (; implicits)
-      DstReg = MI->getOperand(0).getReg();
-      SrcReg = MI->getOperand(1).getReg();
+    // Source instruction is %SDst = VMOVSR %RSrc, 14, %noreg (; implicits)
+    DstReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(1).getReg();
 
-      DReg = getCorrespondingDRegAndLane(TRI, DstReg, Lane);
+    DReg = getCorrespondingDRegAndLane(TRI, DstReg, Lane);
 
-      unsigned ImplicitSReg;
-      if (!getImplicitSPRUseForDPRUse(TRI, MI, DReg, Lane, ImplicitSReg))
-        break;
+    unsigned ImplicitSReg;
+    if (!getImplicitSPRUseForDPRUse(TRI, MI, DReg, Lane, ImplicitSReg))
+      break;
 
-      for (unsigned i = MI->getDesc().getNumOperands(); i; --i)
-        MI->RemoveOperand(i-1);
+    for (unsigned i = MI.getDesc().getNumOperands(); i; --i)
+      MI.RemoveOperand(i - 1);
 
-      // Convert to %DDst = VSETLNi32 %DDst, %RSrc, Lane, 14, %noreg (; imps)
-      // Again DDst may be undefined at the beginning of this instruction.
-      MI->setDesc(get(ARM::VSETLNi32));
-      MIB.addReg(DReg, RegState::Define)
-         .addReg(DReg, getUndefRegState(!MI->readsRegister(DReg, TRI)))
-         .addReg(SrcReg)
-         .addImm(Lane);
-      AddDefaultPred(MIB);
+    // Convert to %DDst = VSETLNi32 %DDst, %RSrc, Lane, 14, %noreg (; imps)
+    // Again DDst may be undefined at the beginning of this instruction.
+    MI.setDesc(get(ARM::VSETLNi32));
+    MIB.addReg(DReg, RegState::Define)
+        .addReg(DReg, getUndefRegState(!MI.readsRegister(DReg, TRI)))
+        .addReg(SrcReg)
+        .addImm(Lane);
+    AddDefaultPred(MIB);
 
-      // The narrower destination must be marked as set to keep previous chains
-      // in place.
-      MIB.addReg(DstReg, RegState::Define | RegState::Implicit);
-      if (ImplicitSReg != 0)
-        MIB.addReg(ImplicitSReg, RegState::Implicit);
-      break;
+    // The narrower destination must be marked as set to keep previous chains
+    // in place.
+    MIB.addReg(DstReg, RegState::Define | RegState::Implicit);
+    if (ImplicitSReg != 0)
+      MIB.addReg(ImplicitSReg, RegState::Implicit);
+    break;
     }
     case ARM::VMOVS: {
       if (Domain != ExeNEON)
         break;
 
       // Source instruction is %SDst = VMOVS %SSrc, 14, %noreg (; implicits)
-      DstReg = MI->getOperand(0).getReg();
-      SrcReg = MI->getOperand(1).getReg();
+      DstReg = MI.getOperand(0).getReg();
+      SrcReg = MI.getOperand(1).getReg();
 
       unsigned DstLane = 0, SrcLane = 0, DDst, DSrc;
       DDst = getCorrespondingDRegAndLane(TRI, DstReg, DstLane);
@@ -4346,16 +4379,16 @@ ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
       if (!getImplicitSPRUseForDPRUse(TRI, MI, DSrc, SrcLane, ImplicitSReg))
         break;
 
-      for (unsigned i = MI->getDesc().getNumOperands(); i; --i)
-        MI->RemoveOperand(i-1);
+      for (unsigned i = MI.getDesc().getNumOperands(); i; --i)
+        MI.RemoveOperand(i - 1);
 
       if (DSrc == DDst) {
         // Destination can be:
         //     %DDst = VDUPLN32d %DDst, Lane, 14, %noreg (; implicits)
-        MI->setDesc(get(ARM::VDUPLN32d));
+        MI.setDesc(get(ARM::VDUPLN32d));
         MIB.addReg(DDst, RegState::Define)
-           .addReg(DDst, getUndefRegState(!MI->readsRegister(DDst, TRI)))
-           .addImm(SrcLane);
+            .addReg(DDst, getUndefRegState(!MI.readsRegister(DDst, TRI)))
+            .addImm(SrcLane);
         AddDefaultPred(MIB);
 
         // Neither the source or the destination are naturally represented any
@@ -4380,18 +4413,18 @@ ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
       // Pattern of the MachineInstrs is:
       //     %DDst = VEXTd32 %DSrc1, %DSrc2, Lane, 14, %noreg (;implicits)
       MachineInstrBuilder NewMIB;
-      NewMIB = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
-                       get(ARM::VEXTd32), DDst);
+      NewMIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(ARM::VEXTd32),
+                       DDst);
 
       // On the first instruction, both DSrc and DDst may be <undef> if present.
       // Specifically when the original instruction didn't have them as an
       // <imp-use>.
       unsigned CurReg = SrcLane == 1 && DstLane == 1 ? DSrc : DDst;
-      bool CurUndef = !MI->readsRegister(CurReg, TRI);
+      bool CurUndef = !MI.readsRegister(CurReg, TRI);
       NewMIB.addReg(CurReg, getUndefRegState(CurUndef));
 
       CurReg = SrcLane == 0 && DstLane == 0 ? DSrc : DDst;
-      CurUndef = !MI->readsRegister(CurReg, TRI);
+      CurUndef = !MI.readsRegister(CurReg, TRI);
       NewMIB.addReg(CurReg, getUndefRegState(CurUndef));
 
       NewMIB.addImm(1);
@@ -4400,17 +4433,17 @@ ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
       if (SrcLane == DstLane)
         NewMIB.addReg(SrcReg, RegState::Implicit);
 
-      MI->setDesc(get(ARM::VEXTd32));
+      MI.setDesc(get(ARM::VEXTd32));
       MIB.addReg(DDst, RegState::Define);
 
       // On the second instruction, DDst has definitely been defined above, so
       // it is not <undef>. DSrc, if present, can be <undef> as above.
       CurReg = SrcLane == 1 && DstLane == 0 ? DSrc : DDst;
-      CurUndef = CurReg == DSrc && !MI->readsRegister(CurReg, TRI);
+      CurUndef = CurReg == DSrc && !MI.readsRegister(CurReg, TRI);
       MIB.addReg(CurReg, getUndefRegState(CurUndef));
 
       CurReg = SrcLane == 0 && DstLane == 1 ? DSrc : DDst;
-      CurUndef = CurReg == DSrc && !MI->readsRegister(CurReg, TRI);
+      CurUndef = CurReg == DSrc && !MI.readsRegister(CurReg, TRI);
       MIB.addReg(CurReg, getUndefRegState(CurUndef));
 
       MIB.addImm(1);
@@ -4446,24 +4479,23 @@ ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
 // VLD1DUPd32 - Writes all D-regs, no partial reg update, 2 uops.
 //
 // FCONSTD can be used as a dependency-breaking instruction.
-unsigned ARMBaseInstrInfo::
-getPartialRegUpdateClearance(const MachineInstr *MI,
-                             unsigned OpNum,
-                             const TargetRegisterInfo *TRI) const {
-  if (!SwiftPartialUpdateClearance ||
-      !(Subtarget.isSwift() || Subtarget.isCortexA15()))
+unsigned ARMBaseInstrInfo::getPartialRegUpdateClearance(
+    const MachineInstr &MI, unsigned OpNum,
+    const TargetRegisterInfo *TRI) const {
+  auto PartialUpdateClearance = Subtarget.getPartialUpdateClearance();
+  if (!PartialUpdateClearance)
     return 0;
 
   assert(TRI && "Need TRI instance");
 
-  const MachineOperand &MO = MI->getOperand(OpNum);
+  const MachineOperand &MO = MI.getOperand(OpNum);
   if (MO.readsReg())
     return 0;
   unsigned Reg = MO.getReg();
   int UseOp = -1;
 
-  switch(MI->getOpcode()) {
-    // Normal instructions writing only an S-register.
+  switch (MI.getOpcode()) {
+  // Normal instructions writing only an S-register.
   case ARM::VLDRS:
   case ARM::FCONSTS:
   case ARM::VMOVSR:
@@ -4472,7 +4504,7 @@ getPartialRegUpdateClearance(const MachineInstr *MI,
   case ARM::VMOVv2i32:
   case ARM::VMOVv2f32:
   case ARM::VMOVv1i64:
-    UseOp = MI->findRegisterUseOperandIdx(Reg, false, TRI);
+    UseOp = MI.findRegisterUseOperandIdx(Reg, false, TRI);
     break;
 
     // Explicitly reads the dependency.
@@ -4485,37 +4517,35 @@ getPartialRegUpdateClearance(const MachineInstr *MI,
 
   // If this instruction actually reads a value from Reg, there is no unwanted
   // dependency.
-  if (UseOp != -1 && MI->getOperand(UseOp).readsReg())
+  if (UseOp != -1 && MI.getOperand(UseOp).readsReg())
     return 0;
 
   // We must be able to clobber the whole D-reg.
   if (TargetRegisterInfo::isVirtualRegister(Reg)) {
     // Virtual register must be a foo:ssub_0<def,undef> operand.
-    if (!MO.getSubReg() || MI->readsVirtualRegister(Reg))
+    if (!MO.getSubReg() || MI.readsVirtualRegister(Reg))
       return 0;
   } else if (ARM::SPRRegClass.contains(Reg)) {
     // Physical register: MI must define the full D-reg.
     unsigned DReg = TRI->getMatchingSuperReg(Reg, ARM::ssub_0,
                                              &ARM::DPRRegClass);
-    if (!DReg || !MI->definesRegister(DReg, TRI))
+    if (!DReg || !MI.definesRegister(DReg, TRI))
       return 0;
   }
 
   // MI has an unwanted D-register dependency.
   // Avoid defs in the previous N instructrions.
-  return SwiftPartialUpdateClearance;
+  return PartialUpdateClearance;
 }
 
 // Break a partial register dependency after getPartialRegUpdateClearance
 // returned non-zero.
-void ARMBaseInstrInfo::
-breakPartialRegDependency(MachineBasicBlock::iterator MI,
-                          unsigned OpNum,
-                          const TargetRegisterInfo *TRI) const {
-  assert(MI && OpNum < MI->getDesc().getNumDefs() && "OpNum is not a def");
+void ARMBaseInstrInfo::breakPartialRegDependency(
+    MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const {
+  assert(OpNum < MI.getDesc().getNumDefs() && "OpNum is not a def");
   assert(TRI && "Need TRI instance");
 
-  const MachineOperand &MO = MI->getOperand(OpNum);
+  const MachineOperand &MO = MI.getOperand(OpNum);
   unsigned Reg = MO.getReg();
   assert(TargetRegisterInfo::isPhysicalRegister(Reg) &&
          "Can't break virtual register dependencies.");
@@ -4528,7 +4558,7 @@ breakPartialRegDependency(MachineBasicBlock::iterator MI,
   }
 
   assert(ARM::DPRRegClass.contains(DReg) && "Can only break D-reg deps");
-  assert(MI->definesRegister(DReg, TRI) && "MI doesn't clobber full D-reg");
+  assert(MI.definesRegister(DReg, TRI) && "MI doesn't clobber full D-reg");
 
   // FIXME: In some cases, VLDRS can be changed to a VLD1DUPd32 which defines
   // the full D-register by loading the same value to both lanes.  The
@@ -4538,9 +4568,10 @@ breakPartialRegDependency(MachineBasicBlock::iterator MI,
 
   // Insert the dependency-breaking FCONSTD before MI.
   // 96 is the encoding of 0.5, but the actual value doesn't matter here.
-  AddDefaultPred(BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
-                         get(ARM::FCONSTD), DReg).addImm(96));
-  MI->addRegisterKilled(DReg, TRI, true);
+  AddDefaultPred(
+      BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(ARM::FCONSTD), DReg)
+          .addImm(96));
+  MI.addRegisterKilled(DReg, TRI, true);
 }
 
 bool ARMBaseInstrInfo::hasNOP() const {
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
index d80c49494c77..52b0ff17dea2 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -36,8 +36,7 @@ protected:
   explicit ARMBaseInstrInfo(const ARMSubtarget &STI);
 
   void expandLoadStackGuardBase(MachineBasicBlock::iterator MI,
-                                unsigned LoadImmOpc, unsigned LoadOpc,
-                                Reloc::Model RM) const;
+                                unsigned LoadImmOpc, unsigned LoadOpc) const;
 
   /// Build the equivalent inputs of a REG_SEQUENCE for the given \p MI
   /// and \p DefIdx.
@@ -93,8 +92,7 @@ protected:
   /// non-commutable pair of operand indices OpIdx1 and OpIdx2.
   /// Even though the instruction is commutable, the method may still
   /// fail to commute the operands, null pointer is returned in such cases.
-  MachineInstr *commuteInstructionImpl(MachineInstr *MI,
-                                       bool NewMI,
+  MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI,
                                        unsigned OpIdx1,
                                        unsigned OpIdx2) const override;
 
@@ -107,7 +105,7 @@ public:
   virtual unsigned getUnindexedOpcode(unsigned Opc) const =0;
 
   MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
-                                      MachineBasicBlock::iterator &MBBI,
+                                      MachineInstr &MI,
                                       LiveVariables *LV) const override;
 
   virtual const ARMBaseRegisterInfo &getRegisterInfo() const = 0;
@@ -122,49 +120,49 @@ public:
                                      const ScheduleDAG *DAG) const override;
 
   // Branch analysis.
-  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+  bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
                      MachineBasicBlock *&FBB,
                      SmallVectorImpl<MachineOperand> &Cond,
                      bool AllowModify = false) const override;
   unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
   unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                         MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
-                        DebugLoc DL) const override;
+                        const DebugLoc &DL) const override;
 
   bool
   ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
 
   // Predication support.
-  bool isPredicated(const MachineInstr *MI) const override;
+  bool isPredicated(const MachineInstr &MI) const override;
 
-  ARMCC::CondCodes getPredicate(const MachineInstr *MI) const {
-    int PIdx = MI->findFirstPredOperandIdx();
-    return PIdx != -1 ? (ARMCC::CondCodes)MI->getOperand(PIdx).getImm()
+  ARMCC::CondCodes getPredicate(const MachineInstr &MI) const {
+    int PIdx = MI.findFirstPredOperandIdx();
+    return PIdx != -1 ? (ARMCC::CondCodes)MI.getOperand(PIdx).getImm()
                       : ARMCC::AL;
   }
 
-  bool PredicateInstruction(MachineInstr *MI,
-                    ArrayRef<MachineOperand> Pred) const override;
+  bool PredicateInstruction(MachineInstr &MI,
+                            ArrayRef<MachineOperand> Pred) const override;
 
   bool SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
                          ArrayRef<MachineOperand> Pred2) const override;
 
-  bool DefinesPredicate(MachineInstr *MI,
+  bool DefinesPredicate(MachineInstr &MI,
                         std::vector<MachineOperand> &Pred) const override;
 
-  bool isPredicable(MachineInstr *MI) const override;
+  bool isPredicable(MachineInstr &MI) const override;
 
   /// GetInstSize - Returns the size of the specified MachineInstr.
   ///
-  virtual unsigned GetInstSizeInBytes(const MachineInstr* MI) const;
+  virtual unsigned GetInstSizeInBytes(const MachineInstr &MI) const;
 
-  unsigned isLoadFromStackSlot(const MachineInstr *MI,
+  unsigned isLoadFromStackSlot(const MachineInstr &MI,
                                int &FrameIndex) const override;
-  unsigned isStoreToStackSlot(const MachineInstr *MI,
+  unsigned isStoreToStackSlot(const MachineInstr &MI,
                               int &FrameIndex) const override;
-  unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI,
+  unsigned isLoadFromStackSlotPostFE(const MachineInstr &MI,
                                      int &FrameIndex) const override;
-  unsigned isStoreToStackSlotPostFE(const MachineInstr *MI,
+  unsigned isStoreToStackSlotPostFE(const MachineInstr &MI,
                                     int &FrameIndex) const override;
 
   void copyToCPSR(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
@@ -175,7 +173,7 @@ public:
                     const ARMSubtarget &Subtarget) const;
 
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-                   DebugLoc DL, unsigned DestReg, unsigned SrcReg,
+                   const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
                    bool KillSrc) const override;
 
   void storeRegToStackSlot(MachineBasicBlock &MBB,
@@ -190,21 +188,21 @@ public:
                             const TargetRegisterClass *RC,
                             const TargetRegisterInfo *TRI) const override;
 
-  bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
+  bool expandPostRAPseudo(MachineInstr &MI) const override;
 
   void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
                      unsigned DestReg, unsigned SubIdx,
-                     const MachineInstr *Orig,
+                     const MachineInstr &Orig,
                      const TargetRegisterInfo &TRI) const override;
 
-  MachineInstr *duplicate(MachineInstr *Orig,
+  MachineInstr *duplicate(MachineInstr &Orig,
                           MachineFunction &MF) const override;
 
   const MachineInstrBuilder &AddDReg(MachineInstrBuilder &MIB, unsigned Reg,
                                      unsigned SubIdx, unsigned State,
                                      const TargetRegisterInfo *TRI) const;
 
-  bool produceSameValue(const MachineInstr *MI0, const MachineInstr *MI1,
+  bool produceSameValue(const MachineInstr &MI0, const MachineInstr &MI1,
                         const MachineRegisterInfo *MRI) const override;
 
   /// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler to
@@ -227,7 +225,7 @@ public:
                                int64_t Offset1, int64_t Offset2,
                                unsigned NumLoads) const override;
 
-  bool isSchedulingBoundary(const MachineInstr *MI,
+  bool isSchedulingBoundary(const MachineInstr &MI,
                             const MachineBasicBlock *MBB,
                             const MachineFunction &MF) const override;
 
@@ -252,7 +250,7 @@ public:
   /// in SrcReg and SrcReg2 if having two register operands, and the value it
   /// compares against in CmpValue. Return true if the comparison instruction
   /// can be analyzed.
-  bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
+  bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
                       unsigned &SrcReg2, int &CmpMask,
                       int &CmpValue) const override;
 
@@ -260,30 +258,29 @@ public:
   /// that we can remove a "comparison with zero"; Remove a redundant CMP
   /// instruction if the flags can be updated in the same way by an earlier
   /// instruction such as SUB.
-  bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg,
+  bool optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
                             unsigned SrcReg2, int CmpMask, int CmpValue,
                             const MachineRegisterInfo *MRI) const override;
 
-  bool analyzeSelect(const MachineInstr *MI,
-                     SmallVectorImpl<MachineOperand> &Cond,
-                     unsigned &TrueOp, unsigned &FalseOp,
-                     bool &Optimizable) const override;
+  bool analyzeSelect(const MachineInstr &MI,
+                     SmallVectorImpl<MachineOperand> &Cond, unsigned &TrueOp,
+                     unsigned &FalseOp, bool &Optimizable) const override;
 
-  MachineInstr *optimizeSelect(MachineInstr *MI,
+  MachineInstr *optimizeSelect(MachineInstr &MI,
                                SmallPtrSetImpl<MachineInstr *> &SeenMIs,
                                bool) const override;
 
   /// FoldImmediate - 'Reg' is known to be defined by a move immediate
   /// instruction, try to fold the immediate into the use instruction.
-  bool FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
-                     unsigned Reg, MachineRegisterInfo *MRI) const override;
+  bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, unsigned Reg,
+                     MachineRegisterInfo *MRI) const override;
 
   unsigned getNumMicroOps(const InstrItineraryData *ItinData,
-                          const MachineInstr *MI) const override;
+                          const MachineInstr &MI) const override;
 
   int getOperandLatency(const InstrItineraryData *ItinData,
-                        const MachineInstr *DefMI, unsigned DefIdx,
-                        const MachineInstr *UseMI,
+                        const MachineInstr &DefMI, unsigned DefIdx,
+                        const MachineInstr &UseMI,
                         unsigned UseIdx) const override;
   int getOperandLatency(const InstrItineraryData *ItinData,
                         SDNode *DefNode, unsigned DefIdx,
@@ -291,19 +288,20 @@ public:
 
   /// VFP/NEON execution domains.
   std::pair<uint16_t, uint16_t>
-  getExecutionDomain(const MachineInstr *MI) const override;
-  void setExecutionDomain(MachineInstr *MI, unsigned Domain) const override;
+  getExecutionDomain(const MachineInstr &MI) const override;
+  void setExecutionDomain(MachineInstr &MI, unsigned Domain) const override;
 
-  unsigned getPartialRegUpdateClearance(const MachineInstr*, unsigned,
-                                      const TargetRegisterInfo*) const override;
-  void breakPartialRegDependency(MachineBasicBlock::iterator, unsigned,
+  unsigned
+  getPartialRegUpdateClearance(const MachineInstr &, unsigned,
+                               const TargetRegisterInfo *) const override;
+  void breakPartialRegDependency(MachineInstr &, unsigned,
                                  const TargetRegisterInfo *TRI) const override;
 
   /// Get the number of addresses by LDM or VLDM or zero for unknown.
-  unsigned getNumLDMAddresses(const MachineInstr *MI) const;
+  unsigned getNumLDMAddresses(const MachineInstr &MI) const;
 
 private:
-  unsigned getInstBundleLength(const MachineInstr *MI) const;
+  unsigned getInstBundleLength(const MachineInstr &MI) const;
 
   int getVLDMDefCycle(const InstrItineraryData *ItinData,
                       const MCInstrDesc &DefMCID,
@@ -327,10 +325,17 @@ private:
                         const MCInstrDesc &UseMCID,
                         unsigned UseIdx, unsigned UseAlign) const;
 
-  unsigned getPredicationCost(const MachineInstr *MI) const override;
+  int getOperandLatencyImpl(const InstrItineraryData *ItinData,
+                            const MachineInstr &DefMI, unsigned DefIdx,
+                            const MCInstrDesc &DefMCID, unsigned DefAdj,
+                            const MachineOperand &DefMO, unsigned Reg,
+                            const MachineInstr &UseMI, unsigned UseIdx,
+                            const MCInstrDesc &UseMCID, unsigned UseAdj) const;
+
+  unsigned getPredicationCost(const MachineInstr &MI) const override;
 
   unsigned getInstrLatency(const InstrItineraryData *ItinData,
-                           const MachineInstr *MI,
+                           const MachineInstr &MI,
                            unsigned *PredCost = nullptr) const override;
 
   int getInstrLatency(const InstrItineraryData *ItinData,
@@ -338,19 +343,18 @@ private:
 
   bool hasHighOperandLatency(const TargetSchedModel &SchedModel,
                              const MachineRegisterInfo *MRI,
-                             const MachineInstr *DefMI, unsigned DefIdx,
-                             const MachineInstr *UseMI,
+                             const MachineInstr &DefMI, unsigned DefIdx,
+                             const MachineInstr &UseMI,
                              unsigned UseIdx) const override;
   bool hasLowDefLatency(const TargetSchedModel &SchedModel,
-                        const MachineInstr *DefMI,
+                        const MachineInstr &DefMI,
                         unsigned DefIdx) const override;
 
   /// verifyInstruction - Perform target specific instruction verification.
-  bool verifyInstruction(const MachineInstr *MI,
+  bool verifyInstruction(const MachineInstr &MI,
                          StringRef &ErrInfo) const override;
 
-  virtual void expandLoadStackGuard(MachineBasicBlock::iterator MI,
-                                    Reloc::Model RM) const = 0;
+  virtual void expandLoadStackGuard(MachineBasicBlock::iterator MI) const = 0;
 
   void expandMEMCPY(MachineBasicBlock::iterator) const;
 
@@ -447,7 +451,7 @@ static inline bool isPushOpcode(int Opc) {
 /// getInstrPredicate - If instruction is predicated, returns its predicate
 /// condition, otherwise returns AL. It also returns the condition code
 /// register by reference.
-ARMCC::CondCodes getInstrPredicate(const MachineInstr *MI, unsigned &PredReg);
+ARMCC::CondCodes getInstrPredicate(const MachineInstr &MI, unsigned &PredReg);
 
 unsigned getMatchingCondBranchOpcode(unsigned Opc);
 
@@ -466,21 +470,24 @@ unsigned convertAddSubFlagsOpcode(unsigned OldOpc);
 /// instructions to materializea destreg = basereg + immediate in ARM / Thumb2
 /// code.
 void emitARMRegPlusImmediate(MachineBasicBlock &MBB,
-                             MachineBasicBlock::iterator &MBBI, DebugLoc dl,
-                             unsigned DestReg, unsigned BaseReg, int NumBytes,
+                             MachineBasicBlock::iterator &MBBI,
+                             const DebugLoc &dl, unsigned DestReg,
+                             unsigned BaseReg, int NumBytes,
                              ARMCC::CondCodes Pred, unsigned PredReg,
                              const ARMBaseInstrInfo &TII, unsigned MIFlags = 0);
 
 void emitT2RegPlusImmediate(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator &MBBI, DebugLoc dl,
-                            unsigned DestReg, unsigned BaseReg, int NumBytes,
+                            MachineBasicBlock::iterator &MBBI,
+                            const DebugLoc &dl, unsigned DestReg,
+                            unsigned BaseReg, int NumBytes,
                             ARMCC::CondCodes Pred, unsigned PredReg,
                             const ARMBaseInstrInfo &TII, unsigned MIFlags = 0);
 void emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
-                               MachineBasicBlock::iterator &MBBI, DebugLoc dl,
-                               unsigned DestReg, unsigned BaseReg,
-                               int NumBytes, const TargetInstrInfo &TII,
-                               const ARMBaseRegisterInfo& MRI,
+                               MachineBasicBlock::iterator &MBBI,
+                               const DebugLoc &dl, unsigned DestReg,
+                               unsigned BaseReg, int NumBytes,
+                               const TargetInstrInfo &TII,
+                               const ARMBaseRegisterInfo &MRI,
                                unsigned MIFlags = 0);
 
 /// Tries to add registers to the reglist of a given base-updating
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index a5207705fc69..aa968efc37d4 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -49,12 +49,9 @@ ARMBaseRegisterInfo::ARMBaseRegisterInfo()
     : ARMGenRegisterInfo(ARM::LR, 0, 0, ARM::PC), BasePtr(ARM::R6) {}
 
 static unsigned getFramePointerReg(const ARMSubtarget &STI) {
-  if (STI.isTargetMachO()) {
-    if (STI.isTargetDarwin() || STI.isThumb1Only())
-      return ARM::R7;
-    else
-      return ARM::R11;
-  } else if (STI.isTargetWindows())
+  if (STI.isTargetMachO())
+    return ARM::R7;
+  else if (STI.isTargetWindows())
     return ARM::R11;
   else // ARM EABI
     return STI.isThumb() ? ARM::R7 : ARM::R11;
@@ -63,8 +60,11 @@ static unsigned getFramePointerReg(const ARMSubtarget &STI) {
 const MCPhysReg*
 ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   const ARMSubtarget &STI = MF->getSubtarget<ARMSubtarget>();
+  bool UseSplitPush = STI.splitFramePushPop();
   const MCPhysReg *RegList =
-      STI.isTargetDarwin() ? CSR_iOS_SaveList : CSR_AAPCS_SaveList;
+      STI.isTargetDarwin()
+          ? CSR_iOS_SaveList
+          : (UseSplitPush ? CSR_AAPCS_SplitPush_SaveList : CSR_AAPCS_SaveList);
 
   const Function *F = MF->getFunction();
   if (F->getCallingConv() == CallingConv::GHC) {
@@ -75,7 +75,7 @@ ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     if (STI.isMClass()) {
       // M-class CPUs have hardware which saves the registers needed to allow a
       // function conforming to the AAPCS to function as a handler.
-      return CSR_AAPCS_SaveList;
+      return UseSplitPush ? CSR_AAPCS_SplitPush_SaveList : CSR_AAPCS_SaveList;
     } else if (F->getFnAttribute("interrupt").getValueAsString() == "FIQ") {
       // Fast interrupt mode gives the handler a private copy of R8-R14, so less
       // need to be saved to restore user-mode state.
@@ -87,6 +87,10 @@ ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     }
   }
 
+  if (STI.isTargetDarwin() && STI.getTargetLowering()->supportSwiftError() &&
+      F->getAttributes().hasAttrSomewhere(Attribute::SwiftError))
+    return CSR_iOS_SwiftError_SaveList;
+
   if (STI.isTargetDarwin() && F->getCallingConv() == CallingConv::CXX_FAST_TLS)
     return MF->getInfo<ARMFunctionInfo>()->isSplitCSR()
                ? CSR_iOS_CXX_TLS_PE_SaveList
@@ -110,6 +114,11 @@ ARMBaseRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
   if (CC == CallingConv::GHC)
     // This is academic becase all GHC calls are (supposed to be) tail calls
     return CSR_NoRegs_RegMask;
+
+  if (STI.isTargetDarwin() && STI.getTargetLowering()->supportSwiftError() &&
+      MF.getFunction()->getAttributes().hasAttrSomewhere(Attribute::SwiftError))
+    return CSR_iOS_SwiftError_RegMask;
+
   if (STI.isTargetDarwin() && CC == CallingConv::CXX_FAST_TLS)
     return CSR_iOS_CXX_TLS_RegMask;
   return STI.isTargetDarwin() ? CSR_iOS_RegMask : CSR_AAPCS_RegMask;
@@ -167,9 +176,8 @@ getReservedRegs(const MachineFunction &MF) const {
     Reserved.set(ARM::R9);
   // Reserve D16-D31 if the subtarget doesn't support them.
   if (!STI.hasVFP3() || STI.hasD16()) {
-    assert(ARM::D31 == ARM::D16 + 15);
-    for (unsigned i = 0; i != 16; ++i)
-      Reserved.set(ARM::D16 + i);
+    static_assert(ARM::D31 == ARM::D16 + 15, "Register list not consecutive!");
+    Reserved.set(ARM::D16, ARM::D31 + 1);
   }
   const TargetRegisterClass *RC  = &ARM::GPRPairRegClass;
   for(TargetRegisterClass::iterator I = RC->begin(), E = RC->end(); I!=E; ++I)
@@ -400,13 +408,10 @@ ARMBaseRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
 
 /// emitLoadConstPool - Emits a load from constpool to materialize the
 /// specified immediate.
-void ARMBaseRegisterInfo::
-emitLoadConstPool(MachineBasicBlock &MBB,
-                  MachineBasicBlock::iterator &MBBI,
-                  DebugLoc dl,
-                  unsigned DestReg, unsigned SubIdx, int Val,
-                  ARMCC::CondCodes Pred,
-                  unsigned PredReg, unsigned MIFlags) const {
+void ARMBaseRegisterInfo::emitLoadConstPool(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+    const DebugLoc &dl, unsigned DestReg, unsigned SubIdx, int Val,
+    ARMCC::CondCodes Pred, unsigned PredReg, unsigned MIFlags) const {
   MachineFunction &MF = *MBB.getParent();
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   MachineConstantPool *ConstantPool = MF.getConstantPool();
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.h b/lib/Target/ARM/ARMBaseRegisterInfo.h
index 6a9a45a65687..1eee94857e05 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -166,12 +166,12 @@ public:
 
   /// emitLoadConstPool - Emits a load from constpool to materialize the
   /// specified immediate.
-  virtual void emitLoadConstPool(MachineBasicBlock &MBB,
-                                 MachineBasicBlock::iterator &MBBI,
-                                 DebugLoc dl, unsigned DestReg, unsigned SubIdx,
-                                 int Val, ARMCC::CondCodes Pred = ARMCC::AL,
-                                 unsigned PredReg = 0,
-                                 unsigned MIFlags = MachineInstr::NoFlags)const;
+  virtual void
+  emitLoadConstPool(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+                    const DebugLoc &dl, unsigned DestReg, unsigned SubIdx,
+                    int Val, ARMCC::CondCodes Pred = ARMCC::AL,
+                    unsigned PredReg = 0,
+                    unsigned MIFlags = MachineInstr::NoFlags) const;
 
   /// Code Generation virtual methods...
   bool requiresRegisterScavenging(const MachineFunction &MF) const override;
diff --git a/lib/Target/ARM/ARMCallingConv.h b/lib/Target/ARM/ARMCallingConv.h
index a731d00883a1..71b819362404 100644
--- a/lib/Target/ARM/ARMCallingConv.h
+++ b/lib/Target/ARM/ARMCallingConv.h
@@ -211,7 +211,7 @@ static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned &ValNo, MVT &ValVT,
 
     // First consume all registers that would give an unaligned object. Whether
     // we go on stack or in regs, no-one will be using them in future.
-    unsigned RegAlign = RoundUpToAlignment(Align, 4) / 4;
+    unsigned RegAlign = alignTo(Align, 4) / 4;
     while (RegIdx % RegAlign != 0 && RegIdx < RegList.size())
       State.AllocateReg(RegList[RegIdx++]);
 
diff --git a/lib/Target/ARM/ARMCallingConv.td b/lib/Target/ARM/ARMCallingConv.td
index 847ef87c1b26..edb69581b9d3 100644
--- a/lib/Target/ARM/ARMCallingConv.td
+++ b/lib/Target/ARM/ARMCallingConv.td
@@ -23,6 +23,12 @@ def CC_ARM_APCS : CallingConv<[
     
   CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
 
+  // Pass SwiftSelf in a callee saved register.
+  CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
+
+  // A SwiftError is passed in R6.
+  CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R6]>>>,
+
   // Handle all vector types as either f64 or v2f64.
   CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
   CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
@@ -42,6 +48,12 @@ def RetCC_ARM_APCS : CallingConv<[
   CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
   CCIfType<[f32], CCBitConvertToType<i32>>,
 
+  // Pass SwiftSelf in a callee saved register.
+  CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
+
+  // A SwiftError is returned in R6.
+  CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R6]>>>,
+
   // Handle all vector types as either f64 or v2f64.
   CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
   CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
@@ -151,6 +163,12 @@ def CC_ARM_AAPCS : CallingConv<[
   CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
   CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
 
+  // Pass SwiftSelf in a callee saved register.
+  CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
+
+  // A SwiftError is passed in R6.
+  CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R6]>>>,
+
   CCIfType<[f64, v2f64], CCCustom<"CC_ARM_AAPCS_Custom_f64">>,
   CCIfType<[f32], CCBitConvertToType<i32>>,
   CCDelegateTo<CC_ARM_AAPCS_Common>
@@ -161,6 +179,12 @@ def RetCC_ARM_AAPCS : CallingConv<[
   CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
   CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
 
+  // Pass SwiftSelf in a callee saved register.
+  CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
+
+  // A SwiftError is returned in R6.
+  CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R6]>>>,
+
   CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_AAPCS_Custom_f64">>,
   CCIfType<[f32], CCBitConvertToType<i32>>,
   CCDelegateTo<RetCC_ARM_AAPCS_Common>
@@ -179,6 +203,12 @@ def CC_ARM_AAPCS_VFP : CallingConv<[
   CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
   CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
 
+  // Pass SwiftSelf in a callee saved register.
+  CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
+
+  // A SwiftError is passed in R6.
+  CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R6]>>>,
+
   // HFAs are passed in a contiguous block of registers, or on the stack
   CCIfConsecutiveRegs<CCCustom<"CC_ARM_AAPCS_Custom_Aggregate">>,
 
@@ -194,6 +224,12 @@ def RetCC_ARM_AAPCS_VFP : CallingConv<[
   CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
   CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
 
+  // Pass SwiftSelf in a callee saved register.
+  CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
+
+  // A SwiftError is returned in R6.
+  CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R6]>>>,
+
   CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>,
   CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
   CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
@@ -210,6 +246,14 @@ def CSR_NoRegs : CalleeSavedRegs<(add)>;
 def CSR_AAPCS : CalleeSavedRegs<(add LR, R11, R10, R9, R8, R7, R6, R5, R4,
                                      (sequence "D%u", 15, 8))>;
 
+// The order of callee-saved registers needs to match the order we actually push
+// them in FrameLowering, because this order is what's used by
+// PrologEpilogInserter to allocate frame index slots. So when R7 is the frame
+// pointer, we use this AAPCS alternative.
+def CSR_AAPCS_SplitPush : CalleeSavedRegs<(add LR, R7, R6, R5, R4,
+                                               R11, R10, R9, R8,
+                                               (sequence "D%u", 15, 8))>;
+
 // Constructors and destructors return 'this' in the ARM C++ ABI; since 'this'
 // and the pointer return value are both passed in R0 in these cases, this can
 // be partially modelled by treating R0 as a callee-saved register
@@ -222,6 +266,9 @@ def CSR_AAPCS_ThisReturn : CalleeSavedRegs<(add LR, R11, R10, R9, R8, R7, R6,
 // Also save R7-R4 first to match the stack frame fixed spill areas.
 def CSR_iOS : CalleeSavedRegs<(add LR, R7, R6, R5, R4, (sub CSR_AAPCS, R9))>;
 
+// R6 is used to pass swifterror, remove it from CSR.
+def CSR_iOS_SwiftError : CalleeSavedRegs<(sub CSR_iOS, R6)>;
+
 def CSR_iOS_ThisReturn : CalleeSavedRegs<(add LR, R7, R6, R5, R4,
                                          (sub CSR_AAPCS_ThisReturn, R9))>;
 
@@ -235,10 +282,11 @@ def CSR_iOS_CXX_TLS : CalleeSavedRegs<(add CSR_iOS, (sequence "R%u", 12, 1),
                                            (sequence "D%u", 31, 0))>;
 
 // CSRs that are handled by prologue, epilogue.
-def CSR_iOS_CXX_TLS_PE : CalleeSavedRegs<(add LR)>;
+def CSR_iOS_CXX_TLS_PE : CalleeSavedRegs<(add LR, R12, R11, R7, R5, R4)>;
 
 // CSRs that are handled explicitly via copies.
-def CSR_iOS_CXX_TLS_ViaCopy : CalleeSavedRegs<(sub CSR_iOS_CXX_TLS, LR)>;
+def CSR_iOS_CXX_TLS_ViaCopy : CalleeSavedRegs<(sub CSR_iOS_CXX_TLS,
+                                                   CSR_iOS_CXX_TLS_PE)>;
 
 // The "interrupt" attribute is used to generate code that is acceptable in
 // exception-handlers of various kinds. It makes us use a different return
diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp
index 55c1684028c2..8511f67dccd5 100644
--- a/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -53,6 +53,11 @@ static cl::opt<bool>
 AdjustJumpTableBlocks("arm-adjust-jump-tables", cl::Hidden, cl::init(true),
           cl::desc("Adjust basic block layout to better use TB[BH]"));
 
+static cl::opt<unsigned>
+CPMaxIteration("arm-constant-island-max-iteration", cl::Hidden, cl::init(30),
+          cl::desc("The max number of iteration for converge"));
+
+
 /// UnknownPadding - Return the worst case padding that could result from
 /// unknown offset bits.  This does not include alignment padding caused by
 /// known offset bits.
@@ -274,6 +279,11 @@ namespace {
 
     bool runOnMachineFunction(MachineFunction &MF) override;
 
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::AllVRegsAllocated);
+    }
+
     const char *getPassName() const override {
       return "ARM constant island placement and branch shortening pass";
     }
@@ -293,10 +303,10 @@ namespace {
     unsigned getCombinedIndex(const MachineInstr *CPEMI);
     int findInRangeCPEntry(CPUser& U, unsigned UserOffset);
     bool findAvailableWater(CPUser&U, unsigned UserOffset,
-                            water_iterator &WaterIter);
+                            water_iterator &WaterIter, bool CloserWater);
     void createNewWater(unsigned CPUserIndex, unsigned UserOffset,
                         MachineBasicBlock *&NewMBB);
-    bool handleConstantPoolUser(unsigned CPUserIndex);
+    bool handleConstantPoolUser(unsigned CPUserIndex, bool CloserWater);
     void removeDeadCPEMI(MachineInstr *CPEMI);
     bool removeUnusedCPEntries();
     bool isCPEntryInRange(MachineInstr *MI, unsigned UserOffset,
@@ -456,8 +466,11 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
     DEBUG(dbgs() << "Beginning CP iteration #" << NoCPIters << '\n');
     bool CPChange = false;
     for (unsigned i = 0, e = CPUsers.size(); i != e; ++i)
-      CPChange |= handleConstantPoolUser(i);
-    if (CPChange && ++NoCPIters > 30)
+      // For most inputs, it converges in no more than 5 iterations.
+      // If it doesn't end in 10, the input may have huge BB or many CPEs.
+      // In this case, we will try different heuristics.
+      CPChange |= handleConstantPoolUser(i, NoCPIters >= CPMaxIteration / 2);
+    if (CPChange && ++NoCPIters > CPMaxIteration)
       report_fatal_error("Constant Island pass failed to converge!");
     DEBUG(dumpBBs());
 
@@ -478,10 +491,18 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
     MadeChange = true;
   }
 
-  // Shrink 32-bit Thumb2 branch, load, and store instructions.
+  // Shrink 32-bit Thumb2 load and store instructions.
   if (isThumb2 && !STI->prefers32BitThumb())
     MadeChange |= optimizeThumb2Instructions();
 
+  // Shrink 32-bit branch instructions.
+  if (isThumb && STI->hasV8MBaselineOps())
+    MadeChange |= optimizeThumb2Branches();
+
+  // Optimize jump tables using TBB / TBH.
+  if (isThumb2)
+    MadeChange |= optimizeThumb2JumpTables();
+
   // After a while, this might be made debug-only, but it is not expensive.
   verify();
 
@@ -654,7 +675,7 @@ bool ARMConstantIslands::BBHasFallthrough(MachineBasicBlock *MBB) {
   // have an unconditional branch for whatever reason.
   MachineBasicBlock *TBB, *FBB;
   SmallVector<MachineOperand, 4> Cond;
-  bool TooDifficult = TII->AnalyzeBranch(*MBB, TBB, FBB, Cond);
+  bool TooDifficult = TII->analyzeBranch(*MBB, TBB, FBB, Cond);
   return TooDifficult || FBB == nullptr;
 }
 
@@ -701,14 +722,10 @@ unsigned ARMConstantIslands::getCPELogAlign(const MachineInstr *CPEMI) {
 /// information about the sizes of each block and the locations of all
 /// the jump tables.
 void ARMConstantIslands::scanFunctionJumpTables() {
-  for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end();
-       MBBI != E; ++MBBI) {
-    MachineBasicBlock &MBB = *MBBI;
-
-    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
-         I != E; ++I)
-      if (I->isBranch() && I->getOpcode() == ARM::t2BR_JT)
-        T2JumpTables.push_back(I);
+  for (MachineBasicBlock &MBB : *MF) {
+    for (MachineInstr &I : MBB)
+      if (I.isBranch() && I.getOpcode() == ARM::t2BR_JT)
+        T2JumpTables.push_back(&I);
   }
 }
 
@@ -735,22 +752,18 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
   adjustBBOffsetsAfter(&MF->front());
 
   // Now go back through the instructions and build up our data structures.
-  for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end();
-       MBBI != E; ++MBBI) {
-    MachineBasicBlock &MBB = *MBBI;
-
+  for (MachineBasicBlock &MBB : *MF) {
     // If this block doesn't fall through into the next MBB, then this is
     // 'water' that a constant pool island could be placed.
     if (!BBHasFallthrough(&MBB))
       WaterList.push_back(&MBB);
 
-    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
-         I != E; ++I) {
-      if (I->isDebugValue())
+    for (MachineInstr &I : MBB) {
+      if (I.isDebugValue())
         continue;
 
-      unsigned Opc = I->getOpcode();
-      if (I->isBranch()) {
+      unsigned Opc = I.getOpcode();
+      if (I.isBranch()) {
         bool isCond = false;
         unsigned Bits = 0;
         unsigned Scale = 1;
@@ -759,7 +772,7 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
         default:
           continue;  // Ignore other JT branches
         case ARM::t2BR_JT:
-          T2JumpTables.push_back(I);
+          T2JumpTables.push_back(&I);
           continue;   // Does not get an entry in ImmBranches
         case ARM::Bcc:
           isCond = true;
@@ -793,11 +806,11 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
 
         // Record this immediate branch.
         unsigned MaxOffs = ((1 << (Bits-1))-1) * Scale;
-        ImmBranches.push_back(ImmBranch(I, MaxOffs, isCond, UOpc));
+        ImmBranches.push_back(ImmBranch(&I, MaxOffs, isCond, UOpc));
       }
 
       if (Opc == ARM::tPUSH || Opc == ARM::tPOP_RET)
-        PushPopMIs.push_back(I);
+        PushPopMIs.push_back(&I);
 
       if (Opc == ARM::CONSTPOOL_ENTRY || Opc == ARM::JUMPTABLE_ADDRS ||
           Opc == ARM::JUMPTABLE_INSTS || Opc == ARM::JUMPTABLE_TBB ||
@@ -805,8 +818,8 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
         continue;
 
       // Scan the instructions for constant pool operands.
-      for (unsigned op = 0, e = I->getNumOperands(); op != e; ++op)
-        if (I->getOperand(op).isCPI() || I->getOperand(op).isJTI()) {
+      for (unsigned op = 0, e = I.getNumOperands(); op != e; ++op)
+        if (I.getOperand(op).isCPI() || I.getOperand(op).isJTI()) {
           // We found one.  The addressing mode tells us the max displacement
           // from the PC that this instruction permits.
 
@@ -865,15 +878,15 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
           }
 
           // Remember that this is a user of a CP entry.
-          unsigned CPI = I->getOperand(op).getIndex();
-          if (I->getOperand(op).isJTI()) {
+          unsigned CPI = I.getOperand(op).getIndex();
+          if (I.getOperand(op).isJTI()) {
             JumpTableUserIndices.insert(std::make_pair(CPI, CPUsers.size()));
             CPI = JumpTableEntryIndices[CPI];
           }
 
           MachineInstr *CPEMI = CPEMIs[CPI];
           unsigned MaxOffs = ((1 << Bits)-1) * Scale;
-          CPUsers.push_back(CPUser(I, CPEMI, MaxOffs, NegOk, IsSoImm));
+          CPUsers.push_back(CPUser(&I, CPEMI, MaxOffs, NegOk, IsSoImm));
 
           // Increment corresponding CPEntry reference count.
           CPEntry *CPE = findConstPoolEntry(CPI, CPEMI);
@@ -896,15 +909,14 @@ void ARMConstantIslands::computeBlockSize(MachineBasicBlock *MBB) {
   BBI.Unalign = 0;
   BBI.PostAlign = 0;
 
-  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
-       ++I) {
+  for (MachineInstr &I : *MBB) {
     BBI.Size += TII->GetInstSizeInBytes(I);
     // For inline asm, GetInstSizeInBytes returns a conservative estimate.
     // The actual size may be smaller, but still a multiple of the instr size.
-    if (I->isInlineAsm())
+    if (I.isInlineAsm())
       BBI.Unalign = isThumb ? 1 : 2;
     // Also consider instructions that may be shrunk later.
-    else if (isThumb && mayOptimizeThumb2Instruction(I))
+    else if (isThumb && mayOptimizeThumb2Instruction(&I))
       BBI.Unalign = 1;
   }
 
@@ -929,7 +941,7 @@ unsigned ARMConstantIslands::getOffsetOf(MachineInstr *MI) const {
   // Sum instructions before MI in MBB.
   for (MachineBasicBlock::iterator I = MBB->begin(); &*I != MI; ++I) {
     assert(I != MBB->end() && "Didn't find MI in its own basic block?");
-    Offset += TII->GetInstSizeInBytes(I);
+    Offset += TII->GetInstSizeInBytes(*I);
   }
   return Offset;
 }
@@ -1108,7 +1120,7 @@ bool ARMConstantIslands::isWaterInRange(unsigned UserOffset,
     Growth = CPEEnd - NextBlockOffset;
     // Compute the padding that would go at the end of the CPE to align the next
     // block.
-    Growth += OffsetToAlignment(CPEEnd, 1u << NextBlockAlignment);
+    Growth += OffsetToAlignment(CPEEnd, 1ULL << NextBlockAlignment);
 
     // If the CPE is to be inserted before the instruction, that will raise
     // the offset of the instruction. Also account for unknown alignment padding
@@ -1285,11 +1297,27 @@ static inline unsigned getUnconditionalBrDisp(int Opc) {
 /// move to a lower address, so search backward from the end of the list and
 /// prefer the first water that is in range.
 bool ARMConstantIslands::findAvailableWater(CPUser &U, unsigned UserOffset,
-                                      water_iterator &WaterIter) {
+                                            water_iterator &WaterIter,
+                                            bool CloserWater) {
   if (WaterList.empty())
     return false;
 
   unsigned BestGrowth = ~0u;
+  // The nearest water without splitting the UserBB is right after it.
+  // If the distance is still large (we have a big BB), then we need to split it
+  // if we don't converge after certain iterations. This helps the following
+  // situation to converge:
+  //   BB0:
+  //      Big BB
+  //   BB1:
+  //      Constant Pool
+  // When a CP access is out of range, BB0 may be used as water. However,
+  // inserting islands between BB0 and BB1 makes other accesses out of range.
+  MachineBasicBlock *UserBB = U.MI->getParent();
+  unsigned MinNoSplitDisp =
+      BBInfo[UserBB->getNumber()].postOffset(getCPELogAlign(U.CPEMI));
+  if (CloserWater && MinNoSplitDisp > U.getMaxDisp() / 2)
+    return false;
   for (water_iterator IP = std::prev(WaterList.end()), B = WaterList.begin();;
        --IP) {
     MachineBasicBlock* WaterBB = *IP;
@@ -1301,6 +1329,8 @@ bool ARMConstantIslands::findAvailableWater(CPUser &U, unsigned UserOffset,
     // should be relatively uncommon and when it does happen, we want to be
     // sure to take advantage of it for all the CPEs near that block, so that
     // we don't insert more branches than necessary.
+    // When CloserWater is true, we try to find the lowest address after (or
+    // equal to) user MI's BB no matter of padding growth.
     unsigned Growth;
     if (isWaterInRange(UserOffset, WaterBB, U, Growth) &&
         (WaterBB->getNumber() < U.HighWaterMark->getNumber() ||
@@ -1312,8 +1342,11 @@ bool ARMConstantIslands::findAvailableWater(CPUser &U, unsigned UserOffset,
       DEBUG(dbgs() << "Found water after BB#" << WaterBB->getNumber()
                    << " Growth=" << Growth << '\n');
 
-      // Keep looking unless it is perfect.
-      if (BestGrowth == 0)
+      if (CloserWater && WaterBB == U.MI->getParent())
+        return true;
+      // Keep looking unless it is perfect and we're not looking for the lowest
+      // possible address.
+      if (!CloserWater && BestGrowth == 0)
         return true;
     }
     if (IP == B)
@@ -1416,7 +1449,7 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
     // iterates at least once.
     BaseInsertOffset =
         std::max(UserBBI.postOffset() - UPad - 8,
-                 UserOffset + TII->GetInstSizeInBytes(UserMI) + 1);
+                 UserOffset + TII->GetInstSizeInBytes(*UserMI) + 1);
     DEBUG(dbgs() << format("Move inside block: %#x\n", BaseInsertOffset));
   }
   unsigned EndInsertOffset = BaseInsertOffset + 4 + UPad +
@@ -1426,11 +1459,11 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
   unsigned CPUIndex = CPUserIndex+1;
   unsigned NumCPUsers = CPUsers.size();
   MachineInstr *LastIT = nullptr;
-  for (unsigned Offset = UserOffset+TII->GetInstSizeInBytes(UserMI);
+  for (unsigned Offset = UserOffset + TII->GetInstSizeInBytes(*UserMI);
        Offset < BaseInsertOffset;
-       Offset += TII->GetInstSizeInBytes(MI), MI = std::next(MI)) {
+       Offset += TII->GetInstSizeInBytes(*MI), MI = std::next(MI)) {
     assert(MI != UserMBB->end() && "Fell off end of block");
-    if (CPUIndex < NumCPUsers && CPUsers[CPUIndex].MI == MI) {
+    if (CPUIndex < NumCPUsers && CPUsers[CPUIndex].MI == &*MI) {
       CPUser &U = CPUsers[CPUIndex];
       if (!isOffsetInRange(Offset, EndInsertOffset, U)) {
         // Shift intertion point by one unit of alignment so it is within reach.
@@ -1447,7 +1480,7 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
 
     // Remember the last IT instruction.
     if (MI->getOpcode() == ARM::t2IT)
-      LastIT = MI;
+      LastIT = &*MI;
   }
 
   --MI;
@@ -1455,23 +1488,24 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
   // Avoid splitting an IT block.
   if (LastIT) {
     unsigned PredReg = 0;
-    ARMCC::CondCodes CC = getITInstrPredicate(MI, PredReg);
+    ARMCC::CondCodes CC = getITInstrPredicate(*MI, PredReg);
     if (CC != ARMCC::AL)
       MI = LastIT;
   }
 
   // We really must not split an IT block.
   DEBUG(unsigned PredReg;
-        assert(!isThumb || getITInstrPredicate(MI, PredReg) == ARMCC::AL));
+        assert(!isThumb || getITInstrPredicate(*MI, PredReg) == ARMCC::AL));
 
-  NewMBB = splitBlockBeforeInstr(MI);
+  NewMBB = splitBlockBeforeInstr(&*MI);
 }
 
 /// handleConstantPoolUser - Analyze the specified user, checking to see if it
 /// is out-of-range.  If so, pick up the constant pool value and move it some
 /// place in-range.  Return true if we changed any addresses (thus must run
 /// another pass of branch lengthening), false otherwise.
-bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
+bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex,
+                                                bool CloserWater) {
   CPUser &U = CPUsers[CPUserIndex];
   MachineInstr *UserMI = U.MI;
   MachineInstr *CPEMI  = U.CPEMI;
@@ -1494,7 +1528,7 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
   MachineBasicBlock *NewIsland = MF->CreateMachineBasicBlock();
   MachineBasicBlock *NewMBB;
   water_iterator IP;
-  if (findAvailableWater(U, UserOffset, IP)) {
+  if (findAvailableWater(U, UserOffset, IP, CloserWater)) {
     DEBUG(dbgs() << "Found water in range\n");
     MachineBasicBlock *WaterBB = *IP;
 
@@ -1584,7 +1618,7 @@ void ARMConstantIslands::removeDeadCPEMI(MachineInstr *CPEMI) {
     CPEBB->setAlignment(0);
   } else
     // Entries are sorted by descending alignment, so realign from the front.
-    CPEBB->setAlignment(getCPELogAlign(CPEBB->begin()));
+    CPEBB->setAlignment(getCPELogAlign(&*CPEBB->begin()));
 
   adjustBBOffsetsAfter(CPEBB);
   // An island has only one predecessor BB and one successor BB. Check if
@@ -1728,7 +1762,7 @@ ARMConstantIslands::fixupConditionalBr(ImmBranch &Br) {
     splitBlockBeforeInstr(MI);
     // No need for the branch to the next block. We're adding an unconditional
     // branch to the destination.
-    int delta = TII->GetInstSizeInBytes(&MBB->back());
+    int delta = TII->GetInstSizeInBytes(MBB->back());
     BBInfo[MBB->getNumber()].Size -= delta;
     MBB->back().eraseFromParent();
     // BBInfo[SplitBB].Offset is wrong temporarily, fixed below
@@ -1744,18 +1778,18 @@ ARMConstantIslands::fixupConditionalBr(ImmBranch &Br) {
   BuildMI(MBB, DebugLoc(), TII->get(MI->getOpcode()))
     .addMBB(NextBB).addImm(CC).addReg(CCReg);
   Br.MI = &MBB->back();
-  BBInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back());
+  BBInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(MBB->back());
   if (isThumb)
     BuildMI(MBB, DebugLoc(), TII->get(Br.UncondBr)).addMBB(DestBB)
             .addImm(ARMCC::AL).addReg(0);
   else
     BuildMI(MBB, DebugLoc(), TII->get(Br.UncondBr)).addMBB(DestBB);
-  BBInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back());
+  BBInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(MBB->back());
   unsigned MaxDisp = getUnconditionalBrDisp(Br.UncondBr);
   ImmBranches.push_back(ImmBranch(&MBB->back(), MaxDisp, false, Br.UncondBr));
 
   // Remove the old conditional branch.  It may or may not still be in MBB.
-  BBInfo[MI->getParent()->getNumber()].Size -= TII->GetInstSizeInBytes(MI);
+  BBInfo[MI->getParent()->getNumber()].Size -= TII->GetInstSizeInBytes(*MI);
   MI->eraseFromParent();
   adjustBBOffsetsAfter(MBB);
   return true;
@@ -1852,8 +1886,6 @@ bool ARMConstantIslands::optimizeThumb2Instructions() {
     }
   }
 
-  MadeChange |= optimizeThumb2Branches();
-  MadeChange |= optimizeThumb2JumpTables();
   return MadeChange;
 }
 
@@ -1910,7 +1942,7 @@ bool ARMConstantIslands::optimizeThumb2Branches() {
 
     NewOpc = 0;
     unsigned PredReg = 0;
-    ARMCC::CondCodes Pred = getInstrPredicate(Br.MI, PredReg);
+    ARMCC::CondCodes Pred = getInstrPredicate(*Br.MI, PredReg);
     if (Pred == ARMCC::EQ)
       NewOpc = ARM::tCBZ;
     else if (Pred == ARMCC::NE)
@@ -1928,7 +1960,7 @@ bool ARMConstantIslands::optimizeThumb2Branches() {
         --CmpMI;
         if (CmpMI->getOpcode() == ARM::tCMPi8) {
           unsigned Reg = CmpMI->getOperand(0).getReg();
-          Pred = getInstrPredicate(CmpMI, PredReg);
+          Pred = getInstrPredicate(*CmpMI, PredReg);
           if (Pred == ARMCC::AL &&
               CmpMI->getOperand(1).getImm() == 0 &&
               isARMLowRegister(Reg)) {
@@ -2170,8 +2202,8 @@ bool ARMConstantIslands::optimizeThumb2JumpTables() {
       }
     }
 
-    unsigned NewSize = TII->GetInstSizeInBytes(NewJTMI);
-    unsigned OrigSize = TII->GetInstSizeInBytes(MI);
+    unsigned NewSize = TII->GetInstSizeInBytes(*NewJTMI);
+    unsigned OrigSize = TII->GetInstSizeInBytes(*MI);
     MI->eraseFromParent();
 
     int Delta = OrigSize - NewSize + DeadSize;
@@ -2240,13 +2272,13 @@ adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) {
   MachineFunction::iterator OldPrior = std::prev(BBi);
 
   // If the block terminator isn't analyzable, don't try to move the block
-  bool B = TII->AnalyzeBranch(*BB, TBB, FBB, Cond);
+  bool B = TII->analyzeBranch(*BB, TBB, FBB, Cond);
 
   // If the block ends in an unconditional branch, move it. The prior block
   // has to have an analyzable terminator for us to move this one. Be paranoid
   // and make sure we're not trying to move the entry block of the function.
-  if (!B && Cond.empty() && BB != MF->begin() &&
-      !TII->AnalyzeBranch(*OldPrior, TBB, FBB, CondPrior)) {
+  if (!B && Cond.empty() && BB != &MF->front() &&
+      !TII->analyzeBranch(*OldPrior, TBB, FBB, CondPrior)) {
     BB->moveAfter(JTBB);
     OldPrior->updateTerminator();
     BB->updateTerminator();
diff --git a/lib/Target/ARM/ARMConstantPoolValue.cpp b/lib/Target/ARM/ARMConstantPoolValue.cpp
index c9849b2605ea..c0db001cb6f1 100644
--- a/lib/Target/ARM/ARMConstantPoolValue.cpp
+++ b/lib/Target/ARM/ARMConstantPoolValue.cpp
@@ -50,11 +50,18 @@ const char *ARMConstantPoolValue::getModifierText() const {
   switch (Modifier) {
     // FIXME: Are these case sensitive? It'd be nice to lower-case all the
     // strings if that's legal.
-  case ARMCP::no_modifier: return "none";
-  case ARMCP::TLSGD:       return "tlsgd";
-  case ARMCP::GOT_PREL:    return "GOT_PREL";
-  case ARMCP::GOTTPOFF:    return "gottpoff";
-  case ARMCP::TPOFF:       return "tpoff";
+  case ARMCP::no_modifier:
+    return "none";
+  case ARMCP::TLSGD:
+    return "tlsgd";
+  case ARMCP::GOT_PREL:
+    return "GOT_PREL";
+  case ARMCP::GOTTPOFF:
+    return "gottpoff";
+  case ARMCP::TPOFF:
+    return "tpoff";
+  case ARMCP::SECREL:
+    return "secrel32";
   }
   llvm_unreachable("Unknown modifier!");
 }
@@ -74,9 +81,9 @@ bool
 ARMConstantPoolValue::hasSameValue(ARMConstantPoolValue *ACPV) {
   if (ACPV->Kind == Kind &&
       ACPV->PCAdjust == PCAdjust &&
-      ACPV->Modifier == Modifier) {
-    if (ACPV->LabelId == LabelId)
-      return true;
+      ACPV->Modifier == Modifier &&
+      ACPV->LabelId == LabelId &&
+      ACPV->AddCurrentAddress == AddCurrentAddress) {
     // Two PC relative constpool entries containing the same GV address or
     // external symbols. FIXME: What about blockaddress?
     if (Kind == ARMCP::CPValue || Kind == ARMCP::CPExtSymbol)
@@ -85,7 +92,7 @@ ARMConstantPoolValue::hasSameValue(ARMConstantPoolValue *ACPV) {
   return false;
 }
 
-void ARMConstantPoolValue::dump() const {
+LLVM_DUMP_METHOD void ARMConstantPoolValue::dump() const {
   errs() << "  " << *this;
 }
 
diff --git a/lib/Target/ARM/ARMConstantPoolValue.h b/lib/Target/ARM/ARMConstantPoolValue.h
index 6b18a4e52878..c07331d71dad 100644
--- a/lib/Target/ARM/ARMConstantPoolValue.h
+++ b/lib/Target/ARM/ARMConstantPoolValue.h
@@ -37,11 +37,12 @@ namespace ARMCP {
   };
 
   enum ARMCPModifier {
-    no_modifier,
-    TLSGD,
-    GOT_PREL,
-    GOTTPOFF,
-    TPOFF
+    no_modifier, /// None
+    TLSGD,       /// Thread Local Storage (General Dynamic Mode)
+    GOT_PREL,    /// Global Offset Table, PC Relative
+    GOTTPOFF,    /// Global Offset Table, Thread Pointer Offset
+    TPOFF,       /// Thread Pointer Offset
+    SECREL,      /// Section Relative (Windows TLS)
   };
 }
 
diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index 56f3498e1204..56f5728ecfb8 100644
--- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -20,6 +20,7 @@
 #include "ARMConstantPoolValue.h"
 #include "ARMMachineFunctionInfo.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -50,6 +51,11 @@ namespace {
 
     bool runOnMachineFunction(MachineFunction &Fn) override;
 
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::AllVRegsAllocated);
+    }
+
     const char *getPassName() const override {
       return "ARM pseudo instruction expansion pass";
     }
@@ -58,7 +64,8 @@ namespace {
     void TransferImpOps(MachineInstr &OldMI,
                         MachineInstrBuilder &UseMI, MachineInstrBuilder &DefMI);
     bool ExpandMI(MachineBasicBlock &MBB,
-                  MachineBasicBlock::iterator MBBI);
+                  MachineBasicBlock::iterator MBBI,
+                  MachineBasicBlock::iterator &NextMBBI);
     bool ExpandMBB(MachineBasicBlock &MBB);
     void ExpandVLD(MachineBasicBlock::iterator &MBBI);
     void ExpandVST(MachineBasicBlock::iterator &MBBI);
@@ -67,6 +74,14 @@ namespace {
                     unsigned Opc, bool IsExt);
     void ExpandMOV32BitImm(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator &MBBI);
+    bool ExpandCMP_SWAP(MachineBasicBlock &MBB,
+                        MachineBasicBlock::iterator MBBI, unsigned LdrexOp,
+                        unsigned StrexOp, unsigned UxtOp,
+                        MachineBasicBlock::iterator &NextMBBI);
+
+    bool ExpandCMP_SWAP_64(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI,
+                           MachineBasicBlock::iterator &NextMBBI);
   };
   char ARMExpandPseudo::ID = 0;
 }
@@ -651,7 +666,7 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
   MachineInstr &MI = *MBBI;
   unsigned Opcode = MI.getOpcode();
   unsigned PredReg = 0;
-  ARMCC::CondCodes Pred = getInstrPredicate(&MI, PredReg);
+  ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
   unsigned DstReg = MI.getOperand(0).getReg();
   bool DstIsDead = MI.getOperand(0).isDead();
   bool isCC = Opcode == ARM::MOVCCi32imm || Opcode == ARM::t2MOVCCi32imm;
@@ -737,8 +752,242 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
   MI.eraseFromParent();
 }
 
+static void addPostLoopLiveIns(MachineBasicBlock *MBB, LivePhysRegs &LiveRegs) {
+  for (auto I = LiveRegs.begin(); I != LiveRegs.end(); ++I)
+    MBB->addLiveIn(*I);
+}
+
+/// Expand a CMP_SWAP pseudo-inst to an ldrex/strex loop as simply as
+/// possible. This only gets used at -O0 so we don't care about efficiency of the
+/// generated code.
+bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MBBI,
+                                     unsigned LdrexOp, unsigned StrexOp,
+                                     unsigned UxtOp,
+                                     MachineBasicBlock::iterator &NextMBBI) {
+  bool IsThumb = STI->isThumb();
+  MachineInstr &MI = *MBBI;
+  DebugLoc DL = MI.getDebugLoc();
+  MachineOperand &Dest = MI.getOperand(0);
+  unsigned StatusReg = MI.getOperand(1).getReg();
+  MachineOperand &Addr = MI.getOperand(2);
+  MachineOperand &Desired = MI.getOperand(3);
+  MachineOperand &New = MI.getOperand(4);
+
+  LivePhysRegs LiveRegs(&TII->getRegisterInfo());
+  LiveRegs.addLiveOuts(MBB);
+  for (auto I = std::prev(MBB.end()); I != MBBI; --I)
+    LiveRegs.stepBackward(*I);
+
+  MachineFunction *MF = MBB.getParent();
+  auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+  auto StoreBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+  auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+
+  MF->insert(++MBB.getIterator(), LoadCmpBB);
+  MF->insert(++LoadCmpBB->getIterator(), StoreBB);
+  MF->insert(++StoreBB->getIterator(), DoneBB);
+
+  if (UxtOp) {
+    MachineInstrBuilder MIB =
+        BuildMI(MBB, MBBI, DL, TII->get(UxtOp), Desired.getReg())
+            .addReg(Desired.getReg(), RegState::Kill);
+    if (!IsThumb)
+      MIB.addImm(0);
+    AddDefaultPred(MIB);
+  }
+
+  // .Lloadcmp:
+  //     ldrex rDest, [rAddr]
+  //     cmp rDest, rDesired
+  //     bne .Ldone
+  LoadCmpBB->addLiveIn(Addr.getReg());
+  LoadCmpBB->addLiveIn(Dest.getReg());
+  LoadCmpBB->addLiveIn(Desired.getReg());
+  addPostLoopLiveIns(LoadCmpBB, LiveRegs);
+
+  MachineInstrBuilder MIB;
+  MIB = BuildMI(LoadCmpBB, DL, TII->get(LdrexOp), Dest.getReg());
+  MIB.addReg(Addr.getReg());
+  if (LdrexOp == ARM::t2LDREX)
+    MIB.addImm(0); // a 32-bit Thumb ldrex (only) allows an offset.
+  AddDefaultPred(MIB);
+
+  unsigned CMPrr = IsThumb ? ARM::tCMPhir : ARM::CMPrr;
+  AddDefaultPred(BuildMI(LoadCmpBB, DL, TII->get(CMPrr))
+                     .addReg(Dest.getReg(), getKillRegState(Dest.isDead()))
+                     .addOperand(Desired));
+  unsigned Bcc = IsThumb ? ARM::tBcc : ARM::Bcc;
+  BuildMI(LoadCmpBB, DL, TII->get(Bcc))
+      .addMBB(DoneBB)
+      .addImm(ARMCC::NE)
+      .addReg(ARM::CPSR, RegState::Kill);
+  LoadCmpBB->addSuccessor(DoneBB);
+  LoadCmpBB->addSuccessor(StoreBB);
+
+  // .Lstore:
+  //     strex rStatus, rNew, [rAddr]
+  //     cmp rStatus, #0
+  //     bne .Lloadcmp
+  StoreBB->addLiveIn(Addr.getReg());
+  StoreBB->addLiveIn(New.getReg());
+  addPostLoopLiveIns(StoreBB, LiveRegs);
+
+
+  MIB = BuildMI(StoreBB, DL, TII->get(StrexOp), StatusReg);
+  MIB.addOperand(New);
+  MIB.addOperand(Addr);
+  if (StrexOp == ARM::t2STREX)
+    MIB.addImm(0); // a 32-bit Thumb strex (only) allows an offset.
+  AddDefaultPred(MIB);
+
+  unsigned CMPri = IsThumb ? ARM::t2CMPri : ARM::CMPri;
+  AddDefaultPred(BuildMI(StoreBB, DL, TII->get(CMPri))
+                     .addReg(StatusReg, RegState::Kill)
+                     .addImm(0));
+  BuildMI(StoreBB, DL, TII->get(Bcc))
+      .addMBB(LoadCmpBB)
+      .addImm(ARMCC::NE)
+      .addReg(ARM::CPSR, RegState::Kill);
+  StoreBB->addSuccessor(LoadCmpBB);
+  StoreBB->addSuccessor(DoneBB);
+
+  DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end());
+  DoneBB->transferSuccessors(&MBB);
+  addPostLoopLiveIns(DoneBB, LiveRegs);
+
+  MBB.addSuccessor(LoadCmpBB);
+
+  NextMBBI = MBB.end();
+  MI.eraseFromParent();
+  return true;
+}
+
+/// ARM's ldrexd/strexd take a consecutive register pair (represented as a
+/// single GPRPair register), Thumb's take two separate registers so we need to
+/// extract the subregs from the pair.
+static void addExclusiveRegPair(MachineInstrBuilder &MIB, MachineOperand &Reg,
+                                unsigned Flags, bool IsThumb,
+                                const TargetRegisterInfo *TRI) {
+  if (IsThumb) {
+    unsigned RegLo = TRI->getSubReg(Reg.getReg(), ARM::gsub_0);
+    unsigned RegHi = TRI->getSubReg(Reg.getReg(), ARM::gsub_1);
+    MIB.addReg(RegLo, Flags | getKillRegState(Reg.isDead()));
+    MIB.addReg(RegHi, Flags | getKillRegState(Reg.isDead()));
+  } else
+    MIB.addReg(Reg.getReg(), Flags | getKillRegState(Reg.isDead()));
+}
+
+/// Expand a 64-bit CMP_SWAP to an ldrexd/strexd loop.
+bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MBBI,
+                                        MachineBasicBlock::iterator &NextMBBI) {
+  bool IsThumb = STI->isThumb();
+  MachineInstr &MI = *MBBI;
+  DebugLoc DL = MI.getDebugLoc();
+  MachineOperand &Dest = MI.getOperand(0);
+  unsigned StatusReg = MI.getOperand(1).getReg();
+  MachineOperand &Addr = MI.getOperand(2);
+  MachineOperand &Desired = MI.getOperand(3);
+  MachineOperand &New = MI.getOperand(4);
+
+  unsigned DestLo = TRI->getSubReg(Dest.getReg(), ARM::gsub_0);
+  unsigned DestHi = TRI->getSubReg(Dest.getReg(), ARM::gsub_1);
+  unsigned DesiredLo = TRI->getSubReg(Desired.getReg(), ARM::gsub_0);
+  unsigned DesiredHi = TRI->getSubReg(Desired.getReg(), ARM::gsub_1);
+
+  LivePhysRegs LiveRegs(&TII->getRegisterInfo());
+  LiveRegs.addLiveOuts(MBB);
+  for (auto I = std::prev(MBB.end()); I != MBBI; --I)
+    LiveRegs.stepBackward(*I);
+
+  MachineFunction *MF = MBB.getParent();
+  auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+  auto StoreBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+  auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+
+  MF->insert(++MBB.getIterator(), LoadCmpBB);
+  MF->insert(++LoadCmpBB->getIterator(), StoreBB);
+  MF->insert(++StoreBB->getIterator(), DoneBB);
+
+  // .Lloadcmp:
+  //     ldrexd rDestLo, rDestHi, [rAddr]
+  //     cmp rDestLo, rDesiredLo
+  //     sbcs rStatus<dead>, rDestHi, rDesiredHi
+  //     bne .Ldone
+  LoadCmpBB->addLiveIn(Addr.getReg());
+  LoadCmpBB->addLiveIn(Dest.getReg());
+  LoadCmpBB->addLiveIn(Desired.getReg());
+  addPostLoopLiveIns(LoadCmpBB, LiveRegs);
+
+  unsigned LDREXD = IsThumb ? ARM::t2LDREXD : ARM::LDREXD;
+  MachineInstrBuilder MIB;
+  MIB = BuildMI(LoadCmpBB, DL, TII->get(LDREXD));
+  addExclusiveRegPair(MIB, Dest, RegState::Define, IsThumb, TRI);
+  MIB.addReg(Addr.getReg());
+  AddDefaultPred(MIB);
+
+  unsigned CMPrr = IsThumb ? ARM::tCMPhir : ARM::CMPrr;
+  AddDefaultPred(BuildMI(LoadCmpBB, DL, TII->get(CMPrr))
+                     .addReg(DestLo, getKillRegState(Dest.isDead()))
+                     .addReg(DesiredLo, getKillRegState(Desired.isDead())));
+
+  unsigned SBCrr = IsThumb ? ARM::t2SBCrr : ARM::SBCrr;
+  MIB = BuildMI(LoadCmpBB, DL, TII->get(SBCrr))
+            .addReg(StatusReg, RegState::Define | RegState::Dead)
+            .addReg(DestHi, getKillRegState(Dest.isDead()))
+            .addReg(DesiredHi, getKillRegState(Desired.isDead()));
+  AddDefaultPred(MIB);
+  MIB.addReg(ARM::CPSR, RegState::Kill);
+
+  unsigned Bcc = IsThumb ? ARM::tBcc : ARM::Bcc;
+  BuildMI(LoadCmpBB, DL, TII->get(Bcc))
+      .addMBB(DoneBB)
+      .addImm(ARMCC::NE)
+      .addReg(ARM::CPSR, RegState::Kill);
+  LoadCmpBB->addSuccessor(DoneBB);
+  LoadCmpBB->addSuccessor(StoreBB);
+
+  // .Lstore:
+  //     strexd rStatus, rNewLo, rNewHi, [rAddr]
+  //     cmp rStatus, #0
+  //     bne .Lloadcmp
+  StoreBB->addLiveIn(Addr.getReg());
+  StoreBB->addLiveIn(New.getReg());
+  addPostLoopLiveIns(StoreBB, LiveRegs);
+
+  unsigned STREXD = IsThumb ? ARM::t2STREXD : ARM::STREXD;
+  MIB = BuildMI(StoreBB, DL, TII->get(STREXD), StatusReg);
+  addExclusiveRegPair(MIB, New, 0, IsThumb, TRI);
+  MIB.addOperand(Addr);
+  AddDefaultPred(MIB);
+
+  unsigned CMPri = IsThumb ? ARM::t2CMPri : ARM::CMPri;
+  AddDefaultPred(BuildMI(StoreBB, DL, TII->get(CMPri))
+                     .addReg(StatusReg, RegState::Kill)
+                     .addImm(0));
+  BuildMI(StoreBB, DL, TII->get(Bcc))
+      .addMBB(LoadCmpBB)
+      .addImm(ARMCC::NE)
+      .addReg(ARM::CPSR, RegState::Kill);
+  StoreBB->addSuccessor(LoadCmpBB);
+  StoreBB->addSuccessor(DoneBB);
+
+  DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end());
+  DoneBB->transferSuccessors(&MBB);
+  addPostLoopLiveIns(DoneBB, LiveRegs);
+
+  MBB.addSuccessor(LoadCmpBB);
+
+  NextMBBI = MBB.end();
+  MI.eraseFromParent();
+  return true;
+}
+
+
 bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
-                               MachineBasicBlock::iterator MBBI) {
+                               MachineBasicBlock::iterator MBBI,
+                               MachineBasicBlock::iterator &NextMBBI) {
   MachineInstr &MI = *MBBI;
   unsigned Opcode = MI.getOpcode();
   switch (Opcode) {
@@ -784,7 +1033,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
             .addReg(JumpTarget.getReg(), RegState::Kill);
       }
 
-      MachineInstr *NewMI = std::prev(MBBI);
+      auto NewMI = std::prev(MBBI);
       for (unsigned i = 1, e = MBBI->getNumOperands(); i != e; ++i)
         NewMI->addOperand(MBBI->getOperand(i));
 
@@ -1375,6 +1624,30 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     case ARM::VTBL4Pseudo: ExpandVTBL(MBBI, ARM::VTBL4, false); return true;
     case ARM::VTBX3Pseudo: ExpandVTBL(MBBI, ARM::VTBX3, true); return true;
     case ARM::VTBX4Pseudo: ExpandVTBL(MBBI, ARM::VTBX4, true); return true;
+
+    case ARM::CMP_SWAP_8:
+      if (STI->isThumb())
+        return ExpandCMP_SWAP(MBB, MBBI, ARM::t2LDREXB, ARM::t2STREXB,
+                              ARM::tUXTB, NextMBBI);
+      else
+        return ExpandCMP_SWAP(MBB, MBBI, ARM::LDREXB, ARM::STREXB,
+                              ARM::UXTB, NextMBBI);
+    case ARM::CMP_SWAP_16:
+      if (STI->isThumb())
+        return ExpandCMP_SWAP(MBB, MBBI, ARM::t2LDREXH, ARM::t2STREXH,
+                              ARM::tUXTH, NextMBBI);
+      else
+        return ExpandCMP_SWAP(MBB, MBBI, ARM::LDREXH, ARM::STREXH,
+                              ARM::UXTH, NextMBBI);
+    case ARM::CMP_SWAP_32:
+      if (STI->isThumb())
+        return ExpandCMP_SWAP(MBB, MBBI, ARM::t2LDREX, ARM::t2STREX, 0,
+                              NextMBBI);
+      else
+        return ExpandCMP_SWAP(MBB, MBBI, ARM::LDREX, ARM::STREX, 0, NextMBBI);
+
+    case ARM::CMP_SWAP_64:
+      return ExpandCMP_SWAP_64(MBB, MBBI, NextMBBI);
   }
 }
 
@@ -1384,7 +1657,7 @@ bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) {
   MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
   while (MBBI != E) {
     MachineBasicBlock::iterator NMBBI = std::next(MBBI);
-    Modified |= ExpandMI(MBB, MBBI);
+    Modified |= ExpandMI(MBB, MBBI, NMBBI);
     MBBI = NMBBI;
   }
 
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index ff2fcfa349dc..13724da5d4f7 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -22,7 +22,6 @@
 #include "ARMSubtarget.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/FastISel.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
@@ -41,7 +40,6 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
@@ -110,11 +108,6 @@ class ARMFastISel final : public FastISel {
                              const TargetRegisterClass *RC,
                              unsigned Op0, bool Op0IsKill,
                              unsigned Op1, bool Op1IsKill);
-    unsigned fastEmitInst_rrr(unsigned MachineInstOpcode,
-                              const TargetRegisterClass *RC,
-                              unsigned Op0, bool Op0IsKill,
-                              unsigned Op1, bool Op1IsKill,
-                              unsigned Op2, bool Op2IsKill);
     unsigned fastEmitInst_ri(unsigned MachineInstOpcode,
                              const TargetRegisterClass *RC,
                              unsigned Op0, bool Op0IsKill,
@@ -164,6 +157,7 @@ class ARMFastISel final : public FastISel {
 
     // Utility routines.
   private:
+    bool isPositionIndependent() const;
     bool isTypeLegal(Type *Ty, MVT &VT);
     bool isLoadTypeLegal(Type *Ty, MVT &VT);
     bool ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
@@ -215,7 +209,7 @@ class ARMFastISel final : public FastISel {
     const MachineInstrBuilder &AddOptionalDefs(const MachineInstrBuilder &MIB);
     void AddLoadStoreOperands(MVT VT, Address &Addr,
                               const MachineInstrBuilder &MIB,
-                              unsigned Flags, bool useAM3);
+                              MachineMemOperand::Flags Flags, bool useAM3);
 };
 
 } // end anonymous namespace
@@ -331,38 +325,6 @@ unsigned ARMFastISel::fastEmitInst_rr(unsigned MachineInstOpcode,
   return ResultReg;
 }
 
-unsigned ARMFastISel::fastEmitInst_rrr(unsigned MachineInstOpcode,
-                                       const TargetRegisterClass *RC,
-                                       unsigned Op0, bool Op0IsKill,
-                                       unsigned Op1, bool Op1IsKill,
-                                       unsigned Op2, bool Op2IsKill) {
-  unsigned ResultReg = createResultReg(RC);
-  const MCInstrDesc &II = TII.get(MachineInstOpcode);
-
-  // Make sure the input operands are sufficiently constrained to be legal
-  // for this instruction.
-  Op0 = constrainOperandRegClass(II, Op0, 1);
-  Op1 = constrainOperandRegClass(II, Op1, 2);
-  Op2 = constrainOperandRegClass(II, Op1, 3);
-
-  if (II.getNumDefs() >= 1) {
-    AddOptionalDefs(
-        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
-            .addReg(Op0, Op0IsKill * RegState::Kill)
-            .addReg(Op1, Op1IsKill * RegState::Kill)
-            .addReg(Op2, Op2IsKill * RegState::Kill));
-  } else {
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
-                   .addReg(Op0, Op0IsKill * RegState::Kill)
-                   .addReg(Op1, Op1IsKill * RegState::Kill)
-                   .addReg(Op2, Op2IsKill * RegState::Kill));
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-                           TII.get(TargetOpcode::COPY), ResultReg)
-                   .addReg(II.ImplicitDefs[0]));
-  }
-  return ResultReg;
-}
-
 unsigned ARMFastISel::fastEmitInst_ri(unsigned MachineInstOpcode,
                                       const TargetRegisterClass *RC,
                                       unsigned Op0, bool Op0IsKill,
@@ -576,12 +538,15 @@ unsigned ARMFastISel::ARMMaterializeInt(const Constant *C, MVT VT) {
   return ResultReg;
 }
 
+bool ARMFastISel::isPositionIndependent() const {
+  return TLI.isPositionIndependent();
+}
+
 unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) {
   // For now 32-bit only.
   if (VT != MVT::i32 || GV->isThreadLocal()) return 0;
 
-  Reloc::Model RelocM = TM.getRelocationModel();
-  bool IsIndirect = Subtarget->GVIsIndirectSymbol(GV, RelocM);
+  bool IsIndirect = Subtarget->isGVIndirectSymbol(GV);
   const TargetRegisterClass *RC = isThumb2 ? &ARM::rGPRRegClass
                                            : &ARM::GPRRegClass;
   unsigned DestReg = createResultReg(RC);
@@ -591,23 +556,20 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) {
   bool IsThreadLocal = GVar && GVar->isThreadLocal();
   if (!Subtarget->isTargetMachO() && IsThreadLocal) return 0;
 
+  bool IsPositionIndependent = isPositionIndependent();
   // Use movw+movt when possible, it avoids constant pool entries.
   // Non-darwin targets only support static movt relocations in FastISel.
   if (Subtarget->useMovt(*FuncInfo.MF) &&
-      (Subtarget->isTargetMachO() || RelocM == Reloc::Static)) {
+      (Subtarget->isTargetMachO() || !IsPositionIndependent)) {
     unsigned Opc;
     unsigned char TF = 0;
     if (Subtarget->isTargetMachO())
       TF = ARMII::MO_NONLAZY;
 
-    switch (RelocM) {
-    case Reloc::PIC_:
+    if (IsPositionIndependent)
       Opc = isThumb2 ? ARM::t2MOV_ga_pcrel : ARM::MOV_ga_pcrel;
-      break;
-    default:
+    else
       Opc = isThumb2 ? ARM::t2MOVi32imm : ARM::MOVi32imm;
-      break;
-    }
     AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                             TII.get(Opc), DestReg).addGlobalAddress(GV, 0, TF));
   } else {
@@ -618,12 +580,11 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) {
       Align = DL.getTypeAllocSize(GV->getType());
     }
 
-    if (Subtarget->isTargetELF() && RelocM == Reloc::PIC_)
+    if (Subtarget->isTargetELF() && IsPositionIndependent)
       return ARMLowerPICELF(GV, Align, VT);
 
     // Grab index.
-    unsigned PCAdj = (RelocM != Reloc::PIC_) ? 0 :
-      (Subtarget->isThumb() ? 4 : 8);
+    unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0;
     unsigned Id = AFI->createPICLabelUId();
     ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(GV, Id,
                                                                 ARMCP::CPValue,
@@ -633,10 +594,10 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) {
     // Load value.
     MachineInstrBuilder MIB;
     if (isThumb2) {
-      unsigned Opc = (RelocM!=Reloc::PIC_) ? ARM::t2LDRpci : ARM::t2LDRpci_pic;
+      unsigned Opc = IsPositionIndependent ? ARM::t2LDRpci_pic : ARM::t2LDRpci;
       MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
                     DestReg).addConstantPoolIndex(Idx);
-      if (RelocM == Reloc::PIC_)
+      if (IsPositionIndependent)
         MIB.addImm(Id);
       AddOptionalDefs(MIB);
     } else {
@@ -648,7 +609,7 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) {
                 .addImm(0);
       AddOptionalDefs(MIB);
 
-      if (RelocM == Reloc::PIC_) {
+      if (IsPositionIndependent) {
         unsigned Opc = IsIndirect ? ARM::PICLDR : ARM::PICADD;
         unsigned NewDestReg = createResultReg(TLI.getRegClassFor(VT));
 
@@ -912,7 +873,8 @@ void ARMFastISel::ARMSimplifyAddress(Address &Addr, MVT VT, bool useAM3) {
 
 void ARMFastISel::AddLoadStoreOperands(MVT VT, Address &Addr,
                                        const MachineInstrBuilder &MIB,
-                                       unsigned Flags, bool useAM3) {
+                                       MachineMemOperand::Flags Flags,
+                                       bool useAM3) {
   // addrmode5 output depends on the selection dag addressing dividing the
   // offset by 4 that it then later multiplies. Do this here as well.
   if (VT.SimpleTy == MVT::f32 || VT.SimpleTy == MVT::f64)
@@ -931,7 +893,7 @@ void ARMFastISel::AddLoadStoreOperands(MVT VT, Address &Addr,
     // ARM halfword load/stores and signed byte loads need an additional
     // operand.
     if (useAM3) {
-      signed Imm = (Addr.Offset < 0) ? (0x100 | -Addr.Offset) : Addr.Offset;
+      int Imm = (Addr.Offset < 0) ? (0x100 | -Addr.Offset) : Addr.Offset;
       MIB.addReg(0);
       MIB.addImm(Imm);
     } else {
@@ -945,7 +907,7 @@ void ARMFastISel::AddLoadStoreOperands(MVT VT, Address &Addr,
     // ARM halfword load/stores and signed byte loads need an additional
     // operand.
     if (useAM3) {
-      signed Imm = (Addr.Offset < 0) ? (0x100 | -Addr.Offset) : Addr.Offset;
+      int Imm = (Addr.Offset < 0) ? (0x100 | -Addr.Offset) : Addr.Offset;
       MIB.addReg(0);
       MIB.addImm(Imm);
     } else {
@@ -1062,6 +1024,21 @@ bool ARMFastISel::SelectLoad(const Instruction *I) {
   if (cast<LoadInst>(I)->isAtomic())
     return false;
 
+  const Value *SV = I->getOperand(0);
+  if (TLI.supportSwiftError()) {
+    // Swifterror values can come from either a function parameter with
+    // swifterror attribute or an alloca with swifterror attribute.
+    if (const Argument *Arg = dyn_cast<Argument>(SV)) {
+      if (Arg->hasSwiftErrorAttr())
+        return false;
+    }
+
+    if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(SV)) {
+      if (Alloca->isSwiftError())
+        return false;
+    }
+  }
+
   // Verify we have a legal type before going any further.
   MVT VT;
   if (!isLoadTypeLegal(I->getType(), VT))
@@ -1177,6 +1154,21 @@ bool ARMFastISel::SelectStore(const Instruction *I) {
   if (cast<StoreInst>(I)->isAtomic())
     return false;
 
+  const Value *PtrV = I->getOperand(1);
+  if (TLI.supportSwiftError()) {
+    // Swifterror values can come from either a function parameter with
+    // swifterror attribute or an alloca with swifterror attribute.
+    if (const Argument *Arg = dyn_cast<Argument>(PtrV)) {
+      if (Arg->hasSwiftErrorAttr())
+        return false;
+    }
+
+    if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(PtrV)) {
+      if (Alloca->isSwiftError())
+        return false;
+    }
+  }
+
   // Verify we have a legal type before going any further.
   MVT VT;
   if (!isLoadTypeLegal(I->getOperand(0)->getType(), VT))
@@ -1726,6 +1718,13 @@ bool ARMFastISel::SelectRem(const Instruction *I, bool isSigned) {
   if (!isTypeLegal(Ty, VT))
     return false;
 
+  // Many ABIs do not provide a libcall for standalone remainder, so we need to
+  // use divrem (see the RTABI 4.3.1). Since FastISel can't handle non-double
+  // multi-reg returns, we'll have to bail out.
+  if (!TLI.hasStandaloneRem(VT)) {
+    return false;
+  }
+
   RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
   if (VT == MVT::i8)
     LC = isSigned ? RTLIB::SREM_I8 : RTLIB::UREM_I8;
@@ -1847,6 +1846,7 @@ CCAssignFn *ARMFastISel::CCAssignFnForCall(CallingConv::ID CC,
     }
     // Fallthrough
   case CallingConv::C:
+  case CallingConv::CXX_FAST_TLS:
     // Use target triple & subtarget features to do actual dispatch.
     if (Subtarget->isAAPCS_ABI()) {
       if (Subtarget->hasVFP2() &&
@@ -1858,6 +1858,7 @@ CCAssignFn *ARMFastISel::CCAssignFnForCall(CallingConv::ID CC,
       return (Return ? RetCC_ARM_APCS: CC_ARM_APCS);
     }
   case CallingConv::ARM_AAPCS_VFP:
+  case CallingConv::Swift:
     if (!isVarArg)
       return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP);
     // Fall through to soft float variant, variadic functions don't
@@ -2083,6 +2084,10 @@ bool ARMFastISel::SelectRet(const Instruction *I) {
   if (!FuncInfo.CanLowerReturn)
     return false;
 
+  if (TLI.supportSwiftError() &&
+      F.getAttributes().hasAttrSomewhere(Attribute::SwiftError))
+    return false;
+
   if (TLI.supportSplitCSR(FuncInfo.MF))
     return false;
 
@@ -2295,8 +2300,7 @@ bool ARMFastISel::SelectCall(const Instruction *I,
 
   // TODO: Avoid some calling conventions?
 
-  PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
-  FunctionType *FTy = cast<FunctionType>(PT->getElementType());
+  FunctionType *FTy = CS.getFunctionType();
   bool isVarArg = FTy->isVarArg();
 
   // Handle *simple* calls for now.
@@ -2345,6 +2349,8 @@ bool ARMFastISel::SelectCall(const Instruction *I,
     // FIXME: Only handle *easy* calls for now.
     if (CS.paramHasAttr(AttrInd, Attribute::InReg) ||
         CS.paramHasAttr(AttrInd, Attribute::StructRet) ||
+        CS.paramHasAttr(AttrInd, Attribute::SwiftSelf) ||
+        CS.paramHasAttr(AttrInd, Attribute::SwiftError) ||
         CS.paramHasAttr(AttrInd, Attribute::Nest) ||
         CS.paramHasAttr(AttrInd, Attribute::ByVal))
       return false;
@@ -2394,22 +2400,15 @@ bool ARMFastISel::SelectCall(const Instruction *I,
   MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
                                     DbgLoc, TII.get(CallOpc));
 
-  unsigned char OpFlags = 0;
-
-  // Add MO_PLT for global address or external symbol in the PIC relocation
-  // model.
-  if (Subtarget->isTargetELF() && TM.getRelocationModel() == Reloc::PIC_)
-    OpFlags = ARMII::MO_PLT;
-
   // ARM calls don't take a predicate, but tBL / tBLX do.
   if(isThumb2)
     AddDefaultPred(MIB);
   if (UseReg)
     MIB.addReg(CalleeReg);
   else if (!IntrMemName)
-    MIB.addGlobalAddress(GV, 0, OpFlags);
+    MIB.addGlobalAddress(GV, 0, 0);
   else
-    MIB.addExternalSymbol(IntrMemName, OpFlags);
+    MIB.addExternalSymbol(IntrMemName, 0);
 
   // Add implicit physical register uses to the call.
   for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
@@ -2942,8 +2941,7 @@ bool ARMFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
 
 unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV,
                                      unsigned Align, MVT VT) {
-  bool UseGOT_PREL =
-      !(GV->hasHiddenVisibility() || GV->hasLocalLinkage());
+  bool UseGOT_PREL = !TM.shouldAssumeDSOLocal(*GV->getParent(), GV);
 
   LLVMContext *Context = &MF->getFunction()->getContext();
   unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
@@ -3006,6 +3004,7 @@ bool ARMFastISel::fastLowerArguments() {
   case CallingConv::ARM_AAPCS_VFP:
   case CallingConv::ARM_AAPCS:
   case CallingConv::ARM_APCS:
+  case CallingConv::Swift:
     break;
   }
 
@@ -3019,6 +3018,8 @@ bool ARMFastISel::fastLowerArguments() {
 
     if (F->getAttributes().hasAttribute(Idx, Attribute::InReg) ||
         F->getAttributes().hasAttribute(Idx, Attribute::StructRet) ||
+        F->getAttributes().hasAttribute(Idx, Attribute::SwiftSelf) ||
+        F->getAttributes().hasAttribute(Idx, Attribute::SwiftError) ||
         F->getAttributes().hasAttribute(Idx, Attribute::ByVal))
       return false;
 
diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp
index c5990bb7d1fb..e8c9f610ea64 100644
--- a/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/lib/Target/ARM/ARMFrameLowering.cpp
@@ -98,35 +98,32 @@ ARMFrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const {
   return hasReservedCallFrame(MF) || MF.getFrameInfo()->hasVarSizedObjects();
 }
 
-static bool isCSRestore(MachineInstr *MI,
-                        const ARMBaseInstrInfo &TII,
+static bool isCSRestore(MachineInstr &MI, const ARMBaseInstrInfo &TII,
                         const MCPhysReg *CSRegs) {
   // Integer spill area is handled with "pop".
-  if (isPopOpcode(MI->getOpcode())) {
+  if (isPopOpcode(MI.getOpcode())) {
     // The first two operands are predicates. The last two are
     // imp-def and imp-use of SP. Check everything in between.
-    for (int i = 5, e = MI->getNumOperands(); i != e; ++i)
-      if (!isCalleeSavedRegister(MI->getOperand(i).getReg(), CSRegs))
+    for (int i = 5, e = MI.getNumOperands(); i != e; ++i)
+      if (!isCalleeSavedRegister(MI.getOperand(i).getReg(), CSRegs))
         return false;
     return true;
   }
-  if ((MI->getOpcode() == ARM::LDR_POST_IMM ||
-       MI->getOpcode() == ARM::LDR_POST_REG ||
-       MI->getOpcode() == ARM::t2LDR_POST) &&
-      isCalleeSavedRegister(MI->getOperand(0).getReg(), CSRegs) &&
-      MI->getOperand(1).getReg() == ARM::SP)
+  if ((MI.getOpcode() == ARM::LDR_POST_IMM ||
+       MI.getOpcode() == ARM::LDR_POST_REG ||
+       MI.getOpcode() == ARM::t2LDR_POST) &&
+      isCalleeSavedRegister(MI.getOperand(0).getReg(), CSRegs) &&
+      MI.getOperand(1).getReg() == ARM::SP)
     return true;
 
   return false;
 }
 
-static void emitRegPlusImmediate(bool isARM, MachineBasicBlock &MBB,
-                                 MachineBasicBlock::iterator &MBBI, DebugLoc dl,
-                                 const ARMBaseInstrInfo &TII, unsigned DestReg,
-                                 unsigned SrcReg, int NumBytes,
-                                 unsigned MIFlags = MachineInstr::NoFlags,
-                                 ARMCC::CondCodes Pred = ARMCC::AL,
-                                 unsigned PredReg = 0) {
+static void emitRegPlusImmediate(
+    bool isARM, MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+    const DebugLoc &dl, const ARMBaseInstrInfo &TII, unsigned DestReg,
+    unsigned SrcReg, int NumBytes, unsigned MIFlags = MachineInstr::NoFlags,
+    ARMCC::CondCodes Pred = ARMCC::AL, unsigned PredReg = 0) {
   if (isARM)
     emitARMRegPlusImmediate(MBB, MBBI, dl, DestReg, SrcReg, NumBytes,
                             Pred, PredReg, TII, MIFlags);
@@ -136,7 +133,7 @@ static void emitRegPlusImmediate(bool isARM, MachineBasicBlock &MBB,
 }
 
 static void emitSPUpdate(bool isARM, MachineBasicBlock &MBB,
-                         MachineBasicBlock::iterator &MBBI, DebugLoc dl,
+                         MachineBasicBlock::iterator &MBBI, const DebugLoc &dl,
                          const ARMBaseInstrInfo &TII, int NumBytes,
                          unsigned MIFlags = MachineInstr::NoFlags,
                          ARMCC::CondCodes Pred = ARMCC::AL,
@@ -145,9 +142,9 @@ static void emitSPUpdate(bool isARM, MachineBasicBlock &MBB,
                        MIFlags, Pred, PredReg);
 }
 
-static int sizeOfSPAdjustment(const MachineInstr *MI) {
+static int sizeOfSPAdjustment(const MachineInstr &MI) {
   int RegSize;
-  switch (MI->getOpcode()) {
+  switch (MI.getOpcode()) {
   case ARM::VSTMDDB_UPD:
     RegSize = 8;
     break;
@@ -165,7 +162,7 @@ static int sizeOfSPAdjustment(const MachineInstr *MI) {
   int count = 0;
   // ARM and Thumb2 push/pop insts have explicit "sp, sp" operands (+
   // pred) so the list starts at 4.
-  for (int i = MI->getNumOperands() - 1; i >= 4; --i)
+  for (int i = MI.getNumOperands() - 1; i >= 4; --i)
     count += RegSize;
   return count;
 }
@@ -206,7 +203,8 @@ struct StackAdjustingInsts {
   }
 
   void emitDefCFAOffsets(MachineModuleInfo &MMI, MachineBasicBlock &MBB,
-                         DebugLoc dl, const ARMBaseInstrInfo &TII, bool HasFP) {
+                         const DebugLoc &dl, const ARMBaseInstrInfo &TII,
+                         bool HasFP) {
     unsigned CFAOffset = 0;
     for (auto &Info : Insts) {
       if (HasFP && !Info.BeforeFPSet)
@@ -235,7 +233,7 @@ static void emitAligningInstructions(MachineFunction &MF, ARMFunctionInfo *AFI,
                                      const TargetInstrInfo &TII,
                                      MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator MBBI,
-                                     DebugLoc DL, const unsigned Reg,
+                                     const DebugLoc &DL, const unsigned Reg,
                                      const unsigned Alignment,
                                      const bool MustBeSingleInstruction) {
   const ARMSubtarget &AST =
@@ -355,7 +353,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
     case ARM::R10:
     case ARM::R11:
     case ARM::R12:
-      if (STI.isTargetDarwin()) {
+      if (STI.splitFramePushPop()) {
         GPRCS2Size += 4;
         break;
       }
@@ -416,7 +414,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
   // .cfi_offset operations will reflect that.
   if (DPRGapSize) {
     assert(DPRGapSize == 4 && "unexpected alignment requirements for DPRs");
-    if (tryFoldSPUpdateIntoPushPop(STI, MF, LastPush, DPRGapSize))
+    if (tryFoldSPUpdateIntoPushPop(STI, MF, &*LastPush, DPRGapSize))
       DefCFAOffsetCandidates.addExtraBytes(LastPush, DPRGapSize);
     else {
       emitSPUpdate(isARM, MBB, MBBI, dl, TII, -DPRGapSize,
@@ -430,7 +428,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
     // Since vpush register list cannot have gaps, there may be multiple vpush
     // instructions in the prologue.
     while (MBBI->getOpcode() == ARM::VSTMDDB_UPD) {
-      DefCFAOffsetCandidates.addInst(MBBI, sizeOfSPAdjustment(MBBI));
+      DefCFAOffsetCandidates.addInst(MBBI, sizeOfSPAdjustment(*MBBI));
       LastPush = MBBI++;
     }
   }
@@ -485,7 +483,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
 
     AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::t2SUBrr),
                                         ARM::SP)
-                                .addReg(ARM::SP, RegState::Define)
+                                .addReg(ARM::SP, RegState::Kill)
                                 .addReg(ARM::R4, RegState::Kill)
                                 .setMIFlags(MachineInstr::FrameSetup)));
     NumBytes = 0;
@@ -494,7 +492,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
   if (NumBytes) {
     // Adjust SP after all the callee-save spills.
     if (AFI->getNumAlignedDPRCS2Regs() == 0 &&
-        tryFoldSPUpdateIntoPushPop(STI, MF, LastPush, NumBytes))
+        tryFoldSPUpdateIntoPushPop(STI, MF, &*LastPush, NumBytes))
       DefCFAOffsetCandidates.addExtraBytes(LastPush, NumBytes);
     else {
       emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes,
@@ -522,7 +520,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
   // that push.
   if (HasFP) {
     MachineBasicBlock::iterator AfterPush = std::next(GPRCS1Push);
-    unsigned PushSize = sizeOfSPAdjustment(GPRCS1Push);
+    unsigned PushSize = sizeOfSPAdjustment(*GPRCS1Push);
     emitRegPlusImmediate(!AFI->isThumbFunction(), MBB, AfterPush,
                          dl, TII, FramePtr, ARM::SP,
                          PushSize + FramePtrOffsetInPush,
@@ -559,7 +557,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
       case ARM::R10:
       case ARM::R11:
       case ARM::R12:
-        if (STI.isTargetDarwin())
+        if (STI.splitFramePushPop())
           break;
         // fallthrough
       case ARM::R0:
@@ -592,7 +590,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
       case ARM::R10:
       case ARM::R11:
       case ARM::R12:
-        if (STI.isTargetDarwin()) {
+        if (STI.splitFramePushPop()) {
           unsigned DwarfReg =  MRI->getDwarfRegNum(Reg, true);
           unsigned Offset = MFI->getObjectOffset(FI);
           unsigned CFIIndex = MMI.addFrameInst(
@@ -727,8 +725,8 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
     if (MBBI != MBB.begin()) {
       do {
         --MBBI;
-      } while (MBBI != MBB.begin() && isCSRestore(MBBI, TII, CSRegs));
-      if (!isCSRestore(MBBI, TII, CSRegs))
+      } while (MBBI != MBB.begin() && isCSRestore(*MBBI, TII, CSRegs));
+      if (!isCSRestore(*MBBI, TII, CSRegs))
         ++MBBI;
     }
 
@@ -774,8 +772,8 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
             .addReg(FramePtr));
       }
     } else if (NumBytes &&
-               !tryFoldSPUpdateIntoPushPop(STI, MF, MBBI, NumBytes))
-        emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes);
+               !tryFoldSPUpdateIntoPushPop(STI, MF, &*MBBI, NumBytes))
+      emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes);
 
     // Increment past our save areas.
     if (AFI->getDPRCalleeSavedAreaSize()) {
@@ -904,33 +902,27 @@ void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB,
     unsigned LastReg = 0;
     for (; i != 0; --i) {
       unsigned Reg = CSI[i-1].getReg();
-      if (!(Func)(Reg, STI.isTargetDarwin())) continue;
+      if (!(Func)(Reg, STI.splitFramePushPop())) continue;
 
       // D-registers in the aligned area DPRCS2 are NOT spilled here.
       if (Reg >= ARM::D8 && Reg < ARM::D8 + NumAlignedDPRCS2Regs)
         continue;
 
-      // Add the callee-saved register as live-in unless it's LR and
-      // @llvm.returnaddress is called. If LR is returned for
-      // @llvm.returnaddress then it's already added to the function and
-      // entry block live-in sets.
-      bool isKill = true;
-      if (Reg == ARM::LR) {
-        if (MF.getFrameInfo()->isReturnAddressTaken() &&
-            MF.getRegInfo().isLiveIn(Reg))
-          isKill = false;
-      }
-
-      if (isKill)
+      bool isLiveIn = MF.getRegInfo().isLiveIn(Reg);
+      if (!isLiveIn)
         MBB.addLiveIn(Reg);
-
       // If NoGap is true, push consecutive registers and then leave the rest
       // for other instructions. e.g.
       // vpush {d8, d10, d11} -> vpush {d8}, vpush {d10, d11}
       if (NoGap && LastReg && LastReg != Reg-1)
         break;
       LastReg = Reg;
-      Regs.push_back(std::make_pair(Reg, isKill));
+      // Do not set a kill flag on values that are also marked as live-in. This
+      // happens with the @llvm-returnaddress intrinsic and with arguments
+      // passed in callee saved registers.
+      // Omitting the kill flags is conservatively correct even if the live-in
+      // is not used after all.
+      Regs.push_back(std::make_pair(Reg, /*isKill=*/!isLiveIn));
     }
 
     if (Regs.empty())
@@ -991,7 +983,7 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
     bool DeleteRet = false;
     for (; i != 0; --i) {
       unsigned Reg = CSI[i-1].getReg();
-      if (!(Func)(Reg, STI.isTargetDarwin())) continue;
+      if (!(Func)(Reg, STI.splitFramePushPop())) continue;
 
       // The aligned reloads from area DPRCS2 are not inserted here.
       if (Reg >= ARM::D8 && Reg < ARM::D8 + NumAlignedDPRCS2Regs)
@@ -1027,7 +1019,7 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
       for (unsigned i = 0, e = Regs.size(); i < e; ++i)
         MIB.addReg(Regs[i], getDefRegState(true));
       if (DeleteRet && MI != MBB.end()) {
-        MIB.copyImplicitOps(&*MI);
+        MIB.copyImplicitOps(*MI);
         MI->eraseFromParent();
       }
       MI = MIB;
@@ -1367,7 +1359,7 @@ static unsigned GetFunctionSizeInBytes(const MachineFunction &MF,
   unsigned FnSize = 0;
   for (auto &MBB : MF) {
     for (auto &MI : MBB)
-      FnSize += TII.GetInstSizeInBytes(&MI);
+      FnSize += TII.GetInstSizeInBytes(MI);
   }
   return FnSize;
 }
@@ -1485,6 +1477,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
   bool CS1Spilled = false;
   bool LRSpilled = false;
   unsigned NumGPRSpills = 0;
+  unsigned NumFPRSpills = 0;
   SmallVector<unsigned, 4> UnspilledCS1GPRs;
   SmallVector<unsigned, 4> UnspilledCS2GPRs;
   const ARMBaseRegisterInfo *RegInfo = static_cast<const ARMBaseRegisterInfo *>(
@@ -1539,13 +1532,22 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
       CanEliminateFrame = false;
     }
 
-    if (!ARM::GPRRegClass.contains(Reg))
+    if (!ARM::GPRRegClass.contains(Reg)) {
+      if (Spilled) {
+        if (ARM::SPRRegClass.contains(Reg))
+          NumFPRSpills++;
+        else if (ARM::DPRRegClass.contains(Reg))
+          NumFPRSpills += 2;
+        else if (ARM::QPRRegClass.contains(Reg))
+          NumFPRSpills += 4;
+      }
       continue;
+    }
 
     if (Spilled) {
       NumGPRSpills++;
 
-      if (!STI.isTargetDarwin()) {
+      if (!STI.splitFramePushPop()) {
         if (Reg == ARM::LR)
           LRSpilled = true;
         CS1Spilled = true;
@@ -1567,7 +1569,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
         break;
       }
     } else {
-      if (!STI.isTargetDarwin()) {
+      if (!STI.splitFramePushPop()) {
         UnspilledCS1GPRs.push_back(Reg);
         continue;
       }
@@ -1613,12 +1615,21 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
   // FIXME: We could add logic to be more precise about negative offsets
   //        and which instructions will need a scratch register for them. Is it
   //        worth the effort and added fragility?
-  bool BigStack = (RS && (MFI->estimateStackSize(MF) +
-                              ((hasFP(MF) && AFI->hasStackFrame()) ? 4 : 0) >=
-                          estimateRSStackSizeLimit(MF, this))) ||
+  unsigned EstimatedStackSize =
+      MFI->estimateStackSize(MF) + 4 * (NumGPRSpills + NumFPRSpills);
+  if (hasFP(MF)) {
+    if (AFI->hasStackFrame())
+      EstimatedStackSize += 4;
+  } else {
+    // If FP is not used, SP will be used to access arguments, so count the
+    // size of arguments into the estimation.
+    EstimatedStackSize += MF.getInfo<ARMFunctionInfo>()->getArgumentStackSize();
+  }
+  EstimatedStackSize += 16; // For possible paddings.
+
+  bool BigStack = EstimatedStackSize >= estimateRSStackSizeLimit(MF, this) ||
                   MFI->hasVarSizedObjects() ||
                   (MFI->adjustsStack() && !canSimplifyCallFramePseudos(MF));
-
   bool ExtraCSSpill = false;
   if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF)) {
     AFI->setHasStackFrame(true);
@@ -1712,6 +1723,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
       } else if (!AFI->isThumb1OnlyFunction()) {
         // note: Thumb1 functions spill to R12, not the stack.  Reserve a slot
         // closest to SP or frame pointer.
+        assert(RS && "Register scavenging not provided");
         const TargetRegisterClass *RC = &ARM::GPRRegClass;
         RS->addScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(),
                                                            RC->getAlignment(),
@@ -1726,19 +1738,18 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
   }
 }
 
-
-void ARMFrameLowering::
-eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator I) const {
+MachineBasicBlock::iterator ARMFrameLowering::eliminateCallFramePseudoInstr(
+    MachineFunction &MF, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator I) const {
   const ARMBaseInstrInfo &TII =
       *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo());
   if (!hasReservedCallFrame(MF)) {
     // If we have alloca, convert as follows:
     // ADJCALLSTACKDOWN -> sub, sp, sp, amount
     // ADJCALLSTACKUP   -> add, sp, sp, amount
-    MachineInstr *Old = I;
-    DebugLoc dl = Old->getDebugLoc();
-    unsigned Amount = Old->getOperand(0).getImm();
+    MachineInstr &Old = *I;
+    DebugLoc dl = Old.getDebugLoc();
+    unsigned Amount = Old.getOperand(0).getImm();
     if (Amount != 0) {
       // We need to keep the stack aligned properly.  To do this, we round the
       // amount of space needed for the outgoing arguments up to the next
@@ -1751,25 +1762,26 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
       bool isARM = !AFI->isThumbFunction();
 
       // Replace the pseudo instruction with a new instruction...
-      unsigned Opc = Old->getOpcode();
-      int PIdx = Old->findFirstPredOperandIdx();
-      ARMCC::CondCodes Pred = (PIdx == -1)
-        ? ARMCC::AL : (ARMCC::CondCodes)Old->getOperand(PIdx).getImm();
+      unsigned Opc = Old.getOpcode();
+      int PIdx = Old.findFirstPredOperandIdx();
+      ARMCC::CondCodes Pred =
+          (PIdx == -1) ? ARMCC::AL
+                       : (ARMCC::CondCodes)Old.getOperand(PIdx).getImm();
       if (Opc == ARM::ADJCALLSTACKDOWN || Opc == ARM::tADJCALLSTACKDOWN) {
         // Note: PredReg is operand 2 for ADJCALLSTACKDOWN.
-        unsigned PredReg = Old->getOperand(2).getReg();
+        unsigned PredReg = Old.getOperand(2).getReg();
         emitSPUpdate(isARM, MBB, I, dl, TII, -Amount, MachineInstr::NoFlags,
                      Pred, PredReg);
       } else {
         // Note: PredReg is operand 3 for ADJCALLSTACKUP.
-        unsigned PredReg = Old->getOperand(3).getReg();
+        unsigned PredReg = Old.getOperand(3).getReg();
         assert(Opc == ARM::ADJCALLSTACKUP || Opc == ARM::tADJCALLSTACKUP);
         emitSPUpdate(isARM, MBB, I, dl, TII, Amount, MachineInstr::NoFlags,
                      Pred, PredReg);
       }
     }
   }
-  MBB.erase(I);
+  return MBB.erase(I);
 }
 
 /// Get the minimum constant for ARM that is greater than or equal to the
@@ -2162,7 +2174,7 @@ void ARMFrameLowering::adjustForSegmentedStacks(
 
   PrevStackMBB->addSuccessor(McrMBB);
 
-#ifdef XDEBUG
+#ifdef EXPENSIVE_CHECKS
   MF.verify();
 #endif
 }
diff --git a/lib/Target/ARM/ARMFrameLowering.h b/lib/Target/ARM/ARMFrameLowering.h
index 66f4dfb6ef52..21cd78da395c 100644
--- a/lib/Target/ARM/ARMFrameLowering.h
+++ b/lib/Target/ARM/ARMFrameLowering.h
@@ -74,7 +74,7 @@ public:
                    bool(*Func)(unsigned, bool),
                    unsigned NumAlignedDPRCS2Regs) const;
 
-  void
+  MachineBasicBlock::iterator
   eliminateCallFramePseudoInstr(MachineFunction &MF,
                                 MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator MI) const override;
diff --git a/lib/Target/ARM/ARMHazardRecognizer.cpp b/lib/Target/ARM/ARMHazardRecognizer.cpp
index 0157c0a35286..0d904ecb6296 100644
--- a/lib/Target/ARM/ARMHazardRecognizer.cpp
+++ b/lib/Target/ARM/ARMHazardRecognizer.cpp
@@ -50,8 +50,7 @@ ARMHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
 
       // Skip over one non-VFP / NEON instruction.
       if (!LastMI->isBarrier() &&
-          // On A9, AGU and NEON/FPU are muxed.
-          !(TII.getSubtarget().isLikeA9() && LastMI->mayLoadOrStore()) &&
+          !(TII.getSubtarget().hasMuxedUnits() && LastMI->mayLoadOrStore()) &&
           (LastMCID.TSFlags & ARMII::DomainMask) == ARMII::DomainGeneral) {
         MachineBasicBlock::iterator I = LastMI;
         if (I != LastMI->getParent()->begin()) {
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 6e7edbf9fb15..20db3d39bcae 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -29,7 +29,6 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetLowering.h"
@@ -44,11 +43,6 @@ DisableShifterOp("disable-shifter-op", cl::Hidden,
   cl::desc("Disable isel of shifter-op"),
   cl::init(false));
 
-static cl::opt<bool>
-CheckVMLxHazard("check-vmlx-hazard", cl::Hidden,
-  cl::desc("Check fp vmla / vmls hazard at isel time"),
-  cl::init(true));
-
 //===--------------------------------------------------------------------===//
 /// ARMDAGToDAGISel - ARM specific code to select ARM machine
 /// instructions for SelectionDAG operations.
@@ -84,12 +78,11 @@ public:
 
   /// getI32Imm - Return a target constant of type i32 with the specified
   /// value.
-  inline SDValue getI32Imm(unsigned Imm, SDLoc dl) {
+  inline SDValue getI32Imm(unsigned Imm, const SDLoc &dl) {
     return CurDAG->getTargetConstant(Imm, dl, MVT::i32);
   }
 
-  SDNode *Select(SDNode *N) override;
-
+  void Select(SDNode *N) override;
 
   bool hasNoVMLxHazardUse(SDNode *N) const;
   bool isShifterOpProfitable(const SDValue &Shift,
@@ -200,57 +193,61 @@ public:
 #include "ARMGenDAGISel.inc"
 
 private:
-  /// SelectARMIndexedLoad - Indexed (pre/post inc/dec) load matching code for
-  /// ARM.
-  SDNode *SelectARMIndexedLoad(SDNode *N);
-  SDNode *SelectT2IndexedLoad(SDNode *N);
+  /// Indexed (pre/post inc/dec) load matching code for ARM.
+  bool tryARMIndexedLoad(SDNode *N);
+  bool tryT1IndexedLoad(SDNode *N);
+  bool tryT2IndexedLoad(SDNode *N);
 
   /// SelectVLD - Select NEON load intrinsics.  NumVecs should be
   /// 1, 2, 3 or 4.  The opcode arrays specify the instructions used for
   /// loads of D registers and even subregs and odd subregs of Q registers.
   /// For NumVecs <= 2, QOpcodes1 is not used.
-  SDNode *SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
-                    const uint16_t *DOpcodes,
-                    const uint16_t *QOpcodes0, const uint16_t *QOpcodes1);
+  void SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
+                 const uint16_t *DOpcodes, const uint16_t *QOpcodes0,
+                 const uint16_t *QOpcodes1);
 
   /// SelectVST - Select NEON store intrinsics.  NumVecs should
   /// be 1, 2, 3 or 4.  The opcode arrays specify the instructions used for
   /// stores of D registers and even subregs and odd subregs of Q registers.
   /// For NumVecs <= 2, QOpcodes1 is not used.
-  SDNode *SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
-                    const uint16_t *DOpcodes,
-                    const uint16_t *QOpcodes0, const uint16_t *QOpcodes1);
+  void SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
+                 const uint16_t *DOpcodes, const uint16_t *QOpcodes0,
+                 const uint16_t *QOpcodes1);
 
   /// SelectVLDSTLane - Select NEON load/store lane intrinsics.  NumVecs should
   /// be 2, 3 or 4.  The opcode arrays specify the instructions used for
   /// load/store of D registers and Q registers.
-  SDNode *SelectVLDSTLane(SDNode *N, bool IsLoad,
-                          bool isUpdating, unsigned NumVecs,
-                          const uint16_t *DOpcodes, const uint16_t *QOpcodes);
+  void SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating,
+                       unsigned NumVecs, const uint16_t *DOpcodes,
+                       const uint16_t *QOpcodes);
 
   /// SelectVLDDup - Select NEON load-duplicate intrinsics.  NumVecs
   /// should be 2, 3 or 4.  The opcode array specifies the instructions used
   /// for loading D registers.  (Q registers are not supported.)
-  SDNode *SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
-                       const uint16_t *Opcodes);
+  void SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
+                    const uint16_t *Opcodes);
 
   /// SelectVTBL - Select NEON VTBL and VTBX intrinsics.  NumVecs should be 2,
   /// 3 or 4.  These are custom-selected so that a REG_SEQUENCE can be
   /// generated to force the table registers to be consecutive.
-  SDNode *SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs, unsigned Opc);
+  void SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs, unsigned Opc);
 
-  /// SelectV6T2BitfieldExtractOp - Select SBFX/UBFX instructions for ARM.
-  SDNode *SelectV6T2BitfieldExtractOp(SDNode *N, bool isSigned);
+  /// Try to select SBFX/UBFX instructions for ARM.
+  bool tryV6T2BitfieldExtractOp(SDNode *N, bool isSigned);
 
   // Select special operations if node forms integer ABS pattern
-  SDNode *SelectABSOp(SDNode *N);
+  bool tryABSOp(SDNode *N);
+
+  bool tryReadRegister(SDNode *N);
+  bool tryWriteRegister(SDNode *N);
 
-  SDNode *SelectReadRegister(SDNode *N);
-  SDNode *SelectWriteRegister(SDNode *N);
+  bool tryInlineAsm(SDNode *N);
 
-  SDNode *SelectInlineAsm(SDNode *N);
+  void SelectConcatVector(SDNode *N);
 
-  SDNode *SelectConcatVector(SDNode *N);
+  bool trySMLAWSMULW(SDNode *N);
+
+  void SelectCMP_SWAP(SDNode *N);
 
   /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
   /// inline asm expressions.
@@ -269,7 +266,7 @@ private:
   SDNode *createQuadQRegsNode(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3);
 
   // Get the alignment operand for a NEON VLD or VST instruction.
-  SDValue GetVLDSTAlign(SDValue Align, SDLoc dl, unsigned NumVecs,
+  SDValue GetVLDSTAlign(SDValue Align, const SDLoc &dl, unsigned NumVecs,
                         bool is64BitVector);
 
   /// Returns the number of instructions required to materialize the given
@@ -426,11 +423,7 @@ bool ARMDAGToDAGISel::hasNoVMLxHazardUse(SDNode *N) const {
   if (OptLevel == CodeGenOpt::None)
     return true;
 
-  if (!CheckVMLxHazard)
-    return true;
-
-  if (!Subtarget->isCortexA7() && !Subtarget->isCortexA8() &&
-      !Subtarget->isCortexA9() && !Subtarget->isSwift())
+  if (!Subtarget->hasVMLxHazards())
     return true;
 
   if (!N->hasOneUse())
@@ -484,6 +477,7 @@ unsigned ARMDAGToDAGISel::ConstantMaterializationCost(unsigned Val) const {
   if (Subtarget->isThumb()) {
     if (Val <= 255) return 1;                               // MOV
     if (Subtarget->hasV6T2Ops() && Val <= 0xffff) return 1; // MOVW
+    if (Val <= 510) return 2;                               // MOV + ADDi8
     if (~Val <= 255) return 2;                              // MOV + MVN
     if (ARM_AM::isThumbImmShiftedVal(Val)) return 2;        // MOV + LSL
   } else {
@@ -548,11 +542,9 @@ bool ARMDAGToDAGISel::SelectImmShifterOperand(SDValue N,
     unsigned PowerOfTwo = 0;
     SDValue NewMulConst;
     if (canExtractShiftFromMul(N, 31, PowerOfTwo, NewMulConst)) {
-      BaseReg = SDValue(Select(CurDAG->getNode(ISD::MUL, SDLoc(N), MVT::i32,
-                                               N.getOperand(0), NewMulConst)
-                                   .getNode()),
-                        0);
+      HandleSDNode Handle(N);
       replaceDAGValue(N.getOperand(1), NewMulConst);
+      BaseReg = Handle.getValue();
       Opc = CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ARM_AM::lsl,
                                                           PowerOfTwo),
                                       SDLoc(N), MVT::i32);
@@ -623,6 +615,7 @@ bool ARMDAGToDAGISel::SelectAddrModeImm12(SDValue N,
 
     if (N.getOpcode() == ARMISD::Wrapper &&
         N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress &&
+        N.getOperand(0).getOpcode() != ISD::TargetExternalSymbol &&
         N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) {
       Base = N.getOperand(0);
     } else
@@ -803,6 +796,7 @@ AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDValue N,
           FI, TLI->getPointerTy(CurDAG->getDataLayout()));
     } else if (N.getOpcode() == ARMISD::Wrapper &&
                N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress &&
+               N.getOperand(0).getOpcode() != ISD::TargetExternalSymbol &&
                N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) {
       Base = N.getOperand(0);
     }
@@ -1070,6 +1064,7 @@ bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N,
           FI, TLI->getPointerTy(CurDAG->getDataLayout()));
     } else if (N.getOpcode() == ARMISD::Wrapper &&
                N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress &&
+               N.getOperand(0).getOpcode() != ISD::TargetExternalSymbol &&
                N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) {
       Base = N.getOperand(0);
     }
@@ -1190,6 +1185,7 @@ ARMDAGToDAGISel::SelectThumbAddrModeImm5S(SDValue N, unsigned Scale,
       return false; // We want to select register offset instead
     } else if (N.getOpcode() == ARMISD::Wrapper &&
         N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress &&
+        N.getOperand(0).getOpcode() != ISD::TargetExternalSymbol &&
         N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) {
       Base = N.getOperand(0);
     } else {
@@ -1297,6 +1293,7 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDValue N,
 
     if (N.getOpcode() == ARMISD::Wrapper &&
         N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress &&
+        N.getOperand(0).getOpcode() != ISD::TargetExternalSymbol &&
         N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) {
       Base = N.getOperand(0);
       if (Base.getOpcode() == ISD::TargetConstantPool)
@@ -1468,15 +1465,15 @@ bool ARMDAGToDAGISel::SelectT2AddrModeExclusive(SDValue N, SDValue &Base,
 //===--------------------------------------------------------------------===//
 
 /// getAL - Returns a ARMCC::AL immediate node.
-static inline SDValue getAL(SelectionDAG *CurDAG, SDLoc dl) {
+static inline SDValue getAL(SelectionDAG *CurDAG, const SDLoc &dl) {
   return CurDAG->getTargetConstant((uint64_t)ARMCC::AL, dl, MVT::i32);
 }
 
-SDNode *ARMDAGToDAGISel::SelectARMIndexedLoad(SDNode *N) {
+bool ARMDAGToDAGISel::tryARMIndexedLoad(SDNode *N) {
   LoadSDNode *LD = cast<LoadSDNode>(N);
   ISD::MemIndexedMode AM = LD->getAddressingMode();
   if (AM == ISD::UNINDEXED)
-    return nullptr;
+    return false;
 
   EVT LoadedVT = LD->getMemoryVT();
   SDValue Offset, AMOpc;
@@ -1530,26 +1527,53 @@ SDNode *ARMDAGToDAGISel::SelectARMIndexedLoad(SDNode *N) {
       SDValue Base = LD->getBasePtr();
       SDValue Ops[]= { Base, AMOpc, getAL(CurDAG, SDLoc(N)),
                        CurDAG->getRegister(0, MVT::i32), Chain };
-      return CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32,
-                                    MVT::i32, MVT::Other, Ops);
+      ReplaceNode(N, CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32,
+                                            MVT::i32, MVT::Other, Ops));
+      return true;
     } else {
       SDValue Chain = LD->getChain();
       SDValue Base = LD->getBasePtr();
       SDValue Ops[]= { Base, Offset, AMOpc, getAL(CurDAG, SDLoc(N)),
                        CurDAG->getRegister(0, MVT::i32), Chain };
-      return CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32,
-                                    MVT::i32, MVT::Other, Ops);
+      ReplaceNode(N, CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32,
+                                            MVT::i32, MVT::Other, Ops));
+      return true;
     }
   }
 
-  return nullptr;
+  return false;
+}
+
+bool ARMDAGToDAGISel::tryT1IndexedLoad(SDNode *N) {
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  EVT LoadedVT = LD->getMemoryVT();
+  ISD::MemIndexedMode AM = LD->getAddressingMode();
+  if (AM == ISD::UNINDEXED || LD->getExtensionType() != ISD::NON_EXTLOAD ||
+      AM != ISD::POST_INC || LoadedVT.getSimpleVT().SimpleTy != MVT::i32)
+    return false;
+
+  auto *COffs = dyn_cast<ConstantSDNode>(LD->getOffset());
+  if (!COffs || COffs->getZExtValue() != 4)
+    return false;
+
+  // A T1 post-indexed load is just a single register LDM: LDM r0!, {r1}.
+  // The encoding of LDM is not how the rest of ISel expects a post-inc load to
+  // look however, so we use a pseudo here and switch it for a tLDMIA_UPD after
+  // ISel.
+  SDValue Chain = LD->getChain();
+  SDValue Base = LD->getBasePtr();
+  SDValue Ops[]= { Base, getAL(CurDAG, SDLoc(N)),
+                   CurDAG->getRegister(0, MVT::i32), Chain };
+  ReplaceNode(N, CurDAG->getMachineNode(ARM::tLDR_postidx, SDLoc(N), MVT::i32, MVT::i32,
+                                        MVT::Other, Ops));
+  return true;
 }
 
-SDNode *ARMDAGToDAGISel::SelectT2IndexedLoad(SDNode *N) {
+bool ARMDAGToDAGISel::tryT2IndexedLoad(SDNode *N) {
   LoadSDNode *LD = cast<LoadSDNode>(N);
   ISD::MemIndexedMode AM = LD->getAddressingMode();
   if (AM == ISD::UNINDEXED)
-    return nullptr;
+    return false;
 
   EVT LoadedVT = LD->getMemoryVT();
   bool isSExtLd = LD->getExtensionType() == ISD::SEXTLOAD;
@@ -1576,7 +1600,7 @@ SDNode *ARMDAGToDAGISel::SelectT2IndexedLoad(SDNode *N) {
         Opcode = isPre ? ARM::t2LDRB_PRE : ARM::t2LDRB_POST;
       break;
     default:
-      return nullptr;
+      return false;
     }
     Match = true;
   }
@@ -1586,11 +1610,12 @@ SDNode *ARMDAGToDAGISel::SelectT2IndexedLoad(SDNode *N) {
     SDValue Base = LD->getBasePtr();
     SDValue Ops[]= { Base, Offset, getAL(CurDAG, SDLoc(N)),
                      CurDAG->getRegister(0, MVT::i32), Chain };
-    return CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32, MVT::i32,
-                                  MVT::Other, Ops);
+    ReplaceNode(N, CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32, MVT::i32,
+                                          MVT::Other, Ops));
+    return true;
   }
 
-  return nullptr;
+  return false;
 }
 
 /// \brief Form a GPRPair pseudo register from a pair of GPR regs.
@@ -1685,7 +1710,7 @@ SDNode *ARMDAGToDAGISel::createQuadQRegsNode(EVT VT, SDValue V0, SDValue V1,
 /// GetVLDSTAlign - Get the alignment (in bytes) for the alignment operand
 /// of a NEON VLD or VST instruction.  The supported values depend on the
 /// number of registers being loaded.
-SDValue ARMDAGToDAGISel::GetVLDSTAlign(SDValue Align, SDLoc dl,
+SDValue ARMDAGToDAGISel::GetVLDSTAlign(SDValue Align, const SDLoc &dl,
                                        unsigned NumVecs, bool is64BitVector) {
   unsigned NumRegs = NumVecs;
   if (!is64BitVector && NumVecs < 3)
@@ -1806,17 +1831,17 @@ static unsigned getVLDSTRegisterUpdateOpcode(unsigned Opc) {
   return Opc; // If not one we handle, return it unchanged.
 }
 
-SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
-                                   const uint16_t *DOpcodes,
-                                   const uint16_t *QOpcodes0,
-                                   const uint16_t *QOpcodes1) {
+void ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
+                                const uint16_t *DOpcodes,
+                                const uint16_t *QOpcodes0,
+                                const uint16_t *QOpcodes1) {
   assert(NumVecs >= 1 && NumVecs <= 4 && "VLD NumVecs out-of-range");
   SDLoc dl(N);
 
   SDValue MemAddr, Align;
   unsigned AddrOpIdx = isUpdating ? 1 : 2;
   if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
-    return nullptr;
+    return;
 
   SDValue Chain = N->getOperand(0);
   EVT VT = N->getValueType(0);
@@ -1922,13 +1947,16 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
   MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
   cast<MachineSDNode>(VLd)->setMemRefs(MemOp, MemOp + 1);
 
-  if (NumVecs == 1)
-    return VLd;
+  if (NumVecs == 1) {
+    ReplaceNode(N, VLd);
+    return;
+  }
 
   // Extract out the subregisters.
   SDValue SuperReg = SDValue(VLd, 0);
-  assert(ARM::dsub_7 == ARM::dsub_0+7 &&
-         ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering");
+  static_assert(ARM::dsub_7 == ARM::dsub_0 + 7 &&
+                    ARM::qsub_3 == ARM::qsub_0 + 3,
+                "Unexpected subreg numbering");
   unsigned Sub0 = (is64BitVector ? ARM::dsub_0 : ARM::qsub_0);
   for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
     ReplaceUses(SDValue(N, Vec),
@@ -1936,13 +1964,13 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
   ReplaceUses(SDValue(N, NumVecs), SDValue(VLd, 1));
   if (isUpdating)
     ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLd, 2));
-  return nullptr;
+  CurDAG->RemoveDeadNode(N);
 }
 
-SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
-                                   const uint16_t *DOpcodes,
-                                   const uint16_t *QOpcodes0,
-                                   const uint16_t *QOpcodes1) {
+void ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
+                                const uint16_t *DOpcodes,
+                                const uint16_t *QOpcodes0,
+                                const uint16_t *QOpcodes1) {
   assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range");
   SDLoc dl(N);
 
@@ -1950,7 +1978,7 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
   unsigned AddrOpIdx = isUpdating ? 1 : 2;
   unsigned Vec0Idx = 3; // AddrOpIdx + (isUpdating ? 2 : 1)
   if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
-    return nullptr;
+    return;
 
   MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
   MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
@@ -2042,7 +2070,8 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
     // Transfer memoperands.
     cast<MachineSDNode>(VSt)->setMemRefs(MemOp, MemOp + 1);
 
-    return VSt;
+    ReplaceNode(N, VSt);
+    return;
   }
 
   // Otherwise, quad registers are stored with two separate instructions,
@@ -2083,13 +2112,13 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
   SDNode *VStB = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys,
                                         Ops);
   cast<MachineSDNode>(VStB)->setMemRefs(MemOp, MemOp + 1);
-  return VStB;
+  ReplaceNode(N, VStB);
 }
 
-SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
-                                         bool isUpdating, unsigned NumVecs,
-                                         const uint16_t *DOpcodes,
-                                         const uint16_t *QOpcodes) {
+void ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating,
+                                      unsigned NumVecs,
+                                      const uint16_t *DOpcodes,
+                                      const uint16_t *QOpcodes) {
   assert(NumVecs >=2 && NumVecs <= 4 && "VLDSTLane NumVecs out-of-range");
   SDLoc dl(N);
 
@@ -2097,7 +2126,7 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
   unsigned AddrOpIdx = isUpdating ? 1 : 2;
   unsigned Vec0Idx = 3; // AddrOpIdx + (isUpdating ? 2 : 1)
   if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
-    return nullptr;
+    return;
 
   MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
   MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
@@ -2188,13 +2217,16 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
                                   QOpcodes[OpcodeIndex]);
   SDNode *VLdLn = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
   cast<MachineSDNode>(VLdLn)->setMemRefs(MemOp, MemOp + 1);
-  if (!IsLoad)
-    return VLdLn;
+  if (!IsLoad) {
+    ReplaceNode(N, VLdLn);
+    return;
+  }
 
   // Extract the subregisters.
   SuperReg = SDValue(VLdLn, 0);
-  assert(ARM::dsub_7 == ARM::dsub_0+7 &&
-         ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering");
+  static_assert(ARM::dsub_7 == ARM::dsub_0 + 7 &&
+                    ARM::qsub_3 == ARM::qsub_0 + 3,
+                "Unexpected subreg numbering");
   unsigned Sub0 = is64BitVector ? ARM::dsub_0 : ARM::qsub_0;
   for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
     ReplaceUses(SDValue(N, Vec),
@@ -2202,18 +2234,17 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
   ReplaceUses(SDValue(N, NumVecs), SDValue(VLdLn, 1));
   if (isUpdating)
     ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdLn, 2));
-  return nullptr;
+  CurDAG->RemoveDeadNode(N);
 }
 
-SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating,
-                                      unsigned NumVecs,
-                                      const uint16_t *Opcodes) {
+void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
+                                   const uint16_t *Opcodes) {
   assert(NumVecs >=2 && NumVecs <= 4 && "VLDDup NumVecs out-of-range");
   SDLoc dl(N);
 
   SDValue MemAddr, Align;
   if (!SelectAddrMode6(N, N->getOperand(1), MemAddr, Align))
-    return nullptr;
+    return;
 
   MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
   MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
@@ -2277,7 +2308,7 @@ SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating,
   SuperReg = SDValue(VLdDup, 0);
 
   // Extract the subregisters.
-  assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
+  static_assert(ARM::dsub_7 == ARM::dsub_0 + 7, "Unexpected subreg numbering");
   unsigned SubIdx = ARM::dsub_0;
   for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
     ReplaceUses(SDValue(N, Vec),
@@ -2285,11 +2316,11 @@ SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating,
   ReplaceUses(SDValue(N, NumVecs), SDValue(VLdDup, 1));
   if (isUpdating)
     ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdDup, 2));
-  return nullptr;
+  CurDAG->RemoveDeadNode(N);
 }
 
-SDNode *ARMDAGToDAGISel::SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs,
-                                    unsigned Opc) {
+void ARMDAGToDAGISel::SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs,
+                                 unsigned Opc) {
   assert(NumVecs >= 2 && NumVecs <= 4 && "VTBL NumVecs out-of-range");
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
@@ -2318,13 +2349,12 @@ SDNode *ARMDAGToDAGISel::SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs,
   Ops.push_back(N->getOperand(FirstTblReg + NumVecs));
   Ops.push_back(getAL(CurDAG, dl)); // predicate
   Ops.push_back(CurDAG->getRegister(0, MVT::i32)); // predicate register
-  return CurDAG->getMachineNode(Opc, dl, VT, Ops);
+  ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, VT, Ops));
 }
 
-SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N,
-                                                     bool isSigned) {
+bool ARMDAGToDAGISel::tryV6T2BitfieldExtractOp(SDNode *N, bool isSigned) {
   if (!Subtarget->hasV6T2Ops())
-    return nullptr;
+    return false;
 
   unsigned Opc = isSigned
     ? (Subtarget->isThumb() ? ARM::t2SBFX : ARM::SBFX)
@@ -2338,7 +2368,7 @@ SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N,
 
       // The immediate is a mask of the low bits iff imm & (imm+1) == 0
       if (And_imm & (And_imm + 1))
-        return nullptr;
+        return false;
 
       unsigned Srl_imm = 0;
       if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SRL,
@@ -2358,7 +2388,8 @@ SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N,
             SDValue Ops[] = { N->getOperand(0).getOperand(0),
                               CurDAG->getTargetConstant(LSB, dl, MVT::i32),
                               getAL(CurDAG, dl), Reg0, Reg0 };
-            return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+            CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+            return true;
           }
 
           // ARM models shift instructions as MOVsi with shifter operand.
@@ -2368,17 +2399,19 @@ SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N,
                                       MVT::i32);
           SDValue Ops[] = { N->getOperand(0).getOperand(0), ShOpc,
                             getAL(CurDAG, dl), Reg0, Reg0 };
-          return CurDAG->SelectNodeTo(N, ARM::MOVsi, MVT::i32, Ops);
+          CurDAG->SelectNodeTo(N, ARM::MOVsi, MVT::i32, Ops);
+          return true;
         }
 
         SDValue Ops[] = { N->getOperand(0).getOperand(0),
                           CurDAG->getTargetConstant(LSB, dl, MVT::i32),
                           CurDAG->getTargetConstant(Width, dl, MVT::i32),
                           getAL(CurDAG, dl), Reg0 };
-        return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+        CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+        return true;
       }
     }
-    return nullptr;
+    return false;
   }
 
   // Otherwise, we're looking for a shift of a shift
@@ -2392,13 +2425,35 @@ SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N,
       unsigned Width = 32 - Srl_imm - 1;
       int LSB = Srl_imm - Shl_imm;
       if (LSB < 0)
-        return nullptr;
+        return false;
       SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
       SDValue Ops[] = { N->getOperand(0).getOperand(0),
                         CurDAG->getTargetConstant(LSB, dl, MVT::i32),
                         CurDAG->getTargetConstant(Width, dl, MVT::i32),
                         getAL(CurDAG, dl), Reg0 };
-      return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+      CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+      return true;
+    }
+  }
+
+  // Or we are looking for a shift of an and, with a mask operand
+  if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, And_imm) &&
+      isShiftedMask_32(And_imm)) {
+    unsigned Srl_imm = 0;
+    unsigned LSB = countTrailingZeros(And_imm);
+    // Shift must be the same as the ands lsb
+    if (isInt32Immediate(N->getOperand(1), Srl_imm) && Srl_imm == LSB) {
+      assert(Srl_imm > 0 && Srl_imm < 32 && "bad amount in shift node!");
+      unsigned MSB = 31 - countLeadingZeros(And_imm);
+      // Note: The width operand is encoded as width-1.
+      unsigned Width = MSB - LSB;
+      SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+      SDValue Ops[] = { N->getOperand(0).getOperand(0),
+                        CurDAG->getTargetConstant(Srl_imm, dl, MVT::i32),
+                        CurDAG->getTargetConstant(Width, dl, MVT::i32),
+                        getAL(CurDAG, dl), Reg0 };
+      CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+      return true;
     }
   }
 
@@ -2407,20 +2462,21 @@ SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N,
     unsigned LSB = 0;
     if (!isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SRL, LSB) &&
         !isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SRA, LSB))
-      return nullptr;
+      return false;
 
     if (LSB + Width > 32)
-      return nullptr;
+      return false;
 
     SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
     SDValue Ops[] = { N->getOperand(0).getOperand(0),
                       CurDAG->getTargetConstant(LSB, dl, MVT::i32),
                       CurDAG->getTargetConstant(Width - 1, dl, MVT::i32),
                       getAL(CurDAG, dl), Reg0 };
-    return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+    CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+    return true;
   }
 
-  return nullptr;
+  return false;
 }
 
 /// Target-specific DAG combining for ISD::XOR.
@@ -2433,16 +2489,16 @@ SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N,
 /// Y = sra (X, size(X)-1); xor (add (X, Y), Y)
 /// ARM instruction selection detects the latter and matches it to
 /// ARM::ABS or ARM::t2ABS machine node.
-SDNode *ARMDAGToDAGISel::SelectABSOp(SDNode *N){
+bool ARMDAGToDAGISel::tryABSOp(SDNode *N){
   SDValue XORSrc0 = N->getOperand(0);
   SDValue XORSrc1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
 
   if (Subtarget->isThumb1Only())
-    return nullptr;
+    return false;
 
   if (XORSrc0.getOpcode() != ISD::ADD || XORSrc1.getOpcode() != ISD::SRA)
-    return nullptr;
+    return false;
 
   SDValue ADDSrc0 = XORSrc0.getOperand(0);
   SDValue ADDSrc1 = XORSrc0.getOperand(1);
@@ -2456,57 +2512,214 @@ SDNode *ARMDAGToDAGISel::SelectABSOp(SDNode *N){
       XType.isInteger() && SRAConstant != nullptr &&
       Size == SRAConstant->getZExtValue()) {
     unsigned Opcode = Subtarget->isThumb2() ? ARM::t2ABS : ARM::ABS;
-    return CurDAG->SelectNodeTo(N, Opcode, VT, ADDSrc0);
+    CurDAG->SelectNodeTo(N, Opcode, VT, ADDSrc0);
+    return true;
+  }
+
+  return false;
+}
+
+static bool SearchSignedMulShort(SDValue SignExt, unsigned *Opc, SDValue &Src1,
+                                 bool Accumulate) {
+  // For SM*WB, we need to some form of sext.
+  // For SM*WT, we need to search for (sra X, 16)
+  // Src1 then gets set to X.
+  if ((SignExt.getOpcode() == ISD::SIGN_EXTEND ||
+       SignExt.getOpcode() == ISD::SIGN_EXTEND_INREG ||
+       SignExt.getOpcode() == ISD::AssertSext) &&
+       SignExt.getValueType() == MVT::i32) {
+
+    *Opc = Accumulate ? ARM::SMLAWB : ARM::SMULWB;
+    Src1 = SignExt.getOperand(0);
+    return true;
   }
 
-  return nullptr;
+  if (SignExt.getOpcode() != ISD::SRA)
+    return false;
+
+  ConstantSDNode *SRASrc1 = dyn_cast<ConstantSDNode>(SignExt.getOperand(1));
+  if (!SRASrc1 || SRASrc1->getZExtValue() != 16)
+    return false;
+
+  SDValue Op0 = SignExt.getOperand(0);
+
+  // The sign extend operand for SM*WB could be generated by a shl and ashr.
+  if (Op0.getOpcode() == ISD::SHL) {
+    SDValue SHL = Op0;
+    ConstantSDNode *SHLSrc1 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
+    if (!SHLSrc1 || SHLSrc1->getZExtValue() != 16)
+      return false;
+
+    *Opc = Accumulate ? ARM::SMLAWB : ARM::SMULWB;
+    Src1 = Op0.getOperand(0);
+    return true;
+  }
+  *Opc = Accumulate ? ARM::SMLAWT : ARM::SMULWT;
+  Src1 = SignExt.getOperand(0);
+  return true;
 }
 
-SDNode *ARMDAGToDAGISel::SelectConcatVector(SDNode *N) {
+static bool SearchSignedMulLong(SDValue OR, unsigned *Opc, SDValue &Src0,
+                                SDValue &Src1, bool Accumulate) {
+  // First we look for:
+  // (add (or (srl ?, 16), (shl ?, 16)))
+  if (OR.getOpcode() != ISD::OR)
+    return false;
+
+  SDValue SRL = OR.getOperand(0);
+  SDValue SHL = OR.getOperand(1);
+
+  if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) {
+    SRL = OR.getOperand(1);
+    SHL = OR.getOperand(0);
+    if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL)
+      return false;
+  }
+
+  ConstantSDNode *SRLSrc1 = dyn_cast<ConstantSDNode>(SRL.getOperand(1));
+  ConstantSDNode *SHLSrc1 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
+  if (!SRLSrc1 || !SHLSrc1 || SRLSrc1->getZExtValue() != 16 ||
+      SHLSrc1->getZExtValue() != 16)
+    return false;
+
+  // The first operands to the shifts need to be the two results from the
+  // same smul_lohi node.
+  if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) ||
+       SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI)
+    return false;
+
+  SDNode *SMULLOHI = SRL.getOperand(0).getNode();
+  if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) ||
+      SHL.getOperand(0) != SDValue(SMULLOHI, 1))
+    return false;
+
+  // Now we have:
+  // (add (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16)))
+  // For SMLAW[B|T] smul_lohi will take a 32-bit and a 16-bit arguments.
+  // For SMLAWB the 16-bit value will signed extended somehow.
+  // For SMLAWT only the SRA is required.
+
+  // Check both sides of SMUL_LOHI
+  if (SearchSignedMulShort(SMULLOHI->getOperand(0), Opc, Src1, Accumulate)) {
+    Src0 = SMULLOHI->getOperand(1);
+  } else if (SearchSignedMulShort(SMULLOHI->getOperand(1), Opc, Src1,
+                                  Accumulate)) {
+    Src0 = SMULLOHI->getOperand(0);
+  } else {
+    return false;
+  }
+  return true;
+}
+
+bool ARMDAGToDAGISel::trySMLAWSMULW(SDNode *N) {
+  SDLoc dl(N);
+  SDValue Src0 = N->getOperand(0);
+  SDValue Src1 = N->getOperand(1);
+  SDValue A, B;
+  unsigned Opc = 0;
+
+  if (N->getOpcode() == ISD::ADD) {
+    if (Src0.getOpcode() != ISD::OR && Src1.getOpcode() != ISD::OR)
+      return false;
+
+    SDValue Acc;
+    if (SearchSignedMulLong(Src0, &Opc, A, B, true)) {
+      Acc = Src1;
+    } else if (SearchSignedMulLong(Src1, &Opc, A, B, true)) {
+      Acc = Src0;
+    } else {
+      return false;
+    }
+    if (Opc == 0)
+      return false;
+
+    SDValue Ops[] = { A, B, Acc, getAL(CurDAG, dl),
+                      CurDAG->getRegister(0, MVT::i32) };
+    CurDAG->SelectNodeTo(N, Opc, MVT::i32, MVT::Other, Ops);
+    return true;
+  } else if (N->getOpcode() == ISD::OR &&
+             SearchSignedMulLong(SDValue(N, 0), &Opc, A, B, false)) {
+    if (Opc == 0)
+      return false;
+
+    SDValue Ops[] = { A, B, getAL(CurDAG, dl),
+                      CurDAG->getRegister(0, MVT::i32)};
+    CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+    return true;
+  }
+  return false;
+}
+
+/// We've got special pseudo-instructions for these
+void ARMDAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
+  unsigned Opcode;
+  EVT MemTy = cast<MemSDNode>(N)->getMemoryVT();
+  if (MemTy == MVT::i8)
+    Opcode = ARM::CMP_SWAP_8;
+  else if (MemTy == MVT::i16)
+    Opcode = ARM::CMP_SWAP_16;
+  else if (MemTy == MVT::i32)
+    Opcode = ARM::CMP_SWAP_32;
+  else
+    llvm_unreachable("Unknown AtomicCmpSwap type");
+
+  SDValue Ops[] = {N->getOperand(1), N->getOperand(2), N->getOperand(3),
+                   N->getOperand(0)};
+  SDNode *CmpSwap = CurDAG->getMachineNode(
+      Opcode, SDLoc(N),
+      CurDAG->getVTList(MVT::i32, MVT::i32, MVT::Other), Ops);
+
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(CmpSwap)->setMemRefs(MemOp, MemOp + 1);
+
+  ReplaceUses(SDValue(N, 0), SDValue(CmpSwap, 0));
+  ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 2));
+  CurDAG->RemoveDeadNode(N);
+}
+
+void ARMDAGToDAGISel::SelectConcatVector(SDNode *N) {
   // The only time a CONCAT_VECTORS operation can have legal types is when
   // two 64-bit vectors are concatenated to a 128-bit vector.
   EVT VT = N->getValueType(0);
   if (!VT.is128BitVector() || N->getNumOperands() != 2)
     llvm_unreachable("unexpected CONCAT_VECTORS");
-  return createDRegPairNode(VT, N->getOperand(0), N->getOperand(1));
+  ReplaceNode(N, createDRegPairNode(VT, N->getOperand(0), N->getOperand(1)));
 }
 
-SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
+void ARMDAGToDAGISel::Select(SDNode *N) {
   SDLoc dl(N);
 
   if (N->isMachineOpcode()) {
     N->setNodeId(-1);
-    return nullptr;   // Already selected.
+    return;   // Already selected.
   }
 
   switch (N->getOpcode()) {
   default: break;
-  case ISD::WRITE_REGISTER: {
-    SDNode *ResNode = SelectWriteRegister(N);
-    if (ResNode)
-      return ResNode;
+  case ISD::ADD:
+  case ISD::OR:
+    if (trySMLAWSMULW(N))
+      return;
     break;
-  }
-  case ISD::READ_REGISTER: {
-    SDNode *ResNode = SelectReadRegister(N);
-    if (ResNode)
-      return ResNode;
+  case ISD::WRITE_REGISTER:
+    if (tryWriteRegister(N))
+      return;
     break;
-  }
-  case ISD::INLINEASM: {
-    SDNode *ResNode = SelectInlineAsm(N);
-    if (ResNode)
-      return ResNode;
+  case ISD::READ_REGISTER:
+    if (tryReadRegister(N))
+      return;
     break;
-  }
-  case ISD::XOR: {
+  case ISD::INLINEASM:
+    if (tryInlineAsm(N))
+      return;
+    break;
+  case ISD::XOR:
     // Select special operations if XOR node forms integer ABS pattern
-    SDNode *ResNode = SelectABSOp(N);
-    if (ResNode)
-      return ResNode;
+    if (tryABSOp(N))
+      return;
     // Other cases are autogenerated.
     break;
-  }
   case ISD::Constant: {
     unsigned Val = cast<ConstantSDNode>(N)->getZExtValue();
     // If we can't materialize the constant we need to use a literal pool
@@ -2530,11 +2743,11 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
           CurDAG->getRegister(0, MVT::i32),
           CurDAG->getEntryNode()
         };
-        ResNode=CurDAG->getMachineNode(ARM::LDRcp, dl, MVT::i32, MVT::Other,
-                                       Ops);
+        ResNode = CurDAG->getMachineNode(ARM::LDRcp, dl, MVT::i32, MVT::Other,
+                                         Ops);
       }
-      ReplaceUses(SDValue(N, 0), SDValue(ResNode, 0));
-      return nullptr;
+      ReplaceNode(N, ResNode);
+      return;
     }
 
     // Other cases are autogenerated.
@@ -2551,25 +2764,27 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
       MachineFrameInfo *MFI = MF->getFrameInfo();
       if (MFI->getObjectAlignment(FI) < 4)
         MFI->setObjectAlignment(FI, 4);
-      return CurDAG->SelectNodeTo(N, ARM::tADDframe, MVT::i32, TFI,
-                                  CurDAG->getTargetConstant(0, dl, MVT::i32));
+      CurDAG->SelectNodeTo(N, ARM::tADDframe, MVT::i32, TFI,
+                           CurDAG->getTargetConstant(0, dl, MVT::i32));
+      return;
     } else {
       unsigned Opc = ((Subtarget->isThumb() && Subtarget->hasThumb2()) ?
                       ARM::t2ADDri : ARM::ADDri);
       SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, dl, MVT::i32),
                         getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32),
                         CurDAG->getRegister(0, MVT::i32) };
-      return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+      CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+      return;
     }
   }
   case ISD::SRL:
-    if (SDNode *I = SelectV6T2BitfieldExtractOp(N, false))
-      return I;
+    if (tryV6T2BitfieldExtractOp(N, false))
+      return;
     break;
   case ISD::SIGN_EXTEND_INREG:
   case ISD::SRA:
-    if (SDNode *I = SelectV6T2BitfieldExtractOp(N, true))
-      return I;
+    if (tryV6T2BitfieldExtractOp(N, true))
+      return;
     break;
   case ISD::MUL:
     if (Subtarget->isThumb1Only())
@@ -2587,11 +2802,13 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
         SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
         if (Subtarget->isThumb()) {
           SDValue Ops[] = { V, V, ShImmOp, getAL(CurDAG, dl), Reg0, Reg0 };
-          return CurDAG->SelectNodeTo(N, ARM::t2ADDrs, MVT::i32, Ops);
+          CurDAG->SelectNodeTo(N, ARM::t2ADDrs, MVT::i32, Ops);
+          return;
         } else {
           SDValue Ops[] = { V, V, Reg0, ShImmOp, getAL(CurDAG, dl), Reg0,
                             Reg0 };
-          return CurDAG->SelectNodeTo(N, ARM::ADDrsi, MVT::i32, Ops);
+          CurDAG->SelectNodeTo(N, ARM::ADDrsi, MVT::i32, Ops);
+          return;
         }
       }
       if (isPowerOf2_32(RHSV+1)) {  // 2^n-1?
@@ -2604,19 +2821,63 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
         SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
         if (Subtarget->isThumb()) {
           SDValue Ops[] = { V, V, ShImmOp, getAL(CurDAG, dl), Reg0, Reg0 };
-          return CurDAG->SelectNodeTo(N, ARM::t2RSBrs, MVT::i32, Ops);
+          CurDAG->SelectNodeTo(N, ARM::t2RSBrs, MVT::i32, Ops);
+          return;
         } else {
           SDValue Ops[] = { V, V, Reg0, ShImmOp, getAL(CurDAG, dl), Reg0,
                             Reg0 };
-          return CurDAG->SelectNodeTo(N, ARM::RSBrsi, MVT::i32, Ops);
+          CurDAG->SelectNodeTo(N, ARM::RSBrsi, MVT::i32, Ops);
+          return;
         }
       }
     }
     break;
   case ISD::AND: {
     // Check for unsigned bitfield extract
-    if (SDNode *I = SelectV6T2BitfieldExtractOp(N, false))
-      return I;
+    if (tryV6T2BitfieldExtractOp(N, false))
+      return;
+
+    // If an immediate is used in an AND node, it is possible that the immediate
+    // can be more optimally materialized when negated. If this is the case we
+    // can negate the immediate and use a BIC instead.
+    auto *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+    if (N1C && N1C->hasOneUse() && Subtarget->isThumb()) {
+      uint32_t Imm = (uint32_t) N1C->getZExtValue();
+
+      // In Thumb2 mode, an AND can take a 12-bit immediate. If this
+      // immediate can be negated and fit in the immediate operand of
+      // a t2BIC, don't do any manual transform here as this can be
+      // handled by the generic ISel machinery.
+      bool PreferImmediateEncoding =
+        Subtarget->hasThumb2() && (is_t2_so_imm(Imm) || is_t2_so_imm_not(Imm));
+      if (!PreferImmediateEncoding &&
+          ConstantMaterializationCost(Imm) >
+              ConstantMaterializationCost(~Imm)) {
+        // The current immediate costs more to materialize than a negated
+        // immediate, so negate the immediate and use a BIC.
+        SDValue NewImm =
+          CurDAG->getConstant(~N1C->getZExtValue(), dl, MVT::i32);
+        // If the new constant didn't exist before, reposition it in the topological
+        // ordering so it is just before N. Otherwise, don't touch its location.
+        if (NewImm->getNodeId() == -1)
+          CurDAG->RepositionNode(N->getIterator(), NewImm.getNode());
+
+        if (!Subtarget->hasThumb2()) {
+          SDValue Ops[] = {CurDAG->getRegister(ARM::CPSR, MVT::i32),
+                           N->getOperand(0), NewImm, getAL(CurDAG, dl),
+                           CurDAG->getRegister(0, MVT::i32)};
+          ReplaceNode(N, CurDAG->getMachineNode(ARM::tBIC, dl, MVT::i32, Ops));
+          return;
+        } else {
+          SDValue Ops[] = {N->getOperand(0), NewImm, getAL(CurDAG, dl),
+                           CurDAG->getRegister(0, MVT::i32),
+                           CurDAG->getRegister(0, MVT::i32)};
+          ReplaceNode(N,
+                      CurDAG->getMachineNode(ARM::t2BICrr, dl, MVT::i32, Ops));
+          return;
+        }
+      }
+    }
 
     // (and (or x, c2), c1) and top 16-bits of c1 and c2 match, lower 16-bits
     // of c1 are 0xffff, and lower 16-bit of c2 are 0. That is, the top 16-bits
@@ -2632,7 +2893,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     if (!Opc)
       break;
     SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
-    ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+    N1C = dyn_cast<ConstantSDNode>(N1);
     if (!N1C)
       break;
     if (N0.getOpcode() == ISD::OR && N0.getNode()->hasOneUse()) {
@@ -2649,29 +2910,34 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                                   dl, MVT::i32);
         SDValue Ops[] = { N0.getOperand(0), Imm16,
                           getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32) };
-        return CurDAG->getMachineNode(Opc, dl, VT, Ops);
+        ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, VT, Ops));
+        return;
       }
     }
     break;
   }
   case ARMISD::VMOVRRD:
-    return CurDAG->getMachineNode(ARM::VMOVRRD, dl, MVT::i32, MVT::i32,
-                                  N->getOperand(0), getAL(CurDAG, dl),
-                                  CurDAG->getRegister(0, MVT::i32));
+    ReplaceNode(N, CurDAG->getMachineNode(ARM::VMOVRRD, dl, MVT::i32, MVT::i32,
+                                          N->getOperand(0), getAL(CurDAG, dl),
+                                          CurDAG->getRegister(0, MVT::i32)));
+    return;
   case ISD::UMUL_LOHI: {
     if (Subtarget->isThumb1Only())
       break;
     if (Subtarget->isThumb()) {
       SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
                         getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32) };
-      return CurDAG->getMachineNode(ARM::t2UMULL, dl, MVT::i32, MVT::i32, Ops);
+      ReplaceNode(
+          N, CurDAG->getMachineNode(ARM::t2UMULL, dl, MVT::i32, MVT::i32, Ops));
+      return;
     } else {
       SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
                         getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32),
                         CurDAG->getRegister(0, MVT::i32) };
-      return CurDAG->getMachineNode(Subtarget->hasV6Ops() ?
-                                    ARM::UMULL : ARM::UMULLv5,
-                                    dl, MVT::i32, MVT::i32, Ops);
+      ReplaceNode(N, CurDAG->getMachineNode(
+                         Subtarget->hasV6Ops() ? ARM::UMULL : ARM::UMULLv5, dl,
+                         MVT::i32, MVT::i32, Ops));
+      return;
     }
   }
   case ISD::SMUL_LOHI: {
@@ -2680,30 +2946,76 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     if (Subtarget->isThumb()) {
       SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
                         getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32) };
-      return CurDAG->getMachineNode(ARM::t2SMULL, dl, MVT::i32, MVT::i32, Ops);
+      ReplaceNode(
+          N, CurDAG->getMachineNode(ARM::t2SMULL, dl, MVT::i32, MVT::i32, Ops));
+      return;
     } else {
       SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
                         getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32),
                         CurDAG->getRegister(0, MVT::i32) };
-      return CurDAG->getMachineNode(Subtarget->hasV6Ops() ?
-                                    ARM::SMULL : ARM::SMULLv5,
-                                    dl, MVT::i32, MVT::i32, Ops);
+      ReplaceNode(N, CurDAG->getMachineNode(
+                         Subtarget->hasV6Ops() ? ARM::SMULL : ARM::SMULLv5, dl,
+                         MVT::i32, MVT::i32, Ops));
+      return;
     }
   }
+  case ARMISD::UMAAL: {
+    unsigned Opc = Subtarget->isThumb() ? ARM::t2UMAAL : ARM::UMAAL;
+    SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+                      N->getOperand(2), N->getOperand(3),
+                      getAL(CurDAG, dl),
+                      CurDAG->getRegister(0, MVT::i32) };
+    ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, MVT::i32, MVT::i32, Ops));
+    return;
+  }
   case ARMISD::UMLAL:{
+    // UMAAL is similar to UMLAL but it adds two 32-bit values to the
+    // 64-bit multiplication result.
+    if (Subtarget->hasV6Ops() && N->getOperand(2).getOpcode() == ARMISD::ADDC &&
+        N->getOperand(3).getOpcode() == ARMISD::ADDE) {
+
+      SDValue Addc = N->getOperand(2);
+      SDValue Adde = N->getOperand(3);
+
+      if (Adde.getOperand(2).getNode() == Addc.getNode()) {
+
+        ConstantSDNode *Op0 = dyn_cast<ConstantSDNode>(Adde.getOperand(0));
+        ConstantSDNode *Op1 = dyn_cast<ConstantSDNode>(Adde.getOperand(1));
+
+        if (Op0 && Op1 && Op0->getZExtValue() == 0 && Op1->getZExtValue() == 0)
+        {
+          // Select UMAAL instead: UMAAL RdLo, RdHi, Rn, Rm
+          // RdLo = one operand to be added, lower 32-bits of res
+          // RdHi = other operand to be added, upper 32-bits of res
+          // Rn = first multiply operand
+          // Rm = second multiply operand
+          SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+                            Addc.getOperand(0), Addc.getOperand(1),
+                            getAL(CurDAG, dl),
+                            CurDAG->getRegister(0, MVT::i32) };
+          unsigned opc = Subtarget->isThumb() ? ARM::t2UMAAL : ARM::UMAAL;
+          CurDAG->SelectNodeTo(N, opc, MVT::i32, MVT::i32, Ops);
+          return;
+        }
+      }
+    }
+
     if (Subtarget->isThumb()) {
       SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
                         N->getOperand(3), getAL(CurDAG, dl),
                         CurDAG->getRegister(0, MVT::i32)};
-      return CurDAG->getMachineNode(ARM::t2UMLAL, dl, MVT::i32, MVT::i32, Ops);
+      ReplaceNode(
+          N, CurDAG->getMachineNode(ARM::t2UMLAL, dl, MVT::i32, MVT::i32, Ops));
+      return;
     }else{
       SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
                         N->getOperand(3), getAL(CurDAG, dl),
                         CurDAG->getRegister(0, MVT::i32),
                         CurDAG->getRegister(0, MVT::i32) };
-      return CurDAG->getMachineNode(Subtarget->hasV6Ops() ?
-                                      ARM::UMLAL : ARM::UMLALv5,
-                                      dl, MVT::i32, MVT::i32, Ops);
+      ReplaceNode(N, CurDAG->getMachineNode(
+                         Subtarget->hasV6Ops() ? ARM::UMLAL : ARM::UMLALv5, dl,
+                         MVT::i32, MVT::i32, Ops));
+      return;
     }
   }
   case ARMISD::SMLAL:{
@@ -2711,25 +3023,29 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
       SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
                         N->getOperand(3), getAL(CurDAG, dl),
                         CurDAG->getRegister(0, MVT::i32)};
-      return CurDAG->getMachineNode(ARM::t2SMLAL, dl, MVT::i32, MVT::i32, Ops);
+      ReplaceNode(
+          N, CurDAG->getMachineNode(ARM::t2SMLAL, dl, MVT::i32, MVT::i32, Ops));
+      return;
     }else{
       SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
                         N->getOperand(3), getAL(CurDAG, dl),
                         CurDAG->getRegister(0, MVT::i32),
                         CurDAG->getRegister(0, MVT::i32) };
-      return CurDAG->getMachineNode(Subtarget->hasV6Ops() ?
-                                      ARM::SMLAL : ARM::SMLALv5,
-                                      dl, MVT::i32, MVT::i32, Ops);
+      ReplaceNode(N, CurDAG->getMachineNode(
+                         Subtarget->hasV6Ops() ? ARM::SMLAL : ARM::SMLALv5, dl,
+                         MVT::i32, MVT::i32, Ops));
+      return;
     }
   }
   case ISD::LOAD: {
-    SDNode *ResNode = nullptr;
-    if (Subtarget->isThumb() && Subtarget->hasThumb2())
-      ResNode = SelectT2IndexedLoad(N);
-    else
-      ResNode = SelectARMIndexedLoad(N);
-    if (ResNode)
-      return ResNode;
+    if (Subtarget->isThumb() && Subtarget->hasThumb2()) {
+      if (tryT2IndexedLoad(N))
+        return;
+    } else if (Subtarget->isThumb()) {
+      if (tryT1IndexedLoad(N))
+        return;
+    } else if (tryARMIndexedLoad(N))
+      return;
     // Other cases are autogenerated.
     break;
   }
@@ -2770,13 +3086,14 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     }
     ReplaceUses(SDValue(N, 0),
                 SDValue(Chain.getNode(), Chain.getResNo()));
-    return nullptr;
+    CurDAG->RemoveDeadNode(N);
+    return;
   }
   case ARMISD::VZIP: {
     unsigned Opc = 0;
     EVT VT = N->getValueType(0);
     switch (VT.getSimpleVT().SimpleTy) {
-    default: return nullptr;
+    default: return;
     case MVT::v8i8:  Opc = ARM::VZIPd8; break;
     case MVT::v4i16: Opc = ARM::VZIPd16; break;
     case MVT::v2f32:
@@ -2790,13 +3107,14 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     SDValue Pred = getAL(CurDAG, dl);
     SDValue PredReg = CurDAG->getRegister(0, MVT::i32);
     SDValue Ops[] = { N->getOperand(0), N->getOperand(1), Pred, PredReg };
-    return CurDAG->getMachineNode(Opc, dl, VT, VT, Ops);
+    ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, VT, VT, Ops));
+    return;
   }
   case ARMISD::VUZP: {
     unsigned Opc = 0;
     EVT VT = N->getValueType(0);
     switch (VT.getSimpleVT().SimpleTy) {
-    default: return nullptr;
+    default: return;
     case MVT::v8i8:  Opc = ARM::VUZPd8; break;
     case MVT::v4i16: Opc = ARM::VUZPd16; break;
     case MVT::v2f32:
@@ -2810,13 +3128,14 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     SDValue Pred = getAL(CurDAG, dl);
     SDValue PredReg = CurDAG->getRegister(0, MVT::i32);
     SDValue Ops[] = { N->getOperand(0), N->getOperand(1), Pred, PredReg };
-    return CurDAG->getMachineNode(Opc, dl, VT, VT, Ops);
+    ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, VT, VT, Ops));
+    return;
   }
   case ARMISD::VTRN: {
     unsigned Opc = 0;
     EVT VT = N->getValueType(0);
     switch (VT.getSimpleVT().SimpleTy) {
-    default: return nullptr;
+    default: return;
     case MVT::v8i8:  Opc = ARM::VTRNd8; break;
     case MVT::v4i16: Opc = ARM::VTRNd16; break;
     case MVT::v2f32:
@@ -2829,7 +3148,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     SDValue Pred = getAL(CurDAG, dl);
     SDValue PredReg = CurDAG->getRegister(0, MVT::i32);
     SDValue Ops[] = { N->getOperand(0), N->getOperand(1), Pred, PredReg };
-    return CurDAG->getMachineNode(Opc, dl, VT, VT, Ops);
+    ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, VT, VT, Ops));
+    return;
   }
   case ARMISD::BUILD_VECTOR: {
     EVT VecVT = N->getValueType(0);
@@ -2837,55 +3157,68 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     unsigned NumElts = VecVT.getVectorNumElements();
     if (EltVT == MVT::f64) {
       assert(NumElts == 2 && "unexpected type for BUILD_VECTOR");
-      return createDRegPairNode(VecVT, N->getOperand(0), N->getOperand(1));
+      ReplaceNode(
+          N, createDRegPairNode(VecVT, N->getOperand(0), N->getOperand(1)));
+      return;
     }
     assert(EltVT == MVT::f32 && "unexpected type for BUILD_VECTOR");
-    if (NumElts == 2)
-      return createSRegPairNode(VecVT, N->getOperand(0), N->getOperand(1));
+    if (NumElts == 2) {
+      ReplaceNode(
+          N, createSRegPairNode(VecVT, N->getOperand(0), N->getOperand(1)));
+      return;
+    }
     assert(NumElts == 4 && "unexpected type for BUILD_VECTOR");
-    return createQuadSRegsNode(VecVT, N->getOperand(0), N->getOperand(1),
-                     N->getOperand(2), N->getOperand(3));
+    ReplaceNode(N,
+                createQuadSRegsNode(VecVT, N->getOperand(0), N->getOperand(1),
+                                    N->getOperand(2), N->getOperand(3)));
+    return;
   }
 
   case ARMISD::VLD2DUP: {
     static const uint16_t Opcodes[] = { ARM::VLD2DUPd8, ARM::VLD2DUPd16,
                                         ARM::VLD2DUPd32 };
-    return SelectVLDDup(N, false, 2, Opcodes);
+    SelectVLDDup(N, false, 2, Opcodes);
+    return;
   }
 
   case ARMISD::VLD3DUP: {
     static const uint16_t Opcodes[] = { ARM::VLD3DUPd8Pseudo,
                                         ARM::VLD3DUPd16Pseudo,
                                         ARM::VLD3DUPd32Pseudo };
-    return SelectVLDDup(N, false, 3, Opcodes);
+    SelectVLDDup(N, false, 3, Opcodes);
+    return;
   }
 
   case ARMISD::VLD4DUP: {
     static const uint16_t Opcodes[] = { ARM::VLD4DUPd8Pseudo,
                                         ARM::VLD4DUPd16Pseudo,
                                         ARM::VLD4DUPd32Pseudo };
-    return SelectVLDDup(N, false, 4, Opcodes);
+    SelectVLDDup(N, false, 4, Opcodes);
+    return;
   }
 
   case ARMISD::VLD2DUP_UPD: {
     static const uint16_t Opcodes[] = { ARM::VLD2DUPd8wb_fixed,
                                         ARM::VLD2DUPd16wb_fixed,
                                         ARM::VLD2DUPd32wb_fixed };
-    return SelectVLDDup(N, true, 2, Opcodes);
+    SelectVLDDup(N, true, 2, Opcodes);
+    return;
   }
 
   case ARMISD::VLD3DUP_UPD: {
     static const uint16_t Opcodes[] = { ARM::VLD3DUPd8Pseudo_UPD,
                                         ARM::VLD3DUPd16Pseudo_UPD,
                                         ARM::VLD3DUPd32Pseudo_UPD };
-    return SelectVLDDup(N, true, 3, Opcodes);
+    SelectVLDDup(N, true, 3, Opcodes);
+    return;
   }
 
   case ARMISD::VLD4DUP_UPD: {
     static const uint16_t Opcodes[] = { ARM::VLD4DUPd8Pseudo_UPD,
                                         ARM::VLD4DUPd16Pseudo_UPD,
                                         ARM::VLD4DUPd32Pseudo_UPD };
-    return SelectVLDDup(N, true, 4, Opcodes);
+    SelectVLDDup(N, true, 4, Opcodes);
+    return;
   }
 
   case ARMISD::VLD1_UPD: {
@@ -2897,7 +3230,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                          ARM::VLD1q16wb_fixed,
                                          ARM::VLD1q32wb_fixed,
                                          ARM::VLD1q64wb_fixed };
-    return SelectVLD(N, true, 1, DOpcodes, QOpcodes, nullptr);
+    SelectVLD(N, true, 1, DOpcodes, QOpcodes, nullptr);
+    return;
   }
 
   case ARMISD::VLD2_UPD: {
@@ -2908,7 +3242,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     static const uint16_t QOpcodes[] = { ARM::VLD2q8PseudoWB_fixed,
                                          ARM::VLD2q16PseudoWB_fixed,
                                          ARM::VLD2q32PseudoWB_fixed };
-    return SelectVLD(N, true, 2, DOpcodes, QOpcodes, nullptr);
+    SelectVLD(N, true, 2, DOpcodes, QOpcodes, nullptr);
+    return;
   }
 
   case ARMISD::VLD3_UPD: {
@@ -2922,7 +3257,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     static const uint16_t QOpcodes1[] = { ARM::VLD3q8oddPseudo_UPD,
                                           ARM::VLD3q16oddPseudo_UPD,
                                           ARM::VLD3q32oddPseudo_UPD };
-    return SelectVLD(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1);
+    SelectVLD(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1);
+    return;
   }
 
   case ARMISD::VLD4_UPD: {
@@ -2936,7 +3272,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     static const uint16_t QOpcodes1[] = { ARM::VLD4q8oddPseudo_UPD,
                                           ARM::VLD4q16oddPseudo_UPD,
                                           ARM::VLD4q32oddPseudo_UPD };
-    return SelectVLD(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1);
+    SelectVLD(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1);
+    return;
   }
 
   case ARMISD::VLD2LN_UPD: {
@@ -2945,7 +3282,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                          ARM::VLD2LNd32Pseudo_UPD };
     static const uint16_t QOpcodes[] = { ARM::VLD2LNq16Pseudo_UPD,
                                          ARM::VLD2LNq32Pseudo_UPD };
-    return SelectVLDSTLane(N, true, true, 2, DOpcodes, QOpcodes);
+    SelectVLDSTLane(N, true, true, 2, DOpcodes, QOpcodes);
+    return;
   }
 
   case ARMISD::VLD3LN_UPD: {
@@ -2954,7 +3292,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                          ARM::VLD3LNd32Pseudo_UPD };
     static const uint16_t QOpcodes[] = { ARM::VLD3LNq16Pseudo_UPD,
                                          ARM::VLD3LNq32Pseudo_UPD };
-    return SelectVLDSTLane(N, true, true, 3, DOpcodes, QOpcodes);
+    SelectVLDSTLane(N, true, true, 3, DOpcodes, QOpcodes);
+    return;
   }
 
   case ARMISD::VLD4LN_UPD: {
@@ -2963,7 +3302,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                          ARM::VLD4LNd32Pseudo_UPD };
     static const uint16_t QOpcodes[] = { ARM::VLD4LNq16Pseudo_UPD,
                                          ARM::VLD4LNq32Pseudo_UPD };
-    return SelectVLDSTLane(N, true, true, 4, DOpcodes, QOpcodes);
+    SelectVLDSTLane(N, true, true, 4, DOpcodes, QOpcodes);
+    return;
   }
 
   case ARMISD::VST1_UPD: {
@@ -2975,7 +3315,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                          ARM::VST1q16wb_fixed,
                                          ARM::VST1q32wb_fixed,
                                          ARM::VST1q64wb_fixed };
-    return SelectVST(N, true, 1, DOpcodes, QOpcodes, nullptr);
+    SelectVST(N, true, 1, DOpcodes, QOpcodes, nullptr);
+    return;
   }
 
   case ARMISD::VST2_UPD: {
@@ -2986,7 +3327,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     static const uint16_t QOpcodes[] = { ARM::VST2q8PseudoWB_fixed,
                                          ARM::VST2q16PseudoWB_fixed,
                                          ARM::VST2q32PseudoWB_fixed };
-    return SelectVST(N, true, 2, DOpcodes, QOpcodes, nullptr);
+    SelectVST(N, true, 2, DOpcodes, QOpcodes, nullptr);
+    return;
   }
 
   case ARMISD::VST3_UPD: {
@@ -3000,7 +3342,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     static const uint16_t QOpcodes1[] = { ARM::VST3q8oddPseudo_UPD,
                                           ARM::VST3q16oddPseudo_UPD,
                                           ARM::VST3q32oddPseudo_UPD };
-    return SelectVST(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1);
+    SelectVST(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1);
+    return;
   }
 
   case ARMISD::VST4_UPD: {
@@ -3014,7 +3357,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     static const uint16_t QOpcodes1[] = { ARM::VST4q8oddPseudo_UPD,
                                           ARM::VST4q16oddPseudo_UPD,
                                           ARM::VST4q32oddPseudo_UPD };
-    return SelectVST(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1);
+    SelectVST(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1);
+    return;
   }
 
   case ARMISD::VST2LN_UPD: {
@@ -3023,7 +3367,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                          ARM::VST2LNd32Pseudo_UPD };
     static const uint16_t QOpcodes[] = { ARM::VST2LNq16Pseudo_UPD,
                                          ARM::VST2LNq32Pseudo_UPD };
-    return SelectVLDSTLane(N, false, true, 2, DOpcodes, QOpcodes);
+    SelectVLDSTLane(N, false, true, 2, DOpcodes, QOpcodes);
+    return;
   }
 
   case ARMISD::VST3LN_UPD: {
@@ -3032,7 +3377,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                          ARM::VST3LNd32Pseudo_UPD };
     static const uint16_t QOpcodes[] = { ARM::VST3LNq16Pseudo_UPD,
                                          ARM::VST3LNq32Pseudo_UPD };
-    return SelectVLDSTLane(N, false, true, 3, DOpcodes, QOpcodes);
+    SelectVLDSTLane(N, false, true, 3, DOpcodes, QOpcodes);
+    return;
   }
 
   case ARMISD::VST4LN_UPD: {
@@ -3041,7 +3387,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                          ARM::VST4LNd32Pseudo_UPD };
     static const uint16_t QOpcodes[] = { ARM::VST4LNq16Pseudo_UPD,
                                          ARM::VST4LNq32Pseudo_UPD };
-    return SelectVLDSTLane(N, false, true, 4, DOpcodes, QOpcodes);
+    SelectVLDSTLane(N, false, true, 4, DOpcodes, QOpcodes);
+    return;
   }
 
   case ISD::INTRINSIC_VOID:
@@ -3051,12 +3398,44 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     default:
       break;
 
+    case Intrinsic::arm_mrrc:
+    case Intrinsic::arm_mrrc2: {
+      SDLoc dl(N);
+      SDValue Chain = N->getOperand(0);
+      unsigned Opc;
+
+      if (Subtarget->isThumb())
+        Opc = (IntNo == Intrinsic::arm_mrrc ? ARM::t2MRRC : ARM::t2MRRC2);
+      else
+        Opc = (IntNo == Intrinsic::arm_mrrc ? ARM::MRRC : ARM::MRRC2);
+
+      SmallVector<SDValue, 5> Ops;
+      Ops.push_back(getI32Imm(cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(), dl)); /* coproc */
+      Ops.push_back(getI32Imm(cast<ConstantSDNode>(N->getOperand(3))->getZExtValue(), dl)); /* opc */
+      Ops.push_back(getI32Imm(cast<ConstantSDNode>(N->getOperand(4))->getZExtValue(), dl)); /* CRm */
+
+      // The mrrc2 instruction in ARM doesn't allow predicates, the top 4 bits of the encoded
+      // instruction will always be '1111' but it is possible in assembly language to specify
+      // AL as a predicate to mrrc2 but it doesn't make any difference to the encoded instruction.
+      if (Opc != ARM::MRRC2) {
+        Ops.push_back(getAL(CurDAG, dl));
+        Ops.push_back(CurDAG->getRegister(0, MVT::i32));
+      }
+
+      Ops.push_back(Chain);
+
+      // Writes to two registers.
+      const EVT RetType[] = {MVT::i32, MVT::i32, MVT::Other};
+
+      ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, RetType, Ops));
+      return;
+    }
     case Intrinsic::arm_ldaexd:
     case Intrinsic::arm_ldrexd: {
       SDLoc dl(N);
       SDValue Chain = N->getOperand(0);
       SDValue MemAddr = N->getOperand(2);
-      bool isThumb = Subtarget->isThumb() && Subtarget->hasThumb2();
+      bool isThumb = Subtarget->isThumb() && Subtarget->hasV8MBaselineOps();
 
       bool IsAcquire = IntNo == Intrinsic::arm_ldaexd;
       unsigned NewOpc = isThumb ? (IsAcquire ? ARM::t2LDAEXD : ARM::t2LDREXD)
@@ -3072,11 +3451,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
       ResTys.push_back(MVT::Other);
 
       // Place arguments in the right order.
-      SmallVector<SDValue, 7> Ops;
-      Ops.push_back(MemAddr);
-      Ops.push_back(getAL(CurDAG, dl));
-      Ops.push_back(CurDAG->getRegister(0, MVT::i32));
-      Ops.push_back(Chain);
+      SDValue Ops[] = {MemAddr, getAL(CurDAG, dl),
+                       CurDAG->getRegister(0, MVT::i32), Chain};
       SDNode *Ld = CurDAG->getMachineNode(NewOpc, dl, ResTys, Ops);
       // Transfer memoperands.
       MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
@@ -3112,7 +3488,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
         ReplaceUses(SDValue(N, 1), Result);
       }
       ReplaceUses(SDValue(N, 2), OutChain);
-      return nullptr;
+      CurDAG->RemoveDeadNode(N);
+      return;
     }
     case Intrinsic::arm_stlexd:
     case Intrinsic::arm_strexd: {
@@ -3150,7 +3527,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
       MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
       cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
 
-      return St;
+      ReplaceNode(N, St);
+      return;
     }
 
     case Intrinsic::arm_neon_vld1: {
@@ -3158,7 +3536,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                            ARM::VLD1d32, ARM::VLD1d64 };
       static const uint16_t QOpcodes[] = { ARM::VLD1q8, ARM::VLD1q16,
                                            ARM::VLD1q32, ARM::VLD1q64};
-      return SelectVLD(N, false, 1, DOpcodes, QOpcodes, nullptr);
+      SelectVLD(N, false, 1, DOpcodes, QOpcodes, nullptr);
+      return;
     }
 
     case Intrinsic::arm_neon_vld2: {
@@ -3166,7 +3545,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                            ARM::VLD2d32, ARM::VLD1q64 };
       static const uint16_t QOpcodes[] = { ARM::VLD2q8Pseudo, ARM::VLD2q16Pseudo,
                                            ARM::VLD2q32Pseudo };
-      return SelectVLD(N, false, 2, DOpcodes, QOpcodes, nullptr);
+      SelectVLD(N, false, 2, DOpcodes, QOpcodes, nullptr);
+      return;
     }
 
     case Intrinsic::arm_neon_vld3: {
@@ -3180,7 +3560,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
       static const uint16_t QOpcodes1[] = { ARM::VLD3q8oddPseudo,
                                             ARM::VLD3q16oddPseudo,
                                             ARM::VLD3q32oddPseudo };
-      return SelectVLD(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1);
+      SelectVLD(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1);
+      return;
     }
 
     case Intrinsic::arm_neon_vld4: {
@@ -3194,7 +3575,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
       static const uint16_t QOpcodes1[] = { ARM::VLD4q8oddPseudo,
                                             ARM::VLD4q16oddPseudo,
                                             ARM::VLD4q32oddPseudo };
-      return SelectVLD(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1);
+      SelectVLD(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1);
+      return;
     }
 
     case Intrinsic::arm_neon_vld2lane: {
@@ -3203,7 +3585,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                            ARM::VLD2LNd32Pseudo };
       static const uint16_t QOpcodes[] = { ARM::VLD2LNq16Pseudo,
                                            ARM::VLD2LNq32Pseudo };
-      return SelectVLDSTLane(N, true, false, 2, DOpcodes, QOpcodes);
+      SelectVLDSTLane(N, true, false, 2, DOpcodes, QOpcodes);
+      return;
     }
 
     case Intrinsic::arm_neon_vld3lane: {
@@ -3212,7 +3595,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                            ARM::VLD3LNd32Pseudo };
       static const uint16_t QOpcodes[] = { ARM::VLD3LNq16Pseudo,
                                            ARM::VLD3LNq32Pseudo };
-      return SelectVLDSTLane(N, true, false, 3, DOpcodes, QOpcodes);
+      SelectVLDSTLane(N, true, false, 3, DOpcodes, QOpcodes);
+      return;
     }
 
     case Intrinsic::arm_neon_vld4lane: {
@@ -3221,7 +3605,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                            ARM::VLD4LNd32Pseudo };
       static const uint16_t QOpcodes[] = { ARM::VLD4LNq16Pseudo,
                                            ARM::VLD4LNq32Pseudo };
-      return SelectVLDSTLane(N, true, false, 4, DOpcodes, QOpcodes);
+      SelectVLDSTLane(N, true, false, 4, DOpcodes, QOpcodes);
+      return;
     }
 
     case Intrinsic::arm_neon_vst1: {
@@ -3229,15 +3614,17 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                            ARM::VST1d32, ARM::VST1d64 };
       static const uint16_t QOpcodes[] = { ARM::VST1q8, ARM::VST1q16,
                                            ARM::VST1q32, ARM::VST1q64 };
-      return SelectVST(N, false, 1, DOpcodes, QOpcodes, nullptr);
+      SelectVST(N, false, 1, DOpcodes, QOpcodes, nullptr);
+      return;
     }
 
     case Intrinsic::arm_neon_vst2: {
       static const uint16_t DOpcodes[] = { ARM::VST2d8, ARM::VST2d16,
                                            ARM::VST2d32, ARM::VST1q64 };
-      static uint16_t QOpcodes[] = { ARM::VST2q8Pseudo, ARM::VST2q16Pseudo,
-                                     ARM::VST2q32Pseudo };
-      return SelectVST(N, false, 2, DOpcodes, QOpcodes, nullptr);
+      static const uint16_t QOpcodes[] = { ARM::VST2q8Pseudo, ARM::VST2q16Pseudo,
+                                           ARM::VST2q32Pseudo };
+      SelectVST(N, false, 2, DOpcodes, QOpcodes, nullptr);
+      return;
     }
 
     case Intrinsic::arm_neon_vst3: {
@@ -3251,7 +3638,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
       static const uint16_t QOpcodes1[] = { ARM::VST3q8oddPseudo,
                                             ARM::VST3q16oddPseudo,
                                             ARM::VST3q32oddPseudo };
-      return SelectVST(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1);
+      SelectVST(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1);
+      return;
     }
 
     case Intrinsic::arm_neon_vst4: {
@@ -3265,7 +3653,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
       static const uint16_t QOpcodes1[] = { ARM::VST4q8oddPseudo,
                                             ARM::VST4q16oddPseudo,
                                             ARM::VST4q32oddPseudo };
-      return SelectVST(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1);
+      SelectVST(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1);
+      return;
     }
 
     case Intrinsic::arm_neon_vst2lane: {
@@ -3274,7 +3663,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                            ARM::VST2LNd32Pseudo };
       static const uint16_t QOpcodes[] = { ARM::VST2LNq16Pseudo,
                                            ARM::VST2LNq32Pseudo };
-      return SelectVLDSTLane(N, false, false, 2, DOpcodes, QOpcodes);
+      SelectVLDSTLane(N, false, false, 2, DOpcodes, QOpcodes);
+      return;
     }
 
     case Intrinsic::arm_neon_vst3lane: {
@@ -3283,7 +3673,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                            ARM::VST3LNd32Pseudo };
       static const uint16_t QOpcodes[] = { ARM::VST3LNq16Pseudo,
                                            ARM::VST3LNq32Pseudo };
-      return SelectVLDSTLane(N, false, false, 3, DOpcodes, QOpcodes);
+      SelectVLDSTLane(N, false, false, 3, DOpcodes, QOpcodes);
+      return;
     }
 
     case Intrinsic::arm_neon_vst4lane: {
@@ -3292,7 +3683,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                            ARM::VST4LNd32Pseudo };
       static const uint16_t QOpcodes[] = { ARM::VST4LNq16Pseudo,
                                            ARM::VST4LNq32Pseudo };
-      return SelectVLDSTLane(N, false, false, 4, DOpcodes, QOpcodes);
+      SelectVLDSTLane(N, false, false, 4, DOpcodes, QOpcodes);
+      return;
     }
     }
     break;
@@ -3305,18 +3697,24 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
       break;
 
     case Intrinsic::arm_neon_vtbl2:
-      return SelectVTBL(N, false, 2, ARM::VTBL2);
+      SelectVTBL(N, false, 2, ARM::VTBL2);
+      return;
     case Intrinsic::arm_neon_vtbl3:
-      return SelectVTBL(N, false, 3, ARM::VTBL3Pseudo);
+      SelectVTBL(N, false, 3, ARM::VTBL3Pseudo);
+      return;
     case Intrinsic::arm_neon_vtbl4:
-      return SelectVTBL(N, false, 4, ARM::VTBL4Pseudo);
+      SelectVTBL(N, false, 4, ARM::VTBL4Pseudo);
+      return;
 
     case Intrinsic::arm_neon_vtbx2:
-      return SelectVTBL(N, true, 2, ARM::VTBX2);
+      SelectVTBL(N, true, 2, ARM::VTBX2);
+      return;
     case Intrinsic::arm_neon_vtbx3:
-      return SelectVTBL(N, true, 3, ARM::VTBX3Pseudo);
+      SelectVTBL(N, true, 3, ARM::VTBX3Pseudo);
+      return;
     case Intrinsic::arm_neon_vtbx4:
-      return SelectVTBL(N, true, 4, ARM::VTBX4Pseudo);
+      SelectVTBL(N, true, 4, ARM::VTBX4Pseudo);
+      return;
     }
     break;
   }
@@ -3324,13 +3722,11 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
   case ARMISD::VTBL1: {
     SDLoc dl(N);
     EVT VT = N->getValueType(0);
-    SmallVector<SDValue, 6> Ops;
-
-    Ops.push_back(N->getOperand(0));
-    Ops.push_back(N->getOperand(1));
-    Ops.push_back(getAL(CurDAG, dl));                // Predicate
-    Ops.push_back(CurDAG->getRegister(0, MVT::i32)); // Predicate Register
-    return CurDAG->getMachineNode(ARM::VTBL1, dl, VT, Ops);
+    SDValue Ops[] = {N->getOperand(0), N->getOperand(1),
+                     getAL(CurDAG, dl),                 // Predicate
+                     CurDAG->getRegister(0, MVT::i32)}; // Predicate Register
+    ReplaceNode(N, CurDAG->getMachineNode(ARM::VTBL1, dl, VT, Ops));
+    return;
   }
   case ARMISD::VTBL2: {
     SDLoc dl(N);
@@ -3341,19 +3737,22 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     SDValue V1 = N->getOperand(1);
     SDValue RegSeq = SDValue(createDRegPairNode(MVT::v16i8, V0, V1), 0);
 
-    SmallVector<SDValue, 6> Ops;
-    Ops.push_back(RegSeq);
-    Ops.push_back(N->getOperand(2));
-    Ops.push_back(getAL(CurDAG, dl));                // Predicate
-    Ops.push_back(CurDAG->getRegister(0, MVT::i32)); // Predicate Register
-    return CurDAG->getMachineNode(ARM::VTBL2, dl, VT, Ops);
+    SDValue Ops[] = {RegSeq, N->getOperand(2), getAL(CurDAG, dl), // Predicate
+                     CurDAG->getRegister(0, MVT::i32)}; // Predicate Register
+    ReplaceNode(N, CurDAG->getMachineNode(ARM::VTBL2, dl, VT, Ops));
+    return;
   }
 
   case ISD::CONCAT_VECTORS:
-    return SelectConcatVector(N);
+    SelectConcatVector(N);
+    return;
+
+  case ISD::ATOMIC_CMP_SWAP:
+    SelectCMP_SWAP(N);
+    return;
   }
 
-  return SelectCode(N);
+  SelectCode(N);
 }
 
 // Inspect a register string of the form
@@ -3362,8 +3761,9 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
 // and obtain the integer operands from them, adding these operands to the
 // provided vector.
 static void getIntOperandsFromRegisterString(StringRef RegString,
-                                             SelectionDAG *CurDAG, SDLoc DL,
-                                             std::vector<SDValue>& Ops) {
+                                             SelectionDAG *CurDAG,
+                                             const SDLoc &DL,
+                                             std::vector<SDValue> &Ops) {
   SmallVector<StringRef, 5> Fields;
   RegString.split(Fields, ':');
 
@@ -3444,6 +3844,9 @@ static inline int getMClassRegisterSYSmValueMask(StringRef RegString) {
           .Case("basepri_max", 0x12)
           .Case("faultmask", 0x13)
           .Case("control", 0x14)
+          .Case("msplim", 0x0a)
+          .Case("psplim", 0x0b)
+          .Case("sp", 0x18)
           .Default(-1);
 }
 
@@ -3473,11 +3876,27 @@ static int getMClassRegisterMask(StringRef Reg, StringRef Flags, bool IsRead,
   if (!Subtarget->hasV7Ops() && SYSmvalue >= 0x11 && SYSmvalue <= 0x13)
     return -1;
 
+  if (Subtarget->has8MSecExt() && Flags.lower() == "ns") {
+    Flags = "";
+    SYSmvalue |= 0x80;
+  }
+
+  if (!Subtarget->has8MSecExt() &&
+      (SYSmvalue == 0xa || SYSmvalue == 0xb || SYSmvalue > 0x14))
+    return -1;
+
+  if (!Subtarget->hasV8MMainlineOps() &&
+      (SYSmvalue == 0x8a || SYSmvalue == 0x8b || SYSmvalue == 0x91 ||
+       SYSmvalue == 0x93))
+    return -1;
+
   // If it was a read then we won't be expecting flags and so at this point
   // we can return the mask.
   if (IsRead) {
-    assert (Flags.empty() && "Unexpected flags for reading M class register.");
-    return SYSmvalue;
+    if (Flags.empty())
+      return SYSmvalue;
+    else
+      return -1;
   }
 
   // We know we are now handling a write so need to get the mask for the flags.
@@ -3563,7 +3982,7 @@ static int getARClassRegisterMask(StringRef Reg, StringRef Flags) {
 // Lower the read_register intrinsic to ARM specific DAG nodes
 // using the supplied metadata string to select the instruction node to use
 // and the registers/masks to construct as operands for the node.
-SDNode *ARMDAGToDAGISel::SelectReadRegister(SDNode *N){
+bool ARMDAGToDAGISel::tryReadRegister(SDNode *N){
   const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(N->getOperand(1));
   const MDString *RegString = dyn_cast<MDString>(MD->getMD()->getOperand(0));
   bool IsThumb2 = Subtarget->isThumb2();
@@ -3592,7 +4011,8 @@ SDNode *ARMDAGToDAGISel::SelectReadRegister(SDNode *N){
     Ops.push_back(getAL(CurDAG, DL));
     Ops.push_back(CurDAG->getRegister(0, MVT::i32));
     Ops.push_back(N->getOperand(0));
-    return CurDAG->getMachineNode(Opcode, DL, ResTypes, Ops);
+    ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, ResTypes, Ops));
+    return true;
   }
 
   std::string SpecialReg = RegString->getString().lower();
@@ -3602,8 +4022,10 @@ SDNode *ARMDAGToDAGISel::SelectReadRegister(SDNode *N){
     Ops = { CurDAG->getTargetConstant(BankedReg, DL, MVT::i32),
             getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32),
             N->getOperand(0) };
-    return CurDAG->getMachineNode(IsThumb2 ? ARM::t2MRSbanked : ARM::MRSbanked,
-                                  DL, MVT::i32, MVT::Other, Ops);
+    ReplaceNode(
+        N, CurDAG->getMachineNode(IsThumb2 ? ARM::t2MRSbanked : ARM::MRSbanked,
+                                  DL, MVT::i32, MVT::Other, Ops));
+    return true;
   }
 
   // The VFP registers are read by creating SelectionDAG nodes with opcodes
@@ -3623,27 +4045,37 @@ SDNode *ARMDAGToDAGISel::SelectReadRegister(SDNode *N){
   // If an opcode was found then we can lower the read to a VFP instruction.
   if (Opcode) {
     if (!Subtarget->hasVFP2())
-      return nullptr;
+      return false;
     if (Opcode == ARM::VMRS_MVFR2 && !Subtarget->hasFPARMv8())
-      return nullptr;
+      return false;
 
     Ops = { getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32),
             N->getOperand(0) };
-    return CurDAG->getMachineNode(Opcode, DL, MVT::i32, MVT::Other, Ops);
+    ReplaceNode(N,
+                CurDAG->getMachineNode(Opcode, DL, MVT::i32, MVT::Other, Ops));
+    return true;
   }
 
   // If the target is M Class then need to validate that the register string
   // is an acceptable value, so check that a mask can be constructed from the
   // string.
   if (Subtarget->isMClass()) {
-    int SYSmValue = getMClassRegisterMask(SpecialReg, "", true, Subtarget);
+    StringRef Flags = "", Reg = SpecialReg;
+    if (Reg.endswith("_ns")) {
+      Flags = "ns";
+      Reg = Reg.drop_back(3);
+    }
+
+    int SYSmValue = getMClassRegisterMask(Reg, Flags, true, Subtarget);
     if (SYSmValue == -1)
-      return nullptr;
+      return false;
 
     SDValue Ops[] = { CurDAG->getTargetConstant(SYSmValue, DL, MVT::i32),
                       getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32),
                       N->getOperand(0) };
-    return CurDAG->getMachineNode(ARM::t2MRS_M, DL, MVT::i32, MVT::Other, Ops);
+    ReplaceNode(
+        N, CurDAG->getMachineNode(ARM::t2MRS_M, DL, MVT::i32, MVT::Other, Ops));
+    return true;
   }
 
   // Here we know the target is not M Class so we need to check if it is one
@@ -3651,24 +4083,27 @@ SDNode *ARMDAGToDAGISel::SelectReadRegister(SDNode *N){
   if (SpecialReg == "apsr" || SpecialReg == "cpsr") {
     Ops = { getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32),
             N->getOperand(0) };
-    return CurDAG->getMachineNode(IsThumb2 ? ARM::t2MRS_AR : ARM::MRS, DL,
-                                  MVT::i32, MVT::Other, Ops);
+    ReplaceNode(N, CurDAG->getMachineNode(IsThumb2 ? ARM::t2MRS_AR : ARM::MRS,
+                                          DL, MVT::i32, MVT::Other, Ops));
+    return true;
   }
 
   if (SpecialReg == "spsr") {
     Ops = { getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32),
             N->getOperand(0) };
-    return CurDAG->getMachineNode(IsThumb2 ? ARM::t2MRSsys_AR : ARM::MRSsys,
-                                  DL, MVT::i32, MVT::Other, Ops);
+    ReplaceNode(
+        N, CurDAG->getMachineNode(IsThumb2 ? ARM::t2MRSsys_AR : ARM::MRSsys, DL,
+                                  MVT::i32, MVT::Other, Ops));
+    return true;
   }
 
-  return nullptr;
+  return false;
 }
 
 // Lower the write_register intrinsic to ARM specific DAG nodes
 // using the supplied metadata string to select the instruction node to use
 // and the registers/masks to use in the nodes
-SDNode *ARMDAGToDAGISel::SelectWriteRegister(SDNode *N){
+bool ARMDAGToDAGISel::tryWriteRegister(SDNode *N){
   const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(N->getOperand(1));
   const MDString *RegString = dyn_cast<MDString>(MD->getMD()->getOperand(0));
   bool IsThumb2 = Subtarget->isThumb2();
@@ -3698,7 +4133,8 @@ SDNode *ARMDAGToDAGISel::SelectWriteRegister(SDNode *N){
     Ops.push_back(CurDAG->getRegister(0, MVT::i32));
     Ops.push_back(N->getOperand(0));
 
-    return CurDAG->getMachineNode(Opcode, DL, MVT::Other, Ops);
+    ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, MVT::Other, Ops));
+    return true;
   }
 
   std::string SpecialReg = RegString->getString().lower();
@@ -3707,8 +4143,10 @@ SDNode *ARMDAGToDAGISel::SelectWriteRegister(SDNode *N){
     Ops = { CurDAG->getTargetConstant(BankedReg, DL, MVT::i32), N->getOperand(2),
             getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32),
             N->getOperand(0) };
-    return CurDAG->getMachineNode(IsThumb2 ? ARM::t2MSRbanked : ARM::MSRbanked,
-                                  DL, MVT::Other, Ops);
+    ReplaceNode(
+        N, CurDAG->getMachineNode(IsThumb2 ? ARM::t2MSRbanked : ARM::MSRbanked,
+                                  DL, MVT::Other, Ops));
+    return true;
   }
 
   // The VFP registers are written to by creating SelectionDAG nodes with
@@ -3724,16 +4162,17 @@ SDNode *ARMDAGToDAGISel::SelectWriteRegister(SDNode *N){
 
   if (Opcode) {
     if (!Subtarget->hasVFP2())
-      return nullptr;
+      return false;
     Ops = { N->getOperand(2), getAL(CurDAG, DL),
             CurDAG->getRegister(0, MVT::i32), N->getOperand(0) };
-    return CurDAG->getMachineNode(Opcode, DL, MVT::Other, Ops);
+    ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, MVT::Other, Ops));
+    return true;
   }
 
-  SmallVector<StringRef, 5> Fields;
-  StringRef(SpecialReg).split(Fields, '_', 1, false);
-  std::string Reg = Fields[0].str();
-  StringRef Flags = Fields.size() == 2 ? Fields[1] : "";
+  std::pair<StringRef, StringRef> Fields;
+  Fields = StringRef(SpecialReg).rsplit('_');
+  std::string Reg = Fields.first.str();
+  StringRef Flags = Fields.second;
 
   // If the target was M Class then need to validate the special register value
   // and retrieve the mask for use in the instruction node.
@@ -3745,12 +4184,13 @@ SDNode *ARMDAGToDAGISel::SelectWriteRegister(SDNode *N){
     }
     int SYSmValue = getMClassRegisterMask(Reg, Flags, false, Subtarget);
     if (SYSmValue == -1)
-      return nullptr;
+      return false;
 
     SDValue Ops[] = { CurDAG->getTargetConstant(SYSmValue, DL, MVT::i32),
                       N->getOperand(2), getAL(CurDAG, DL),
                       CurDAG->getRegister(0, MVT::i32), N->getOperand(0) };
-    return CurDAG->getMachineNode(ARM::t2MSR_M, DL, MVT::Other, Ops);
+    ReplaceNode(N, CurDAG->getMachineNode(ARM::t2MSR_M, DL, MVT::Other, Ops));
+    return true;
   }
 
   // We then check to see if a valid mask can be constructed for one of the
@@ -3761,14 +4201,15 @@ SDNode *ARMDAGToDAGISel::SelectWriteRegister(SDNode *N){
     Ops = { CurDAG->getTargetConstant(Mask, DL, MVT::i32), N->getOperand(2),
             getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32),
             N->getOperand(0) };
-    return CurDAG->getMachineNode(IsThumb2 ? ARM::t2MSR_AR : ARM::MSR,
-                                  DL, MVT::Other, Ops);
+    ReplaceNode(N, CurDAG->getMachineNode(IsThumb2 ? ARM::t2MSR_AR : ARM::MSR,
+                                          DL, MVT::Other, Ops));
+    return true;
   }
 
-  return nullptr;
+  return false;
 }
 
-SDNode *ARMDAGToDAGISel::SelectInlineAsm(SDNode *N){
+bool ARMDAGToDAGISel::tryInlineAsm(SDNode *N){
   std::vector<SDValue> AsmNodeOperands;
   unsigned Flag, Kind;
   bool Changed = false;
@@ -3823,6 +4264,17 @@ SDNode *ARMDAGToDAGISel::SelectInlineAsm(SDNode *N){
     if (Changed && InlineAsm::isUseOperandTiedToDef(Flag, DefIdx))
       IsTiedToChangedOp = OpChanged[DefIdx];
 
+    // Memory operands to inline asm in the SelectionDAG are modeled with two
+    // operands: a constant of value InlineAsm::Kind_Mem followed by the input
+    // operand. If we get here and we have a Kind_Mem, skip the next operand (so
+    // it doesn't get misinterpreted), and continue. We do this here because
+    // it's important to update the OpChanged array correctly before moving on.
+    if (Kind == InlineAsm::Kind_Mem) {
+      SDValue op = N->getOperand(++i);
+      AsmNodeOperands.push_back(op);
+      continue;
+    }
+
     if (Kind != InlineAsm::Kind_RegUse && Kind != InlineAsm::Kind_RegDef
         && Kind != InlineAsm::Kind_RegDefEarlyClobber)
       continue;
@@ -3912,12 +4364,13 @@ SDNode *ARMDAGToDAGISel::SelectInlineAsm(SDNode *N){
   if (Glue.getNode())
     AsmNodeOperands.push_back(Glue);
   if (!Changed)
-    return nullptr;
+    return false;
 
   SDValue New = CurDAG->getNode(ISD::INLINEASM, SDLoc(N),
       CurDAG->getVTList(MVT::Other, MVT::Glue), AsmNodeOperands);
   New->setNodeId(-1);
-  return New.getNode();
+  ReplaceNode(N, New.getNode());
+  return true;
 }
 
 
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 978e99cf511e..d6e7caf98a80 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -65,6 +65,13 @@ ARMInterworking("arm-interworking", cl::Hidden,
   cl::desc("Enable / disable ARM interworking (for debugging only)"),
   cl::init(true));
 
+// Disabled for causing self-hosting failures once returned-attribute inference
+// was enabled.
+static cl::opt<bool>
+EnableThisRetForwarding("arm-this-return-forwarding", cl::Hidden,
+                        cl::desc("Directly forward this return"),
+                        cl::init(false));
+
 namespace {
   class ARMCCState : public CCState {
   public:
@@ -240,7 +247,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
 
     // Set the correct calling convention for ARMv7k WatchOS. It's just
     // AAPCS_VFP for functions as simple as libcalls.
-    if (Subtarget->isTargetWatchOS()) {
+    if (Subtarget->isTargetWatchABI()) {
       for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i)
         setLibcallCallingConv((RTLIB::Libcall)i, CallingConv::ARM_AAPCS_VFP);
     }
@@ -254,7 +261,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   // RTLIB
   if (Subtarget->isAAPCS_ABI() &&
       (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() ||
-       Subtarget->isTargetAndroid())) {
+       Subtarget->isTargetMuslAEABI() || Subtarget->isTargetAndroid())) {
     static const struct {
       const RTLIB::Libcall Op;
       const char * const Name;
@@ -390,10 +397,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
       { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP },
       { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP },
       { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP },
-      { RTLIB::SDIV_I32, "__rt_sdiv",   CallingConv::ARM_AAPCS_VFP },
-      { RTLIB::UDIV_I32, "__rt_udiv",   CallingConv::ARM_AAPCS_VFP },
-      { RTLIB::SDIV_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS_VFP },
-      { RTLIB::UDIV_I64, "__rt_udiv64", CallingConv::ARM_AAPCS_VFP },
     };
 
     for (const auto &LC : LibraryCalls) {
@@ -410,17 +413,19 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
   }
 
-  // The half <-> float conversion functions are always soft-float, but are
-  // needed for some targets which use a hard-float calling convention by
-  // default.
-  if (Subtarget->isAAPCS_ABI()) {
-    setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS);
-  } else {
-    setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS);
-    setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS);
-    setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS);
+  // The half <-> float conversion functions are always soft-float on
+  // non-watchos platforms, but are needed for some targets which use a
+  // hard-float calling convention by default.
+  if (!Subtarget->isTargetWatchABI()) {
+    if (Subtarget->isAAPCS_ABI()) {
+      setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS);
+      setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS);
+      setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS);
+    } else {
+      setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS);
+      setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS);
+      setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS);
+    }
   }
 
   // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have
@@ -581,6 +586,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::CTPOP,      MVT::v4i32, Custom);
     setOperationAction(ISD::CTPOP,      MVT::v4i16, Custom);
     setOperationAction(ISD::CTPOP,      MVT::v8i16, Custom);
+    setOperationAction(ISD::CTPOP,      MVT::v1i64, Expand);
+    setOperationAction(ISD::CTPOP,      MVT::v2i64, Expand);
+
+    setOperationAction(ISD::CTLZ,       MVT::v1i64, Expand);
+    setOperationAction(ISD::CTLZ,       MVT::v2i64, Expand);
 
     // NEON does not have single instruction CTTZ for vectors.
     setOperationAction(ISD::CTTZ, MVT::v8i8, Custom);
@@ -712,6 +722,10 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
       setIndexedStoreAction(im, MVT::i16, Legal);
       setIndexedStoreAction(im, MVT::i32, Legal);
     }
+  } else {
+    // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}.
+    setIndexedLoadAction(ISD::POST_INC, MVT::i32,  Legal);
+    setIndexedStoreAction(ISD::POST_INC, MVT::i32,  Legal);
   }
 
   setOperationAction(ISD::SADDO, MVT::i32, Custom);
@@ -758,10 +772,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only())
     setOperationAction(ISD::CTLZ, MVT::i32, Expand);
 
-  // These just redirect to CTTZ and CTLZ on ARM.
-  setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i32  , Expand);
-  setOperationAction(ISD::CTLZ_ZERO_UNDEF  , MVT::i32  , Expand);
-
   // @llvm.readcyclecounter requires the Performance Monitors extension.
   // Default to the 0 expansion on unsupported platforms.
   // FIXME: Technically there are older ARM CPUs that have
@@ -773,19 +783,30 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   if (!Subtarget->hasV6Ops())
     setOperationAction(ISD::BSWAP, MVT::i32, Expand);
 
-  if (!(Subtarget->hasDivide() && Subtarget->isThumb2()) &&
-      !(Subtarget->hasDivideInARMMode() && !Subtarget->isThumb())) {
+  bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivide()
+                                        : Subtarget->hasDivideInARMMode();
+  if (!hasDivide) {
     // These are expanded into libcalls if the cpu doesn't have HW divider.
     setOperationAction(ISD::SDIV,  MVT::i32, LibCall);
     setOperationAction(ISD::UDIV,  MVT::i32, LibCall);
   }
 
+  if (Subtarget->isTargetWindows() && !Subtarget->hasDivide()) {
+    setOperationAction(ISD::SDIV, MVT::i32, Custom);
+    setOperationAction(ISD::UDIV, MVT::i32, Custom);
+
+    setOperationAction(ISD::SDIV, MVT::i64, Custom);
+    setOperationAction(ISD::UDIV, MVT::i64, Custom);
+  }
+
   setOperationAction(ISD::SREM,  MVT::i32, Expand);
   setOperationAction(ISD::UREM,  MVT::i32, Expand);
   // Register based DivRem for AEABI (RTABI 4.2)
-  if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid()) {
+  if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
+      Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI()) {
     setOperationAction(ISD::SREM, MVT::i64, Custom);
     setOperationAction(ISD::UREM, MVT::i64, Custom);
+    HasStandaloneRem = false;
 
     setLibcallName(RTLIB::SDIVREM_I8,  "__aeabi_idivmod");
     setLibcallName(RTLIB::SDIVREM_I16, "__aeabi_idivmod");
@@ -807,6 +828,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
 
     setOperationAction(ISD::SDIVREM, MVT::i32, Custom);
     setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
+    setOperationAction(ISD::SDIVREM, MVT::i64, Custom);
+    setOperationAction(ISD::UDIVREM, MVT::i64, Custom);
   } else {
     setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
     setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
@@ -833,21 +856,21 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
 
   // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
-  // the default expansion. If we are targeting a single threaded system,
-  // then set them all for expand so we can lower them later into their
-  // non-atomic form.
-  if (TM.Options.ThreadModel == ThreadModel::Single)
-    setOperationAction(ISD::ATOMIC_FENCE,   MVT::Other, Expand);
-  else if (Subtarget->hasAnyDataBarrier() && !Subtarget->isThumb1Only()) {
+  // the default expansion.
+  InsertFencesForAtomic = false;
+  if (Subtarget->hasAnyDataBarrier() &&
+      (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) {
     // ATOMIC_FENCE needs custom lowering; the others should have been expanded
     // to ldrex/strex loops already.
     setOperationAction(ISD::ATOMIC_FENCE,     MVT::Other, Custom);
+    if (!Subtarget->isThumb() || !Subtarget->isMClass())
+      setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i64, Custom);
 
     // On v8, we have particularly efficient implementations of atomic fences
     // if they can be combined with nearby atomic loads and stores.
-    if (!Subtarget->hasV8Ops()) {
+    if (!Subtarget->hasV8Ops() || getTargetMachine().getOptLevel() == 0) {
       // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc.
-      setInsertFencesForAtomic(true);
+      InsertFencesForAtomic = true;
     }
   } else {
     // If there's anything we can use as a barrier, go through custom lowering
@@ -909,6 +932,10 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
 
+  // Thumb-1 cannot currently select ARMISD::SUBE.
+  if (!Subtarget->isThumb1Only())
+    setOperationAction(ISD::SETCCE, MVT::i32, Custom);
+
   setOperationAction(ISD::BRCOND,    MVT::Other, Expand);
   setOperationAction(ISD::BR_CC,     MVT::i32,   Custom);
   setOperationAction(ISD::BR_CC,     MVT::f32,   Custom);
@@ -956,7 +983,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   if (Subtarget->hasSinCos()) {
     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
     setLibcallName(RTLIB::SINCOS_F64, "sincos");
-    if (Subtarget->isTargetWatchOS()) {
+    if (Subtarget->isTargetWatchABI()) {
       setLibcallCallingConv(RTLIB::SINCOS_F32, CallingConv::ARM_AAPCS_VFP);
       setLibcallCallingConv(RTLIB::SINCOS_F64, CallingConv::ARM_AAPCS_VFP);
     }
@@ -1039,7 +1066,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   setMinStackArgumentAlignment(4);
 
   // Prefer likely predicted branches to selects on out-of-order cores.
-  PredictableSelectIsExpensive = Subtarget->isLikeA9();
+  PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder();
 
   setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2);
 }
@@ -1106,7 +1133,6 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::CALL:          return "ARMISD::CALL";
   case ARMISD::CALL_PRED:     return "ARMISD::CALL_PRED";
   case ARMISD::CALL_NOLINK:   return "ARMISD::CALL_NOLINK";
-  case ARMISD::tCALL:         return "ARMISD::tCALL";
   case ARMISD::BRCOND:        return "ARMISD::BRCOND";
   case ARMISD::BR_JT:         return "ARMISD::BR_JT";
   case ARMISD::BR2_JT:        return "ARMISD::BR2_JT";
@@ -1123,6 +1149,8 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
 
   case ARMISD::CMOV:          return "ARMISD::CMOV";
 
+  case ARMISD::SSAT:          return "ARMISD::SSAT";
+
   case ARMISD::SRL_FLAG:      return "ARMISD::SRL_FLAG";
   case ARMISD::SRA_FLAG:      return "ARMISD::SRA_FLAG";
   case ARMISD::RRX:           return "ARMISD::RRX";
@@ -1199,6 +1227,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::VTBL2:         return "ARMISD::VTBL2";
   case ARMISD::VMULLs:        return "ARMISD::VMULLs";
   case ARMISD::VMULLu:        return "ARMISD::VMULLu";
+  case ARMISD::UMAAL:         return "ARMISD::UMAAL";
   case ARMISD::UMLAL:         return "ARMISD::UMLAL";
   case ARMISD::SMLAL:         return "ARMISD::SMLAL";
   case ARMISD::BUILD_VECTOR:  return "ARMISD::BUILD_VECTOR";
@@ -1373,7 +1402,10 @@ ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
   case CallingConv::ARM_APCS:
   case CallingConv::GHC:
     return CC;
+  case CallingConv::PreserveMost:
+    return CallingConv::PreserveMost;
   case CallingConv::ARM_AAPCS_VFP:
+  case CallingConv::Swift:
     return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP;
   case CallingConv::C:
     if (!Subtarget->isAAPCS_ABI())
@@ -1415,18 +1447,18 @@ CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
     return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
   case CallingConv::GHC:
     return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC);
+  case CallingConv::PreserveMost:
+    return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
   }
 }
 
 /// LowerCallResult - Lower the result values of a call into the
 /// appropriate copies out of appropriate physical registers.
-SDValue
-ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
-                                   CallingConv::ID CallConv, bool isVarArg,
-                                   const SmallVectorImpl<ISD::InputArg> &Ins,
-                                   SDLoc dl, SelectionDAG &DAG,
-                                   SmallVectorImpl<SDValue> &InVals,
-                                   bool isThisReturn, SDValue ThisVal) const {
+SDValue ARMTargetLowering::LowerCallResult(
+    SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
+    SDValue ThisVal) const {
 
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
@@ -1442,7 +1474,7 @@ ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
 
     // Pass 'this' value directly from the argument to return value, to avoid
     // reg unit interference
-    if (i == 0 && isThisReturn) {
+    if (i == 0 && isThisReturn && EnableThisRetForwarding) {
       assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 &&
              "unexpected return calling convention register assignment");
       InVals.push_back(ThisVal);
@@ -1506,23 +1538,21 @@ ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
 }
 
 /// LowerMemOpCallTo - Store the argument to the stack.
-SDValue
-ARMTargetLowering::LowerMemOpCallTo(SDValue Chain,
-                                    SDValue StackPtr, SDValue Arg,
-                                    SDLoc dl, SelectionDAG &DAG,
-                                    const CCValAssign &VA,
-                                    ISD::ArgFlagsTy Flags) const {
+SDValue ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
+                                            SDValue Arg, const SDLoc &dl,
+                                            SelectionDAG &DAG,
+                                            const CCValAssign &VA,
+                                            ISD::ArgFlagsTy Flags) const {
   unsigned LocMemOffset = VA.getLocMemOffset();
   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
                        StackPtr, PtrOff);
   return DAG.getStore(
       Chain, dl, Arg, PtrOff,
-      MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset),
-      false, false, 0);
+      MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
 }
 
-void ARMTargetLowering::PassF64ArgInRegs(SDLoc dl, SelectionDAG &DAG,
+void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG,
                                          SDValue Chain, SDValue &Arg,
                                          RegsToPassVector &RegsToPass,
                                          CCValAssign &VA, CCValAssign &NextVA,
@@ -1704,7 +1734,6 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
           SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
           SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg,
                                      MachinePointerInfo(),
-                                     false, false, false,
                                      DAG.InferPtrAlignment(AddArg));
           MemOpChains.push_back(Load.getValue(1));
           RegsToPass.push_back(std::make_pair(j, Load));
@@ -1780,20 +1809,27 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
   // node so that legalize doesn't hack it.
   bool isDirect = false;
-  bool isARMFunc = false;
+
+  const TargetMachine &TM = getTargetMachine();
+  const Module *Mod = MF.getFunction()->getParent();
+  const GlobalValue *GV = nullptr;
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+    GV = G->getGlobal();
+  bool isStub =
+      !TM.shouldAssumeDSOLocal(*Mod, GV) && Subtarget->isTargetMachO();
+
+  bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
   bool isLocalARMFunc = false;
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   auto PtrVt = getPointerTy(DAG.getDataLayout());
 
   if (Subtarget->genLongCalls()) {
-    assert((Subtarget->isTargetWindows() ||
-            getTargetMachine().getRelocationModel() == Reloc::Static) &&
-           "long-calls with non-static relocation model!");
+    assert((!isPositionIndependent() || Subtarget->isTargetWindows()) &&
+           "long-calls codegen is not position independent!");
     // Handle a global address or an external symbol. If it's not one of
     // those, the target's already in a register, so we don't need to do
     // anything extra.
-    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
-      const GlobalValue *GV = G->getGlobal();
+    if (isa<GlobalAddressSDNode>(Callee)) {
       // Create a constant pool entry for the callee address
       unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
       ARMConstantPoolValue *CPV =
@@ -1804,8 +1840,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
       Callee = DAG.getLoad(
           PtrVt, dl, DAG.getEntryNode(), CPAddr,
-          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
-          false, false, 0);
+          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
     } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) {
       const char *Sym = S->getSymbol();
 
@@ -1819,54 +1854,55 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
       Callee = DAG.getLoad(
           PtrVt, dl, DAG.getEntryNode(), CPAddr,
-          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
-          false, false, 0);
-    }
-  } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
-    const GlobalValue *GV = G->getGlobal();
-    isDirect = true;
-    bool isDef = GV->isStrongDefinitionForLinker();
-    bool isStub = (!isDef && Subtarget->isTargetMachO()) &&
-                   getTargetMachine().getRelocationModel() != Reloc::Static;
-    isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
-    // ARM call to a local ARM function is predicable.
-    isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking);
-    // tBX takes a register source operand.
-    if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
-      assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?");
-      Callee = DAG.getNode(
-          ARMISD::WrapperPIC, dl, PtrVt,
-          DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY));
-      Callee = DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), Callee,
-                           MachinePointerInfo::getGOT(DAG.getMachineFunction()),
-                           false, false, true, 0);
-    } else if (Subtarget->isTargetCOFF()) {
-      assert(Subtarget->isTargetWindows() &&
-             "Windows is the only supported COFF target");
-      unsigned TargetFlags = GV->hasDLLImportStorageClass()
-                                 ? ARMII::MO_DLLIMPORT
-                                 : ARMII::MO_NO_FLAG;
-      Callee =
-          DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*Offset=*/0, TargetFlags);
-      if (GV->hasDLLImportStorageClass())
+          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+    }
+  } else if (isa<GlobalAddressSDNode>(Callee)) {
+    // If we're optimizing for minimum size and the function is called three or
+    // more times in this block, we can improve codesize by calling indirectly
+    // as BLXr has a 16-bit encoding.
+    auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
+    auto *BB = CLI.CS->getParent();
+    bool PreferIndirect =
+        Subtarget->isThumb() && MF.getFunction()->optForMinSize() &&
+        std::count_if(GV->user_begin(), GV->user_end(), [&BB](const User *U) {
+          return isa<Instruction>(U) && cast<Instruction>(U)->getParent() == BB;
+        }) > 2;
+
+    if (!PreferIndirect) {
+      isDirect = true;
+      bool isDef = GV->isStrongDefinitionForLinker();
+
+      // ARM call to a local ARM function is predicable.
+      isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking);
+      // tBX takes a register source operand.
+      if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
+        assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?");
+        Callee = DAG.getNode(
+            ARMISD::WrapperPIC, dl, PtrVt,
+            DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY));
         Callee =
-            DAG.getLoad(PtrVt, dl, DAG.getEntryNode(),
-                        DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee),
+            DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), Callee,
                         MachinePointerInfo::getGOT(DAG.getMachineFunction()),
-                        false, false, false, 0);
-    } else {
-      // On ELF targets for PIC code, direct calls should go through the PLT
-      unsigned OpFlags = 0;
-      if (Subtarget->isTargetELF() &&
-          getTargetMachine().getRelocationModel() == Reloc::PIC_)
-        OpFlags = ARMII::MO_PLT;
-      Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, OpFlags);
+                        /* Alignment = */ 0, MachineMemOperand::MOInvariant);
+      } else if (Subtarget->isTargetCOFF()) {
+        assert(Subtarget->isTargetWindows() &&
+               "Windows is the only supported COFF target");
+        unsigned TargetFlags = GV->hasDLLImportStorageClass()
+                                   ? ARMII::MO_DLLIMPORT
+                                   : ARMII::MO_NO_FLAG;
+        Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*Offset=*/0,
+                                            TargetFlags);
+        if (GV->hasDLLImportStorageClass())
+          Callee =
+              DAG.getLoad(PtrVt, dl, DAG.getEntryNode(),
+                          DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee),
+                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
+      } else {
+        Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, 0);
+      }
     }
   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
     isDirect = true;
-    bool isStub = Subtarget->isTargetMachO() &&
-                  getTargetMachine().getRelocationModel() != Reloc::Static;
-    isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
     // tBX takes a register source operand.
     const char *Sym = S->getSymbol();
     if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
@@ -1878,17 +1914,11 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
       Callee = DAG.getLoad(
           PtrVt, dl, DAG.getEntryNode(), CPAddr,
-          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
-          false, false, 0);
+          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
       SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
       Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel);
     } else {
-      unsigned OpFlags = 0;
-      // On ELF targets for PIC code, direct calls should go through the PLT
-      if (Subtarget->isTargetELF() &&
-                  getTargetMachine().getRelocationModel() == Reloc::PIC_)
-        OpFlags = ARMII::MO_PLT;
-      Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, OpFlags);
+      Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0);
     }
   }
 
@@ -1898,11 +1928,11 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
       CallOpc = ARMISD::CALL_NOLINK;
     else
-      CallOpc = isARMFunc ? ARMISD::CALL : ARMISD::tCALL;
+      CallOpc = ARMISD::CALL;
   } else {
     if (!isDirect && !Subtarget->hasV5TOps())
       CallOpc = ARMISD::CALL_NOLINK;
-    else if (doesNotRet && isDirect && Subtarget->hasRAS() &&
+    else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() &&
              // Emit regular call when code size is the priority
              !MF.getFunction()->optForMinSize())
       // "mov lr, pc; b _foo" to avoid confusing the RSP
@@ -2042,7 +2072,7 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
     if (!Def)
       return false;
     if (!Flags.isByVal()) {
-      if (!TII->isLoadFromStackSlot(Def, FI))
+      if (!TII->isLoadFromStackSlot(*Def, FI))
         return false;
     } else {
       return false;
@@ -2082,9 +2112,9 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
                                     const SmallVectorImpl<SDValue> &OutVals,
                                     const SmallVectorImpl<ISD::InputArg> &Ins,
                                                      SelectionDAG& DAG) const {
-  const Function *CallerF = DAG.getMachineFunction().getFunction();
+  MachineFunction &MF = DAG.getMachineFunction();
+  const Function *CallerF = MF.getFunction();
   CallingConv::ID CallerCC = CallerF->getCallingConv();
-  bool CCMatch = CallerCC == CalleeCC;
 
   assert(Subtarget->supportsTailCall());
 
@@ -2122,41 +2152,25 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
       return false;
   }
 
-  // If the calling conventions do not match, then we'd better make sure the
-  // results are returned in the same way as what the caller expects.
-  if (!CCMatch) {
-    SmallVector<CCValAssign, 16> RVLocs1;
-    ARMCCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1,
-                       *DAG.getContext(), Call);
-    CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForNode(CalleeCC, true, isVarArg));
-
-    SmallVector<CCValAssign, 16> RVLocs2;
-    ARMCCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2,
-                       *DAG.getContext(), Call);
-    CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForNode(CallerCC, true, isVarArg));
-
-    if (RVLocs1.size() != RVLocs2.size())
+  // Check that the call results are passed in the same way.
+  LLVMContext &C = *DAG.getContext();
+  if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
+                                  CCAssignFnForNode(CalleeCC, true, isVarArg),
+                                  CCAssignFnForNode(CallerCC, true, isVarArg)))
+    return false;
+  // The callee has to preserve all registers the caller needs to preserve.
+  const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
+  const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
+  if (CalleeCC != CallerCC) {
+    const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
+    if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
       return false;
-    for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
-      if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
-        return false;
-      if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
-        return false;
-      if (RVLocs1[i].isRegLoc()) {
-        if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
-          return false;
-      } else {
-        if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
-          return false;
-      }
-    }
   }
 
   // If Caller's vararg or byval argument has been split between registers and
   // stack, do not perform tail call, since part of the argument is in caller's
   // local frame.
-  const ARMFunctionInfo *AFI_Caller = DAG.getMachineFunction().
-                                      getInfo<ARMFunctionInfo>();
+  const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>();
   if (AFI_Caller->getArgRegsSaveSize())
     return false;
 
@@ -2166,13 +2180,10 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
     // Check if stack adjustment is needed. For now, do not do this if any
     // argument is passed on the stack.
     SmallVector<CCValAssign, 16> ArgLocs;
-    ARMCCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
-                      *DAG.getContext(), Call);
+    ARMCCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C, Call);
     CCInfo.AnalyzeCallOperands(Outs,
                                CCAssignFnForNode(CalleeCC, false, isVarArg));
     if (CCInfo.getNextStackOffset()) {
-      MachineFunction &MF = DAG.getMachineFunction();
-
       // Check if the arguments are already laid out in the right way as
       // the caller's fixed stack objects.
       MachineFrameInfo *MFI = MF.getFrameInfo();
@@ -2209,6 +2220,10 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
         }
       }
     }
+
+    const MachineRegisterInfo &MRI = MF.getRegInfo();
+    if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
+      return false;
   }
 
   return true;
@@ -2226,7 +2241,7 @@ ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
 }
 
 static SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps,
-                                    SDLoc DL, SelectionDAG &DAG) {
+                                    const SDLoc &DL, SelectionDAG &DAG) {
   const MachineFunction &MF = DAG.getMachineFunction();
   const Function *F = MF.getFunction();
 
@@ -2259,11 +2274,11 @@ static SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps,
 }
 
 SDValue
-ARMTargetLowering::LowerReturn(SDValue Chain,
-                               CallingConv::ID CallConv, bool isVarArg,
+ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                               bool isVarArg,
                                const SmallVectorImpl<ISD::OutputArg> &Outs,
                                const SmallVectorImpl<SDValue> &OutVals,
-                               SDLoc dl, SelectionDAG &DAG) const {
+                               const SDLoc &dl, SelectionDAG &DAG) const {
 
   // CCValAssign - represent the assignment of the return value to a location.
   SmallVector<CCValAssign, 16> RVLocs;
@@ -2521,9 +2536,9 @@ SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
   SDLoc DL(Op);
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
-  Reloc::Model RelocM = getTargetMachine().getRelocationModel();
   SDValue CPAddr;
-  if (RelocM == Reloc::Static) {
+  bool IsPositionIndependent = isPositionIndependent();
+  if (!IsPositionIndependent) {
     CPAddr = DAG.getTargetConstantPool(BA, PtrVT, 4);
   } else {
     unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
@@ -2534,11 +2549,10 @@ SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
     CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
   }
   CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
-  SDValue Result =
-      DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), CPAddr,
-                  MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
-                  false, false, false, 0);
-  if (RelocM == Reloc::Static)
+  SDValue Result = DAG.getLoad(
+      PtrVT, DL, DAG.getEntryNode(), CPAddr,
+      MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+  if (!IsPositionIndependent)
     return Result;
   SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32);
   return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
@@ -2584,7 +2598,8 @@ ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op,
   SDValue FuncTLVGet =
       DAG.getLoad(MVT::i32, DL, Chain, DescAddr,
                   MachinePointerInfo::getGOT(DAG.getMachineFunction()),
-                  false, true, true, 4);
+                  /* Alignment = */ 4, MachineMemOperand::MONonTemporal |
+                                           MachineMemOperand::MOInvariant);
   Chain = FuncTLVGet.getValue(1);
 
   MachineFunction &F = DAG.getMachineFunction();
@@ -2610,6 +2625,61 @@ ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op,
   return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1));
 }
 
+SDValue
+ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
+
+  SDValue Chain = DAG.getEntryNode();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+  SDLoc DL(Op);
+
+  // Load the current TEB (thread environment block)
+  SDValue Ops[] = {Chain,
+                   DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32),
+                   DAG.getConstant(15, DL, MVT::i32),
+                   DAG.getConstant(0, DL, MVT::i32),
+                   DAG.getConstant(13, DL, MVT::i32),
+                   DAG.getConstant(0, DL, MVT::i32),
+                   DAG.getConstant(2, DL, MVT::i32)};
+  SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
+                                   DAG.getVTList(MVT::i32, MVT::Other), Ops);
+
+  SDValue TEB = CurrentTEB.getValue(0);
+  Chain = CurrentTEB.getValue(1);
+
+  // Load the ThreadLocalStoragePointer from the TEB
+  // A pointer to the TLS array is located at offset 0x2c from the TEB.
+  SDValue TLSArray =
+      DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL));
+  TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
+
+  // The pointer to the thread's TLS data area is at the TLS Index scaled by 4
+  // offset into the TLSArray.
+
+  // Load the TLS index from the C runtime
+  SDValue TLSIndex =
+      DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG);
+  TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex);
+  TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo());
+
+  SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
+                              DAG.getConstant(2, DL, MVT::i32));
+  SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
+                            DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
+                            MachinePointerInfo());
+
+  // Get the offset of the start of the .tls section (section base)
+  const auto *GA = cast<GlobalAddressSDNode>(Op);
+  auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL);
+  SDValue Offset = DAG.getLoad(
+      PtrVT, DL, Chain, DAG.getNode(ARMISD::Wrapper, DL, MVT::i32,
+                                    DAG.getTargetConstantPool(CPV, PtrVT, 4)),
+      MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+
+  return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset);
+}
+
 // Lower ISD::GlobalTLSAddress using the "general dynamic" model
 SDValue
 ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
@@ -2625,10 +2695,9 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
                                     ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
   SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4);
   Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
-  Argument =
-      DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Argument,
-                  MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
-                  false, false, false, 0);
+  Argument = DAG.getLoad(
+      PtrVT, dl, DAG.getEntryNode(), Argument,
+      MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
   SDValue Chain = Argument.getValue(1);
 
   SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
@@ -2645,8 +2714,7 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl).setChain(Chain)
     .setCallee(CallingConv::C, Type::getInt32Ty(*DAG.getContext()),
-               DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args),
-               0);
+               DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args));
 
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
   return CallResult.first;
@@ -2680,8 +2748,7 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
     Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
     Offset = DAG.getLoad(
         PtrVT, dl, Chain, Offset,
-        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
-        false, false, 0);
+        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
     Chain = Offset.getValue(1);
 
     SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
@@ -2689,8 +2756,7 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
 
     Offset = DAG.getLoad(
         PtrVT, dl, Chain, Offset,
-        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
-        false, false, 0);
+        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
   } else {
     // local exec model
     assert(model == TLSModel::LocalExec);
@@ -2700,8 +2766,7 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
     Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
     Offset = DAG.getLoad(
         PtrVT, dl, Chain, Offset,
-        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
-        false, false, 0);
+        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
   }
 
   // The address of the thread local variable is the add of the thread
@@ -2714,6 +2779,9 @@ ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
   if (Subtarget->isTargetDarwin())
     return LowerGlobalTLSAddressDarwin(Op, DAG);
 
+  if (Subtarget->isTargetWindows())
+    return LowerGlobalTLSAddressWindows(Op, DAG);
+
   // TODO: implement the "local dynamic" model
   assert(Subtarget->isTargetELF() && "Only ELF implemented here");
   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
@@ -2738,9 +2806,9 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDLoc dl(Op);
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
-  if (getTargetMachine().getRelocationModel() == Reloc::PIC_) {
-    bool UseGOT_PREL =
-        !(GV->hasHiddenVisibility() || GV->hasLocalLinkage());
+  const TargetMachine &TM = getTargetMachine();
+  if (isPositionIndependent()) {
+    bool UseGOT_PREL = !TM.shouldAssumeDSOLocal(*GV->getParent(), GV);
 
     MachineFunction &MF = DAG.getMachineFunction();
     ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
@@ -2756,15 +2824,14 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
     CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
     SDValue Result = DAG.getLoad(
         PtrVT, dl, DAG.getEntryNode(), CPAddr,
-        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
-        false, false, 0);
+        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
     SDValue Chain = Result.getValue(1);
     SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
     Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
     if (UseGOT_PREL)
-      Result = DAG.getLoad(PtrVT, dl, Chain, Result,
-                           MachinePointerInfo::getGOT(DAG.getMachineFunction()),
-                           false, false, false, 0);
+      Result =
+          DAG.getLoad(PtrVT, dl, Chain, Result,
+                      MachinePointerInfo::getGOT(DAG.getMachineFunction()));
     return Result;
   }
 
@@ -2781,8 +2848,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
     CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
     return DAG.getLoad(
         PtrVT, dl, DAG.getEntryNode(), CPAddr,
-        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
-        false, false, 0);
+        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
   }
 }
 
@@ -2791,7 +2857,6 @@ SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDLoc dl(Op);
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
-  Reloc::Model RelocM = getTargetMachine().getRelocationModel();
 
   if (Subtarget->useMovt(DAG.getMachineFunction()))
     ++NumMovwMovt;
@@ -2799,15 +2864,14 @@ SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
   // FIXME: Once remat is capable of dealing with instructions with register
   // operands, expand this into multiple nodes
   unsigned Wrapper =
-      RelocM == Reloc::PIC_ ? ARMISD::WrapperPIC : ARMISD::Wrapper;
+      isPositionIndependent() ? ARMISD::WrapperPIC : ARMISD::Wrapper;
 
   SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY);
   SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G);
 
-  if (Subtarget->GVIsIndirectSymbol(GV, RelocM))
+  if (Subtarget->isGVIndirectSymbol(GV))
     Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
-                         MachinePointerInfo::getGOT(DAG.getMachineFunction()),
-                         false, false, false, 0);
+                         MachinePointerInfo::getGOT(DAG.getMachineFunction()));
   return Result;
 }
 
@@ -2833,8 +2897,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
                                                   TargetFlags));
   if (GV->hasDLLImportStorageClass())
     Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
-                         MachinePointerInfo::getGOT(DAG.getMachineFunction()),
-                         false, false, false, 0);
+                         MachinePointerInfo::getGOT(DAG.getMachineFunction()));
   return Result;
 }
 
@@ -2873,7 +2936,7 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
            "RBIT intrinsic must have i32 type!");
     return DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, Op.getOperand(1));
   }
-  case Intrinsic::arm_thread_pointer: {
+  case Intrinsic::thread_pointer: {
     EVT PtrVT = getPointerTy(DAG.getDataLayout());
     return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
   }
@@ -2882,10 +2945,9 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
     ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
     unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
     EVT PtrVT = getPointerTy(DAG.getDataLayout());
-    Reloc::Model RelocM = getTargetMachine().getRelocationModel();
     SDValue CPAddr;
-    unsigned PCAdj = (RelocM != Reloc::PIC_)
-      ? 0 : (Subtarget->isThumb() ? 4 : 8);
+    bool IsPositionIndependent = isPositionIndependent();
+    unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0;
     ARMConstantPoolValue *CPV =
       ARMConstantPoolConstant::Create(MF.getFunction(), ARMPCLabelIndex,
                                       ARMCP::CPLSDA, PCAdj);
@@ -2893,10 +2955,9 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
     CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
     SDValue Result = DAG.getLoad(
         PtrVT, dl, DAG.getEntryNode(), CPAddr,
-        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
-        false, false, 0);
+        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
 
-    if (RelocM == Reloc::PIC_) {
+    if (IsPositionIndependent) {
       SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
       Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
     }
@@ -2962,7 +3023,8 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG,
   if (Subtarget->isMClass()) {
     // Only a full system barrier exists in the M-class architectures.
     Domain = ARM_MB::SY;
-  } else if (Subtarget->isSwift() && Ord == Release) {
+  } else if (Subtarget->preferISHSTBarriers() &&
+             Ord == AtomicOrdering::Release) {
     // Swift happens to implement ISHST barriers in a way that's compatible with
     // Release semantics but weaker than ISH so we'd be fools not to use
     // it. Beware: other processors probably don't!
@@ -3012,13 +3074,14 @@ static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) {
   SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
-                      MachinePointerInfo(SV), false, false, 0);
+                      MachinePointerInfo(SV));
 }
 
-SDValue
-ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
-                                        SDValue &Root, SelectionDAG &DAG,
-                                        SDLoc dl) const {
+SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA,
+                                                CCValAssign &NextVA,
+                                                SDValue &Root,
+                                                SelectionDAG &DAG,
+                                                const SDLoc &dl) const {
   MachineFunction &MF = DAG.getMachineFunction();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
 
@@ -3041,8 +3104,7 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
     ArgValue2 = DAG.getLoad(
         MVT::i32, dl, Root, FIN,
-        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), false,
-        false, false, 0);
+        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
   } else {
     Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
     ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
@@ -3060,13 +3122,11 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
 // these values; otherwise, this reassembles a (byval) structure that
 // was split between registers and memory.
 // Return: The frame index registers were stored into.
-int
-ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
-                                  SDLoc dl, SDValue &Chain,
-                                  const Value *OrigArg,
-                                  unsigned InRegsParamRecordIdx,
-                                  int ArgOffset,
-                                  unsigned ArgSize) const {
+int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
+                                      const SDLoc &dl, SDValue &Chain,
+                                      const Value *OrigArg,
+                                      unsigned InRegsParamRecordIdx,
+                                      int ArgOffset, unsigned ArgSize) const {
   // Currently, two use-cases possible:
   // Case #1. Non-var-args function, and we meet first byval parameter.
   //          Setup first unallocated register as first byval register;
@@ -3104,9 +3164,8 @@ ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
   for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) {
     unsigned VReg = MF.addLiveIn(Reg, RC);
     SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
-    SDValue Store =
-        DAG.getStore(Val.getValue(1), dl, Val, FIN,
-                     MachinePointerInfo(OrigArg, 4 * i), false, false, 0);
+    SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
+                                 MachinePointerInfo(OrigArg, 4 * i));
     MemOps.push_back(Store);
     FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT));
   }
@@ -3117,17 +3176,16 @@ ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
 }
 
 // Setup stack frame, the va_list pointer will start from.
-void
-ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
-                                        SDLoc dl, SDValue &Chain,
-                                        unsigned ArgOffset,
-                                        unsigned TotalArgRegsSaveSize,
-                                        bool ForceMutable) const {
+void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
+                                             const SDLoc &dl, SDValue &Chain,
+                                             unsigned ArgOffset,
+                                             unsigned TotalArgRegsSaveSize,
+                                             bool ForceMutable) const {
   MachineFunction &MF = DAG.getMachineFunction();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
 
   // Try to store any remaining integer argument regs
-  // to their spots on the stack so that they may be loaded by deferencing
+  // to their spots on the stack so that they may be loaded by dereferencing
   // the result of va_next.
   // If there is no regs to be stored, just point address after last
   // argument passed via stack.
@@ -3137,14 +3195,10 @@ ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
   AFI->setVarArgsFrameIndex(FrameIndex);
 }
 
-SDValue
-ARMTargetLowering::LowerFormalArguments(SDValue Chain,
-                                        CallingConv::ID CallConv, bool isVarArg,
-                                        const SmallVectorImpl<ISD::InputArg>
-                                          &Ins,
-                                        SDLoc dl, SelectionDAG &DAG,
-                                        SmallVectorImpl<SDValue> &InVals)
-                                          const {
+SDValue ARMTargetLowering::LowerFormalArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
 
@@ -3226,10 +3280,9 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
           if (VA.isMemLoc()) {
             int FI = MFI->CreateFixedObject(8, VA.getLocMemOffset(), true);
             SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
-            ArgValue2 = DAG.getLoad(
-                MVT::f64, dl, Chain, FIN,
-                MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
-                false, false, false, 0);
+            ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN,
+                                    MachinePointerInfo::getFixedStack(
+                                        DAG.getMachineFunction(), FI));
           } else {
             ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i],
                                              Chain, DAG, dl);
@@ -3322,10 +3375,9 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
 
             // Create load nodes to retrieve arguments from the stack.
             SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
-            InVals.push_back(DAG.getLoad(
-                VA.getValVT(), dl, Chain, FIN,
-                MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
-                false, false, false, 0));
+            InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
+                                         MachinePointerInfo::getFixedStack(
+                                             DAG.getMachineFunction(), FI)));
           }
           lastInsIndex = index;
         }
@@ -3369,10 +3421,9 @@ static bool isFloatingPointZero(SDValue Op) {
 
 /// Returns appropriate ARM CMP (cmp) and corresponding condition code for
 /// the given operands.
-SDValue
-ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
-                             SDValue &ARMcc, SelectionDAG &DAG,
-                             SDLoc dl) const {
+SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+                                     SDValue &ARMcc, SelectionDAG &DAG,
+                                     const SDLoc &dl) const {
   if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
     unsigned C = RHSC->getZExtValue();
     if (!isLegalICmpImmediate(C)) {
@@ -3428,9 +3479,8 @@ ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
 }
 
 /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
-SDValue
-ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG,
-                             SDLoc dl) const {
+SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS,
+                                     SelectionDAG &DAG, const SDLoc &dl) const {
   assert(!Subtarget->isFPOnlySP() || RHS.getValueType() != MVT::f64);
   SDValue Cmp;
   if (!isFloatingPointZero(RHS))
@@ -3647,7 +3697,7 @@ static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
   }
 }
 
-SDValue ARMTargetLowering::getCMOV(SDLoc dl, EVT VT, SDValue FalseVal,
+SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal,
                                    SDValue TrueVal, SDValue ARMcc, SDValue CCR,
                                    SDValue Cmp, SelectionDAG &DAG) const {
   if (Subtarget->isFPOnlySP() && VT == MVT::f64) {
@@ -3673,14 +3723,149 @@ SDValue ARMTargetLowering::getCMOV(SDLoc dl, EVT VT, SDValue FalseVal,
   }
 }
 
+static bool isGTorGE(ISD::CondCode CC) {
+  return CC == ISD::SETGT || CC == ISD::SETGE;
+}
+
+static bool isLTorLE(ISD::CondCode CC) {
+  return CC == ISD::SETLT || CC == ISD::SETLE;
+}
+
+// See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating.
+// All of these conditions (and their <= and >= counterparts) will do:
+//          x < k ? k : x
+//          x > k ? x : k
+//          k < x ? x : k
+//          k > x ? k : x
+static bool isLowerSaturate(const SDValue LHS, const SDValue RHS,
+                            const SDValue TrueVal, const SDValue FalseVal,
+                            const ISD::CondCode CC, const SDValue K) {
+  return (isGTorGE(CC) &&
+          ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) ||
+         (isLTorLE(CC) &&
+          ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal)));
+}
+
+// Similar to isLowerSaturate(), but checks for upper-saturating conditions.
+static bool isUpperSaturate(const SDValue LHS, const SDValue RHS,
+                            const SDValue TrueVal, const SDValue FalseVal,
+                            const ISD::CondCode CC, const SDValue K) {
+  return (isGTorGE(CC) &&
+          ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal))) ||
+         (isLTorLE(CC) &&
+          ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal)));
+}
+
+// Check if two chained conditionals could be converted into SSAT.
+//
+// SSAT can replace a set of two conditional selectors that bound a number to an
+// interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples:
+//
+//     x < -k ? -k : (x > k ? k : x)
+//     x < -k ? -k : (x < k ? x : k)
+//     x > -k ? (x > k ? k : x) : -k
+//     x < k ? (x < -k ? -k : x) : k
+//     etc.
+//
+// It returns true if the conversion can be done, false otherwise.
+// Additionally, the variable is returned in parameter V and the constant in K.
+static bool isSaturatingConditional(const SDValue &Op, SDValue &V,
+                                    uint64_t &K) {
+
+  SDValue LHS1 = Op.getOperand(0);
+  SDValue RHS1 = Op.getOperand(1);
+  SDValue TrueVal1 = Op.getOperand(2);
+  SDValue FalseVal1 = Op.getOperand(3);
+  ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+
+  const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1;
+  if (Op2.getOpcode() != ISD::SELECT_CC)
+    return false;
+
+  SDValue LHS2 = Op2.getOperand(0);
+  SDValue RHS2 = Op2.getOperand(1);
+  SDValue TrueVal2 = Op2.getOperand(2);
+  SDValue FalseVal2 = Op2.getOperand(3);
+  ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get();
+
+  // Find out which are the constants and which are the variables
+  // in each conditional
+  SDValue *K1 = isa<ConstantSDNode>(LHS1) ? &LHS1 : isa<ConstantSDNode>(RHS1)
+                                                        ? &RHS1
+                                                        : NULL;
+  SDValue *K2 = isa<ConstantSDNode>(LHS2) ? &LHS2 : isa<ConstantSDNode>(RHS2)
+                                                        ? &RHS2
+                                                        : NULL;
+  SDValue K2Tmp = isa<ConstantSDNode>(TrueVal2) ? TrueVal2 : FalseVal2;
+  SDValue V1Tmp = (K1 && *K1 == LHS1) ? RHS1 : LHS1;
+  SDValue V2Tmp = (K2 && *K2 == LHS2) ? RHS2 : LHS2;
+  SDValue V2 = (K2Tmp == TrueVal2) ? FalseVal2 : TrueVal2;
+
+  // We must detect cases where the original operations worked with 16- or
+  // 8-bit values. In such case, V2Tmp != V2 because the comparison operations
+  // must work with sign-extended values but the select operations return
+  // the original non-extended value.
+  SDValue V2TmpReg = V2Tmp;
+  if (V2Tmp->getOpcode() == ISD::SIGN_EXTEND_INREG)
+    V2TmpReg = V2Tmp->getOperand(0);
+
+  // Check that the registers and the constants have the correct values
+  // in both conditionals
+  if (!K1 || !K2 || *K1 == Op2 || *K2 != K2Tmp || V1Tmp != V2Tmp ||
+      V2TmpReg != V2)
+    return false;
+
+  // Figure out which conditional is saturating the lower/upper bound.
+  const SDValue *LowerCheckOp =
+      isLowerSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1)
+          ? &Op
+          : isLowerSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2) ? &Op2
+                                                                       : NULL;
+  const SDValue *UpperCheckOp =
+      isUpperSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1)
+          ? &Op
+          : isUpperSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2) ? &Op2
+                                                                       : NULL;
+
+  if (!UpperCheckOp || !LowerCheckOp || LowerCheckOp == UpperCheckOp)
+    return false;
+
+  // Check that the constant in the lower-bound check is
+  // the opposite of the constant in the upper-bound check
+  // in 1's complement.
+  int64_t Val1 = cast<ConstantSDNode>(*K1)->getSExtValue();
+  int64_t Val2 = cast<ConstantSDNode>(*K2)->getSExtValue();
+  int64_t PosVal = std::max(Val1, Val2);
+
+  if (((Val1 > Val2 && UpperCheckOp == &Op) ||
+       (Val1 < Val2 && UpperCheckOp == &Op2)) &&
+      Val1 == ~Val2 && isPowerOf2_64(PosVal + 1)) {
+
+    V = V2;
+    K = (uint64_t)PosVal; // At this point, PosVal is guaranteed to be positive
+    return true;
+  }
+
+  return false;
+}
+
 SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
+
   EVT VT = Op.getValueType();
+  SDLoc dl(Op);
+
+  // Try to convert two saturating conditional selects into a single SSAT
+  SDValue SatValue;
+  uint64_t SatConstant;
+  if (isSaturatingConditional(Op, SatValue, SatConstant))
+    return DAG.getNode(ARMISD::SSAT, dl, VT, SatValue,
+                       DAG.getConstant(countTrailingOnes(SatConstant), dl, VT));
+
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
   SDValue TrueVal = Op.getOperand(2);
   SDValue FalseVal = Op.getOperand(3);
-  SDLoc dl(Op);
 
   if (Subtarget->isFPOnlySP() && LHS.getValueType() == MVT::f64) {
     DAG.getTargetLoweringInfo().softenSetCCOperands(DAG, MVT::f64, LHS, RHS, CC,
@@ -3781,10 +3966,9 @@ static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) {
     return DAG.getConstant(0, SDLoc(Op), MVT::i32);
 
   if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op))
-    return DAG.getLoad(MVT::i32, SDLoc(Op),
-                       Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
-                       Ld->isVolatile(), Ld->isNonTemporal(),
-                       Ld->isInvariant(), Ld->getAlignment());
+    return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(),
+                       Ld->getPointerInfo(), Ld->getAlignment(),
+                       Ld->getMemOperand()->getFlags());
 
   llvm_unreachable("Unknown VFP cmp argument!");
 }
@@ -3801,21 +3985,17 @@ static void expandf64Toi32(SDValue Op, SelectionDAG &DAG,
 
   if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) {
     SDValue Ptr = Ld->getBasePtr();
-    RetVal1 = DAG.getLoad(MVT::i32, dl,
-                          Ld->getChain(), Ptr,
-                          Ld->getPointerInfo(),
-                          Ld->isVolatile(), Ld->isNonTemporal(),
-                          Ld->isInvariant(), Ld->getAlignment());
+    RetVal1 =
+        DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
+                    Ld->getAlignment(), Ld->getMemOperand()->getFlags());
 
     EVT PtrType = Ptr.getValueType();
     unsigned NewAlign = MinAlign(Ld->getAlignment(), 4);
     SDValue NewPtr = DAG.getNode(ISD::ADD, dl,
                                  PtrType, Ptr, DAG.getConstant(4, dl, PtrType));
-    RetVal2 = DAG.getLoad(MVT::i32, dl,
-                          Ld->getChain(), NewPtr,
-                          Ld->getPointerInfo().getWithOffset(4),
-                          Ld->isVolatile(), Ld->isNonTemporal(),
-                          Ld->isInvariant(), NewAlign);
+    RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr,
+                          Ld->getPointerInfo().getWithOffset(4), NewAlign,
+                          Ld->getMemOperand()->getFlags());
     return;
   }
 
@@ -3908,8 +4088,7 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
   if (getTargetMachine().Options.UnsafeFPMath &&
       (CC == ISD::SETEQ || CC == ISD::SETOEQ ||
        CC == ISD::SETNE || CC == ISD::SETUNE)) {
-    SDValue Result = OptimizeVFPBrcond(Op, DAG);
-    if (Result.getNode())
+    if (SDValue Result = OptimizeVFPBrcond(Op, DAG))
       return Result;
   }
 
@@ -3950,19 +4129,17 @@ SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
     return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain,
                        Addr, Op.getOperand(2), JTI);
   }
-  if (getTargetMachine().getRelocationModel() == Reloc::PIC_) {
+  if (isPositionIndependent()) {
     Addr =
         DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr,
-                    MachinePointerInfo::getJumpTable(DAG.getMachineFunction()),
-                    false, false, false, 0);
+                    MachinePointerInfo::getJumpTable(DAG.getMachineFunction()));
     Chain = Addr.getValue(1);
     Addr = DAG.getNode(ISD::ADD, dl, PTy, Addr, Table);
     return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
   } else {
     Addr =
         DAG.getLoad(PTy, dl, Chain, Addr,
-                    MachinePointerInfo::getJumpTable(DAG.getMachineFunction()),
-                    false, false, false, 0);
+                    MachinePointerInfo::getJumpTable(DAG.getMachineFunction()));
     Chain = Addr.getValue(1);
     return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
   }
@@ -4156,7 +4333,7 @@ SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
     SDValue Offset = DAG.getConstant(4, dl, MVT::i32);
     return DAG.getLoad(VT, dl, DAG.getEntryNode(),
                        DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
-                       MachinePointerInfo(), false, false, false, 0);
+                       MachinePointerInfo());
   }
 
   // Return LR, which contains the return address. Mark it an implicit live-in.
@@ -4178,8 +4355,7 @@ SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
   while (Depth--)
     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
-                            MachinePointerInfo(),
-                            false, false, false, 0);
+                            MachinePointerInfo());
   return FrameAddr;
 }
 
@@ -4322,7 +4498,7 @@ static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) {
 /// not support i64 elements, so sometimes the zero vectors will need to be
 /// explicitly constructed.  Regardless, use a canonical VMOV to create the
 /// zero vector.
-static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, SDLoc dl) {
+static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
   assert(VT.isVector() && "Expected a vector type");
   // The canonical modified immediate encoding of a zero vector is....0!
   SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32);
@@ -4826,12 +5002,36 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
   return Result;
 }
 
+static SDValue LowerSETCCE(SDValue Op, SelectionDAG &DAG) {
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue Carry = Op.getOperand(2);
+  SDValue Cond = Op.getOperand(3);
+  SDLoc DL(Op);
+
+  assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only.");
+
+  assert(Carry.getOpcode() != ISD::CARRY_FALSE);
+  SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
+  SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry);
+
+  SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
+  SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
+  SDValue ARMcc = DAG.getConstant(
+      IntCCToARMCC(cast<CondCodeSDNode>(Cond)->get()), DL, MVT::i32);
+  SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+  SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, ARM::CPSR,
+                                   Cmp.getValue(1), SDValue());
+  return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc,
+                     CCR, Chain.getValue(1));
+}
+
 /// isNEONModifiedImm - Check if the specified splat value corresponds to a
 /// valid vector constant for a NEON instruction with a "modified immediate"
 /// operand (e.g., VMOV).  If so, return the encoded value.
 static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
                                  unsigned SplatBitSize, SelectionDAG &DAG,
-                                 SDLoc dl, EVT &VT, bool is128Bits,
+                                 const SDLoc &dl, EVT &VT, bool is128Bits,
                                  NEONModImmType type) {
   unsigned OpCmode, Imm;
 
@@ -4979,7 +5179,7 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
     return SDValue();
 
   // Try splatting with a VMOV.f32...
-  APFloat FPVal = CFP->getValueAPF();
+  const APFloat &FPVal = CFP->getValueAPF();
   int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal);
 
   if (ImmVal != -1) {
@@ -5421,7 +5621,7 @@ static bool isReverseMask(ArrayRef<int> M, EVT VT) {
 // instruction, return an SDValue of such a constant (will become a MOV
 // instruction).  Otherwise return null.
 static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG,
-                                     const ARMSubtarget *ST, SDLoc dl) {
+                                     const ARMSubtarget *ST, const SDLoc &dl) {
   uint64_t Val;
   if (!isa<ConstantSDNode>(N))
     return SDValue();
@@ -5502,7 +5702,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
   SDValue Value;
   for (unsigned i = 0; i < NumElts; ++i) {
     SDValue V = Op.getOperand(i);
-    if (V.getOpcode() == ISD::UNDEF)
+    if (V.isUndef())
       continue;
     if (i > 0)
       isOnlyLowElement = false;
@@ -5585,7 +5785,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
         Ops.push_back(DAG.getNode(ISD::BITCAST, dl, MVT::i32,
                                   Op.getOperand(i)));
       EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
-      SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, Ops);
+      SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
       Val = LowerBUILD_VECTOR(Val, DAG, ST);
       if (Val.getNode())
         return DAG.getNode(ISD::BITCAST, dl, VT, Val);
@@ -5635,7 +5835,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
     SDValue Vec = DAG.getUNDEF(VT);
     for (unsigned i = 0 ; i < NumElts; ++i) {
       SDValue V = Op.getOperand(i);
-      if (V.getOpcode() == ISD::UNDEF)
+      if (V.isUndef())
         continue;
       SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32);
       Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
@@ -5681,7 +5881,7 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
   SmallVector<ShuffleSourceInfo, 2> Sources;
   for (unsigned i = 0; i < NumElts; ++i) {
     SDValue V = Op.getOperand(i);
-    if (V.getOpcode() == ISD::UNDEF)
+    if (V.isUndef())
       continue;
     else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
       // A shuffle can only come from building a vector from various
@@ -5808,7 +6008,7 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
   int BitsPerShuffleLane = ShuffleVT.getVectorElementType().getSizeInBits();
   for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
     SDValue Entry = Op.getOperand(i);
-    if (Entry.getOpcode() == ISD::UNDEF)
+    if (Entry.isUndef())
       continue;
 
     auto Src = std::find(Sources.begin(), Sources.end(), Entry.getOperand(0));
@@ -5845,7 +6045,7 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
     ShuffleOps[i] = Sources[i].ShuffleVec;
 
   SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
-                                         ShuffleOps[1], &Mask[0]);
+                                         ShuffleOps[1], Mask);
   return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
 }
 
@@ -5895,7 +6095,7 @@ ARMTargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
 /// the specified operations to build the shuffle.
 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
                                       SDValue RHS, SelectionDAG &DAG,
-                                      SDLoc dl) {
+                                      const SDLoc &dl) {
   unsigned OpNum = (PFEntry >> 26) & 0x0F;
   unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
   unsigned RHSID = (PFEntry >>  0) & ((1 << 13)-1);
@@ -5982,12 +6182,12 @@ static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op,
          I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I)
     VTBLMask.push_back(DAG.getConstant(*I, DL, MVT::i32));
 
-  if (V2.getNode()->getOpcode() == ISD::UNDEF)
+  if (V2.getNode()->isUndef())
     return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,
-                       DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i8, VTBLMask));
+                       DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
 
   return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,
-                     DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i8, VTBLMask));
+                     DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
 }
 
 static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op,
@@ -6024,7 +6224,7 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
 
   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
   if (EltSize <= 32) {
-    if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], VT)) {
+    if (SVN->isSplat()) {
       int Lane = SVN->getSplatIndex();
       // If this is undef splat, generate it via "just" vdup, if possible.
       if (Lane == -1) Lane = 0;
@@ -6040,7 +6240,7 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
           !isa<ConstantSDNode>(V1.getOperand(0))) {
         bool IsScalarToVector = true;
         for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i)
-          if (V1.getOperand(i).getOpcode() != ISD::UNDEF) {
+          if (!V1.getOperand(i).isUndef()) {
             IsScalarToVector = false;
             break;
           }
@@ -6067,8 +6267,7 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
     if (isVREVMask(ShuffleMask, VT, 16))
       return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
 
-    if (V2->getOpcode() == ISD::UNDEF &&
-        isSingletonVEXTMask(ShuffleMask, VT, Imm)) {
+    if (V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) {
       return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1,
                          DAG.getConstant(Imm, dl, MVT::i32));
     }
@@ -6103,8 +6302,7 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
     // ->
     //   concat(VZIP(v1, v2):0, :1)
     //
-    if (V1->getOpcode() == ISD::CONCAT_VECTORS &&
-        V2->getOpcode() == ISD::UNDEF) {
+    if (V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) {
       SDValue SubV1 = V1->getOperand(0);
       SDValue SubV2 = V1->getOperand(1);
       EVT SubVT = SubV1.getValueType();
@@ -6175,11 +6373,9 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
   if ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT))
     return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG);
 
-  if (VT == MVT::v8i8) {
-    SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG);
-    if (NewOp.getNode())
+  if (VT == MVT::v8i8)
+    if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG))
       return NewOp;
-  }
 
   return SDValue();
 }
@@ -6218,11 +6414,11 @@ static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
   SDValue Val = DAG.getUNDEF(MVT::v2f64);
   SDValue Op0 = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1);
-  if (Op0.getOpcode() != ISD::UNDEF)
+  if (!Op0.isUndef())
     Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
                       DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0),
                       DAG.getIntPtrConstant(0, dl));
-  if (Op1.getOpcode() != ISD::UNDEF)
+  if (!Op1.isUndef())
     Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
                       DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1),
                       DAG.getIntPtrConstant(1, dl));
@@ -6351,17 +6547,16 @@ static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) {
   // The load already has the right type.
   if (ExtendedTy == LD->getMemoryVT())
     return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(),
-                LD->getBasePtr(), LD->getPointerInfo(), LD->isVolatile(),
-                LD->isNonTemporal(), LD->isInvariant(),
-                LD->getAlignment());
+                       LD->getBasePtr(), LD->getPointerInfo(),
+                       LD->getAlignment(), LD->getMemOperand()->getFlags());
 
   // We need to create a zextload/sextload. We cannot just create a load
   // followed by a zext/zext node because LowerMUL is also run during normal
   // operation legalization where we can't create illegal types.
   return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy,
                         LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
-                        LD->getMemoryVT(), LD->isVolatile(), LD->isInvariant(),
-                        LD->isNonTemporal(), LD->getAlignment());
+                        LD->getMemoryVT(), LD->getAlignment(),
+                        LD->getMemOperand()->getFlags());
 }
 
 /// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND,
@@ -6387,8 +6582,9 @@ static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) {
     assert(BVN->getOpcode() == ISD::BUILD_VECTOR &&
            BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR");
     unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
-    return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), MVT::v2i32,
-                       BVN->getOperand(LowElt), BVN->getOperand(LowElt+2));
+    return DAG.getBuildVector(
+        MVT::v2i32, SDLoc(N),
+        {BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)});
   }
   // Construct a new BUILD_VECTOR with elements truncated to half the size.
   assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
@@ -6405,8 +6601,7 @@ static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) {
     // The values are implicitly truncated so sext vs. zext doesn't matter.
     Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
   }
-  return DAG.getNode(ISD::BUILD_VECTOR, dl,
-                     MVT::getVectorVT(TruncVT, NumElts), Ops);
+  return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
 }
 
 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
@@ -6506,8 +6701,8 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
                                DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
 }
 
-static SDValue
-LowerSDIV_v4i8(SDValue X, SDValue Y, SDLoc dl, SelectionDAG &DAG) {
+static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl,
+                              SelectionDAG &DAG) {
   // TODO: Should this propagate fast-math-flags?
 
   // Convert to float
@@ -6528,8 +6723,7 @@ LowerSDIV_v4i8(SDValue X, SDValue Y, SDLoc dl, SelectionDAG &DAG) {
   // float4 result = as_float4(as_int4(xf*recip) + 0xb000);
   X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y);
   X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X);
-  Y = DAG.getConstant(0xb000, dl, MVT::i32);
-  Y = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Y, Y, Y, Y);
+  Y = DAG.getConstant(0xb000, dl, MVT::v4i32);
   X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y);
   X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X);
   // Convert back to short.
@@ -6538,8 +6732,8 @@ LowerSDIV_v4i8(SDValue X, SDValue Y, SDLoc dl, SelectionDAG &DAG) {
   return X;
 }
 
-static SDValue
-LowerSDIV_v4i16(SDValue N0, SDValue N1, SDLoc dl, SelectionDAG &DAG) {
+static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl,
+                               SelectionDAG &DAG) {
   // TODO: Should this propagate fast-math-flags?
 
   SDValue N2;
@@ -6567,8 +6761,7 @@ LowerSDIV_v4i16(SDValue N0, SDValue N1, SDLoc dl, SelectionDAG &DAG) {
   // float4 result = as_float4(as_int4(xf*recip) + 0x89);
   N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
-  N1 = DAG.getConstant(0x89, dl, MVT::i32);
-  N1 = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, N1, N1, N1, N1);
+  N1 = DAG.getConstant(0x89, dl, MVT::v4i32);
   N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
   // Convert back to integer and return.
@@ -6679,8 +6872,7 @@ static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) {
   // float4 result = as_float4(as_int4(xf*recip) + 2);
   N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
-  N1 = DAG.getConstant(2, dl, MVT::i32);
-  N1 = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, N1, N1, N1, N1);
+  N1 = DAG.getConstant(2, dl, MVT::v4i32);
   N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
   // Convert back to integer and return.
@@ -6766,21 +6958,21 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl)
       .setChain(DAG.getEntryNode())
-      .setCallee(CC, RetTy, Callee, std::move(Args), 0)
+      .setCallee(CC, RetTy, Callee, std::move(Args))
       .setDiscardResult(ShouldUseSRet);
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
 
   if (!ShouldUseSRet)
     return CallResult.first;
 
-  SDValue LoadSin = DAG.getLoad(ArgVT, dl, CallResult.second, SRet,
-                                MachinePointerInfo(), false, false, false, 0);
+  SDValue LoadSin =
+      DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo());
 
   // Address of cos field.
   SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet,
                             DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl));
-  SDValue LoadCos = DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add,
-                                MachinePointerInfo(), false, false, false, 0);
+  SDValue LoadCos =
+      DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo());
 
   SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
   return DAG.getNode(ISD::MERGE_VALUES, dl, Tys,
@@ -6819,7 +7011,7 @@ SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
   CLI.setDebugLoc(dl)
     .setChain(Chain)
     .setCallee(CallingConv::ARM_AAPCS_VFP, VT.getTypeForEVT(*DAG.getContext()),
-               ES, std::move(Args), 0);
+               ES, std::move(Args));
 
   return LowerCallTo(CLI).first;
 }
@@ -6867,13 +7059,13 @@ void ARMTargetLowering::ExpandDIV_Windows(
 }
 
 static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
-  // Monotonic load/store is legal for all targets
-  if (cast<AtomicSDNode>(Op)->getOrdering() <= Monotonic)
-    return Op;
+  if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering()))
+    // Acquire/Release load/store is not legal for targets without a dmb or
+    // equivalent available.
+    return SDValue();
 
-  // Acquire/Release load/store is not legal for targets without a
-  // dmb or equivalent available.
-  return SDValue();
+  // Monotonic load/store is legal for all targets.
+  return Op;
 }
 
 static void ReplaceREADCYCLECOUNTER(SDNode *N,
@@ -6899,6 +7091,46 @@ static void ReplaceREADCYCLECOUNTER(SDNode *N,
   Results.push_back(Cycles32.getValue(1));
 }
 
+static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
+  SDLoc dl(V.getNode());
+  SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i32);
+  SDValue VHi = DAG.getAnyExtOrTrunc(
+      DAG.getNode(ISD::SRL, dl, MVT::i64, V, DAG.getConstant(32, dl, MVT::i32)),
+      dl, MVT::i32);
+  SDValue RegClass =
+      DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32);
+  SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32);
+  SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32);
+  const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
+  return SDValue(
+      DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
+}
+
+static void ReplaceCMP_SWAP_64Results(SDNode *N,
+                                       SmallVectorImpl<SDValue> & Results,
+                                       SelectionDAG &DAG) {
+  assert(N->getValueType(0) == MVT::i64 &&
+         "AtomicCmpSwap on types less than 64 should be legal");
+  SDValue Ops[] = {N->getOperand(1),
+                   createGPRPairNode(DAG, N->getOperand(2)),
+                   createGPRPairNode(DAG, N->getOperand(3)),
+                   N->getOperand(0)};
+  SDNode *CmpSwap = DAG.getMachineNode(
+      ARM::CMP_SWAP_64, SDLoc(N),
+      DAG.getVTList(MVT::Untyped, MVT::i32, MVT::Other), Ops);
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineSDNode::mmo_iterator MemOp = MF.allocateMemRefsArray(1);
+  MemOp[0] = cast<MemSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(CmpSwap)->setMemRefs(MemOp, MemOp + 1);
+
+  Results.push_back(DAG.getTargetExtractSubreg(ARM::gsub_0, SDLoc(N), MVT::i32,
+                                               SDValue(CmpSwap, 0)));
+  Results.push_back(DAG.getTargetExtractSubreg(ARM::gsub_1, SDLoc(N), MVT::i32,
+                                               SDValue(CmpSwap, 0)));
+  Results.push_back(SDValue(CmpSwap, 2));
+}
+
 SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   default: llvm_unreachable("Don't know how to custom lower this!");
@@ -6948,6 +7180,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
   case ISD::CTPOP:         return LowerCTPOP(Op.getNode(), DAG, Subtarget);
   case ISD::SETCC:         return LowerVSETCC(Op, DAG);
+  case ISD::SETCCE:        return LowerSETCCE(Op, DAG);
   case ISD::ConstantFP:    return LowerConstantFP(Op, DAG, Subtarget);
   case ISD::BUILD_VECTOR:  return LowerBUILD_VECTOR(Op, DAG, Subtarget);
   case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
@@ -6956,8 +7189,14 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
   case ISD::FLT_ROUNDS_:   return LowerFLT_ROUNDS_(Op, DAG);
   case ISD::MUL:           return LowerMUL(Op, DAG);
-  case ISD::SDIV:          return LowerSDIV(Op, DAG);
-  case ISD::UDIV:          return LowerUDIV(Op, DAG);
+  case ISD::SDIV:
+    if (Subtarget->isTargetWindows())
+      return LowerDIV_Windows(Op, DAG, /* Signed */ true);
+    return LowerSDIV(Op, DAG);
+  case ISD::UDIV:
+    if (Subtarget->isTargetWindows())
+      return LowerDIV_Windows(Op, DAG, /* Signed */ false);
+    return LowerUDIV(Op, DAG);
   case ISD::ADDC:
   case ISD::ADDE:
   case ISD::SUBC:
@@ -7005,6 +7244,13 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
   case ISD::UREM:
     Res = LowerREM(N, DAG);
     break;
+  case ISD::SDIVREM:
+  case ISD::UDIVREM:
+    Res = LowerDivRem(SDValue(N, 0), DAG);
+    assert(Res.getNumOperands() == 2 && "DivRem needs two values");
+    Results.push_back(Res.getValue(0));
+    Results.push_back(Res.getValue(1));
+    return;
   case ISD::READCYCLECOUNTER:
     ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
     return;
@@ -7013,6 +7259,9 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
     assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows");
     return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV,
                              Results);
+  case ISD::ATOMIC_CMP_SWAP:
+    ReplaceCMP_SWAP_64Results(N, Results, DAG);
+    return;
   }
   if (Res.getNode())
     Results.push_back(Res);
@@ -7024,11 +7273,12 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
 
 /// SetupEntryBlockForSjLj - Insert code into the entry block that creates and
 /// registers the function context.
-void ARMTargetLowering::
-SetupEntryBlockForSjLj(MachineInstr *MI, MachineBasicBlock *MBB,
-                       MachineBasicBlock *DispatchBB, int FI) const {
+void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
+                                               MachineBasicBlock *MBB,
+                                               MachineBasicBlock *DispatchBB,
+                                               int FI) const {
   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
-  DebugLoc dl = MI->getDebugLoc();
+  DebugLoc dl = MI.getDebugLoc();
   MachineFunction *MF = MBB->getParent();
   MachineRegisterInfo *MRI = &MF->getRegInfo();
   MachineConstantPool *MCP = MF->getConstantPool();
@@ -7139,10 +7389,10 @@ SetupEntryBlockForSjLj(MachineInstr *MI, MachineBasicBlock *MBB,
   }
 }
 
-void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI,
+void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
                                               MachineBasicBlock *MBB) const {
   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
-  DebugLoc dl = MI->getDebugLoc();
+  DebugLoc dl = MI.getDebugLoc();
   MachineFunction *MF = MBB->getParent();
   MachineRegisterInfo *MRI = &MF->getRegInfo();
   MachineFrameInfo *MFI = MF->getFrameInfo();
@@ -7182,7 +7432,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI,
 
   // Get an ordered list of the machine basic blocks for the jump table.
   std::vector<MachineBasicBlock*> LPadList;
-  SmallPtrSet<MachineBasicBlock*, 64> InvokeBBs;
+  SmallPtrSet<MachineBasicBlock*, 32> InvokeBBs;
   LPadList.reserve(CallSiteNumToLPad.size());
   for (unsigned I = 1; I <= MaxCSNum; ++I) {
     SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I];
@@ -7200,7 +7450,6 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI,
   MachineJumpTableInfo *JTI =
     MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline);
   unsigned MJTI = JTI->createJumpTableIndex(LPadList);
-  Reloc::Model RelocM = getTargetMachine().getRelocationModel();
 
   // Create the MBBs for the dispatch code.
 
@@ -7244,6 +7493,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI,
   // registers being marked as clobbered.
   MIB.addRegMask(RI.getNoPreservedMask());
 
+  bool IsPositionIndependent = isPositionIndependent();
   unsigned NumLPads = LPadList.size();
   if (Subtarget->isThumb2()) {
     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
@@ -7357,7 +7607,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI,
                    .addMemOperand(JTMMOLd));
 
     unsigned NewVReg6 = NewVReg5;
-    if (RelocM == Reloc::PIC_) {
+    if (IsPositionIndependent) {
       NewVReg6 = MRI->createVirtualRegister(TRC);
       AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
                      .addReg(ARM::CPSR, RegState::Define)
@@ -7440,7 +7690,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI,
       .addImm(0)
       .addMemOperand(JTMMOLd));
 
-    if (RelocM == Reloc::PIC_) {
+    if (IsPositionIndependent) {
       BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
         .addReg(NewVReg5, RegState::Kill)
         .addReg(NewVReg4)
@@ -7524,7 +7774,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI,
     (*I)->setIsEHPad(false);
 
   // The instruction is gone now.
-  MI->eraseFromParent();
+  MI.eraseFromParent();
 }
 
 static
@@ -7576,8 +7826,8 @@ static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) {
 
 /// Emit a post-increment load operation with given size. The instructions
 /// will be added to BB at Pos.
-static void emitPostLd(MachineBasicBlock *BB, MachineInstr *Pos,
-                       const TargetInstrInfo *TII, DebugLoc dl,
+static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos,
+                       const TargetInstrInfo *TII, const DebugLoc &dl,
                        unsigned LdSize, unsigned Data, unsigned AddrIn,
                        unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
   unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2);
@@ -7608,8 +7858,8 @@ static void emitPostLd(MachineBasicBlock *BB, MachineInstr *Pos,
 
 /// Emit a post-increment store operation with given size. The instructions
 /// will be added to BB at Pos.
-static void emitPostSt(MachineBasicBlock *BB, MachineInstr *Pos,
-                       const TargetInstrInfo *TII, DebugLoc dl,
+static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos,
+                       const TargetInstrInfo *TII, const DebugLoc &dl,
                        unsigned StSize, unsigned Data, unsigned AddrIn,
                        unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
   unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2);
@@ -7637,7 +7887,7 @@ static void emitPostSt(MachineBasicBlock *BB, MachineInstr *Pos,
 }
 
 MachineBasicBlock *
-ARMTargetLowering::EmitStructByval(MachineInstr *MI,
+ARMTargetLowering::EmitStructByval(MachineInstr &MI,
                                    MachineBasicBlock *BB) const {
   // This pseudo instruction has 3 operands: dst, src, size
   // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
@@ -7646,11 +7896,11 @@ ARMTargetLowering::EmitStructByval(MachineInstr *MI,
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
   MachineFunction::iterator It = ++BB->getIterator();
 
-  unsigned dest = MI->getOperand(0).getReg();
-  unsigned src = MI->getOperand(1).getReg();
-  unsigned SizeVal = MI->getOperand(2).getImm();
-  unsigned Align = MI->getOperand(3).getImm();
-  DebugLoc dl = MI->getDebugLoc();
+  unsigned dest = MI.getOperand(0).getReg();
+  unsigned src = MI.getOperand(1).getReg();
+  unsigned SizeVal = MI.getOperand(2).getImm();
+  unsigned Align = MI.getOperand(3).getImm();
+  DebugLoc dl = MI.getDebugLoc();
 
   MachineFunction *MF = BB->getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
@@ -7722,7 +7972,7 @@ ARMTargetLowering::EmitStructByval(MachineInstr *MI,
       srcIn = srcOut;
       destIn = destOut;
     }
-    MI->eraseFromParent();   // The instruction is gone now.
+    MI.eraseFromParent(); // The instruction is gone now.
     return BB;
   }
 
@@ -7848,7 +8098,7 @@ ARMTargetLowering::EmitStructByval(MachineInstr *MI,
 
   // Add epilogue to handle BytesLeft.
   BB = exitMBB;
-  MachineInstr *StartOfExit = exitMBB->begin();
+  auto StartOfExit = exitMBB->begin();
 
   //   [scratch, srcOut] = LDRB_POST(srcLoop, 1)
   //   [destOut] = STRB_POST(scratch, destLoop, 1)
@@ -7866,16 +8116,16 @@ ARMTargetLowering::EmitStructByval(MachineInstr *MI,
     destIn = destOut;
   }
 
-  MI->eraseFromParent();   // The instruction is gone now.
+  MI.eraseFromParent(); // The instruction is gone now.
   return BB;
 }
 
 MachineBasicBlock *
-ARMTargetLowering::EmitLowered__chkstk(MachineInstr *MI,
+ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI,
                                        MachineBasicBlock *MBB) const {
   const TargetMachine &TM = getTargetMachine();
   const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
-  DebugLoc DL = MI->getDebugLoc();
+  DebugLoc DL = MI.getDebugLoc();
 
   assert(Subtarget->isTargetWindows() &&
          "__chkstk is only supported on Windows");
@@ -7930,24 +8180,26 @@ ARMTargetLowering::EmitLowered__chkstk(MachineInstr *MI,
 
   AddDefaultCC(AddDefaultPred(BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr),
                                       ARM::SP)
-                              .addReg(ARM::SP).addReg(ARM::R4)));
+                         .addReg(ARM::SP, RegState::Kill)
+                         .addReg(ARM::R4, RegState::Kill)
+                         .setMIFlags(MachineInstr::FrameSetup)));
 
-  MI->eraseFromParent();
+  MI.eraseFromParent();
   return MBB;
 }
 
 MachineBasicBlock *
-ARMTargetLowering::EmitLowered__dbzchk(MachineInstr *MI,
+ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI,
                                        MachineBasicBlock *MBB) const {
-  DebugLoc DL = MI->getDebugLoc();
+  DebugLoc DL = MI.getDebugLoc();
   MachineFunction *MF = MBB->getParent();
   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
 
   MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock();
-  MF->push_back(ContBB);
+  MF->insert(++MBB->getIterator(), ContBB);
   ContBB->splice(ContBB->begin(), MBB,
                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
-  MBB->addSuccessor(ContBB);
+  ContBB->transferSuccessorsAndUpdatePHIs(MBB);
 
   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
   MF->push_back(TrapBB);
@@ -7955,74 +8207,89 @@ ARMTargetLowering::EmitLowered__dbzchk(MachineInstr *MI,
   MBB->addSuccessor(TrapBB);
 
   BuildMI(*MBB, MI, DL, TII->get(ARM::tCBZ))
-      .addReg(MI->getOperand(0).getReg())
+      .addReg(MI.getOperand(0).getReg())
       .addMBB(TrapBB);
+  AddDefaultPred(BuildMI(*MBB, MI, DL, TII->get(ARM::t2B)).addMBB(ContBB));
+  MBB->addSuccessor(ContBB);
 
-  MI->eraseFromParent();
+  MI.eraseFromParent();
   return ContBB;
 }
 
 MachineBasicBlock *
-ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                MachineBasicBlock *BB) const {
   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
-  DebugLoc dl = MI->getDebugLoc();
+  DebugLoc dl = MI.getDebugLoc();
   bool isThumb2 = Subtarget->isThumb2();
-  switch (MI->getOpcode()) {
+  switch (MI.getOpcode()) {
   default: {
-    MI->dump();
+    MI.dump();
     llvm_unreachable("Unexpected instr type to insert");
   }
+
+  // Thumb1 post-indexed loads are really just single-register LDMs.
+  case ARM::tLDR_postidx: {
+    BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD))
+      .addOperand(MI.getOperand(1)) // Rn_wb
+      .addOperand(MI.getOperand(2)) // Rn
+      .addOperand(MI.getOperand(3)) // PredImm
+      .addOperand(MI.getOperand(4)) // PredReg
+      .addOperand(MI.getOperand(0)); // Rt
+    MI.eraseFromParent();
+    return BB;
+  }
+
   // The Thumb2 pre-indexed stores have the same MI operands, they just
   // define them differently in the .td files from the isel patterns, so
   // they need pseudos.
   case ARM::t2STR_preidx:
-    MI->setDesc(TII->get(ARM::t2STR_PRE));
+    MI.setDesc(TII->get(ARM::t2STR_PRE));
     return BB;
   case ARM::t2STRB_preidx:
-    MI->setDesc(TII->get(ARM::t2STRB_PRE));
+    MI.setDesc(TII->get(ARM::t2STRB_PRE));
     return BB;
   case ARM::t2STRH_preidx:
-    MI->setDesc(TII->get(ARM::t2STRH_PRE));
+    MI.setDesc(TII->get(ARM::t2STRH_PRE));
     return BB;
 
   case ARM::STRi_preidx:
   case ARM::STRBi_preidx: {
-    unsigned NewOpc = MI->getOpcode() == ARM::STRi_preidx ?
-      ARM::STR_PRE_IMM : ARM::STRB_PRE_IMM;
+    unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM
+                                                         : ARM::STRB_PRE_IMM;
     // Decode the offset.
-    unsigned Offset = MI->getOperand(4).getImm();
+    unsigned Offset = MI.getOperand(4).getImm();
     bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub;
     Offset = ARM_AM::getAM2Offset(Offset);
     if (isSub)
       Offset = -Offset;
 
-    MachineMemOperand *MMO = *MI->memoperands_begin();
+    MachineMemOperand *MMO = *MI.memoperands_begin();
     BuildMI(*BB, MI, dl, TII->get(NewOpc))
-      .addOperand(MI->getOperand(0))  // Rn_wb
-      .addOperand(MI->getOperand(1))  // Rt
-      .addOperand(MI->getOperand(2))  // Rn
-      .addImm(Offset)                 // offset (skip GPR==zero_reg)
-      .addOperand(MI->getOperand(5))  // pred
-      .addOperand(MI->getOperand(6))
-      .addMemOperand(MMO);
-    MI->eraseFromParent();
+        .addOperand(MI.getOperand(0)) // Rn_wb
+        .addOperand(MI.getOperand(1)) // Rt
+        .addOperand(MI.getOperand(2)) // Rn
+        .addImm(Offset)               // offset (skip GPR==zero_reg)
+        .addOperand(MI.getOperand(5)) // pred
+        .addOperand(MI.getOperand(6))
+        .addMemOperand(MMO);
+    MI.eraseFromParent();
     return BB;
   }
   case ARM::STRr_preidx:
   case ARM::STRBr_preidx:
   case ARM::STRH_preidx: {
     unsigned NewOpc;
-    switch (MI->getOpcode()) {
+    switch (MI.getOpcode()) {
     default: llvm_unreachable("unexpected opcode!");
     case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break;
     case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break;
     case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break;
     }
     MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc));
-    for (unsigned i = 0; i < MI->getNumOperands(); ++i)
-      MIB.addOperand(MI->getOperand(i));
-    MI->eraseFromParent();
+    for (unsigned i = 0; i < MI.getNumOperands(); ++i)
+      MIB.addOperand(MI.getOperand(i));
+    MI.eraseFromParent();
     return BB;
   }
 
@@ -8055,8 +8322,10 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     BB->addSuccessor(copy0MBB);
     BB->addSuccessor(sinkMBB);
 
-    BuildMI(BB, dl, TII->get(ARM::tBcc)).addMBB(sinkMBB)
-      .addImm(MI->getOperand(3).getImm()).addReg(MI->getOperand(4).getReg());
+    BuildMI(BB, dl, TII->get(ARM::tBcc))
+        .addMBB(sinkMBB)
+        .addImm(MI.getOperand(3).getImm())
+        .addReg(MI.getOperand(4).getReg());
 
     //  copy0MBB:
     //   %FalseValue = ...
@@ -8070,12 +8339,13 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
     //  ...
     BB = sinkMBB;
-    BuildMI(*BB, BB->begin(), dl,
-            TII->get(ARM::PHI), MI->getOperand(0).getReg())
-      .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
-      .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
+    BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg())
+        .addReg(MI.getOperand(1).getReg())
+        .addMBB(copy0MBB)
+        .addReg(MI.getOperand(2).getReg())
+        .addMBB(thisMBB);
 
-    MI->eraseFromParent();   // The pseudo instruction is gone now.
+    MI.eraseFromParent(); // The pseudo instruction is gone now.
     return BB;
   }
 
@@ -8086,10 +8356,10 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
 
     // Compare both parts that make up the double comparison separately for
     // equality.
-    bool RHSisZero = MI->getOpcode() == ARM::BCCZi64;
+    bool RHSisZero = MI.getOpcode() == ARM::BCCZi64;
 
-    unsigned LHS1 = MI->getOperand(1).getReg();
-    unsigned LHS2 = MI->getOperand(2).getReg();
+    unsigned LHS1 = MI.getOperand(1).getReg();
+    unsigned LHS2 = MI.getOperand(2).getReg();
     if (RHSisZero) {
       AddDefaultPred(BuildMI(BB, dl,
                              TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
@@ -8098,8 +8368,8 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
         .addReg(LHS2).addImm(0)
         .addImm(ARMCC::EQ).addReg(ARM::CPSR);
     } else {
-      unsigned RHS1 = MI->getOperand(3).getReg();
-      unsigned RHS2 = MI->getOperand(4).getReg();
+      unsigned RHS1 = MI.getOperand(3).getReg();
+      unsigned RHS2 = MI.getOperand(4).getReg();
       AddDefaultPred(BuildMI(BB, dl,
                              TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
                      .addReg(LHS1).addReg(RHS1));
@@ -8108,9 +8378,9 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
         .addImm(ARMCC::EQ).addReg(ARM::CPSR);
     }
 
-    MachineBasicBlock *destMBB = MI->getOperand(RHSisZero ? 3 : 5).getMBB();
+    MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB();
     MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB);
-    if (MI->getOperand(0).getImm() == ARMCC::NE)
+    if (MI.getOperand(0).getImm() == ARMCC::NE)
       std::swap(destMBB, exitMBB);
 
     BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
@@ -8120,7 +8390,7 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     else
       BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB);
 
-    MI->eraseFromParent();   // The pseudo instruction is gone now.
+    MI.eraseFromParent(); // The pseudo instruction is gone now.
     return BB;
   }
 
@@ -8157,9 +8427,9 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     Fn->insert(BBI, RSBBB);
     Fn->insert(BBI, SinkBB);
 
-    unsigned int ABSSrcReg = MI->getOperand(1).getReg();
-    unsigned int ABSDstReg = MI->getOperand(0).getReg();
-    bool ABSSrcKIll = MI->getOperand(1).isKill();
+    unsigned int ABSSrcReg = MI.getOperand(1).getReg();
+    unsigned int ABSDstReg = MI.getOperand(0).getReg();
+    bool ABSSrcKIll = MI.getOperand(1).isKill();
     bool isThumb2 = Subtarget->isThumb2();
     MachineRegisterInfo &MRI = Fn->getRegInfo();
     // In Thumb mode S must not be specified if source register is the SP or
@@ -8204,7 +8474,7 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
       .addReg(ABSSrcReg).addMBB(BB);
 
     // remove ABS instruction
-    MI->eraseFromParent();
+    MI.eraseFromParent();
 
     // return last added BB
     return SinkBB;
@@ -8223,38 +8493,38 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
 /// when it is expanded into LDM/STM. This is done as a post-isel lowering
 /// instead of as a custom inserter because we need the use list from the SDNode.
 static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget,
-                                   MachineInstr *MI, const SDNode *Node) {
+                                    MachineInstr &MI, const SDNode *Node) {
   bool isThumb1 = Subtarget->isThumb1Only();
 
-  DebugLoc DL = MI->getDebugLoc();
-  MachineFunction *MF = MI->getParent()->getParent();
+  DebugLoc DL = MI.getDebugLoc();
+  MachineFunction *MF = MI.getParent()->getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
   MachineInstrBuilder MIB(*MF, MI);
 
   // If the new dst/src is unused mark it as dead.
   if (!Node->hasAnyUseOfValue(0)) {
-    MI->getOperand(0).setIsDead(true);
+    MI.getOperand(0).setIsDead(true);
   }
   if (!Node->hasAnyUseOfValue(1)) {
-    MI->getOperand(1).setIsDead(true);
+    MI.getOperand(1).setIsDead(true);
   }
 
   // The MEMCPY both defines and kills the scratch registers.
-  for (unsigned I = 0; I != MI->getOperand(4).getImm(); ++I) {
+  for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) {
     unsigned TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass
                                                          : &ARM::GPRRegClass);
     MIB.addReg(TmpReg, RegState::Define|RegState::Dead);
   }
 }
 
-void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
+void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
                                                       SDNode *Node) const {
-  if (MI->getOpcode() == ARM::MEMCPY) {
+  if (MI.getOpcode() == ARM::MEMCPY) {
     attachMEMCPYScratchRegs(Subtarget, MI, Node);
     return;
   }
 
-  const MCInstrDesc *MCID = &MI->getDesc();
+  const MCInstrDesc *MCID = &MI.getDesc();
   // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
   // RSC. Coming out of isel, they have an implicit CPSR def, but the optional
   // operand is still set to noreg. If needed, set the optional operand's
@@ -8263,24 +8533,24 @@ void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
   // e.g. ADCS (..., CPSR<imp-def>) -> ADC (... opt:CPSR<def>).
 
   // Rename pseudo opcodes.
-  unsigned NewOpc = convertAddSubFlagsOpcode(MI->getOpcode());
+  unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode());
   if (NewOpc) {
     const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo();
     MCID = &TII->get(NewOpc);
 
-    assert(MCID->getNumOperands() == MI->getDesc().getNumOperands() + 1 &&
+    assert(MCID->getNumOperands() == MI.getDesc().getNumOperands() + 1 &&
            "converted opcode should be the same except for cc_out");
 
-    MI->setDesc(*MCID);
+    MI.setDesc(*MCID);
 
     // Add the optional cc_out operand
-    MI->addOperand(MachineOperand::CreateReg(0, /*isDef=*/true));
+    MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true));
   }
   unsigned ccOutIdx = MCID->getNumOperands() - 1;
 
   // Any ARM instruction that sets the 's' bit should specify an optional
   // "cc_out" operand in the last operand position.
-  if (!MI->hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) {
+  if (!MI.hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) {
     assert(!NewOpc && "Optional cc_out operand required");
     return;
   }
@@ -8288,14 +8558,14 @@ void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
   // since we already have an optional CPSR def.
   bool definesCPSR = false;
   bool deadCPSR = false;
-  for (unsigned i = MCID->getNumOperands(), e = MI->getNumOperands();
-       i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+  for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e;
+       ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
     if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) {
       definesCPSR = true;
       if (MO.isDead())
         deadCPSR = true;
-      MI->RemoveOperand(i);
+      MI.RemoveOperand(i);
       break;
     }
   }
@@ -8305,14 +8575,14 @@ void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
   }
   assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag");
   if (deadCPSR) {
-    assert(!MI->getOperand(ccOutIdx).getReg() &&
+    assert(!MI.getOperand(ccOutIdx).getReg() &&
            "expect uninitialized optional cc_out operand");
     return;
   }
 
   // If this instruction was defined with an optional CPSR def and its dag node
   // had a live implicit CPSR def, then activate the optional CPSR def.
-  MachineOperand &MO = MI->getOperand(ccOutIdx);
+  MachineOperand &MO = MI.getOperand(ccOutIdx);
   MO.setReg(ARM::CPSR);
   MO.setIsDef(true);
 }
@@ -8442,16 +8712,12 @@ SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes,
                                        TargetLowering::DAGCombinerInfo &DCI) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  if (N0.getNode()->hasOneUse()) {
-    SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes);
-    if (Result.getNode())
+  if (N0.getNode()->hasOneUse())
+    if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes))
       return Result;
-  }
-  if (N1.getNode()->hasOneUse()) {
-    SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes);
-    if (Result.getNode())
+  if (N1.getNode()->hasOneUse())
+    if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes))
       return Result;
-  }
   return SDValue();
 }
 
@@ -8533,7 +8799,7 @@ static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1,
   // Get widened type and narrowed type.
   MVT widenType;
   unsigned numElem = VT.getVectorNumElements();
-  
+
   EVT inputLaneType = Vec.getValueType().getVectorElementType();
   switch (inputLaneType.getSimpleVT().SimpleTy) {
     case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break;
@@ -8559,11 +8825,6 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode,
                                      TargetLowering::DAGCombinerInfo &DCI,
                                      const ARMSubtarget *Subtarget) {
 
-  if (Subtarget->isThumb1Only()) return SDValue();
-
-  // Only perform the checks after legalize when the pattern is available.
-  if (DCI.isBeforeLegalize()) return SDValue();
-
   // Look for multiply add opportunities.
   // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where
   // each add nodes consumes a value from ISD::UMUL_LOHI and there is
@@ -8691,14 +8952,97 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode,
   return resNode;
 }
 
+static SDValue AddCombineTo64bitUMAAL(SDNode *AddcNode,
+                                      TargetLowering::DAGCombinerInfo &DCI,
+                                      const ARMSubtarget *Subtarget) {
+  // UMAAL is similar to UMLAL except that it adds two unsigned values.
+  // While trying to combine for the other MLAL nodes, first search for the
+  // chance to use UMAAL. Check if Addc uses another addc node which can first
+  // be combined into a UMLAL. The other pattern is AddcNode being combined
+  // into an UMLAL and then using another addc is handled in ISelDAGToDAG.
+
+  if (!Subtarget->hasV6Ops())
+    return AddCombineTo64bitMLAL(AddcNode, DCI, Subtarget);
+
+  SDNode *PrevAddc = nullptr;
+  if (AddcNode->getOperand(0).getOpcode() == ISD::ADDC)
+    PrevAddc = AddcNode->getOperand(0).getNode();
+  else if (AddcNode->getOperand(1).getOpcode() == ISD::ADDC)
+    PrevAddc = AddcNode->getOperand(1).getNode();
+
+  // If there's no addc chains, just return a search for any MLAL.
+  if (PrevAddc == nullptr)
+    return AddCombineTo64bitMLAL(AddcNode, DCI, Subtarget);
+
+  // Try to convert the addc operand to an MLAL and if that fails try to
+  // combine AddcNode.
+  SDValue MLAL = AddCombineTo64bitMLAL(PrevAddc, DCI, Subtarget);
+  if (MLAL != SDValue(PrevAddc, 0))
+    return AddCombineTo64bitMLAL(AddcNode, DCI, Subtarget);
+
+  // Find the converted UMAAL or quit if it doesn't exist.
+  SDNode *UmlalNode = nullptr;
+  SDValue AddHi;
+  if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) {
+    UmlalNode = AddcNode->getOperand(0).getNode();
+    AddHi = AddcNode->getOperand(1);
+  } else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) {
+    UmlalNode = AddcNode->getOperand(1).getNode();
+    AddHi = AddcNode->getOperand(0);
+  } else {
+    return SDValue();
+  }
+
+  // The ADDC should be glued to an ADDE node, which uses the same UMLAL as
+  // the ADDC as well as Zero.
+  auto *Zero = dyn_cast<ConstantSDNode>(UmlalNode->getOperand(3));
+
+  if (!Zero || Zero->getZExtValue() != 0)
+    return SDValue();
+
+  // Check that we have a glued ADDC node.
+  if (AddcNode->getValueType(1) != MVT::Glue)
+    return SDValue();
+
+  // Look for the glued ADDE.
+  SDNode* AddeNode = AddcNode->getGluedUser();
+  if (!AddeNode)
+    return SDValue();
+
+  if ((AddeNode->getOperand(0).getNode() == Zero &&
+       AddeNode->getOperand(1).getNode() == UmlalNode) ||
+      (AddeNode->getOperand(0).getNode() == UmlalNode &&
+       AddeNode->getOperand(1).getNode() == Zero)) {
+
+    SelectionDAG &DAG = DCI.DAG;
+    SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1),
+                      UmlalNode->getOperand(2), AddHi };
+    SDValue UMAAL =  DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode),
+                                 DAG.getVTList(MVT::i32, MVT::i32), Ops);
+
+    // Replace the ADDs' nodes uses by the UMAAL node's values.
+    DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1));
+    DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0));
+
+    // Return original node to notify the driver to stop replacing.
+    return SDValue(AddcNode, 0);
+  }
+  return SDValue();
+}
+
 /// PerformADDCCombine - Target-specific dag combine transform from
-/// ISD::ADDC, ISD::ADDE, and ISD::MUL_LOHI to MLAL.
+/// ISD::ADDC, ISD::ADDE, and ISD::MUL_LOHI to MLAL or
+/// ISD::ADDC, ISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL
 static SDValue PerformADDCCombine(SDNode *N,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const ARMSubtarget *Subtarget) {
 
-  return AddCombineTo64bitMLAL(N, DCI, Subtarget);
+  if (Subtarget->isThumb1Only()) return SDValue();
 
+  // Only perform the checks after legalize when the pattern is available.
+  if (DCI.isBeforeLegalize()) return SDValue();
+
+  return AddCombineTo64bitUMAAL(N, DCI, Subtarget);
 }
 
 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
@@ -8710,15 +9054,13 @@ static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
                                           const ARMSubtarget *Subtarget){
 
   // Attempt to create vpaddl for this add.
-  SDValue Result = AddCombineToVPADDL(N, N0, N1, DCI, Subtarget);
-  if (Result.getNode())
+  if (SDValue Result = AddCombineToVPADDL(N, N0, N1, DCI, Subtarget))
     return Result;
 
   // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
-  if (N0.getNode()->hasOneUse()) {
-    SDValue Result = combineSelectAndUse(N, N0, N1, DCI);
-    if (Result.getNode()) return Result;
-  }
+  if (N0.getNode()->hasOneUse())
+    if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI))
+      return Result;
   return SDValue();
 }
 
@@ -8731,8 +9073,7 @@ static SDValue PerformADDCombine(SDNode *N,
   SDValue N1 = N->getOperand(1);
 
   // First try with the default operand order.
-  SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget);
-  if (Result.getNode())
+  if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget))
     return Result;
 
   // If that didn't work, try again with the operands commuted.
@@ -8747,10 +9088,9 @@ static SDValue PerformSUBCombine(SDNode *N,
   SDValue N1 = N->getOperand(1);
 
   // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
-  if (N1.getNode()->hasOneUse()) {
-    SDValue Result = combineSelectAndUse(N, N1, N0, DCI);
-    if (Result.getNode()) return Result;
-  }
+  if (N1.getNode()->hasOneUse())
+    if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI))
+      return Result;
 
   return SDValue();
 }
@@ -8920,8 +9260,7 @@ static SDValue PerformANDCombine(SDNode *N,
 
   if (!Subtarget->isThumb1Only()) {
     // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))
-    SDValue Result = combineSelectAndUseCommutative(N, true, DCI);
-    if (Result.getNode())
+    if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI))
       return Result;
   }
 
@@ -8963,8 +9302,7 @@ static SDValue PerformORCombine(SDNode *N,
 
   if (!Subtarget->isThumb1Only()) {
     // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
-    SDValue Result = combineSelectAndUseCommutative(N, false, DCI);
-    if (Result.getNode())
+    if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
       return Result;
   }
 
@@ -9137,8 +9475,7 @@ static SDValue PerformXORCombine(SDNode *N,
 
   if (!Subtarget->isThumb1Only()) {
     // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
-    SDValue Result = combineSelectAndUseCommutative(N, false, DCI);
-    if (Result.getNode())
+    if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
       return Result;
   }
 
@@ -9300,17 +9637,15 @@ static SDValue PerformVMOVRRDCombine(SDNode *N,
     SelectionDAG &DAG = DCI.DAG;
     SDLoc DL(LD);
     SDValue BasePtr = LD->getBasePtr();
-    SDValue NewLD1 = DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr,
-                                 LD->getPointerInfo(), LD->isVolatile(),
-                                 LD->isNonTemporal(), LD->isInvariant(),
-                                 LD->getAlignment());
+    SDValue NewLD1 =
+        DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(),
+                    LD->getAlignment(), LD->getMemOperand()->getFlags());
 
     SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
                                     DAG.getConstant(4, DL, MVT::i32));
-    SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, NewLD1.getValue(1), OffsetPtr,
-                                 LD->getPointerInfo(), LD->isVolatile(),
-                                 LD->isNonTemporal(), LD->isInvariant(),
-                                 std::min(4U, LD->getAlignment() / 2));
+    SDValue NewLD2 = DAG.getLoad(
+        MVT::i32, DL, NewLD1.getValue(1), OffsetPtr, LD->getPointerInfo(),
+        std::min(4U, LD->getAlignment() / 2), LD->getMemOperand()->getFlags());
 
     DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
     if (DCI.DAG.getDataLayout().isBigEndian())
@@ -9364,11 +9699,9 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N,
   // into a pair of GPRs, which is fine when the value is used as a scalar,
   // but if the i64 value is converted to a vector, we need to undo the VMOVRRD.
   SelectionDAG &DAG = DCI.DAG;
-  if (N->getNumOperands() == 2) {
-    SDValue RV = PerformVMOVDRRCombine(N, DAG);
-    if (RV.getNode())
+  if (N->getNumOperands() == 2)
+    if (SDValue RV = PerformVMOVDRRCombine(N, DAG))
       return RV;
-  }
 
   // Load i64 elements as f64 values so that type legalization does not split
   // them up into i32 values.
@@ -9385,7 +9718,7 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N,
     DCI.AddToWorklist(V.getNode());
   }
   EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts);
-  SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, FloatVT, Ops);
+  SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops);
   return DAG.getNode(ISD::BITCAST, dl, VT, BV);
 }
 
@@ -9434,7 +9767,7 @@ PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
       // Assume only bit cast to i32 will go away.
       if (Elt->getOperand(0).getValueType() == MVT::i32)
         ++NumOfBitCastedElts;
-    } else if (Elt.getOpcode() == ISD::UNDEF || isa<ConstantSDNode>(Elt))
+    } else if (Elt.isUndef() || isa<ConstantSDNode>(Elt))
       // Constants are statically casted, thus do not count them as
       // relevant operands.
       --NumOfRelevantElts;
@@ -9461,7 +9794,7 @@ PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
   SDLoc dl(N);
   for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) {
     SDValue V = N->getOperand(Idx);
-    if (V.getOpcode() == ISD::UNDEF)
+    if (V.isUndef())
       continue;
     if (V.getOpcode() == ISD::BITCAST &&
         V->getOperand(0).getValueType() == MVT::i32)
@@ -9529,8 +9862,7 @@ static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
     return SDValue();
   SDValue Concat0Op1 = Op0.getOperand(1);
   SDValue Concat1Op1 = Op1.getOperand(1);
-  if (Concat0Op1.getOpcode() != ISD::UNDEF ||
-      Concat1Op1.getOpcode() != ISD::UNDEF)
+  if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef())
     return SDValue();
   // Skip the transformation if any of the types are illegal.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -9557,7 +9889,7 @@ static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
     NewMask.push_back(NewElt);
   }
   return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat,
-                              DAG.getUNDEF(VT), NewMask.data());
+                              DAG.getUNDEF(VT), NewMask);
 }
 
 /// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP,
@@ -9953,7 +10285,7 @@ static SDValue PerformSTORECombine(SDNode *N,
 
     SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec,
                                 DAG.getUNDEF(WideVec.getValueType()),
-                                ShuffleVec.data());
+                                ShuffleVec);
     // At this point all of the data is stored at the bottom of the
     // register. We now need to save it to mem.
 
@@ -9984,8 +10316,8 @@ static SDValue PerformSTORECombine(SDNode *N,
                                    StoreType, ShuffWide,
                                    DAG.getIntPtrConstant(I, DL));
       SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr,
-                                St->getPointerInfo(), St->isVolatile(),
-                                St->isNonTemporal(), St->getAlignment());
+                                St->getPointerInfo(), St->getAlignment(),
+                                St->getMemOperand()->getFlags());
       BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr,
                             Increment);
       Chains.push_back(Ch);
@@ -10004,18 +10336,18 @@ static SDValue PerformSTORECombine(SDNode *N,
     bool isBigEndian = DAG.getDataLayout().isBigEndian();
     SDLoc DL(St);
     SDValue BasePtr = St->getBasePtr();
-    SDValue NewST1 = DAG.getStore(St->getChain(), DL,
-                                  StVal.getNode()->getOperand(isBigEndian ? 1 : 0 ),
-                                  BasePtr, St->getPointerInfo(), St->isVolatile(),
-                                  St->isNonTemporal(), St->getAlignment());
+    SDValue NewST1 = DAG.getStore(
+        St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0),
+        BasePtr, St->getPointerInfo(), St->getAlignment(),
+        St->getMemOperand()->getFlags());
 
     SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
                                     DAG.getConstant(4, DL, MVT::i32));
     return DAG.getStore(NewST1.getValue(0), DL,
                         StVal.getNode()->getOperand(isBigEndian ? 0 : 1),
-                        OffsetPtr, St->getPointerInfo(), St->isVolatile(),
-                        St->isNonTemporal(),
-                        std::min(4U, St->getAlignment() / 2));
+                        OffsetPtr, St->getPointerInfo(),
+                        std::min(4U, St->getAlignment() / 2),
+                        St->getMemOperand()->getFlags());
   }
 
   if (StVal.getValueType() == MVT::i64 &&
@@ -10038,9 +10370,8 @@ static SDValue PerformSTORECombine(SDNode *N,
     DCI.AddToWorklist(ExtElt.getNode());
     DCI.AddToWorklist(V.getNode());
     return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(),
-                        St->getPointerInfo(), St->isVolatile(),
-                        St->isNonTemporal(), St->getAlignment(),
-                        St->getAAInfo());
+                        St->getPointerInfo(), St->getAlignment(),
+                        St->getMemOperand()->getFlags(), St->getAAInfo());
   }
 
   // If this is a legal vector store, try to combine it into a VST1_UPD.
@@ -10066,7 +10397,8 @@ static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   SDValue Op = N->getOperand(0);
-  if (!Op.getValueType().isVector() || Op.getOpcode() != ISD::FMUL)
+  if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
+      Op.getOpcode() != ISD::FMUL)
     return SDValue();
 
   SDValue ConstVec = Op->getOperand(1);
@@ -10123,7 +10455,7 @@ static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG,
 
   SDValue Op = N->getOperand(0);
   unsigned OpOpcode = Op.getNode()->getOpcode();
-  if (!N->getValueType(0).isVector() ||
+  if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() ||
       (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP))
     return SDValue();
 
@@ -10464,7 +10796,7 @@ static void computeKnownBits(SelectionDAG &DAG, SDValue Op, APInt &KnownZero,
     // The operand to BFI is already a mask suitable for removing the bits it
     // sets.
     ConstantSDNode *CI = cast<ConstantSDNode>(Op.getOperand(2));
-    APInt Mask = CI->getAPIntValue();
+    const APInt &Mask = CI->getAPIntValue();
     KnownZero &= Mask;
     KnownOne &= Mask;
     return;
@@ -10522,7 +10854,7 @@ SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &D
   } else {
     assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?");
   }
-  
+
   if (Op1->getOpcode() != ISD::OR)
     return SDValue();
 
@@ -10552,7 +10884,7 @@ SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &D
   SDLoc dl(X);
   EVT VT = X.getValueType();
   unsigned BitInX = AndC->getAPIntValue().logBase2();
-  
+
   if (BitInX != 0) {
     // We must shift X first.
     X = DAG.getNode(ISD::SRL, dl, VT, X,
@@ -10573,6 +10905,46 @@ SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &D
   return V;
 }
 
+/// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
+SDValue
+ARMTargetLowering::PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const {
+  SDValue Cmp = N->getOperand(4);
+  if (Cmp.getOpcode() != ARMISD::CMPZ)
+    // Only looking at NE cases.
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  SDLoc dl(N);
+  SDValue LHS = Cmp.getOperand(0);
+  SDValue RHS = Cmp.getOperand(1);
+  SDValue Chain = N->getOperand(0);
+  SDValue BB = N->getOperand(1);
+  SDValue ARMcc = N->getOperand(2);
+  ARMCC::CondCodes CC =
+    (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
+
+  // (brcond Chain BB ne CPSR (cmpz (and (cmov 0 1 CC CPSR Cmp) 1) 0))
+  // -> (brcond Chain BB CC CPSR Cmp)
+  if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() &&
+      LHS->getOperand(0)->getOpcode() == ARMISD::CMOV &&
+      LHS->getOperand(0)->hasOneUse()) {
+    auto *LHS00C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(0));
+    auto *LHS01C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(1));
+    auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
+    auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
+    if ((LHS00C && LHS00C->getZExtValue() == 0) &&
+        (LHS01C && LHS01C->getZExtValue() == 1) &&
+        (LHS1C && LHS1C->getZExtValue() == 1) &&
+        (RHSC && RHSC->getZExtValue() == 0)) {
+      return DAG.getNode(
+          ARMISD::BRCOND, dl, VT, Chain, BB, LHS->getOperand(0)->getOperand(2),
+          LHS->getOperand(0)->getOperand(3), LHS->getOperand(0)->getOperand(4));
+    }
+  }
+
+  return SDValue();
+}
+
 /// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
 SDValue
 ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
@@ -10626,6 +10998,21 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
                       N->getOperand(3), NewCmp);
   }
 
+  // (cmov F T ne CPSR (cmpz (cmov 0 1 CC CPSR Cmp) 0))
+  // -> (cmov F T CC CPSR Cmp)
+  if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse()) {
+    auto *LHS0C = dyn_cast<ConstantSDNode>(LHS->getOperand(0));
+    auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
+    auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
+    if ((LHS0C && LHS0C->getZExtValue() == 0) &&
+        (LHS1C && LHS1C->getZExtValue() == 1) &&
+        (RHSC && RHSC->getZExtValue() == 0)) {
+      return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal,
+                         LHS->getOperand(2), LHS->getOperand(3),
+                         LHS->getOperand(4));
+    }
+  }
+
   if (Res.getNode()) {
     APInt KnownZero, KnownOne;
     DAG.computeKnownBits(SDValue(N,0), KnownZero, KnownOne);
@@ -10676,6 +11063,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::ZERO_EXTEND:
   case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget);
   case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG);
+  case ARMISD::BRCOND: return PerformBRCONDCombine(N, DCI.DAG);
   case ISD::LOAD:       return PerformLOADCombine(N, DCI);
   case ARMISD::VLD2DUP:
   case ARMISD::VLD3DUP:
@@ -11198,22 +11586,37 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
                                                    SDValue &Offset,
                                                    ISD::MemIndexedMode &AM,
                                                    SelectionDAG &DAG) const {
-  if (Subtarget->isThumb1Only())
-    return false;
-
   EVT VT;
   SDValue Ptr;
-  bool isSEXTLoad = false;
+  bool isSEXTLoad = false, isNonExt;
   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
     VT  = LD->getMemoryVT();
     Ptr = LD->getBasePtr();
     isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
+    isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
   } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
     VT  = ST->getMemoryVT();
     Ptr = ST->getBasePtr();
+    isNonExt = !ST->isTruncatingStore();
   } else
     return false;
 
+  if (Subtarget->isThumb1Only()) {
+    // Thumb-1 can do a limited post-inc load or store as an updating LDM. It
+    // must be non-extending/truncating, i32, with an offset of 4.
+    assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!");
+    if (Op->getOpcode() != ISD::ADD || !isNonExt)
+      return false;
+    auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1));
+    if (!RHS || RHS->getZExtValue() != 4)
+      return false;
+    
+    Offset = Op->getOperand(1);
+    Base = Op->getOperand(0);
+    AM = ISD::POST_INC;
+    return true;
+  }
+  
   bool isInc;
   bool isLegal = false;
   if (Subtarget->isThumb2())
@@ -11322,6 +11725,26 @@ bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const {
   return false;
 }
 
+const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const {
+  // At this point, we have to lower this constraint to something else, so we
+  // lower it to an "r" or "w". However, by doing this we will force the result
+  // to be in register, while the X constraint is much more permissive.
+  //
+  // Although we are correct (we are free to emit anything, without
+  // constraints), we might break use cases that would expect us to be more
+  // efficient and emit something else.
+  if (!Subtarget->hasVFP2())
+    return "r";
+  if (ConstraintVT.isFloatingPoint())
+    return "w";
+  if (ConstraintVT.isVector() && Subtarget->hasNEON() &&
+     (ConstraintVT.getSizeInBits() == 64 ||
+      ConstraintVT.getSizeInBits() == 128))
+    return "w";
+
+  return "r";
+}
+
 /// getConstraintType - Given a constraint letter, return the type of
 /// constraint it is for this target.
 ARMTargetLowering::ConstraintType
@@ -11640,7 +12063,8 @@ static TargetLowering::ArgListTy getDivRemArgList(
 }
 
 SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
-  assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid()) &&
+  assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
+          Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI()) &&
          "Register-based DivRem lowering only");
   unsigned Opcode = Op->getOpcode();
   assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
@@ -11664,7 +12088,7 @@ SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
   SDLoc dl(Op);
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl).setChain(InChain)
-    .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args), 0)
+    .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
     .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
 
   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
@@ -11702,7 +12126,7 @@ SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const {
   // Lower call
   CallLoweringInfo CLI(DAG);
   CLI.setChain(InChain)
-     .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args), 0)
+     .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args))
      .setSExtResult(isSigned).setZExtResult(!isSigned).setDebugLoc(SDLoc(N));
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
 
@@ -11950,23 +12374,20 @@ Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder,
 Instruction* ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
                                          AtomicOrdering Ord, bool IsStore,
                                          bool IsLoad) const {
-  if (!getInsertFencesForAtomic())
-    return nullptr;
-
   switch (Ord) {
-  case NotAtomic:
-  case Unordered:
+  case AtomicOrdering::NotAtomic:
+  case AtomicOrdering::Unordered:
     llvm_unreachable("Invalid fence: unordered/non-atomic");
-  case Monotonic:
-  case Acquire:
+  case AtomicOrdering::Monotonic:
+  case AtomicOrdering::Acquire:
     return nullptr; // Nothing to do
-  case SequentiallyConsistent:
+  case AtomicOrdering::SequentiallyConsistent:
     if (!IsStore)
       return nullptr; // Nothing to do
     /*FALLTHROUGH*/
-  case Release:
-  case AcquireRelease:
-    if (Subtarget->isSwift())
+  case AtomicOrdering::Release:
+  case AtomicOrdering::AcquireRelease:
+    if (Subtarget->preferISHSTBarriers())
       return makeDMB(Builder, ARM_MB::ISHST);
     // FIXME: add a comment with a link to documentation justifying this.
     else
@@ -11978,19 +12399,16 @@ Instruction* ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
 Instruction* ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
                                           AtomicOrdering Ord, bool IsStore,
                                           bool IsLoad) const {
-  if (!getInsertFencesForAtomic())
-    return nullptr;
-
   switch (Ord) {
-  case NotAtomic:
-  case Unordered:
+  case AtomicOrdering::NotAtomic:
+  case AtomicOrdering::Unordered:
     llvm_unreachable("Invalid fence: unordered/not-atomic");
-  case Monotonic:
-  case Release:
+  case AtomicOrdering::Monotonic:
+  case AtomicOrdering::Release:
     return nullptr; // Nothing to do
-  case Acquire:
-  case AcquireRelease:
-  case SequentiallyConsistent:
+  case AtomicOrdering::Acquire:
+  case AtomicOrdering::AcquireRelease:
+  case AtomicOrdering::SequentiallyConsistent:
     return makeDMB(Builder, ARM_MB::ISH);
   }
   llvm_unreachable("Unknown fence ordering in emitTrailingFence");
@@ -12031,7 +12449,17 @@ ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
 
 bool ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(
     AtomicCmpXchgInst *AI) const {
-  return true;
+  // At -O0, fast-regalloc cannot cope with the live vregs necessary to
+  // implement cmpxchg without spilling. If the address being exchanged is also
+  // on the stack and close enough to the spill slot, this can lead to a
+  // situation where the monitor always gets cleared and the atomic operation
+  // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
+  return getTargetMachine().getOptLevel() != 0;
+}
+
+bool ARMTargetLowering::shouldInsertFencesForAtomic(
+    const Instruction *I) const {
+  return InsertFencesForAtomic;
 }
 
 // This has so far only been implemented for MachO.
@@ -12080,7 +12508,7 @@ Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
                                          AtomicOrdering Ord) const {
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
   Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
-  bool IsAcquire = isAtLeastAcquire(Ord);
+  bool IsAcquire = isAcquireOrStronger(Ord);
 
   // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd
   // intrinsic must return {i32, i32} and we have to recombine them into a
@@ -12124,7 +12552,7 @@ Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val,
                                                Value *Addr,
                                                AtomicOrdering Ord) const {
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-  bool IsRelease = isAtLeastRelease(Ord);
+  bool IsRelease = isReleaseOrStronger(Ord);
 
   // Since the intrinsics must have legal type, the i64 intrinsics take two
   // parameters: "i32, i32". We must marshal Val into the appropriate form
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 96b56c3ec330..4906686616bc 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -43,7 +43,6 @@ namespace llvm {
       CALL,         // Function call.
       CALL_PRED,    // Function call that's predicable.
       CALL_NOLINK,  // Function call with branch not branch-and-link.
-      tCALL,        // Thumb function call.
       BRCOND,       // Conditional branch.
       BR_JT,        // Jumptable branch.
       BR2_JT,       // Jumptable branch (2 level - jumptable entry is a jump).
@@ -61,6 +60,8 @@ namespace llvm {
 
       CMOV,         // ARM conditional move instructions.
 
+      SSAT,         // Signed saturation
+
       BCC_i64,
 
       SRL_FLAG,     // V,Flag = srl_flag X -> srl X, 1 + save carry out.
@@ -164,6 +165,7 @@ namespace llvm {
 
       UMLAL,        // 64bit Unsigned Accumulate Multiply
       SMLAL,        // 64bit Signed Accumulate Multiply
+      UMAAL,        // 64-bit Unsigned Accumulate Accumulate Multiply
 
       // Operands of the standard BUILD_VECTOR node are not legalized, which
       // is fine if BUILD_VECTORs are always lowered to shuffles or other
@@ -251,13 +253,14 @@ namespace llvm {
                            EVT VT) const override;
 
     MachineBasicBlock *
-      EmitInstrWithCustomInserter(MachineInstr *MI,
-                                  MachineBasicBlock *MBB) const override;
+    EmitInstrWithCustomInserter(MachineInstr &MI,
+                                MachineBasicBlock *MBB) const override;
 
-    void AdjustInstrPostInstrSelection(MachineInstr *MI,
+    void AdjustInstrPostInstrSelection(MachineInstr &MI,
                                        SDNode *Node) const override;
 
     SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const;
+    SDValue PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const;
     SDValue PerformCMOVToBFICombine(SDNode *N, SelectionDAG &DAG) const;
     SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
@@ -335,6 +338,8 @@ namespace llvm {
     getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                  StringRef Constraint, MVT VT) const override;
 
+    const char *LowerXConstraint(EVT ConstraintVT) const override;
+
     /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
     /// vector.  If it is invalid, don't add anything to Ops. If hasMemory is
     /// true it means one of the asm constraint of the inline asm instruction
@@ -453,6 +458,7 @@ namespace llvm {
     bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
                                unsigned Factor) const override;
 
+    bool shouldInsertFencesForAtomic(const Instruction *I) const override;
     TargetLoweringBase::AtomicExpansionKind
     shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
     bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
@@ -468,6 +474,14 @@ namespace llvm {
     bool isCheapToSpeculateCttz() const override;
     bool isCheapToSpeculateCtlz() const override;
 
+    bool supportSwiftError() const override {
+      return true;
+    }
+
+    bool hasStandaloneRem(EVT VT) const override {
+      return HasStandaloneRem;
+    }
+
   protected:
     std::pair<const TargetRegisterClass *, uint8_t>
     findRepresentativeClass(const TargetRegisterInfo *TRI,
@@ -486,29 +500,34 @@ namespace llvm {
     ///
     unsigned ARMPCLabelIndex;
 
+    // TODO: remove this, and have shouldInsertFencesForAtomic do the proper
+    // check.
+    bool InsertFencesForAtomic;
+
+    bool HasStandaloneRem = true;
+
     void addTypeForNEON(MVT VT, MVT PromotedLdStVT, MVT PromotedBitwiseVT);
     void addDRTypeForNEON(MVT VT);
     void addQRTypeForNEON(MVT VT);
     std::pair<SDValue, SDValue> getARMXALUOOp(SDValue Op, SelectionDAG &DAG, SDValue &ARMcc) const;
 
     typedef SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPassVector;
-    void PassF64ArgInRegs(SDLoc dl, SelectionDAG &DAG,
-                          SDValue Chain, SDValue &Arg,
-                          RegsToPassVector &RegsToPass,
+    void PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG, SDValue Chain,
+                          SDValue &Arg, RegsToPassVector &RegsToPass,
                           CCValAssign &VA, CCValAssign &NextVA,
                           SDValue &StackPtr,
                           SmallVectorImpl<SDValue> &MemOpChains,
                           ISD::ArgFlagsTy Flags) const;
     SDValue GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
                                  SDValue &Root, SelectionDAG &DAG,
-                                 SDLoc dl) const;
+                                 const SDLoc &dl) const;
 
     CallingConv::ID getEffectiveCallingConv(CallingConv::ID CC,
                                             bool isVarArg) const;
     CCAssignFn *CCAssignFnForNode(CallingConv::ID CC, bool Return,
                                   bool isVarArg) const;
     SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg,
-                             SDLoc dl, SelectionDAG &DAG,
+                             const SDLoc &dl, SelectionDAG &DAG,
                              const CCValAssign &VA,
                              ISD::ArgFlagsTy Flags) const;
     SDValue LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
@@ -527,6 +546,7 @@ namespace llvm {
                                  SelectionDAG &DAG,
                                  TLSModel::Model model) const;
     SDValue LowerGlobalTLSAddressDarwin(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalTLSAddressWindows(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) const;
@@ -576,9 +596,9 @@ namespace llvm {
     SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
                             CallingConv::ID CallConv, bool isVarArg,
                             const SmallVectorImpl<ISD::InputArg> &Ins,
-                            SDLoc dl, SelectionDAG &DAG,
-                            SmallVectorImpl<SDValue> &InVals,
-                            bool isThisReturn, SDValue ThisVal) const;
+                            const SDLoc &dl, SelectionDAG &DAG,
+                            SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
+                            SDValue ThisVal) const;
 
     bool supportSplitCSR(MachineFunction *MF) const override {
       return MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
@@ -590,23 +610,19 @@ namespace llvm {
       const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
 
     SDValue
-      LowerFormalArguments(SDValue Chain,
-                           CallingConv::ID CallConv, bool isVarArg,
-                           const SmallVectorImpl<ISD::InputArg> &Ins,
-                           SDLoc dl, SelectionDAG &DAG,
-                           SmallVectorImpl<SDValue> &InVals) const override;
-
-    int StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
-                       SDLoc dl, SDValue &Chain,
-                       const Value *OrigArg,
-                       unsigned InRegsParamRecordIdx,
-                       int ArgOffset,
+    LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                         const SmallVectorImpl<ISD::InputArg> &Ins,
+                         const SDLoc &dl, SelectionDAG &DAG,
+                         SmallVectorImpl<SDValue> &InVals) const override;
+
+    int StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, const SDLoc &dl,
+                       SDValue &Chain, const Value *OrigArg,
+                       unsigned InRegsParamRecordIdx, int ArgOffset,
                        unsigned ArgSize) const;
 
     void VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
-                              SDLoc dl, SDValue &Chain,
-                              unsigned ArgOffset,
-                              unsigned TotalArgRegsSaveSize,
+                              const SDLoc &dl, SDValue &Chain,
+                              unsigned ArgOffset, unsigned TotalArgRegsSaveSize,
                               bool ForceMutable = false) const;
 
     SDValue
@@ -634,42 +650,39 @@ namespace llvm {
                         const SmallVectorImpl<ISD::OutputArg> &Outs,
                         LLVMContext &Context) const override;
 
-    SDValue
-      LowerReturn(SDValue Chain,
-                  CallingConv::ID CallConv, bool isVarArg,
-                  const SmallVectorImpl<ISD::OutputArg> &Outs,
-                  const SmallVectorImpl<SDValue> &OutVals,
-                  SDLoc dl, SelectionDAG &DAG) const override;
+    SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                        const SmallVectorImpl<ISD::OutputArg> &Outs,
+                        const SmallVectorImpl<SDValue> &OutVals,
+                        const SDLoc &dl, SelectionDAG &DAG) const override;
 
     bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
 
     bool mayBeEmittedAsTailCall(CallInst *CI) const override;
 
-    SDValue getCMOV(SDLoc dl, EVT VT, SDValue FalseVal, SDValue TrueVal,
+    SDValue getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal, SDValue TrueVal,
                     SDValue ARMcc, SDValue CCR, SDValue Cmp,
                     SelectionDAG &DAG) const;
     SDValue getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
-                      SDValue &ARMcc, SelectionDAG &DAG, SDLoc dl) const;
-    SDValue getVFPCmp(SDValue LHS, SDValue RHS,
-                      SelectionDAG &DAG, SDLoc dl) const;
+                      SDValue &ARMcc, SelectionDAG &DAG, const SDLoc &dl) const;
+    SDValue getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG,
+                      const SDLoc &dl) const;
     SDValue duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const;
 
     SDValue OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const;
 
-    void SetupEntryBlockForSjLj(MachineInstr *MI,
-                                MachineBasicBlock *MBB,
+    void SetupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB,
                                 MachineBasicBlock *DispatchBB, int FI) const;
 
-    void EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const;
+    void EmitSjLjDispatchBlock(MachineInstr &MI, MachineBasicBlock *MBB) const;
 
-    bool RemapAddSubWithFlags(MachineInstr *MI, MachineBasicBlock *BB) const;
+    bool RemapAddSubWithFlags(MachineInstr &MI, MachineBasicBlock *BB) const;
 
-    MachineBasicBlock *EmitStructByval(MachineInstr *MI,
+    MachineBasicBlock *EmitStructByval(MachineInstr &MI,
                                        MachineBasicBlock *MBB) const;
 
-    MachineBasicBlock *EmitLowered__chkstk(MachineInstr *MI,
+    MachineBasicBlock *EmitLowered__chkstk(MachineInstr &MI,
                                            MachineBasicBlock *MBB) const;
-    MachineBasicBlock *EmitLowered__dbzchk(MachineInstr *MI,
+    MachineBasicBlock *EmitLowered__dbzchk(MachineInstr &MI,
                                            MachineBasicBlock *MBB) const;
   };
 
diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td
index e79608d360ca..37a83f70a1fb 100644
--- a/lib/Target/ARM/ARMInstrFormats.td
+++ b/lib/Target/ARM/ARMInstrFormats.td
@@ -246,23 +246,33 @@ def shr_imm64 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm <= 64; }]> {
   let ParserMatchClass = shr_imm64_asm_operand;
 }
 
+
+// ARM Assembler operand for ldr Rd, =expression which generates an offset
+// to a constant pool entry or a MOV depending on the value of expression
+def const_pool_asm_operand : AsmOperandClass { let Name = "ConstPoolAsmImm"; }
+def const_pool_asm_imm : Operand<i32> {
+  let ParserMatchClass = const_pool_asm_operand;
+}
+
+
 //===----------------------------------------------------------------------===//
 // ARM Assembler alias templates.
 //
-class ARMInstAlias<string Asm, dag Result, bit Emit = 0b1>
-      : InstAlias<Asm, Result, Emit>, Requires<[IsARM]>;
-class  tInstAlias<string Asm, dag Result, bit Emit = 0b1>
-      : InstAlias<Asm, Result, Emit>, Requires<[IsThumb]>;
-class t2InstAlias<string Asm, dag Result, bit Emit = 0b1>
-      : InstAlias<Asm, Result, Emit>, Requires<[IsThumb2]>;
-class VFP2InstAlias<string Asm, dag Result, bit Emit = 0b1>
-      : InstAlias<Asm, Result, Emit>, Requires<[HasVFP2]>;
-class VFP2DPInstAlias<string Asm, dag Result, bit Emit = 0b1>
-      : InstAlias<Asm, Result, Emit>, Requires<[HasVFP2,HasDPVFP]>;
-class VFP3InstAlias<string Asm, dag Result, bit Emit = 0b1>
-      : InstAlias<Asm, Result, Emit>, Requires<[HasVFP3]>;
-class NEONInstAlias<string Asm, dag Result, bit Emit = 0b1>
-      : InstAlias<Asm, Result, Emit>, Requires<[HasNEON]>;
+// Note: When EmitPriority == 1, the alias will be used for printing
+class ARMInstAlias<string Asm, dag Result, bit EmitPriority = 0>
+      : InstAlias<Asm, Result, EmitPriority>, Requires<[IsARM]>;
+class  tInstAlias<string Asm, dag Result, bit EmitPriority = 0>
+      : InstAlias<Asm, Result, EmitPriority>, Requires<[IsThumb]>;
+class t2InstAlias<string Asm, dag Result, bit EmitPriority = 0>
+      : InstAlias<Asm, Result, EmitPriority>, Requires<[IsThumb2]>;
+class VFP2InstAlias<string Asm, dag Result, bit EmitPriority = 0>
+      : InstAlias<Asm, Result, EmitPriority>, Requires<[HasVFP2]>;
+class VFP2DPInstAlias<string Asm, dag Result, bit EmitPriority = 0>
+      : InstAlias<Asm, Result, EmitPriority>, Requires<[HasVFP2,HasDPVFP]>;
+class VFP3InstAlias<string Asm, dag Result, bit EmitPriority = 0>
+      : InstAlias<Asm, Result, EmitPriority>, Requires<[HasVFP3]>;
+class NEONInstAlias<string Asm, dag Result, bit EmitPriority = 0>
+      : InstAlias<Asm, Result, EmitPriority>, Requires<[HasNEON]>;
 
 
 class VFP2MnemonicAlias<string src, string dst> : MnemonicAlias<src, dst>,
@@ -563,12 +573,12 @@ class AIstrex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
 class AIldaex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
               string opc, string asm, list<dag> pattern>
   : AIldr_ex_or_acq<opcod, 0b10, oops, iops, itin, opc, asm, pattern>,
-    Requires<[IsARM, HasV8]>;
+    Requires<[IsARM, HasAcquireRelease, HasV7Clrex]>;
 
 class AIstlex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
               string opc, string asm, list<dag> pattern>
   : AIstr_ex_or_rel<opcod, 0b10, oops, iops, itin, opc, asm, pattern>,
-    Requires<[IsARM, HasV8]> {
+    Requires<[IsARM, HasAcquireRelease, HasV7Clrex]> {
   bits<4> Rd;
   let Inst{15-12} = Rd;
 }
@@ -593,12 +603,12 @@ class AIswp<bit b, dag oops, dag iops, string opc, list<dag> pattern>
 class AIldracq<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
               string opc, string asm, list<dag> pattern>
   : AIldr_ex_or_acq<opcod, 0b00, oops, iops, itin, opc, asm, pattern>,
-    Requires<[IsARM, HasV8]>;
+    Requires<[IsARM, HasAcquireRelease]>;
 
 class AIstrrel<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
               string opc, string asm, list<dag> pattern>
   : AIstr_ex_or_rel<opcod, 0b00, oops, iops, itin, opc, asm, pattern>,
-    Requires<[IsARM, HasV8]> {
+    Requires<[IsARM, HasAcquireRelease]> {
   let Inst{15-12}   = 0b1111;
 }
 
@@ -1379,11 +1389,6 @@ class T2Ipostldst<bit signed, bits<2> opcod, bit load, bit pre,
   let DecoderMethod = "DecodeT2LdStPre";
 }
 
-// Tv5Pat - Same as Pat<>, but requires V5T Thumb mode.
-class Tv5Pat<dag pattern, dag result> : Pat<pattern, result> {
-  list<Predicate> Predicates = [IsThumb, IsThumb1Only, HasV5T];
-}
-
 // T1Pat - Same as Pat<>, but requires that the compiler be in Thumb1 mode.
 class T1Pat<dag pattern, dag result> : Pat<pattern, result> {
   list<Predicate> Predicates = [IsThumb, IsThumb1Only];
@@ -1495,6 +1500,32 @@ class ASI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops,
   let D = VFPNeonDomain;
 }
 
+class AHI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops,
+           InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : VFPI<oops, iops, AddrMode5, 4, IndexModeNone,
+         VFPLdStFrm, itin, opc, asm, "", pattern> {
+  list<Predicate> Predicates = [HasFullFP16];
+
+  // Instruction operands.
+  bits<5>  Sd;
+  bits<13> addr;
+
+  // Encode instruction operands.
+  let Inst{23}    = addr{8};      // U (add = (U == '1'))
+  let Inst{22}    = Sd{0};
+  let Inst{19-16} = addr{12-9};   // Rn
+  let Inst{15-12} = Sd{4-1};
+  let Inst{7-0}   = addr{7-0};    // imm8
+
+  let Inst{27-24} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{11-8}  = 0b1001;     // Half precision
+
+  // Loads & stores operate on both NEON and VFP pipelines.
+  let D = VFPNeonDomain;
+}
+
 // VFP Load / store multiple pseudo instructions.
 class PseudoVFPLdStM<dag oops, dag iops, InstrItinClass itin, string cstr,
                      list<dag> pattern>
@@ -1817,6 +1848,114 @@ class ASbIn<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops,
   let Inst{22}    = Sd{0};
 }
 
+// Half precision, unary, predicated
+class AHuI<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
+           bit opcod5, dag oops, dag iops, InstrItinClass itin, string opc,
+           string asm, list<dag> pattern>
+  : VFPAI<oops, iops, VFPUnaryFrm, itin, opc, asm, pattern> {
+  list<Predicate> Predicates = [HasFullFP16];
+
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Sm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+
+  let Inst{27-23} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{19-16} = opcod3;
+  let Inst{11-8}  = 0b1001;   // Half precision
+  let Inst{7-6}   = opcod4;
+  let Inst{4}     = opcod5;
+}
+
+// Half precision, unary, non-predicated
+class AHuInp<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
+             bit opcod5, dag oops, dag iops, InstrItinClass itin,
+             string asm, list<dag> pattern>
+  : VFPXI<oops, iops, AddrModeNone, 4, IndexModeNone,
+          VFPUnaryFrm, itin, asm, "", pattern> {
+  list<Predicate> Predicates = [HasFullFP16];
+
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Sm;
+
+  let Inst{31-28} = 0b1111;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+
+  let Inst{27-23} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{19-16} = opcod3;
+  let Inst{11-8}  = 0b1001;   // Half precision
+  let Inst{7-6}   = opcod4;
+  let Inst{4}     = opcod5;
+}
+
+// Half precision, binary
+class AHbI<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops, dag iops,
+           InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : VFPAI<oops, iops, VFPBinaryFrm, itin, opc, asm, pattern> {
+  list<Predicate> Predicates = [HasFullFP16];
+
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Sn;
+  bits<5> Sm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{19-16} = Sn{4-1};
+  let Inst{7}     = Sn{0};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+
+  let Inst{27-23} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{11-8}  = 0b1001;   // Half precision
+  let Inst{6}     = op6;
+  let Inst{4}     = op4;
+}
+
+// Half precision, binary, not predicated
+class AHbInp<bits<5> opcod1, bits<2> opcod2, bit opcod3, dag oops, dag iops,
+           InstrItinClass itin, string asm, list<dag> pattern>
+  : VFPXI<oops, iops, AddrModeNone, 4, IndexModeNone,
+          VFPBinaryFrm, itin, asm, "", pattern> {
+  list<Predicate> Predicates = [HasFullFP16];
+
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Sn;
+  bits<5> Sm;
+
+  let Inst{31-28} = 0b1111;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{19-16} = Sn{4-1};
+  let Inst{7}     = Sn{0};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+
+  let Inst{27-23} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{11-8}  = 0b1001;   // Half precision
+  let Inst{6}     = opcod3;
+  let Inst{4}     = 0;
+}
+
 // VFP conversion instructions
 class AVConv1I<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<4> opcod4,
                dag oops, dag iops, InstrItinClass itin, string opc, string asm,
@@ -2321,22 +2460,25 @@ class NEONFPPat<dag pattern, dag result> : Pat<pattern, result> {
 }
 
 // VFP/NEON Instruction aliases for type suffices.
-class VFPDataTypeInstAlias<string opc, string dt, string asm, dag Result> :
-  InstAlias<!strconcat(opc, dt, "\t", asm), Result>, Requires<[HasVFP2]>;
+// Note: When EmitPriority == 1, the alias will be used for printing
+class VFPDataTypeInstAlias<string opc, string dt, string asm, dag Result, bit EmitPriority = 0> :
+  InstAlias<!strconcat(opc, dt, "\t", asm), Result, EmitPriority>, Requires<[HasVFP2]>;
 
-multiclass VFPDTAnyInstAlias<string opc, string asm, dag Result> {
-  def : VFPDataTypeInstAlias<opc, ".8", asm, Result>;
-  def : VFPDataTypeInstAlias<opc, ".16", asm, Result>;
-  def : VFPDataTypeInstAlias<opc, ".32", asm, Result>;
-  def : VFPDataTypeInstAlias<opc, ".64", asm, Result>;
+// Note: When EmitPriority == 1, the alias will be used for printing
+multiclass VFPDTAnyInstAlias<string opc, string asm, dag Result, bit EmitPriority = 0> {
+  def : VFPDataTypeInstAlias<opc, ".8", asm, Result, EmitPriority>;
+  def : VFPDataTypeInstAlias<opc, ".16", asm, Result, EmitPriority>;
+  def : VFPDataTypeInstAlias<opc, ".32", asm, Result, EmitPriority>;
+  def : VFPDataTypeInstAlias<opc, ".64", asm, Result, EmitPriority>;
 }
 
-multiclass NEONDTAnyInstAlias<string opc, string asm, dag Result> {
+// Note: When EmitPriority == 1, the alias will be used for printing
+multiclass NEONDTAnyInstAlias<string opc, string asm, dag Result, bit EmitPriority = 0> {
   let Predicates = [HasNEON] in {
-  def : VFPDataTypeInstAlias<opc, ".8", asm, Result>;
-  def : VFPDataTypeInstAlias<opc, ".16", asm, Result>;
-  def : VFPDataTypeInstAlias<opc, ".32", asm, Result>;
-  def : VFPDataTypeInstAlias<opc, ".64", asm, Result>;
+  def : VFPDataTypeInstAlias<opc, ".8", asm, Result, EmitPriority>;
+  def : VFPDataTypeInstAlias<opc, ".16", asm, Result, EmitPriority>;
+  def : VFPDataTypeInstAlias<opc, ".32", asm, Result, EmitPriority>;
+  def : VFPDataTypeInstAlias<opc, ".64", asm, Result, EmitPriority>;
 }
 }
 
diff --git a/lib/Target/ARM/ARMInstrInfo.cpp b/lib/Target/ARM/ARMInstrInfo.cpp
index cf973d68085f..98b1b4ca4272 100644
--- a/lib/Target/ARM/ARMInstrInfo.cpp
+++ b/lib/Target/ARM/ARMInstrInfo.cpp
@@ -90,29 +90,29 @@ unsigned ARMInstrInfo::getUnindexedOpcode(unsigned Opc) const {
   return 0;
 }
 
-void ARMInstrInfo::expandLoadStackGuard(MachineBasicBlock::iterator MI,
-                                        Reloc::Model RM) const {
+void ARMInstrInfo::expandLoadStackGuard(MachineBasicBlock::iterator MI) const {
   MachineFunction &MF = *MI->getParent()->getParent();
   const ARMSubtarget &Subtarget = MF.getSubtarget<ARMSubtarget>();
+  const TargetMachine &TM = MF.getTarget();
 
   if (!Subtarget.useMovt(MF)) {
-    if (RM == Reloc::PIC_)
-      expandLoadStackGuardBase(MI, ARM::LDRLIT_ga_pcrel, ARM::LDRi12, RM);
+    if (TM.isPositionIndependent())
+      expandLoadStackGuardBase(MI, ARM::LDRLIT_ga_pcrel, ARM::LDRi12);
     else
-      expandLoadStackGuardBase(MI, ARM::LDRLIT_ga_abs, ARM::LDRi12, RM);
+      expandLoadStackGuardBase(MI, ARM::LDRLIT_ga_abs, ARM::LDRi12);
     return;
   }
 
-  if (RM != Reloc::PIC_) {
-    expandLoadStackGuardBase(MI, ARM::MOVi32imm, ARM::LDRi12, RM);
+  if (!TM.isPositionIndependent()) {
+    expandLoadStackGuardBase(MI, ARM::MOVi32imm, ARM::LDRi12);
     return;
   }
 
   const GlobalValue *GV =
       cast<GlobalValue>((*MI->memoperands_begin())->getValue());
 
-  if (!Subtarget.GVIsIndirectSymbol(GV, RM)) {
-    expandLoadStackGuardBase(MI, ARM::MOV_ga_pcrel, ARM::LDRi12, RM);
+  if (!Subtarget.isGVIndirectSymbol(GV)) {
+    expandLoadStackGuardBase(MI, ARM::MOV_ga_pcrel, ARM::LDRi12);
     return;
   }
 
@@ -123,9 +123,9 @@ void ARMInstrInfo::expandLoadStackGuard(MachineBasicBlock::iterator MI,
 
   MIB = BuildMI(MBB, MI, DL, get(ARM::MOV_ga_pcrel_ldr), Reg)
             .addGlobalAddress(GV, 0, ARMII::MO_NONLAZY);
-  unsigned Flag = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant;
+  auto Flags = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant;
   MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand(
-      MachinePointerInfo::getGOT(*MBB.getParent()), Flag, 4, 4);
+      MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 4, 4);
   MIB.addMemOperand(MMO);
   MIB = BuildMI(MBB, MI, DL, get(ARM::LDRi12), Reg);
   MIB.addReg(Reg, RegState::Kill).addImm(0);
diff --git a/lib/Target/ARM/ARMInstrInfo.h b/lib/Target/ARM/ARMInstrInfo.h
index 90f34ea08401..4b1b7097b18d 100644
--- a/lib/Target/ARM/ARMInstrInfo.h
+++ b/lib/Target/ARM/ARMInstrInfo.h
@@ -39,8 +39,7 @@ public:
   const ARMRegisterInfo &getRegisterInfo() const override { return RI; }
 
 private:
-  void expandLoadStackGuard(MachineBasicBlock::iterator MI,
-                            Reloc::Model RM) const override;
+  void expandLoadStackGuard(MachineBasicBlock::iterator MI) const override;
 };
 
 }
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index c446ba3109e4..060376b0a273 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -90,12 +90,6 @@ def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3,
                                              SDTCisVT<1, i32>,
                                              SDTCisVT<4, i32>]>;
 
-def SDT_ARM64bitmlal : SDTypeProfile<2,4, [ SDTCisVT<0, i32>, SDTCisVT<1, i32>,
-                                        SDTCisVT<2, i32>, SDTCisVT<3, i32>,
-                                        SDTCisVT<4, i32>, SDTCisVT<5, i32> ] >;
-def ARMUmlal         : SDNode<"ARMISD::UMLAL", SDT_ARM64bitmlal>;
-def ARMSmlal         : SDNode<"ARMISD::SMLAL", SDT_ARM64bitmlal>;
-
 // Node definitions.
 def ARMWrapper       : SDNode<"ARMISD::Wrapper",     SDTIntUnaryOp>;
 def ARMWrapperPIC    : SDNode<"ARMISD::WrapperPIC",  SDTIntUnaryOp>;
@@ -128,6 +122,8 @@ def ARMintretflag    : SDNode<"ARMISD::INTRET_FLAG", SDT_ARMcall,
 def ARMcmov          : SDNode<"ARMISD::CMOV", SDT_ARMCMov,
                               [SDNPInGlue]>;
 
+def ARMssatnoshift   : SDNode<"ARMISD::SSAT", SDTIntSatNoShOp, []>;
+
 def ARMbrcond        : SDNode<"ARMISD::BRCOND", SDT_ARMBrcond,
                               [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>;
 
@@ -201,6 +197,12 @@ def NoV6             : Predicate<"!Subtarget->hasV6Ops()">;
 def HasV6M           : Predicate<"Subtarget->hasV6MOps()">,
                                  AssemblerPredicate<"HasV6MOps",
                                                     "armv6m or armv6t2">;
+def HasV8MBaseline   : Predicate<"Subtarget->hasV8MBaselineOps()">,
+                                 AssemblerPredicate<"HasV8MBaselineOps",
+                                                    "armv8m.base">;
+def HasV8MMainline   : Predicate<"Subtarget->hasV8MMainlineOps()">,
+                                 AssemblerPredicate<"HasV8MMainlineOps",
+                                                    "armv8m.main">;
 def HasV6T2          : Predicate<"Subtarget->hasV6T2Ops()">,
                                  AssemblerPredicate<"HasV6T2Ops", "armv6t2">;
 def NoV6T2           : Predicate<"!Subtarget->hasV6T2Ops()">;
@@ -235,6 +237,8 @@ def HasCrypto        : Predicate<"Subtarget->hasCrypto()">,
                                  AssemblerPredicate<"FeatureCrypto", "crypto">;
 def HasCRC           : Predicate<"Subtarget->hasCRC()">,
                                  AssemblerPredicate<"FeatureCRC", "crc">;
+def HasRAS           : Predicate<"Subtarget->hasRAS()">,
+                                 AssemblerPredicate<"FeatureRAS", "ras">;
 def HasFP16          : Predicate<"Subtarget->hasFP16()">,
                                  AssemblerPredicate<"FeatureFP16","half-float conversions">;
 def HasFullFP16      : Predicate<"Subtarget->hasFullFP16()">,
@@ -251,6 +255,12 @@ def HasDSP           : Predicate<"Subtarget->hasDSP()">,
 def HasDB            : Predicate<"Subtarget->hasDataBarrier()">,
                                  AssemblerPredicate<"FeatureDB",
                                                     "data-barriers">;
+def HasV7Clrex  : Predicate<"Subtarget->hasV7Clrex()">,
+                            AssemblerPredicate<"FeatureV7Clrex",
+                                               "v7 clrex">;
+def HasAcquireRelease : Predicate<"Subtarget->hasAcquireRelease()">,
+                                  AssemblerPredicate<"FeatureAcquireRelease",
+                                                     "acquire/release">;
 def HasMP            : Predicate<"Subtarget->hasMPExtension()">,
                                  AssemblerPredicate<"FeatureMP",
                                                     "mp-extensions">;
@@ -260,6 +270,9 @@ def HasVirtualization: Predicate<"false">,
 def HasTrustZone     : Predicate<"Subtarget->hasTrustZone()">,
                                  AssemblerPredicate<"FeatureTrustZone",
                                                     "TrustZone">;
+def Has8MSecExt      : Predicate<"Subtarget->has8MSecExt()">,
+                                 AssemblerPredicate<"Feature8MSecExt",
+                                                    "ARMv8-M Security Extensions">;
 def HasZCZ           : Predicate<"Subtarget->hasZeroCycleZeroing()">;
 def UseNEONForFP     : Predicate<"Subtarget->useNEONForSinglePrecisionFP()">;
 def DontUseNEONForFP : Predicate<"!Subtarget->useNEONForSinglePrecisionFP()">;
@@ -279,6 +292,8 @@ def IsARM            : Predicate<"!Subtarget->isThumb()">,
 def IsMachO          : Predicate<"Subtarget->isTargetMachO()">;
 def IsNotMachO       : Predicate<"!Subtarget->isTargetMachO()">;
 def IsNaCl           : Predicate<"Subtarget->isTargetNaCl()">;
+def IsWindows        : Predicate<"Subtarget->isTargetWindows()">;
+def IsNotWindows     : Predicate<"!Subtarget->isTargetWindows()">;
 def UseNaClTrap      : Predicate<"Subtarget->useNaClTrap()">,
                                  AssemblerPredicate<"FeatureNaClTrap", "NaCl">;
 def DontUseNaClTrap  : Predicate<"!Subtarget->useNaClTrap()">;
@@ -301,19 +316,16 @@ def DontUseFusedMAC  : Predicate<"!(TM.Options.AllowFPOpFusion =="
                                  " Subtarget->hasVFP4()) || "
                                  "Subtarget->isTargetDarwin()">;
 
-// VGETLNi32 is microcoded on Swift - prefer VMOV.
-def HasFastVGETLNi32 : Predicate<"!Subtarget->isSwift()">;
-def HasSlowVGETLNi32 : Predicate<"Subtarget->isSwift()">;
+def HasFastVGETLNi32 : Predicate<"!Subtarget->hasSlowVGETLNi32()">;
+def HasSlowVGETLNi32 : Predicate<"Subtarget->hasSlowVGETLNi32()">;
 
-// VDUP.32 is microcoded on Swift - prefer VMOV.
-def HasFastVDUP32 : Predicate<"!Subtarget->isSwift()">;
-def HasSlowVDUP32 : Predicate<"Subtarget->isSwift()">;
+def HasFastVDUP32 : Predicate<"!Subtarget->hasSlowVDUP32()">;
+def HasSlowVDUP32 : Predicate<"Subtarget->hasSlowVDUP32()">;
 
-// Cortex-A9 prefers VMOVSR to VMOVDRR even when using NEON for scalar FP, as
-// this allows more effective execution domain optimization. See
-// setExecutionDomain().
-def UseVMOVSR : Predicate<"Subtarget->isCortexA9() || !Subtarget->useNEONForSinglePrecisionFP()">;
-def DontUseVMOVSR : Predicate<"!Subtarget->isCortexA9() && Subtarget->useNEONForSinglePrecisionFP()">;
+def UseVMOVSR : Predicate<"Subtarget->preferVMOVSR() ||"
+                          "!Subtarget->useNEONForSinglePrecisionFP()">;
+def DontUseVMOVSR : Predicate<"!Subtarget->preferVMOVSR() &&"
+                              "Subtarget->useNEONForSinglePrecisionFP()">;
 
 def IsLE             : Predicate<"MF->getDataLayout().isLittleEndian()">;
 def IsBE             : Predicate<"MF->getDataLayout().isBigEndian()">;
@@ -360,8 +372,6 @@ def lo16AllZero : PatLeaf<(i32 imm), [{
   return (((uint32_t)N->getZExtValue()) & 0xFFFFUL) == 0;
 }], hi16>;
 
-class BinOpWithFlagFrag<dag res> :
-      PatFrag<(ops node:$LHS, node:$RHS, node:$FLAG), res>;
 class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>;
 class UnOpFrag <dag res> : PatFrag<(ops node:$Src), res>;
 
@@ -408,34 +418,35 @@ def brtarget : Operand<OtherVT> {
   let DecoderMethod = "DecodeT2BROperand";
 }
 
-// FIXME: get rid of this one?
-def uncondbrtarget : Operand<OtherVT> {
-  let EncoderMethod = "getUnconditionalBranchTargetOpValue";
-  let OperandType = "OPERAND_PCREL";
+// Branches targeting ARM-mode must be divisible by 4 if they're a raw
+// immediate.
+def ARMBranchTarget : AsmOperandClass {
+  let Name = "ARMBranchTarget";
 }
 
-// Branch target for ARM. Handles conditional/unconditional
-def br_target : Operand<OtherVT> {
-  let EncoderMethod = "getARMBranchTargetOpValue";
-  let OperandType = "OPERAND_PCREL";
+// Branches targeting Thumb-mode must be divisible by 2 if they're a raw
+// immediate.
+def ThumbBranchTarget : AsmOperandClass {
+  let Name = "ThumbBranchTarget";
 }
 
-// Call target.
-// FIXME: rename bltarget to t2_bl_target?
-def bltarget : Operand<i32> {
-  // Encoded the same as branch targets.
-  let EncoderMethod = "getBranchTargetOpValue";
+def arm_br_target : Operand<OtherVT> {
+  let ParserMatchClass = ARMBranchTarget;
+  let EncoderMethod = "getARMBranchTargetOpValue";
   let OperandType = "OPERAND_PCREL";
 }
 
 // Call target for ARM. Handles conditional/unconditional
 // FIXME: rename bl_target to t2_bltarget?
-def bl_target : Operand<i32> {
+def arm_bl_target : Operand<i32> {
+  let ParserMatchClass = ARMBranchTarget;
   let EncoderMethod = "getARMBLTargetOpValue";
   let OperandType = "OPERAND_PCREL";
 }
 
-def blx_target : Operand<i32> {
+// Target for BLX *from* ARM mode.
+def arm_blx_target : Operand<i32> {
+  let ParserMatchClass = ThumbBranchTarget;
   let EncoderMethod = "getARMBLXTargetOpValue";
   let OperandType = "OPERAND_PCREL";
 }
@@ -981,6 +992,21 @@ def addrmode5_pre : AddrMode5 {
    let PrintMethod = "printAddrMode5Operand<true>";
 }
 
+// addrmode5fp16 := reg +/- imm8*2
+//
+def AddrMode5FP16AsmOperand : AsmOperandClass { let Name = "AddrMode5FP16"; }
+class AddrMode5FP16 : Operand<i32>,
+                      ComplexPattern<i32, 2, "SelectAddrMode5FP16", []> {
+  let EncoderMethod = "getAddrMode5FP16OpValue";
+  let DecoderMethod = "DecodeAddrMode5FP16Operand";
+  let ParserMatchClass = AddrMode5FP16AsmOperand;
+  let MIOperandInfo = (ops GPR:$base, i32imm);
+}
+
+def addrmode5fp16 : AddrMode5FP16 {
+   let PrintMethod = "printAddrMode5FP16Operand<false>";
+}
+
 // addrmode6 := reg with optional alignment
 //
 def AddrMode6AsmOperand : AsmOperandClass { let Name = "AlignedMemory"; }
@@ -1224,7 +1250,7 @@ include "ARMInstrFormats.td"
 let TwoOperandAliasConstraint = "$Rn = $Rd" in
 multiclass AsI1_bin_irs<bits<4> opcod, string opc,
                      InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
-                        PatFrag opnode, bit Commutable = 0> {
+                     SDPatternOperator opnode, bit Commutable = 0> {
   // The register-immediate version is re-materializable. This is useful
   // in particular for taking the address of a local.
   let isReMaterializable = 1 in {
@@ -1297,7 +1323,7 @@ multiclass AsI1_bin_irs<bits<4> opcod, string opc,
 let TwoOperandAliasConstraint = "$Rn = $Rd" in
 multiclass AsI1_rbin_irs<bits<4> opcod, string opc,
                      InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
-                        PatFrag opnode, bit Commutable = 0> {
+                     SDNode opnode, bit Commutable = 0> {
   // The register-immediate version is re-materializable. This is useful
   // in particular for taking the address of a local.
   let isReMaterializable = 1 in {
@@ -1369,7 +1395,7 @@ multiclass AsI1_rbin_irs<bits<4> opcod, string opc,
 /// AdjustInstrPostInstrSelection after giving them an optional CPSR operand.
 let hasPostISelHook = 1, Defs = [CPSR] in {
 multiclass AsI1_bin_s_irs<InstrItinClass iii, InstrItinClass iir,
-                          InstrItinClass iis, PatFrag opnode,
+                          InstrItinClass iis, SDNode opnode,
                           bit Commutable = 0> {
   def ri : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, mod_imm:$imm, pred:$p),
                          4, iii,
@@ -1402,7 +1428,7 @@ multiclass AsI1_bin_s_irs<InstrItinClass iii, InstrItinClass iir,
 /// operands are reversed.
 let hasPostISelHook = 1, Defs = [CPSR] in {
 multiclass AsI1_rbin_s_is<InstrItinClass iii, InstrItinClass iir,
-                          InstrItinClass iis, PatFrag opnode,
+                          InstrItinClass iis, SDNode opnode,
                           bit Commutable = 0> {
   def ri : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, mod_imm:$imm, pred:$p),
                          4, iii,
@@ -1431,8 +1457,8 @@ multiclass AsI1_rbin_s_is<InstrItinClass iii, InstrItinClass iir,
 let isCompare = 1, Defs = [CPSR] in {
 multiclass AI1_cmp_irs<bits<4> opcod, string opc,
                      InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
-                       PatFrag opnode, bit Commutable = 0,
-                       string rrDecoderMethod = ""> {
+                     SDPatternOperator opnode, bit Commutable = 0,
+                     string rrDecoderMethod = ""> {
   def ri : AI1<opcod, (outs), (ins GPR:$Rn, mod_imm:$imm), DPFrm, iii,
                opc, "\t$Rn, $imm",
                [(opnode GPR:$Rn, mod_imm:$imm)]>,
@@ -1561,7 +1587,7 @@ class AI_exta_rrot_np<bits<8> opcod, string opc>
 
 /// AI1_adde_sube_irs - Define instructions and patterns for adde and sube.
 let TwoOperandAliasConstraint = "$Rn = $Rd" in
-multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
+multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, SDNode opnode,
                              bit Commutable = 0> {
   let hasPostISelHook = 1, Defs = [CPSR], Uses = [CPSR] in {
   def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, mod_imm:$imm),
@@ -1632,7 +1658,7 @@ multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
 
 /// AI1_rsc_irs - Define instructions and patterns for rsc
 let TwoOperandAliasConstraint = "$Rn = $Rd" in
-multiclass AI1_rsc_irs<bits<4> opcod, string opc, PatFrag opnode> {
+multiclass AI1_rsc_irs<bits<4> opcod, string opc, SDNode opnode> {
   let hasPostISelHook = 1, Defs = [CPSR], Uses = [CPSR] in {
   def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, mod_imm:$imm),
                 DPFrm, IIC_iALUi, opc, "\t$Rd, $Rn, $imm",
@@ -1880,6 +1906,7 @@ def HINT : AI<(outs), (ins imm0_239:$imm), MiscFrm, NoItinerary,
   bits<8> imm;
   let Inst{27-8} = 0b00110010000011110000;
   let Inst{7-0} = imm;
+  let DecoderMethod = "DecodeHINTInstruction";
 }
 
 def : InstAlias<"nop$p", (HINT 0, pred:$p)>, Requires<[IsARM, HasV6K]>;
@@ -1888,6 +1915,7 @@ def : InstAlias<"wfe$p", (HINT 2, pred:$p)>, Requires<[IsARM, HasV6K]>;
 def : InstAlias<"wfi$p", (HINT 3, pred:$p)>, Requires<[IsARM, HasV6K]>;
 def : InstAlias<"sev$p", (HINT 4, pred:$p)>, Requires<[IsARM, HasV6K]>;
 def : InstAlias<"sevl$p", (HINT 5, pred:$p)>, Requires<[IsARM, HasV8]>;
+def : InstAlias<"esb$p", (HINT 16, pred:$p)>, Requires<[IsARM, HasRAS]>;
 
 def SEL : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm, NoItinerary, "sel",
              "\t$Rd, $Rn, $Rm", []>, Requires<[IsARM, HasV6]> {
@@ -1915,7 +1943,7 @@ def BKPT : AInoP<(outs), (ins imm0_65535:$val), MiscFrm, NoItinerary,
   let Inst{7-4} = 0b0111;
 }
 // default immediate for breakpoint mnemonic
-def : InstAlias<"bkpt", (BKPT 0)>, Requires<[IsARM]>;
+def : InstAlias<"bkpt", (BKPT 0), 0>, Requires<[IsARM]>;
 
 def HLT : AInoP<(outs), (ins imm0_65535:$val), MiscFrm, NoItinerary,
                  "hlt", "\t$val", []>, Requires<[IsARM, HasV8]> {
@@ -2181,7 +2209,7 @@ let isCall = 1,
   // at least be a pseudo instruction expanding to the predicated version
   // at MC lowering time.
   Defs = [LR], Uses = [SP] in {
-  def BL  : ABXI<0b1011, (outs), (ins bl_target:$func),
+  def BL  : ABXI<0b1011, (outs), (ins arm_bl_target:$func),
                 IIC_Br, "bl\t$func",
                 [(ARMcall tglobaladdr:$func)]>,
             Requires<[IsARM]>, Sched<[WriteBrL]> {
@@ -2191,7 +2219,7 @@ let isCall = 1,
     let DecoderMethod = "DecodeBranchImmInstruction";
   }
 
-  def BL_pred : ABI<0b1011, (outs), (ins bl_target:$func),
+  def BL_pred : ABI<0b1011, (outs), (ins arm_bl_target:$func),
                    IIC_Br, "bl", "\t$func",
                    [(ARMcall_pred tglobaladdr:$func)]>,
                 Requires<[IsARM]>, Sched<[WriteBrL]> {
@@ -2232,7 +2260,7 @@ let isCall = 1,
 
   // mov lr, pc; b if callee is marked noreturn to avoid confusing the
   // return stack predictor.
-  def BMOVPCB_CALL : ARMPseudoInst<(outs), (ins bl_target:$func),
+  def BMOVPCB_CALL : ARMPseudoInst<(outs), (ins arm_bl_target:$func),
                                8, IIC_Br, [(ARMcall_nolink tglobaladdr:$func)]>,
                       Requires<[IsARM]>, Sched<[WriteBr]>;
 }
@@ -2240,7 +2268,7 @@ let isCall = 1,
 let isBranch = 1, isTerminator = 1 in {
   // FIXME: should be able to write a pattern for ARMBrcond, but can't use
   // a two-value operand where a dag node expects two operands. :(
-  def Bcc : ABI<0b1010, (outs), (ins br_target:$target),
+  def Bcc : ABI<0b1010, (outs), (ins arm_br_target:$target),
                IIC_Br, "b", "\t$target",
                [/*(ARMbrcond bb:$target, imm:$cc, CCR:$ccr)*/]>,
                Sched<[WriteBr]>  {
@@ -2255,8 +2283,9 @@ let isBranch = 1, isTerminator = 1 in {
     // FIXME: We shouldn't need this pseudo at all. Just using Bcc directly
     // should be sufficient.
     // FIXME: Is B really a Barrier? That doesn't seem right.
-    def B : ARMPseudoExpand<(outs), (ins br_target:$target), 4, IIC_Br,
-                [(br bb:$target)], (Bcc br_target:$target, (ops 14, zero_reg))>,
+    def B : ARMPseudoExpand<(outs), (ins arm_br_target:$target), 4, IIC_Br,
+                [(br bb:$target)], (Bcc arm_br_target:$target,
+                (ops 14, zero_reg))>,
                 Sched<[WriteBr]>;
 
     let Size = 4, isNotDuplicable = 1, isIndirectBranch = 1 in {
@@ -2283,7 +2312,7 @@ let isBranch = 1, isTerminator = 1 in {
 }
 
 // BLX (immediate)
-def BLXi : AXI<(outs), (ins blx_target:$target), BrMiscFrm, NoItinerary,
+def BLXi : AXI<(outs), (ins arm_blx_target:$target), BrMiscFrm, NoItinerary,
                "blx\t$target", []>,
            Requires<[IsARM, HasV5T]>, Sched<[WriteBrL]> {
   let Inst{31-25} = 0b1111101;
@@ -2313,9 +2342,9 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in {
   def TCRETURNri : PseudoInst<(outs), (ins tcGPR:$dst), IIC_Br, []>,
                    Sched<[WriteBr]>;
 
-  def TAILJMPd : ARMPseudoExpand<(outs), (ins br_target:$dst),
+  def TAILJMPd : ARMPseudoExpand<(outs), (ins arm_br_target:$dst),
                                  4, IIC_Br, [],
-                                 (Bcc br_target:$dst, (ops 14, zero_reg))>,
+                                 (Bcc arm_br_target:$dst, (ops 14, zero_reg))>,
                                  Requires<[IsARM]>, Sched<[WriteBr]>;
 
   def TAILJMPr : ARMPseudoExpand<(outs), (ins tcGPR:$dst),
@@ -2467,14 +2496,12 @@ def ERET : ABI<0b0001, (outs), (ins), NoItinerary, "eret", "", []>,
 // Load
 
 
-defm LDR  : AI_ldr1<0, "ldr", IIC_iLoad_r, IIC_iLoad_si,
-                    UnOpFrag<(load node:$Src)>>;
+defm LDR  : AI_ldr1<0, "ldr", IIC_iLoad_r, IIC_iLoad_si, load>;
 defm LDRB : AI_ldr1nopc<1, "ldrb", IIC_iLoad_bh_r, IIC_iLoad_bh_si,
-                    UnOpFrag<(zextloadi8 node:$Src)>>;
-defm STR  : AI_str1<0, "str", IIC_iStore_r, IIC_iStore_si,
-                   BinOpFrag<(store node:$LHS, node:$RHS)>>;
+                        zextloadi8>;
+defm STR  : AI_str1<0, "str", IIC_iStore_r, IIC_iStore_si, store>;
 defm STRB : AI_str1nopc<1, "strb", IIC_iStore_bh_r, IIC_iStore_bh_si,
-                   BinOpFrag<(truncstorei8 node:$LHS, node:$RHS)>>;
+                        truncstorei8>;
 
 // Special LDR for loads from non-pc-relative constpools.
 let canFoldAsLoad = 1, mayLoad = 1, hasSideEffects = 0,
@@ -2764,6 +2791,12 @@ def LDRBT_POST
   : ARMAsmPseudo<"ldrbt${q} $Rt, $addr", (ins addr_offset_none:$addr, pred:$q),
                  (outs GPR:$Rt)>;
 
+// Pseudo instruction ldr Rt, =immediate
+def LDRConstPool
+  : ARMAsmPseudo<"ldr${q} $Rt, $immediate",
+                 (ins const_pool_asm_imm:$immediate, pred:$q),
+                 (outs GPR:$Rt)>;
+
 // Store
 
 // Stores with truncate
@@ -3299,8 +3332,8 @@ def MOVi16 : AI1<0b1000, (outs GPR:$Rd), (ins imm0_65535_expr:$imm),
 }
 
 def : InstAlias<"mov${p} $Rd, $imm",
-                (MOVi16 GPR:$Rd, imm0_65535_expr:$imm, pred:$p)>,
-        Requires<[IsARM]>;
+                (MOVi16 GPR:$Rd, imm0_65535_expr:$imm, pred:$p), 0>,
+        Requires<[IsARM, HasV6T2]>;
 
 def MOVi16_ga_pcrel : PseudoInst<(outs GPR:$Rd),
                                 (ins i32imm:$addr, pclabel:$id), IIC_iMOVi, []>,
@@ -3439,11 +3472,9 @@ def UBFX  : I<(outs GPRnopc:$Rd),
 //
 
 defm ADD  : AsI1_bin_irs<0b0100, "add",
-                         IIC_iALUi, IIC_iALUr, IIC_iALUsr,
-                         BinOpFrag<(add  node:$LHS, node:$RHS)>, 1>;
+                         IIC_iALUi, IIC_iALUr, IIC_iALUsr, add, 1>;
 defm SUB  : AsI1_bin_irs<0b0010, "sub",
-                         IIC_iALUi, IIC_iALUr, IIC_iALUsr,
-                         BinOpFrag<(sub  node:$LHS, node:$RHS)>>;
+                         IIC_iALUi, IIC_iALUr, IIC_iALUsr, sub>;
 
 // ADD and SUB with 's' bit set.
 //
@@ -3455,27 +3486,21 @@ defm SUB  : AsI1_bin_irs<0b0010, "sub",
 // FIXME: Eliminate ADDS/SUBS pseudo opcodes after adding tablegen
 // support for an optional CPSR definition that corresponds to the DAG
 // node's second value. We can then eliminate the implicit def of CPSR.
-defm ADDS : AsI1_bin_s_irs<IIC_iALUi, IIC_iALUr, IIC_iALUsr,
-                           BinOpFrag<(ARMaddc node:$LHS, node:$RHS)>, 1>;
-defm SUBS : AsI1_bin_s_irs<IIC_iALUi, IIC_iALUr, IIC_iALUsr,
-                           BinOpFrag<(ARMsubc node:$LHS, node:$RHS)>>;
+defm ADDS : AsI1_bin_s_irs<IIC_iALUi, IIC_iALUr, IIC_iALUsr, ARMaddc, 1>;
+defm SUBS : AsI1_bin_s_irs<IIC_iALUi, IIC_iALUr, IIC_iALUsr, ARMsubc>;
 
-defm ADC : AI1_adde_sube_irs<0b0101, "adc",
-              BinOpWithFlagFrag<(ARMadde node:$LHS, node:$RHS, node:$FLAG)>, 1>;
-defm SBC : AI1_adde_sube_irs<0b0110, "sbc",
-              BinOpWithFlagFrag<(ARMsube node:$LHS, node:$RHS, node:$FLAG)>>;
+defm ADC : AI1_adde_sube_irs<0b0101, "adc", ARMadde, 1>;
+defm SBC : AI1_adde_sube_irs<0b0110, "sbc", ARMsube>;
 
 defm RSB  : AsI1_rbin_irs<0b0011, "rsb",
                           IIC_iALUi, IIC_iALUr, IIC_iALUsr,
-                          BinOpFrag<(sub node:$LHS, node:$RHS)>>;
+                          sub>;
 
 // FIXME: Eliminate them if we can write def : Pat patterns which defines
 // CPSR and the implicit def of CPSR is not needed.
-defm RSBS : AsI1_rbin_s_is<IIC_iALUi, IIC_iALUr, IIC_iALUsr,
-                           BinOpFrag<(ARMsubc node:$LHS, node:$RHS)>>;
+defm RSBS : AsI1_rbin_s_is<IIC_iALUi, IIC_iALUr, IIC_iALUsr, ARMsubc>;
 
-defm RSC : AI1_rsc_irs<0b0111, "rsc",
-                BinOpWithFlagFrag<(ARMsube node:$LHS, node:$RHS, node:$FLAG)>>;
+defm RSC : AI1_rsc_irs<0b0111, "rsc", ARMsube>;
 
 // (sub X, imm) gets canonicalized to (add X, -imm).  Match this form.
 // The assume-no-carry-in form uses the negation of the input since add/sub
@@ -3685,20 +3710,19 @@ def : ARMV6Pat<(int_arm_ssat GPRnopc:$a, imm1_32:$pos),
                (SSAT imm1_32:$pos, GPRnopc:$a, 0)>;
 def : ARMV6Pat<(int_arm_usat GPRnopc:$a, imm0_31:$pos),
                (USAT imm0_31:$pos, GPRnopc:$a, 0)>;
+def : ARMPat<(ARMssatnoshift GPRnopc:$Rn, imm0_31:$imm),
+             (SSAT imm0_31:$imm, GPRnopc:$Rn, 0)>;
 
 //===----------------------------------------------------------------------===//
 //  Bitwise Instructions.
 //
 
 defm AND   : AsI1_bin_irs<0b0000, "and",
-                          IIC_iBITi, IIC_iBITr, IIC_iBITsr,
-                          BinOpFrag<(and node:$LHS, node:$RHS)>, 1>;
+                          IIC_iBITi, IIC_iBITr, IIC_iBITsr, and, 1>;
 defm ORR   : AsI1_bin_irs<0b1100, "orr",
-                          IIC_iBITi, IIC_iBITr, IIC_iBITsr,
-                          BinOpFrag<(or  node:$LHS, node:$RHS)>, 1>;
+                          IIC_iBITi, IIC_iBITr, IIC_iBITsr, or, 1>;
 defm EOR   : AsI1_bin_irs<0b0001, "eor",
-                          IIC_iBITi, IIC_iBITr, IIC_iBITsr,
-                          BinOpFrag<(xor node:$LHS, node:$RHS)>, 1>;
+                          IIC_iBITi, IIC_iBITr, IIC_iBITsr, xor, 1>;
 defm BIC   : AsI1_bin_irs<0b1110, "bic",
                           IIC_iBITi, IIC_iBITr, IIC_iBITsr,
                           BinOpFrag<(and node:$LHS, (not node:$RHS))>>;
@@ -3923,9 +3947,10 @@ def UMLAL : AsMla1I64<0b0000101, (outs GPR:$RdLo, GPR:$RdHi),
          RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]>;
 
 def UMAAL : AMul1I <0b0000010, (outs GPR:$RdLo, GPR:$RdHi),
-                               (ins GPR:$Rn, GPR:$Rm), IIC_iMAC64,
+                               (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi),
+                               IIC_iMAC64,
                     "umaal", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
-                    Requires<[IsARM, HasV6]> {
+         RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]> {
   bits<4> RdLo;
   bits<4> RdHi;
   bits<4> Rm;
@@ -3989,28 +4014,28 @@ def SMMLSR : AMul2Ia <0b0111010, 0b1111, (outs GPR:$Rd),
                IIC_iMAC32, "smmlsr", "\t$Rd, $Rn, $Rm, $Ra", []>,
             Requires<[IsARM, HasV6]>;
 
-multiclass AI_smul<string opc, PatFrag opnode> {
+multiclass AI_smul<string opc> {
   def BB : AMulxyI<0b0001011, 0b00, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
               IIC_iMUL16, !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm",
-              [(set GPR:$Rd, (opnode (sext_inreg GPR:$Rn, i16),
+              [(set GPR:$Rd, (mul (sext_inreg GPR:$Rn, i16),
                                       (sext_inreg GPR:$Rm, i16)))]>,
            Requires<[IsARM, HasV5TE]>;
 
   def BT : AMulxyI<0b0001011, 0b10, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
               IIC_iMUL16, !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm",
-              [(set GPR:$Rd, (opnode (sext_inreg GPR:$Rn, i16),
+              [(set GPR:$Rd, (mul (sext_inreg GPR:$Rn, i16),
                                       (sra GPR:$Rm, (i32 16))))]>,
            Requires<[IsARM, HasV5TE]>;
 
   def TB : AMulxyI<0b0001011, 0b01, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
               IIC_iMUL16, !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm",
-              [(set GPR:$Rd, (opnode (sra GPR:$Rn, (i32 16)),
+              [(set GPR:$Rd, (mul (sra GPR:$Rn, (i32 16)),
                                       (sext_inreg GPR:$Rm, i16)))]>,
            Requires<[IsARM, HasV5TE]>;
 
   def TT : AMulxyI<0b0001011, 0b11, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
               IIC_iMUL16, !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm",
-              [(set GPR:$Rd, (opnode (sra GPR:$Rn, (i32 16)),
+              [(set GPR:$Rd, (mul (sra GPR:$Rn, (i32 16)),
                                       (sra GPR:$Rm, (i32 16))))]>,
             Requires<[IsARM, HasV5TE]>;
 
@@ -4026,13 +4051,13 @@ multiclass AI_smul<string opc, PatFrag opnode> {
 }
 
 
-multiclass AI_smla<string opc, PatFrag opnode> {
+multiclass AI_smla<string opc> {
   let DecoderMethod = "DecodeSMLAInstruction" in {
   def BB : AMulxyIa<0b0001000, 0b00, (outs GPRnopc:$Rd),
               (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
               IIC_iMAC16, !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm, $Ra",
               [(set GPRnopc:$Rd, (add GPR:$Ra,
-                               (opnode (sext_inreg GPRnopc:$Rn, i16),
+                               (mul (sext_inreg GPRnopc:$Rn, i16),
                                        (sext_inreg GPRnopc:$Rm, i16))))]>,
            Requires<[IsARM, HasV5TE, UseMulOps]>;
 
@@ -4040,7 +4065,7 @@ multiclass AI_smla<string opc, PatFrag opnode> {
               (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
               IIC_iMAC16, !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm, $Ra",
               [(set GPRnopc:$Rd,
-                    (add GPR:$Ra, (opnode (sext_inreg GPRnopc:$Rn, i16),
+                    (add GPR:$Ra, (mul (sext_inreg GPRnopc:$Rn, i16),
                                           (sra GPRnopc:$Rm, (i32 16)))))]>,
            Requires<[IsARM, HasV5TE, UseMulOps]>;
 
@@ -4048,7 +4073,7 @@ multiclass AI_smla<string opc, PatFrag opnode> {
               (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
               IIC_iMAC16, !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm, $Ra",
               [(set GPRnopc:$Rd,
-                    (add GPR:$Ra, (opnode (sra GPRnopc:$Rn, (i32 16)),
+                    (add GPR:$Ra, (mul (sra GPRnopc:$Rn, (i32 16)),
                                           (sext_inreg GPRnopc:$Rm, i16))))]>,
            Requires<[IsARM, HasV5TE, UseMulOps]>;
 
@@ -4056,7 +4081,7 @@ multiclass AI_smla<string opc, PatFrag opnode> {
               (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
               IIC_iMAC16, !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm, $Ra",
              [(set GPRnopc:$Rd,
-                   (add GPR:$Ra, (opnode (sra GPRnopc:$Rn, (i32 16)),
+                   (add GPR:$Ra, (mul (sra GPRnopc:$Rn, (i32 16)),
                                          (sra GPRnopc:$Rm, (i32 16)))))]>,
             Requires<[IsARM, HasV5TE, UseMulOps]>;
 
@@ -4074,8 +4099,8 @@ multiclass AI_smla<string opc, PatFrag opnode> {
   }
 }
 
-defm SMUL : AI_smul<"smul", BinOpFrag<(mul node:$LHS, node:$RHS)>>;
-defm SMLA : AI_smla<"smla", BinOpFrag<(mul node:$LHS, node:$RHS)>>;
+defm SMUL : AI_smul<"smul">;
+defm SMLA : AI_smla<"smla">;
 
 // Halfword multiply accumulate long: SMLAL<x><y>.
 def SMLALBB : AMulxyI64<0b0001010, 0b00, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
@@ -4336,8 +4361,7 @@ def SETPAN : AInoP<(outs), (ins imm0_1:$imm), MiscFrm, NoItinerary, "setpan",
 //
 
 defm CMP  : AI1_cmp_irs<0b1010, "cmp",
-                        IIC_iCMPi, IIC_iCMPr, IIC_iCMPsr,
-                        BinOpFrag<(ARMcmp node:$LHS, node:$RHS)>>;
+                        IIC_iCMPi, IIC_iCMPr, IIC_iCMPsr, ARMcmp>;
 
 // ARMcmpZ can re-use the above instruction definitions.
 def : ARMPat<(ARMcmpZ GPR:$src, mod_imm:$imm),
@@ -4745,7 +4769,7 @@ def : ARMPat<(stlex_2 (and GPR:$Rt, 0xffff), addr_offset_none:$addr),
 class acquiring_load<PatFrag base>
   : PatFrag<(ops node:$ptr), (base node:$ptr), [{
   AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
-  return isAtLeastAcquire(Ordering);
+  return isAcquireOrStronger(Ordering);
 }]>;
 
 def atomic_load_acquire_8  : acquiring_load<atomic_load_8>;
@@ -4755,7 +4779,7 @@ def atomic_load_acquire_32 : acquiring_load<atomic_load_32>;
 class releasing_store<PatFrag base>
   : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
   AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
-  return isAtLeastRelease(Ordering);
+  return isReleaseOrStronger(Ordering);
 }]>;
 
 def atomic_store_release_8  : releasing_store<atomic_store_8>;
@@ -4831,21 +4855,21 @@ def CDP2 : ABXI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
 }
 
 class ACI<dag oops, dag iops, string opc, string asm,
-          IndexMode im = IndexModeNone>
+            list<dag> pattern, IndexMode im = IndexModeNone>
   : I<oops, iops, AddrModeNone, 4, im, BrFrm, NoItinerary,
-      opc, asm, "", []> {
+      opc, asm, "", pattern> {
   let Inst{27-25} = 0b110;
 }
 class ACInoP<dag oops, dag iops, string opc, string asm,
-          IndexMode im = IndexModeNone>
+          list<dag> pattern, IndexMode im = IndexModeNone>
   : InoP<oops, iops, AddrModeNone, 4, im, BrFrm, NoItinerary,
-         opc, asm, "", []> {
+         opc, asm, "", pattern> {
   let Inst{31-28} = 0b1111;
   let Inst{27-25} = 0b110;
 }
-multiclass LdStCop<bit load, bit Dbit, string asm> {
+multiclass LdStCop<bit load, bit Dbit, string asm, list<dag> pattern> {
   def _OFFSET : ACI<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr),
-                    asm, "\t$cop, $CRd, $addr"> {
+                    asm, "\t$cop, $CRd, $addr", pattern> {
     bits<13> addr;
     bits<4> cop;
     bits<4> CRd;
@@ -4861,7 +4885,7 @@ multiclass LdStCop<bit load, bit Dbit, string asm> {
     let DecoderMethod = "DecodeCopMemInstruction";
   }
   def _PRE : ACI<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5_pre:$addr),
-                 asm, "\t$cop, $CRd, $addr!", IndexModePre> {
+                 asm, "\t$cop, $CRd, $addr!", [], IndexModePre> {
     bits<13> addr;
     bits<4> cop;
     bits<4> CRd;
@@ -4878,7 +4902,7 @@ multiclass LdStCop<bit load, bit Dbit, string asm> {
   }
   def _POST: ACI<(outs), (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr,
                               postidx_imm8s4:$offset),
-                 asm, "\t$cop, $CRd, $addr, $offset", IndexModePost> {
+                 asm, "\t$cop, $CRd, $addr, $offset", [], IndexModePost> {
     bits<9> offset;
     bits<4> addr;
     bits<4> cop;
@@ -4897,7 +4921,7 @@ multiclass LdStCop<bit load, bit Dbit, string asm> {
   def _OPTION : ACI<(outs),
                     (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr,
                          coproc_option_imm:$option),
-      asm, "\t$cop, $CRd, $addr, $option"> {
+      asm, "\t$cop, $CRd, $addr, $option", []> {
     bits<8> option;
     bits<4> addr;
     bits<4> cop;
@@ -4914,9 +4938,9 @@ multiclass LdStCop<bit load, bit Dbit, string asm> {
     let DecoderMethod = "DecodeCopMemInstruction";
   }
 }
-multiclass LdSt2Cop<bit load, bit Dbit, string asm> {
+multiclass LdSt2Cop<bit load, bit Dbit, string asm, list<dag> pattern> {
   def _OFFSET : ACInoP<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr),
-                       asm, "\t$cop, $CRd, $addr"> {
+                       asm, "\t$cop, $CRd, $addr", pattern> {
     bits<13> addr;
     bits<4> cop;
     bits<4> CRd;
@@ -4932,7 +4956,7 @@ multiclass LdSt2Cop<bit load, bit Dbit, string asm> {
     let DecoderMethod = "DecodeCopMemInstruction";
   }
   def _PRE : ACInoP<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5_pre:$addr),
-                    asm, "\t$cop, $CRd, $addr!", IndexModePre> {
+                    asm, "\t$cop, $CRd, $addr!", [], IndexModePre> {
     bits<13> addr;
     bits<4> cop;
     bits<4> CRd;
@@ -4949,7 +4973,7 @@ multiclass LdSt2Cop<bit load, bit Dbit, string asm> {
   }
   def _POST: ACInoP<(outs), (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr,
                                  postidx_imm8s4:$offset),
-                 asm, "\t$cop, $CRd, $addr, $offset", IndexModePost> {
+                 asm, "\t$cop, $CRd, $addr, $offset", [], IndexModePost> {
     bits<9> offset;
     bits<4> addr;
     bits<4> cop;
@@ -4968,7 +4992,7 @@ multiclass LdSt2Cop<bit load, bit Dbit, string asm> {
   def _OPTION : ACInoP<(outs),
                        (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr,
                             coproc_option_imm:$option),
-      asm, "\t$cop, $CRd, $addr, $option"> {
+      asm, "\t$cop, $CRd, $addr, $option", []> {
     bits<8> option;
     bits<4> addr;
     bits<4> cop;
@@ -4986,14 +5010,15 @@ multiclass LdSt2Cop<bit load, bit Dbit, string asm> {
   }
 }
 
-defm LDC   : LdStCop <1, 0, "ldc">;
-defm LDCL  : LdStCop <1, 1, "ldcl">;
-defm STC   : LdStCop <0, 0, "stc">;
-defm STCL  : LdStCop <0, 1, "stcl">;
-defm LDC2  : LdSt2Cop<1, 0, "ldc2">, Requires<[PreV8]>;
-defm LDC2L : LdSt2Cop<1, 1, "ldc2l">, Requires<[PreV8]>;
-defm STC2  : LdSt2Cop<0, 0, "stc2">, Requires<[PreV8]>;
-defm STC2L : LdSt2Cop<0, 1, "stc2l">, Requires<[PreV8]>;
+defm LDC   : LdStCop <1, 0, "ldc", [(int_arm_ldc imm:$cop, imm:$CRd, addrmode5:$addr)]>;
+defm LDCL  : LdStCop <1, 1, "ldcl", [(int_arm_ldcl imm:$cop, imm:$CRd, addrmode5:$addr)]>;
+defm LDC2  : LdSt2Cop<1, 0, "ldc2", [(int_arm_ldc2 imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8]>;
+defm LDC2L : LdSt2Cop<1, 1, "ldc2l", [(int_arm_ldc2l imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8]>;
+
+defm STC   : LdStCop <0, 0, "stc", [(int_arm_stc imm:$cop, imm:$CRd, addrmode5:$addr)]>;
+defm STCL  : LdStCop <0, 1, "stcl", [(int_arm_stcl imm:$cop, imm:$CRd, addrmode5:$addr)]>;
+defm STC2  : LdSt2Cop<0, 0, "stc2", [(int_arm_stc2 imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8]>;
+defm STC2L : LdSt2Cop<0, 1, "stc2l", [(int_arm_stc2l imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8]>;
 
 //===----------------------------------------------------------------------===//
 // Move between coprocessor and ARM core register.
@@ -5118,9 +5143,9 @@ def MRRC : MovRRCopro<"mrrc", 1 /* from coprocessor to ARM core register */,
                       (outs GPRnopc:$Rt, GPRnopc:$Rt2),
                       (ins p_imm:$cop, imm0_15:$opc1, c_imm:$CRm), []>;
 
-class MovRRCopro2<string opc, bit direction, list<dag> pattern = []>
-  : ABXI<0b1100, (outs), (ins p_imm:$cop, imm0_15:$opc1,
-         GPRnopc:$Rt, GPRnopc:$Rt2, c_imm:$CRm), NoItinerary,
+class MovRRCopro2<string opc, bit direction, dag oops, dag iops,
+                  list<dag> pattern = []>
+  : ABXI<0b1100, oops, iops, NoItinerary,
          !strconcat(opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm"), pattern>,
     Requires<[PreV8]> {
   let Inst{31-28} = 0b1111;
@@ -5139,13 +5164,18 @@ class MovRRCopro2<string opc, bit direction, list<dag> pattern = []>
   let Inst{7-4}   = opc1;
   let Inst{3-0}   = CRm;
 
-  let DecoderMethod = "DecodeMRRC2";
+  let DecoderMethod = "DecoderForMRRC2AndMCRR2";
 }
 
 def MCRR2 : MovRRCopro2<"mcrr2", 0 /* from ARM core register to coprocessor */,
+                        (outs), (ins p_imm:$cop, imm0_15:$opc1, GPRnopc:$Rt,
+                        GPRnopc:$Rt2, c_imm:$CRm),
                         [(int_arm_mcrr2 imm:$cop, imm:$opc1, GPRnopc:$Rt,
                                         GPRnopc:$Rt2, imm:$CRm)]>;
-def MRRC2 : MovRRCopro2<"mrrc2", 1 /* from coprocessor to ARM core register */>;
+
+def MRRC2 : MovRRCopro2<"mrrc2", 1 /* from coprocessor to ARM core register */,
+                       (outs GPRnopc:$Rt, GPRnopc:$Rt2),
+                       (ins p_imm:$cop, imm0_15:$opc1, c_imm:$CRm), []>;
 
 //===----------------------------------------------------------------------===//
 // Move between special register and ARM core register
@@ -5164,7 +5194,7 @@ def MRS : ABI<0b0001, (outs GPRnopc:$Rd), (ins), NoItinerary,
   let Unpredictable{11-0} = 0b110100001111;
 }
 
-def : InstAlias<"mrs${p} $Rd, cpsr", (MRS GPRnopc:$Rd, pred:$p)>,
+def : InstAlias<"mrs${p} $Rd, cpsr", (MRS GPRnopc:$Rd, pred:$p), 0>,
          Requires<[IsARM]>;
 
 // The MRSsys instruction is the MRS instruction from the ARM ARM,
@@ -5206,6 +5236,7 @@ def MRSbanked : ABI<0b0001, (outs GPRnopc:$Rd), (ins banked_reg:$banked),
 // to distinguish between them. The mask operand contains the special register
 // (R Bit) in bit 4 and bits 3-0 contains the mask with the fields to be
 // accessed in the special register.
+let Defs = [CPSR] in
 def MSR : ABI<0b0001, (outs), (ins msr_mask:$mask, GPR:$Rn), NoItinerary,
               "msr", "\t$mask, $Rn", []> {
   bits<5> mask;
@@ -5220,6 +5251,7 @@ def MSR : ABI<0b0001, (outs), (ins msr_mask:$mask, GPR:$Rn), NoItinerary,
   let Inst{3-0} = Rn;
 }
 
+let Defs = [CPSR] in
 def MSRi : ABI<0b0011, (outs), (ins msr_mask:$mask,  mod_imm:$imm), NoItinerary,
                "msr", "\t$mask, $imm", []> {
   bits<5> mask;
@@ -5268,8 +5300,8 @@ let usesCustomInserter = 1, Uses = [R4], Defs = [R4, SP] in
 def win__dbzchk : SDNode<"ARMISD::WIN__DBZCHK", SDT_WIN__DBZCHK,
                          [SDNPHasChain, SDNPSideEffect, SDNPOutGlue]>;
 let usesCustomInserter = 1, Defs = [CPSR] in
-  def WIN__DBZCHK : PseudoInst<(outs), (ins GPR:$divisor), NoItinerary,
-                               [(win__dbzchk GPR:$divisor)]>;
+  def WIN__DBZCHK : PseudoInst<(outs), (ins tGPR:$divisor), NoItinerary,
+                               [(win__dbzchk tGPR:$divisor)]>;
 
 //===----------------------------------------------------------------------===//
 // TLS Instructions
@@ -5423,6 +5455,8 @@ def : Pat<(load (ARMWrapperPIC tglobaltlsaddr:$addr)),
 def : ARMPat<(ARMWrapper  tconstpool  :$dst), (LEApcrel tconstpool  :$dst)>;
 def : ARMPat<(ARMWrapper  tglobaladdr :$dst), (MOVi32imm tglobaladdr :$dst)>,
             Requires<[IsARM, UseMovt]>;
+def : ARMPat<(ARMWrapper texternalsym :$dst), (MOVi32imm texternalsym :$dst)>,
+            Requires<[IsARM, UseMovt]>;
 def : ARMPat<(ARMWrapperJT tjumptable:$dst),
              (LEApcrelJT tjumptable:$dst)>;
 
@@ -5568,9 +5602,9 @@ include "ARMInstrNEON.td"
 //
 
 // Memory barriers
-def : InstAlias<"dmb", (DMB 0xf)>, Requires<[IsARM, HasDB]>;
-def : InstAlias<"dsb", (DSB 0xf)>, Requires<[IsARM, HasDB]>;
-def : InstAlias<"isb", (ISB 0xf)>, Requires<[IsARM, HasDB]>;
+def : InstAlias<"dmb", (DMB 0xf), 0>, Requires<[IsARM, HasDB]>;
+def : InstAlias<"dsb", (DSB 0xf), 0>, Requires<[IsARM, HasDB]>;
+def : InstAlias<"isb", (ISB 0xf), 0>, Requires<[IsARM, HasDB]>;
 
 // System instructions
 def : MnemonicAlias<"swi", "svc">;
@@ -5583,13 +5617,13 @@ def : MnemonicAlias<"stmfd", "stmdb">;
 def : MnemonicAlias<"stmia", "stm">;
 def : MnemonicAlias<"stmea", "stm">;
 
-// PKHBT/PKHTB with default shift amount. PKHTB is equivalent to PKHBT when the
-// shift amount is zero (i.e., unspecified).
+// PKHBT/PKHTB with default shift amount. PKHTB is equivalent to PKHBT with the
+// input operands swapped when the shift amount is zero (i.e., unspecified).
 def : InstAlias<"pkhbt${p} $Rd, $Rn, $Rm",
-                (PKHBT GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, 0, pred:$p)>,
+                (PKHBT GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, 0, pred:$p), 0>,
         Requires<[IsARM, HasV6]>;
 def : InstAlias<"pkhtb${p} $Rd, $Rn, $Rm",
-                (PKHBT GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, 0, pred:$p)>,
+                (PKHBT GPRnopc:$Rd, GPRnopc:$Rm, GPRnopc:$Rn, 0, pred:$p), 0>,
         Requires<[IsARM, HasV6]>;
 
 // PUSH/POP aliases for STM/LDM
@@ -5747,23 +5781,23 @@ def : InstAlias<"nop${p}", (MOVr R0, R0, pred:$p, zero_reg)>,
 // the instruction definitions need difference constraints pre-v6.
 // Use these aliases for the assembly parsing on pre-v6.
 def : InstAlias<"mul${s}${p} $Rd, $Rn, $Rm",
-            (MUL GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p, cc_out:$s)>,
+            (MUL GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p, cc_out:$s), 0>,
          Requires<[IsARM, NoV6]>;
 def : InstAlias<"mla${s}${p} $Rd, $Rn, $Rm, $Ra",
             (MLA GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$Ra,
-             pred:$p, cc_out:$s)>,
+             pred:$p, cc_out:$s), 0>,
          Requires<[IsARM, NoV6]>;
 def : InstAlias<"smlal${s}${p} $RdLo, $RdHi, $Rn, $Rm",
-            (SMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>,
+            (SMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), 0>,
          Requires<[IsARM, NoV6]>;
 def : InstAlias<"umlal${s}${p} $RdLo, $RdHi, $Rn, $Rm",
-            (UMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>,
+            (UMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), 0>,
          Requires<[IsARM, NoV6]>;
 def : InstAlias<"smull${s}${p} $RdLo, $RdHi, $Rn, $Rm",
-            (SMULL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>,
+            (SMULL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), 0>,
          Requires<[IsARM, NoV6]>;
 def : InstAlias<"umull${s}${p} $RdLo, $RdHi, $Rn, $Rm",
-            (UMULL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>,
+            (UMULL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), 0>,
          Requires<[IsARM, NoV6]>;
 
 // 'it' blocks in ARM mode just validate the predicates. The IT itself
@@ -5775,3 +5809,36 @@ let mayLoad = 1, mayStore =1, hasSideEffects = 1 in
 def SPACE : PseudoInst<(outs GPR:$Rd), (ins i32imm:$size, GPR:$Rn),
                        NoItinerary,
                        [(set GPR:$Rd, (int_arm_space imm:$size, GPR:$Rn))]>;
+
+//===----------------------------------
+// Atomic cmpxchg for -O0
+//===----------------------------------
+
+// The fast register allocator used during -O0 inserts spills to cover any VRegs
+// live across basic block boundaries. When this happens between an LDXR and an
+// STXR it can clear the exclusive monitor, causing all cmpxchg attempts to
+// fail.
+
+// Unfortunately, this means we have to have an alternative (expanded
+// post-regalloc) path for -O0 compilations. Fortunately this path can be
+// significantly more naive than the standard expansion: we conservatively
+// assume seq_cst, strong cmpxchg and omit clrex on failure.
+
+let Constraints = "@earlyclobber $Rd,@earlyclobber $status",
+    mayLoad = 1, mayStore = 1 in {
+def CMP_SWAP_8 : PseudoInst<(outs GPR:$Rd, GPR:$status),
+                            (ins GPR:$addr, GPR:$desired, GPR:$new),
+                            NoItinerary, []>, Sched<[]>;
+
+def CMP_SWAP_16 : PseudoInst<(outs GPR:$Rd, GPR:$status),
+                             (ins GPR:$addr, GPR:$desired, GPR:$new),
+                             NoItinerary, []>, Sched<[]>;
+
+def CMP_SWAP_32 : PseudoInst<(outs GPR:$Rd, GPR:$status),
+                             (ins GPR:$addr, GPR:$desired, GPR:$new),
+                             NoItinerary, []>, Sched<[]>;
+
+def CMP_SWAP_64 : PseudoInst<(outs GPRPair:$Rd, GPR:$status),
+                             (ins GPR:$addr, GPRPair:$desired, GPRPair:$new),
+                             NoItinerary, []>, Sched<[]>;
+}
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index 5b1f9a06442e..93a174f3678a 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -15,10 +15,6 @@
 // Thumb specific DAG Nodes.
 //
 
-def ARMtcall : SDNode<"ARMISD::tCALL", SDT_ARMcall,
-                      [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
-                       SDNPVariadic]>;
-
 def imm_sr_XFORM: SDNodeXForm<imm, [{
   unsigned Imm = N->getZExtValue();
   return CurDAG->getTargetConstant((Imm == 32 ? 0 : Imm), SDLoc(N), MVT::i32);
@@ -70,6 +66,14 @@ def thumb_immshifted_shamt : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(V, SDLoc(N), MVT::i32);
 }]>;
 
+def imm256_510 : ImmLeaf<i32, [{
+  return Imm >= 256 && Imm < 511;
+}]>;
+
+def thumb_imm256_510_addend : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() - 255, SDLoc(N), MVT::i32);
+}]>;
+
 // Scaled 4 immediate.
 def t_imm0_1020s4_asmoperand: AsmOperandClass { let Name = "Imm0_1020s4"; }
 def t_imm0_1020s4 : Operand<i32> {
@@ -121,26 +125,38 @@ def t_adrlabel : Operand<i32> {
   let ParserMatchClass = UnsignedOffset_b8s2;
 }
 
-def t_bcctarget : Operand<i32> {
-  let EncoderMethod = "getThumbBCCTargetOpValue";
-  let DecoderMethod = "DecodeThumbBCCTargetOperand";
-}
 
-def t_cbtarget : Operand<i32> {
-  let EncoderMethod = "getThumbCBTargetOpValue";
-  let DecoderMethod = "DecodeThumbCmpBROperand";
+def thumb_br_target : Operand<OtherVT> {
+  let ParserMatchClass = ThumbBranchTarget;
+  let EncoderMethod = "getThumbBranchTargetOpValue";
+  let OperandType = "OPERAND_PCREL";
 }
 
-def t_bltarget : Operand<i32> {
+def thumb_bl_target : Operand<i32> {
+  let ParserMatchClass = ThumbBranchTarget;
   let EncoderMethod = "getThumbBLTargetOpValue";
   let DecoderMethod = "DecodeThumbBLTargetOperand";
 }
 
-def t_blxtarget : Operand<i32> {
+// Target for BLX *from* thumb mode.
+def thumb_blx_target : Operand<i32> {
+  let ParserMatchClass = ARMBranchTarget;
   let EncoderMethod = "getThumbBLXTargetOpValue";
   let DecoderMethod = "DecodeThumbBLXOffset";
 }
 
+def thumb_bcc_target : Operand<OtherVT> {
+  let ParserMatchClass = ThumbBranchTarget;
+  let EncoderMethod = "getThumbBCCTargetOpValue";
+  let DecoderMethod = "DecodeThumbBCCTargetOperand";
+}
+
+def thumb_cb_target : Operand<OtherVT> {
+  let ParserMatchClass = ThumbBranchTarget;
+  let EncoderMethod = "getThumbCBTargetOpValue";
+  let DecoderMethod = "DecodeThumbCmpBROperand";
+}
+
 // t_addrmode_pc := <label> => pc + imm8 * 4
 //
 def t_addrmode_pc : MemOperand {
@@ -278,16 +294,17 @@ def tHINT : T1pI<(outs), (ins imm0_15:$imm), NoItinerary, "hint", "\t$imm",
   let Inst{7-4} = imm;
 }
 
-class tHintAlias<string Asm, dag Result> : tInstAlias<Asm, Result> {
+// Note: When EmitPriority == 1, the alias will be used for printing
+class tHintAlias<string Asm, dag Result, bit EmitPriority = 0> : tInstAlias<Asm, Result, EmitPriority> {
   let Predicates = [IsThumb, HasV6M];
 }
 
-def : tHintAlias<"nop$p", (tHINT 0, pred:$p)>; // A8.6.110
-def : tHintAlias<"yield$p", (tHINT 1, pred:$p)>; // A8.6.410
-def : tHintAlias<"wfe$p", (tHINT 2, pred:$p)>; // A8.6.408
-def : tHintAlias<"wfi$p", (tHINT 3, pred:$p)>; // A8.6.409
-def : tHintAlias<"sev$p", (tHINT 4, pred:$p)>; // A8.6.157
-def : tInstAlias<"sevl$p", (tHINT 5, pred:$p)> {
+def : tHintAlias<"nop$p", (tHINT 0, pred:$p), 1>; // A8.6.110
+def : tHintAlias<"yield$p", (tHINT 1, pred:$p), 1>; // A8.6.410
+def : tHintAlias<"wfe$p", (tHINT 2, pred:$p), 1>; // A8.6.408
+def : tHintAlias<"wfi$p", (tHINT 3, pred:$p), 1>; // A8.6.409
+def : tHintAlias<"sev$p", (tHINT 4, pred:$p), 1>; // A8.6.157
+def : tInstAlias<"sevl$p", (tHINT 5, pred:$p), 1> {
   let Predicates = [IsThumb2, HasV8];
 }
 
@@ -302,7 +319,7 @@ def tBKPT : T1I<(outs), (ins imm0_255:$val), NoItinerary, "bkpt\t$val",
   let Inst{7-0} = val;
 }
 // default immediate for breakpoint mnemonic
-def : InstAlias<"bkpt", (tBKPT 0)>, Requires<[IsThumb]>;
+def : InstAlias<"bkpt", (tBKPT 0), 0>, Requires<[IsThumb]>;
 
 def tHLT : T1I<(outs), (ins imm0_63:$val), NoItinerary, "hlt\t$val",
                 []>, T1Encoding<0b101110>, Requires<[IsThumb, HasV8]> {
@@ -439,6 +456,14 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
     let Inst{2-0} = 0b000;
     let Unpredictable{2-0} = 0b111;
   }
+  def tBXNS : TI<(outs), (ins GPR:$Rm, pred:$p), IIC_Br, "bxns${p}\t$Rm", []>,
+              Requires<[IsThumb, Has8MSecExt]>,
+              T1Special<{1,1,0,?}>, Sched<[WriteBr]> {
+    bits<4> Rm;
+    let Inst{6-3} = Rm;
+    let Inst{2-0} = 0b100;
+    let Unpredictable{1-0} = 0b11;
+  }
 }
 
 let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
@@ -458,9 +483,9 @@ let isCall = 1,
   Defs = [LR], Uses = [SP] in {
   // Also used for Thumb2
   def tBL  : TIx2<0b11110, 0b11, 1,
-                  (outs), (ins pred:$p, t_bltarget:$func), IIC_Br,
+                  (outs), (ins pred:$p, thumb_bl_target:$func), IIC_Br,
                   "bl${p}\t$func",
-                  [(ARMtcall tglobaladdr:$func)]>,
+                  [(ARMcall tglobaladdr:$func)]>,
              Requires<[IsThumb]>, Sched<[WriteBrL]> {
     bits<24> func;
     let Inst{26} = func{23};
@@ -472,9 +497,8 @@ let isCall = 1,
 
   // ARMv5T and above, also used for Thumb2
   def tBLXi : TIx2<0b11110, 0b11, 0,
-                 (outs), (ins pred:$p, t_blxtarget:$func), IIC_Br,
-                   "blx${p}\t$func",
-                   [(ARMcall tglobaladdr:$func)]>,
+                 (outs), (ins pred:$p, thumb_blx_target:$func), IIC_Br,
+                   "blx${p}\t$func", []>,
               Requires<[IsThumb, HasV5T, IsNotMClass]>, Sched<[WriteBrL]> {
     bits<24> func;
     let Inst{26} = func{23};
@@ -488,7 +512,7 @@ let isCall = 1,
   // Also used for Thumb2
   def tBLXr : TI<(outs), (ins pred:$p, GPR:$func), IIC_Br,
                   "blx${p}\t$func",
-                  [(ARMtcall GPR:$func)]>,
+                  [(ARMcall GPR:$func)]>,
               Requires<[IsThumb, HasV5T]>,
               T1Special<{1,1,1,?}>, Sched<[WriteBrL]> { // A6.2.3 & A8.6.24;
     bits<4> func;
@@ -496,6 +520,17 @@ let isCall = 1,
     let Inst{2-0} = 0b000;
   }
 
+  // ARMv8-M Security Extensions
+  def tBLXNSr : TI<(outs), (ins pred:$p, GPRnopc:$func), IIC_Br,
+                   "blxns${p}\t$func", []>,
+                Requires<[IsThumb, Has8MSecExt]>,
+                T1Special<{1,1,1,?}>, Sched<[WriteBrL]> {
+    bits<4> func;
+    let Inst{6-3} = func;
+    let Inst{2-0} = 0b100;
+    let Unpredictable{1-0} = 0b11;
+  }
+
   // ARMv4T
   def tBX_CALL : tPseudoInst<(outs), (ins tGPR:$func),
                   4, IIC_Br,
@@ -517,8 +552,9 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
   // Just a pseudo for a tBL instruction. Needed to let regalloc know about
   // the clobber of LR.
   let Defs = [LR] in
-  def tBfar : tPseudoExpand<(outs), (ins t_bltarget:$target, pred:$p),
-                          4, IIC_Br, [], (tBL pred:$p, t_bltarget:$target)>,
+  def tBfar : tPseudoExpand<(outs), (ins thumb_bl_target:$target, pred:$p),
+                          4, IIC_Br, [],
+                          (tBL pred:$p, thumb_bl_target:$target)>,
                           Sched<[WriteBrTbl]>;
 
   def tBR_JTr : tPseudoInst<(outs),
@@ -534,7 +570,7 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
 // FIXME: should be able to write a pattern for ARMBrcond, but can't use
 // a two-value operand where a dag node expects two operands. :(
 let isBranch = 1, isTerminator = 1 in
-  def tBcc : T1I<(outs), (ins t_bcctarget:$target, pred:$p), IIC_Br,
+  def tBcc : T1I<(outs), (ins thumb_bcc_target:$target, pred:$p), IIC_Br,
                  "b${p}\t$target",
                  [/*(ARMbrcond bb:$target, imm:$cc)*/]>,
              T1BranchCond<{1,1,0,1}>, Sched<[WriteBr]> {
@@ -663,19 +699,19 @@ multiclass thumb_st_rr_ri_enc<bits<3> reg_opc, bits<4> imm_opc,
 defm tLDR  : thumb_ld_rr_ri_enc<0b100, 0b0110, t_addrmode_rr,
                                 t_addrmode_is4, AddrModeT1_4,
                                 IIC_iLoad_r, IIC_iLoad_i, "ldr",
-                                UnOpFrag<(load node:$Src)>>;
+                                load>;
 
 // A8.6.64 & A8.6.61
 defm tLDRB : thumb_ld_rr_ri_enc<0b110, 0b0111, t_addrmode_rr,
                                 t_addrmode_is1, AddrModeT1_1,
                                 IIC_iLoad_bh_r, IIC_iLoad_bh_i, "ldrb",
-                                UnOpFrag<(zextloadi8 node:$Src)>>;
+                                zextloadi8>;
 
 // A8.6.76 & A8.6.73
 defm tLDRH : thumb_ld_rr_ri_enc<0b101, 0b1000, t_addrmode_rr,
                                 t_addrmode_is2, AddrModeT1_2,
                                 IIC_iLoad_bh_r, IIC_iLoad_bh_i, "ldrh",
-                                UnOpFrag<(zextloadi16 node:$Src)>>;
+                                zextloadi16>;
 
 let AddedComplexity = 10 in
 def tLDRSB :                    // A8.6.80
@@ -706,19 +742,19 @@ def tSTRspi : T1pIs<(outs), (ins tGPR:$Rt, t_addrmode_sp:$addr), IIC_iStore_i,
 defm tSTR  : thumb_st_rr_ri_enc<0b000, 0b0110, t_addrmode_rr,
                                 t_addrmode_is4, AddrModeT1_4,
                                 IIC_iStore_r, IIC_iStore_i, "str",
-                                BinOpFrag<(store node:$LHS, node:$RHS)>>;
+                                store>;
 
 // A8.6.197 & A8.6.195
 defm tSTRB : thumb_st_rr_ri_enc<0b010, 0b0111, t_addrmode_rr,
                                 t_addrmode_is1, AddrModeT1_1,
                                 IIC_iStore_bh_r, IIC_iStore_bh_i, "strb",
-                                BinOpFrag<(truncstorei8 node:$LHS, node:$RHS)>>;
+                                truncstorei8>;
 
 // A8.6.207 & A8.6.205
 defm tSTRH : thumb_st_rr_ri_enc<0b001, 0b1000, t_addrmode_rr,
                                t_addrmode_is2, AddrModeT1_2,
                                IIC_iStore_bh_r, IIC_iStore_bh_i, "strh",
-                               BinOpFrag<(truncstorei16 node:$LHS, node:$RHS)>>;
+                               truncstorei16>;
 
 
 //===----------------------------------------------------------------------===//
@@ -770,7 +806,7 @@ def tSTMIA_UPD : Thumb1I<(outs GPR:$wb),
 } // hasSideEffects
 
 def : InstAlias<"ldm${p} $Rn!, $regs",
-                (tLDMIA tGPR:$Rn, pred:$p, reglist:$regs)>,
+                (tLDMIA tGPR:$Rn, pred:$p, reglist:$regs), 0>,
         Requires<[IsThumb, IsThumb1Only]>;
 
 let mayLoad = 1, Uses = [SP], Defs = [SP], hasExtraDefRegAllocReq = 1 in
@@ -1310,7 +1346,14 @@ def tInt_eh_sjlj_longjmp : XI<(outs), (ins GPR:$src, GPR:$scratch),
                               AddrModeNone, 0, IndexModeNone,
                               Pseudo, NoItinerary, "", "",
                               [(ARMeh_sjlj_longjmp GPR:$src, GPR:$scratch)]>,
-                             Requires<[IsThumb]>;
+                             Requires<[IsThumb,IsNotWindows]>;
+
+let isBarrier = 1, hasSideEffects = 1, isTerminator = 1, isCodeGenOnly = 1,
+    Defs = [ R11, LR, SP ] in
+def tInt_WIN_eh_sjlj_longjmp
+  : XI<(outs), (ins GPR:$src, GPR:$scratch), AddrModeNone, 0, IndexModeNone,
+       Pseudo, NoItinerary, "", "", [(ARMeh_sjlj_longjmp GPR:$src, GPR:$scratch)]>,
+    Requires<[IsThumb,IsWindows]>;
 
 //===----------------------------------------------------------------------===//
 // Non-Instruction Patterns
@@ -1380,16 +1423,9 @@ def : T1Pat<(ARMWrapperJT tjumptable:$dst),
             (tLEApcrelJT tjumptable:$dst)>;
 
 // Direct calls
-def : T1Pat<(ARMtcall texternalsym:$func), (tBL texternalsym:$func)>,
+def : T1Pat<(ARMcall texternalsym:$func), (tBL texternalsym:$func)>,
       Requires<[IsThumb]>;
 
-def : Tv5Pat<(ARMcall texternalsym:$func), (tBLXi texternalsym:$func)>,
-      Requires<[IsThumb, HasV5T, IsNotMClass]>;
-
-// Indirect calls to ARM routines
-def : Tv5Pat<(ARMcall GPR:$dst), (tBLXr GPR:$dst)>,
-      Requires<[IsThumb, HasV5T]>;
-
 // zextload i1 -> zextload i8
 def : T1Pat<(zextloadi1 t_addrmode_is1:$addr),
             (tLDRBi t_addrmode_is1:$addr)>;
@@ -1415,6 +1451,24 @@ def : T1Pat<(extloadi8  t_addrmode_rr:$addr),  (tLDRBr t_addrmode_rr:$addr)>;
 def : T1Pat<(extloadi16 t_addrmode_is2:$addr), (tLDRHi t_addrmode_is2:$addr)>;
 def : T1Pat<(extloadi16 t_addrmode_rr:$addr),  (tLDRHr t_addrmode_rr:$addr)>;
 
+// post-inc loads and stores
+
+// post-inc LDR -> LDM r0!, {r1}. The way operands are layed out in LDMs is
+// different to how ISel expects them for a post-inc load, so use a pseudo
+// and expand it just after ISel.
+let usesCustomInserter = 1,
+    Constraints = "$Rn = $Rn_wb,@earlyclobber $Rn_wb" in
+ def tLDR_postidx: tPseudoInst<(outs rGPR:$Rt, rGPR:$Rn_wb),
+                               (ins rGPR:$Rn, pred:$p),
+                               4, IIC_iStore_ru,
+                               []>;
+
+// post-inc STR -> STM r0!, {r1}. The layout of this (because it doesn't def
+// multiple registers) is the same in ISel as MachineInstr, so there's no need
+// for a pseudo.
+def : T1Pat<(post_store rGPR:$Rt, rGPR:$Rn, 4),
+            (tSTMIA_UPD rGPR:$Rn, rGPR:$Rt)>;
+
 // If it's impossible to use [r,r] address mode for sextload, select to
 // ldr{b|h} + sxt{b|h} instead.
 def : T1Pat<(sextloadi8 t_addrmode_is1:$addr),
@@ -1474,6 +1528,10 @@ def : T1Pat<(i32 thumb_immshifted:$src),
 def : T1Pat<(i32 imm0_255_comp:$src),
             (tMVN (tMOVi8 (imm_comp_XFORM imm:$src)))>;
 
+def : T1Pat<(i32 imm256_510:$src),
+            (tADDi8 (tMOVi8 255),
+                    (thumb_imm256_510_addend imm:$src))>;
+
 // Pseudo instruction that combines ldr from constpool and add pc. This should
 // be expanded into two instructions late to allow if-conversion and
 // scheduling.
@@ -1502,7 +1560,7 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
 
 // In Thumb1, "nop" is encoded as a "mov r8, r8". Technically, the bf00
 // encoding is available on ARMv6K, but we don't differentiate that finely.
-def : InstAlias<"nop", (tMOVr R8, R8, 14, 0)>,Requires<[IsThumb, IsThumb1Only]>;
+def : InstAlias<"nop", (tMOVr R8, R8, 14, 0), 0>, Requires<[IsThumb, IsThumb1Only]>;
 
 
 // For round-trip assembly/disassembly, we have to handle a CPS instruction
@@ -1524,3 +1582,8 @@ def : tInstAlias<"lsr${s}${p} $Rdm, $imm",
              (tLSRri tGPR:$Rdm, cc_out:$s, tGPR:$Rdm, imm_sr:$imm, pred:$p)>;
 def : tInstAlias<"asr${s}${p} $Rdm, $imm",
              (tASRri tGPR:$Rdm, cc_out:$s, tGPR:$Rdm, imm_sr:$imm, pred:$p)>;
+
+// Pseudo instruction ldr Rt, =immediate
+def tLDRConstPool
+  : tAsmPseudo<"ldr${p} $Rt, $immediate",
+               (ins tGPR:$Rt, const_pool_asm_imm:$immediate, pred:$p)>;
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index f42f4569b2f8..55e5308be40e 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -270,7 +270,7 @@ def t2addrmode_so_reg : MemOperand,
   let EncoderMethod = "getT2AddrModeSORegOpValue";
   let DecoderMethod = "DecodeT2AddrModeSOReg";
   let ParserMatchClass = t2addrmode_so_reg_asmoperand;
-  let MIOperandInfo = (ops GPR:$base, rGPR:$offsreg, i32imm:$offsimm);
+  let MIOperandInfo = (ops GPRnopc:$base, rGPR:$offsreg, i32imm:$offsimm);
 }
 
 // Addresses for the TBB/TBH instructions.
@@ -576,8 +576,8 @@ class T2MlaLong<bits<3> opc22_20, bits<4> opc7_4,
 /// changed to modify CPSR.
 multiclass T2I_bin_irs<bits<4> opcod, string opc,
                      InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
-                       PatFrag opnode, bit Commutable = 0,
-                       string wide = ""> {
+                     SDPatternOperator opnode, bit Commutable = 0,
+                     string wide = ""> {
    // shifted imm
    def ri : T2sTwoRegImm<
                 (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm), iii,
@@ -632,7 +632,7 @@ multiclass T2I_bin_irs<bits<4> opcod, string opc,
 //  the ".w" suffix to indicate that they are wide.
 multiclass T2I_bin_w_irs<bits<4> opcod, string opc,
                      InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
-                         PatFrag opnode, bit Commutable = 0> :
+                     SDPatternOperator opnode, bit Commutable = 0> :
     T2I_bin_irs<opcod, opc, iii, iir, iis, opnode, Commutable, ".w"> {
   // Assembler aliases w/ the ".w" suffix.
   def : t2InstAlias<!strconcat(opc, "${s}${p}.w", " $Rd, $Rn, $imm"),
@@ -661,7 +661,7 @@ multiclass T2I_bin_w_irs<bits<4> opcod, string opc,
 /// T2I_rbin_is - Same as T2I_bin_irs except the order of operands are
 /// reversed.  The 'rr' form is only defined for the disassembler; for codegen
 /// it is equivalent to the T2I_bin_irs counterpart.
-multiclass T2I_rbin_irs<bits<4> opcod, string opc, PatFrag opnode> {
+multiclass T2I_rbin_irs<bits<4> opcod, string opc, SDNode opnode> {
    // shifted imm
    def ri : T2sTwoRegImm<
                  (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm), IIC_iALUi,
@@ -705,7 +705,7 @@ multiclass T2I_rbin_irs<bits<4> opcod, string opc, PatFrag opnode> {
 /// AdjustInstrPostInstrSelection after giving then an optional CPSR operand.
 let hasPostISelHook = 1, Defs = [CPSR] in {
 multiclass T2I_bin_s_irs<InstrItinClass iii, InstrItinClass iir,
-                         InstrItinClass iis, PatFrag opnode,
+                         InstrItinClass iis, SDNode opnode,
                          bit Commutable = 0> {
    // shifted imm
    def ri : t2PseudoInst<(outs rGPR:$Rd),
@@ -735,7 +735,7 @@ multiclass T2I_bin_s_irs<InstrItinClass iii, InstrItinClass iir,
 /// T2I_rbin_s_is -  Same as T2I_bin_s_irs, except selection DAG
 /// operands are reversed.
 let hasPostISelHook = 1, Defs = [CPSR] in {
-multiclass T2I_rbin_s_is<PatFrag opnode> {
+multiclass T2I_rbin_s_is<SDNode opnode> {
    // shifted imm
    def ri : t2PseudoInst<(outs rGPR:$Rd),
                          (ins rGPR:$Rn, t2_so_imm:$imm, pred:$p),
@@ -755,7 +755,7 @@ multiclass T2I_rbin_s_is<PatFrag opnode> {
 
 /// T2I_bin_ii12rs - Defines a set of (op reg, {so_imm|imm0_4095|r|so_reg})
 /// patterns for a binary operation that produces a value.
-multiclass T2I_bin_ii12rs<bits<3> op23_21, string opc, PatFrag opnode,
+multiclass T2I_bin_ii12rs<bits<3> op23_21, string opc, SDNode opnode,
                           bit Commutable = 0> {
    // shifted imm
    // The register-immediate version is re-materializable. This is useful
@@ -824,7 +824,7 @@ multiclass T2I_bin_ii12rs<bits<3> op23_21, string opc, PatFrag opnode,
 /// for a binary operation that produces a value and use the carry
 /// bit. It's not predicable.
 let Defs = [CPSR], Uses = [CPSR] in {
-multiclass T2I_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
+multiclass T2I_adde_sube_irs<bits<4> opcod, string opc, SDNode opnode,
                              bit Commutable = 0> {
    // shifted imm
    def ri : T2sTwoRegImm<(outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm),
@@ -864,7 +864,7 @@ multiclass T2I_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
 
 /// T2I_sh_ir - Defines a set of (op reg, {so_imm|r}) patterns for a shift /
 //  rotate operation that produces a value.
-multiclass T2I_sh_ir<bits<2> opcod, string opc, Operand ty, PatFrag opnode> {
+multiclass T2I_sh_ir<bits<2> opcod, string opc, Operand ty, SDNode opnode> {
    // 5-bit imm
    def ri : T2sTwoRegShiftImm<
                  (outs rGPR:$Rd), (ins rGPR:$Rm, ty:$imm), IIC_iMOVsi,
@@ -919,7 +919,7 @@ multiclass T2I_sh_ir<bits<2> opcod, string opc, Operand ty, PatFrag opnode> {
 /// a explicit result, only implicitly set CPSR.
 multiclass T2I_cmp_irs<bits<4> opcod, string opc,
                      InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
-                       PatFrag opnode> {
+                     SDPatternOperator opnode> {
 let isCompare = 1, Defs = [CPSR] in {
    // shifted imm
    def ri : T2OneRegCmpImm<
@@ -1260,20 +1260,19 @@ def t2LEApcrelJT : t2PseudoInst<(outs rGPR:$Rd),
 
 // Load
 let canFoldAsLoad = 1, isReMaterializable = 1  in
-defm t2LDR   : T2I_ld<0, 0b10, "ldr", IIC_iLoad_i, IIC_iLoad_si, GPR,
-                      UnOpFrag<(load node:$Src)>>;
+defm t2LDR   : T2I_ld<0, 0b10, "ldr", IIC_iLoad_i, IIC_iLoad_si, GPR, load>;
 
 // Loads with zero extension
 defm t2LDRH  : T2I_ld<0, 0b01, "ldrh", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
-                      GPRnopc, UnOpFrag<(zextloadi16 node:$Src)>>;
+                      GPRnopc, zextloadi16>;
 defm t2LDRB  : T2I_ld<0, 0b00, "ldrb", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
-                      GPRnopc, UnOpFrag<(zextloadi8  node:$Src)>>;
+                      GPRnopc, zextloadi8>;
 
 // Loads with sign extension
 defm t2LDRSH : T2I_ld<1, 0b01, "ldrsh", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
-                      GPRnopc, UnOpFrag<(sextloadi16 node:$Src)>>;
+                      GPRnopc, sextloadi16>;
 defm t2LDRSB : T2I_ld<1, 0b00, "ldrsb", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
-                      GPRnopc, UnOpFrag<(sextloadi8  node:$Src)>>;
+                      GPRnopc, sextloadi8>;
 
 let mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 in {
 // Load doubleword
@@ -1414,7 +1413,7 @@ def t2LDRSHT : T2IldT<1, 0b01, "ldrsht", IIC_iLoad_bh_i>;
 class T2Ildacq<bits<4> bits23_20, bits<2> bit54, dag oops, dag iops,
                string opc, string asm, list<dag> pattern>
   : Thumb2I<oops, iops, AddrModeNone, 4, NoItinerary,
-            opc, asm, "", pattern>, Requires<[IsThumb, HasV8]> {
+            opc, asm, "", pattern>, Requires<[IsThumb, HasAcquireRelease]> {
   bits<4> Rt;
   bits<4> addr;
 
@@ -1438,12 +1437,11 @@ def t2LDAH : T2Ildacq<0b1101, 0b01, (outs rGPR:$Rt),
                       (ins addr_offset_none:$addr), "ldah", "\t$Rt, $addr", []>;
 
 // Store
-defm t2STR :T2I_st<0b10,"str", IIC_iStore_i, IIC_iStore_si, GPR,
-                   BinOpFrag<(store node:$LHS, node:$RHS)>>;
+defm t2STR :T2I_st<0b10,"str", IIC_iStore_i, IIC_iStore_si, GPR, store>;
 defm t2STRB:T2I_st<0b00,"strb", IIC_iStore_bh_i, IIC_iStore_bh_si,
-                   rGPR, BinOpFrag<(truncstorei8 node:$LHS, node:$RHS)>>;
+                   rGPR, truncstorei8>;
 defm t2STRH:T2I_st<0b01,"strh", IIC_iStore_bh_i, IIC_iStore_bh_si,
-                   rGPR, BinOpFrag<(truncstorei16 node:$LHS, node:$RHS)>>;
+                   rGPR, truncstorei16>;
 
 // Store doubleword
 let mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 in
@@ -1586,7 +1584,7 @@ def t2STRD_POST : T2Ii8s4post<0, 1, 0, (outs GPR:$wb),
 class T2Istrrel<bits<2> bit54, dag oops, dag iops,
                 string opc, string asm, list<dag> pattern>
   : Thumb2I<oops, iops, AddrModeNone, 4, NoItinerary, opc,
-            asm, "", pattern>, Requires<[IsThumb, HasV8]> {
+            asm, "", pattern>, Requires<[IsThumb, HasAcquireRelease]> {
   bits<4> Rt;
   bits<4> addr;
 
@@ -1906,7 +1904,8 @@ def : t2InstAlias<"mov${p} $Rd, $imm", (t2MOVi rGPR:$Rd, t2_so_imm:$imm,
 let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in
 def t2MOVi16 : T2I<(outs rGPR:$Rd), (ins imm0_65535_expr:$imm), IIC_iMOVi,
                    "movw", "\t$Rd, $imm",
-                   [(set rGPR:$Rd, imm0_65535:$imm)]>, Sched<[WriteALU]> {
+                   [(set rGPR:$Rd, imm0_65535:$imm)]>, Sched<[WriteALU]>,
+                   Requires<[IsThumb, HasV8MBaseline]> {
   let Inst{31-27} = 0b11110;
   let Inst{25} = 1;
   let Inst{24-21} = 0b0010;
@@ -1924,8 +1923,9 @@ def t2MOVi16 : T2I<(outs rGPR:$Rd), (ins imm0_65535_expr:$imm), IIC_iMOVi,
   let DecoderMethod = "DecodeT2MOVTWInstruction";
 }
 
-def : t2InstAlias<"mov${p} $Rd, $imm",
-                  (t2MOVi16 rGPR:$Rd, imm256_65535_expr:$imm, pred:$p)>;
+def : InstAlias<"mov${p} $Rd, $imm",
+                (t2MOVi16 rGPR:$Rd, imm256_65535_expr:$imm, pred:$p), 0>,
+                Requires<[IsThumb, HasV8MBaseline]>;
 
 def t2MOVi16_ga_pcrel : PseudoInst<(outs rGPR:$Rd),
                                 (ins i32imm:$addr, pclabel:$id), IIC_iMOVi, []>;
@@ -1936,7 +1936,8 @@ def t2MOVTi16 : T2I<(outs rGPR:$Rd),
                     "movt", "\t$Rd, $imm",
                     [(set rGPR:$Rd,
                           (or (and rGPR:$src, 0xffff), lo16AllZero:$imm))]>,
-                          Sched<[WriteALU]> {
+                          Sched<[WriteALU]>,
+                          Requires<[IsThumb, HasV8MBaseline]> {
   let Inst{31-27} = 0b11110;
   let Inst{25} = 1;
   let Inst{24-21} = 0b0110;
@@ -1956,7 +1957,7 @@ def t2MOVTi16 : T2I<(outs rGPR:$Rd),
 
 def t2MOVTi16_ga_pcrel : PseudoInst<(outs rGPR:$Rd),
                      (ins rGPR:$src, i32imm:$addr, pclabel:$id), IIC_iMOVi, []>,
-                     Sched<[WriteALU]>;
+                     Sched<[WriteALU]>, Requires<[IsThumb, HasV8MBaseline]>;
 } // Constraints
 
 def : T2Pat<(or rGPR:$src, 0xffff0000), (t2MOVTi16 rGPR:$src, 0xffff)>;
@@ -1997,7 +1998,7 @@ def t2UXTB   : T2I_ext_rrot<0b101, "uxtb",
 def t2UXTH   : T2I_ext_rrot<0b001, "uxth",
                                UnOpFrag<(and node:$Src, 0x0000FFFF)>>;
 def t2UXTB16 : T2I_ext_rrot_uxtb16<0b011, "uxtb16",
-                               UnOpFrag<(and node:$Src, 0x00FF00FF)>>;
+                                   UnOpFrag<(and node:$Src, 0x00FF00FF)>>;
 
 // FIXME: This pattern incorrectly assumes the shl operator is a rotate.
 //        The transformation should probably be done as a combiner action
@@ -2029,10 +2030,8 @@ def : Pat<(add rGPR:$Rn, (and (srl rGPR:$Rm, imm8_or_16:$rot), 0xFFFF)),
 //  Arithmetic Instructions.
 //
 
-defm t2ADD  : T2I_bin_ii12rs<0b000, "add",
-                             BinOpFrag<(add  node:$LHS, node:$RHS)>, 1>;
-defm t2SUB  : T2I_bin_ii12rs<0b101, "sub",
-                             BinOpFrag<(sub  node:$LHS, node:$RHS)>>;
+defm t2ADD  : T2I_bin_ii12rs<0b000, "add", add, 1>;
+defm t2SUB  : T2I_bin_ii12rs<0b101, "sub", sub>;
 
 // ADD and SUB with 's' bit set. No 12-bit immediate (T4) variants.
 //
@@ -2044,25 +2043,20 @@ defm t2SUB  : T2I_bin_ii12rs<0b101, "sub",
 // FIXME: Eliminate t2ADDS/t2SUBS pseudo opcodes after adding tablegen
 // support for an optional CPSR definition that corresponds to the DAG
 // node's second value. We can then eliminate the implicit def of CPSR.
-defm t2ADDS : T2I_bin_s_irs <IIC_iALUi, IIC_iALUr, IIC_iALUsi,
-                             BinOpFrag<(ARMaddc node:$LHS, node:$RHS)>, 1>;
-defm t2SUBS : T2I_bin_s_irs <IIC_iALUi, IIC_iALUr, IIC_iALUsi,
-                             BinOpFrag<(ARMsubc node:$LHS, node:$RHS)>>;
+defm t2ADDS : T2I_bin_s_irs <IIC_iALUi, IIC_iALUr, IIC_iALUsi, ARMaddc, 1>;
+defm t2SUBS : T2I_bin_s_irs <IIC_iALUi, IIC_iALUr, IIC_iALUsi, ARMsubc>;
 
 let hasPostISelHook = 1 in {
-defm t2ADC  : T2I_adde_sube_irs<0b1010, "adc",
-              BinOpWithFlagFrag<(ARMadde node:$LHS, node:$RHS, node:$FLAG)>, 1>;
-defm t2SBC  : T2I_adde_sube_irs<0b1011, "sbc",
-              BinOpWithFlagFrag<(ARMsube node:$LHS, node:$RHS, node:$FLAG)>>;
+defm t2ADC  : T2I_adde_sube_irs<0b1010, "adc", ARMadde, 1>;
+defm t2SBC  : T2I_adde_sube_irs<0b1011, "sbc", ARMsube>;
 }
 
 // RSB
-defm t2RSB  : T2I_rbin_irs  <0b1110, "rsb",
-                             BinOpFrag<(sub  node:$LHS, node:$RHS)>>;
+defm t2RSB  : T2I_rbin_irs  <0b1110, "rsb", sub>;
 
 // FIXME: Eliminate them if we can write def : Pat patterns which defines
 // CPSR and the implicit def of CPSR is not needed.
-defm t2RSBS : T2I_rbin_s_is <BinOpFrag<(ARMsubc node:$LHS, node:$RHS)>>;
+defm t2RSBS : T2I_rbin_s_is <ARMsubc>;
 
 // (sub X, imm) gets canonicalized to (add X, -imm).  Match this form.
 // The assume-no-carry-in form uses the negation of the input since add/sub
@@ -2293,19 +2287,17 @@ def t2USAT16: T2SatI<(outs rGPR:$Rd), (ins imm0_15:$sat_imm, rGPR:$Rn),
 
 def : T2Pat<(int_arm_ssat GPR:$a, imm1_32:$pos), (t2SSAT imm1_32:$pos, GPR:$a, 0)>;
 def : T2Pat<(int_arm_usat GPR:$a, imm0_31:$pos), (t2USAT imm0_31:$pos, GPR:$a, 0)>;
+def : T2Pat<(ARMssatnoshift GPRnopc:$Rn, imm0_31:$imm),
+             (t2SSAT imm0_31:$imm, GPRnopc:$Rn, 0)>;
 
 //===----------------------------------------------------------------------===//
 //  Shift and rotate Instructions.
 //
 
-defm t2LSL  : T2I_sh_ir<0b00, "lsl", imm0_31,
-                        BinOpFrag<(shl  node:$LHS, node:$RHS)>>;
-defm t2LSR  : T2I_sh_ir<0b01, "lsr", imm_sr,
-                        BinOpFrag<(srl  node:$LHS, node:$RHS)>>;
-defm t2ASR  : T2I_sh_ir<0b10, "asr", imm_sr,
-                        BinOpFrag<(sra  node:$LHS, node:$RHS)>>;
-defm t2ROR  : T2I_sh_ir<0b11, "ror", imm0_31,
-                        BinOpFrag<(rotr node:$LHS, node:$RHS)>>;
+defm t2LSL  : T2I_sh_ir<0b00, "lsl", imm0_31, shl>;
+defm t2LSR  : T2I_sh_ir<0b01, "lsr", imm_sr,  srl>;
+defm t2ASR  : T2I_sh_ir<0b10, "asr", imm_sr,  sra>;
+defm t2ROR  : T2I_sh_ir<0b11, "ror", imm0_31, rotr>;
 
 // (rotr x, (and y, 0x...1f)) ==> (ROR x, y)
 def : T2Pat<(rotr rGPR:$lhs, (and rGPR:$rhs, lo5AllOne)),
@@ -2362,14 +2354,11 @@ def t2MOVsra_flag : T2TwoRegShiftImm<
 //
 
 defm t2AND  : T2I_bin_w_irs<0b0000, "and",
-                            IIC_iBITi, IIC_iBITr, IIC_iBITsi,
-                            BinOpFrag<(and node:$LHS, node:$RHS)>, 1>;
+                            IIC_iBITi, IIC_iBITr, IIC_iBITsi, and, 1>;
 defm t2ORR  : T2I_bin_w_irs<0b0010, "orr",
-                            IIC_iBITi, IIC_iBITr, IIC_iBITsi,
-                            BinOpFrag<(or  node:$LHS, node:$RHS)>, 1>;
+                            IIC_iBITi, IIC_iBITr, IIC_iBITsi, or, 1>;
 defm t2EOR  : T2I_bin_w_irs<0b0100, "eor",
-                            IIC_iBITi, IIC_iBITr, IIC_iBITsi,
-                            BinOpFrag<(xor node:$LHS, node:$RHS)>, 1>;
+                            IIC_iBITi, IIC_iBITr, IIC_iBITsi, xor, 1>;
 
 defm t2BIC  : T2I_bin_w_irs<0b0001, "bic",
                             IIC_iBITi, IIC_iBITr, IIC_iBITsi,
@@ -2516,7 +2505,7 @@ multiclass T2I_un_irs<bits<4> opcod, string opc,
 let AddedComplexity = 1 in
 defm t2MVN  : T2I_un_irs <0b0011, "mvn",
                           IIC_iMVNi, IIC_iMVNr, IIC_iMVNsi,
-                          UnOpFrag<(not node:$Src)>, 1, 1, 1>;
+                          not, 1, 1, 1>;
 
 let AddedComplexity = 1 in
 def : T2Pat<(and     rGPR:$src, t2_so_imm_not:$imm),
@@ -2606,8 +2595,9 @@ def t2UMLAL : T2MlaLong<0b110, 0b0000,
 
 def t2UMAAL : T2MulLong<0b110, 0b0110,
                   (outs rGPR:$RdLo, rGPR:$RdHi),
-                  (ins rGPR:$Rn, rGPR:$Rm), IIC_iMAC64,
+                  (ins rGPR:$Rn, rGPR:$Rm, rGPR:$RLo, rGPR:$RHi), IIC_iMAC64,
                   "umaal", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+          RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">,
           Requires<[IsThumb2, HasDSP]>;
 } // hasSideEffects
 
@@ -2677,7 +2667,7 @@ def t2SMMLSR:T2FourReg<
   let Inst{7-4} = 0b0001; // Rounding (Inst{4} = 1)
 }
 
-multiclass T2I_smul<string opc, PatFrag opnode> {
+multiclass T2I_smul<string opc, SDNode opnode> {
   def BB : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16,
               !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm",
               [(set rGPR:$Rd, (opnode (sext_inreg rGPR:$Rn, i16),
@@ -2756,7 +2746,7 @@ multiclass T2I_smul<string opc, PatFrag opnode> {
 }
 
 
-multiclass T2I_smla<string opc, PatFrag opnode> {
+multiclass T2I_smla<string opc, SDNode opnode> {
   def BB : T2FourReg<
         (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC16,
               !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm, $Ra",
@@ -2835,8 +2825,8 @@ multiclass T2I_smla<string opc, PatFrag opnode> {
   }
 }
 
-defm t2SMUL : T2I_smul<"smul", BinOpFrag<(mul node:$LHS, node:$RHS)>>;
-defm t2SMLA : T2I_smla<"smla", BinOpFrag<(mul node:$LHS, node:$RHS)>>;
+defm t2SMUL : T2I_smul<"smul", mul>;
+defm t2SMLA : T2I_smla<"smla", mul>;
 
 // Halfword multiple accumulate long: SMLAL<x><y>
 def t2SMLALBB : T2FourReg_mac<1, 0b100, 0b1000, (outs rGPR:$Ra,rGPR:$Rd),
@@ -2923,7 +2913,7 @@ def t2SMLSLDX : T2FourReg_mac<1, 0b101, 0b1101, (outs rGPR:$Ra,rGPR:$Rd),
 def t2SDIV : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iDIV,
                  "sdiv", "\t$Rd, $Rn, $Rm",
                  [(set rGPR:$Rd, (sdiv rGPR:$Rn, rGPR:$Rm))]>,
-                 Requires<[HasDivide, IsThumb2]> {
+                 Requires<[HasDivide, IsThumb, HasV8MBaseline]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-21} = 0b011100;
   let Inst{20} = 0b1;
@@ -2934,7 +2924,7 @@ def t2SDIV : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iDIV,
 def t2UDIV : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iDIV,
                  "udiv", "\t$Rd, $Rn, $Rm",
                  [(set rGPR:$Rd, (udiv rGPR:$Rn, rGPR:$Rm))]>,
-                 Requires<[HasDivide, IsThumb2]> {
+                 Requires<[HasDivide, IsThumb, HasV8MBaseline]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-21} = 0b011101;
   let Inst{20} = 0b1;
@@ -3080,8 +3070,7 @@ def t2CRC32CW : T2I_crc32<1, 0b10, "cw", int_arm_crc32cw>;
 //  Comparison Instructions...
 //
 defm t2CMP  : T2I_cmp_irs<0b1101, "cmp",
-                          IIC_iCMPi, IIC_iCMPr, IIC_iCMPsi,
-                          BinOpFrag<(ARMcmp node:$LHS, node:$RHS)>>;
+                          IIC_iCMPi, IIC_iCMPr, IIC_iCMPsi, ARMcmp>;
 
 def : T2Pat<(ARMcmpZ  GPRnopc:$lhs, t2_so_imm:$imm),
             (t2CMPri  GPRnopc:$lhs, t2_so_imm:$imm)>;
@@ -3288,15 +3277,18 @@ let mayLoad = 1 in {
 def t2LDREXB : T2I_ldrex<0b0100, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
                          "ldrexb", "\t$Rt, $addr", "",
-                         [(set rGPR:$Rt, (ldrex_1 addr_offset_none:$addr))]>;
+                         [(set rGPR:$Rt, (ldrex_1 addr_offset_none:$addr))]>,
+               Requires<[IsThumb, HasV8MBaseline]>;
 def t2LDREXH : T2I_ldrex<0b0101, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
                          "ldrexh", "\t$Rt, $addr", "",
-                         [(set rGPR:$Rt, (ldrex_2 addr_offset_none:$addr))]>;
+                         [(set rGPR:$Rt, (ldrex_2 addr_offset_none:$addr))]>,
+               Requires<[IsThumb, HasV8MBaseline]>;
 def t2LDREX  : Thumb2I<(outs rGPR:$Rt), (ins t2addrmode_imm0_1020s4:$addr),
                        AddrModeNone, 4, NoItinerary,
                        "ldrex", "\t$Rt, $addr", "",
-                     [(set rGPR:$Rt, (ldrex_4 t2addrmode_imm0_1020s4:$addr))]> {
+                     [(set rGPR:$Rt, (ldrex_4 t2addrmode_imm0_1020s4:$addr))]>,
+               Requires<[IsThumb, HasV8MBaseline]> {
   bits<4> Rt;
   bits<12> addr;
   let Inst{31-27} = 0b11101;
@@ -3320,17 +3312,17 @@ def t2LDAEXB : T2I_ldrex<0b1100, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
                          "ldaexb", "\t$Rt, $addr", "",
                          [(set rGPR:$Rt, (ldaex_1 addr_offset_none:$addr))]>,
-               Requires<[IsThumb, HasV8]>;
+               Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]>;
 def t2LDAEXH : T2I_ldrex<0b1101, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
                          "ldaexh", "\t$Rt, $addr", "",
                          [(set rGPR:$Rt, (ldaex_2 addr_offset_none:$addr))]>,
-               Requires<[IsThumb, HasV8]>;
+               Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]>;
 def t2LDAEX  : Thumb2I<(outs rGPR:$Rt), (ins addr_offset_none:$addr),
                        AddrModeNone, 4, NoItinerary,
                        "ldaex", "\t$Rt, $addr", "",
                          [(set rGPR:$Rt, (ldaex_4 addr_offset_none:$addr))]>,
-               Requires<[IsThumb, HasV8]> {
+               Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]> {
   bits<4> Rt;
   bits<4> addr;
   let Inst{31-27} = 0b11101;
@@ -3345,7 +3337,8 @@ def t2LDAEXD : T2I_ldrex<0b1111, (outs rGPR:$Rt, rGPR:$Rt2),
                          (ins addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
                          "ldaexd", "\t$Rt, $Rt2, $addr", "",
-                         [], {?, ?, ?, ?}>, Requires<[IsThumb, HasV8]> {
+                         [], {?, ?, ?, ?}>, Requires<[IsThumb,
+                         HasAcquireRelease, HasV7Clrex, IsNotMClass]> {
   bits<4> Rt2;
   let Inst{11-8} = Rt2;
 
@@ -3359,20 +3352,23 @@ def t2STREXB : T2I_strex<0b0100, (outs rGPR:$Rd),
                          AddrModeNone, 4, NoItinerary,
                          "strexb", "\t$Rd, $Rt, $addr", "",
                          [(set rGPR:$Rd,
-                               (strex_1 rGPR:$Rt, addr_offset_none:$addr))]>;
+                               (strex_1 rGPR:$Rt, addr_offset_none:$addr))]>,
+               Requires<[IsThumb, HasV8MBaseline]>;
 def t2STREXH : T2I_strex<0b0101, (outs rGPR:$Rd),
                          (ins rGPR:$Rt, addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
                          "strexh", "\t$Rd, $Rt, $addr", "",
                          [(set rGPR:$Rd,
-                               (strex_2 rGPR:$Rt, addr_offset_none:$addr))]>;
+                               (strex_2 rGPR:$Rt, addr_offset_none:$addr))]>,
+               Requires<[IsThumb, HasV8MBaseline]>;
 
 def t2STREX  : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt,
                              t2addrmode_imm0_1020s4:$addr),
                   AddrModeNone, 4, NoItinerary,
                   "strex", "\t$Rd, $Rt, $addr", "",
                   [(set rGPR:$Rd,
-                        (strex_4 rGPR:$Rt, t2addrmode_imm0_1020s4:$addr))]> {
+                        (strex_4 rGPR:$Rt, t2addrmode_imm0_1020s4:$addr))]>,
+               Requires<[IsThumb, HasV8MBaseline]> {
   bits<4> Rd;
   bits<4> Rt;
   bits<12> addr;
@@ -3399,7 +3395,8 @@ def t2STLEXB : T2I_strex<0b1100, (outs rGPR:$Rd),
                          "stlexb", "\t$Rd, $Rt, $addr", "",
                          [(set rGPR:$Rd,
                                (stlex_1 rGPR:$Rt, addr_offset_none:$addr))]>,
-                         Requires<[IsThumb, HasV8]>;
+                         Requires<[IsThumb, HasAcquireRelease,
+                                   HasV7Clrex]>;
 
 def t2STLEXH : T2I_strex<0b1101, (outs rGPR:$Rd),
                          (ins rGPR:$Rt, addr_offset_none:$addr),
@@ -3407,7 +3404,8 @@ def t2STLEXH : T2I_strex<0b1101, (outs rGPR:$Rd),
                          "stlexh", "\t$Rd, $Rt, $addr", "",
                          [(set rGPR:$Rd,
                                (stlex_2 rGPR:$Rt, addr_offset_none:$addr))]>,
-                         Requires<[IsThumb, HasV8]>;
+                         Requires<[IsThumb, HasAcquireRelease,
+                                   HasV7Clrex]>;
 
 def t2STLEX  : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt,
                              addr_offset_none:$addr),
@@ -3415,7 +3413,7 @@ def t2STLEX  : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt,
                   "stlex", "\t$Rd, $Rt, $addr", "",
                   [(set rGPR:$Rd,
                         (stlex_4 rGPR:$Rt, addr_offset_none:$addr))]>,
-                  Requires<[IsThumb, HasV8]> {
+                  Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]> {
   bits<4> Rd;
   bits<4> Rt;
   bits<4> addr;
@@ -3431,14 +3429,15 @@ def t2STLEXD : T2I_strex<0b1111, (outs rGPR:$Rd),
                          (ins rGPR:$Rt, rGPR:$Rt2, addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
                          "stlexd", "\t$Rd, $Rt, $Rt2, $addr", "", [],
-                         {?, ?, ?, ?}>, Requires<[IsThumb, HasV8]> {
+                         {?, ?, ?, ?}>, Requires<[IsThumb, HasAcquireRelease,
+                         HasV7Clrex, IsNotMClass]> {
   bits<4> Rt2;
   let Inst{11-8} = Rt2;
 }
 }
 
 def t2CLREX : T2I<(outs), (ins), NoItinerary, "clrex", "", [(int_arm_clrex)]>,
-            Requires<[IsThumb2, HasV7]>  {
+            Requires<[IsThumb, HasV7Clrex]>  {
   let Inst{31-16} = 0xf3bf;
   let Inst{15-14} = 0b10;
   let Inst{13} = 0;
@@ -3449,22 +3448,30 @@ def t2CLREX : T2I<(outs), (ins), NoItinerary, "clrex", "", [(int_arm_clrex)]>,
 }
 
 def : T2Pat<(and (ldrex_1 addr_offset_none:$addr), 0xff),
-            (t2LDREXB addr_offset_none:$addr)>;
+            (t2LDREXB addr_offset_none:$addr)>,
+            Requires<[IsThumb, HasV8MBaseline]>;
 def : T2Pat<(and (ldrex_2 addr_offset_none:$addr), 0xffff),
-            (t2LDREXH addr_offset_none:$addr)>;
+            (t2LDREXH addr_offset_none:$addr)>,
+            Requires<[IsThumb, HasV8MBaseline]>;
 def : T2Pat<(strex_1 (and GPR:$Rt, 0xff), addr_offset_none:$addr),
-            (t2STREXB GPR:$Rt, addr_offset_none:$addr)>;
+            (t2STREXB GPR:$Rt, addr_offset_none:$addr)>,
+            Requires<[IsThumb, HasV8MBaseline]>;
 def : T2Pat<(strex_2 (and GPR:$Rt, 0xffff), addr_offset_none:$addr),
-            (t2STREXH GPR:$Rt, addr_offset_none:$addr)>;
+            (t2STREXH GPR:$Rt, addr_offset_none:$addr)>,
+            Requires<[IsThumb, HasV8MBaseline]>;
 
 def : T2Pat<(and (ldaex_1 addr_offset_none:$addr), 0xff),
-            (t2LDAEXB addr_offset_none:$addr)>;
+            (t2LDAEXB addr_offset_none:$addr)>,
+            Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]>;
 def : T2Pat<(and (ldaex_2 addr_offset_none:$addr), 0xffff),
-            (t2LDAEXH addr_offset_none:$addr)>;
+            (t2LDAEXH addr_offset_none:$addr)>,
+            Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]>;
 def : T2Pat<(stlex_1 (and GPR:$Rt, 0xff), addr_offset_none:$addr),
-            (t2STLEXB GPR:$Rt, addr_offset_none:$addr)>;
+            (t2STLEXB GPR:$Rt, addr_offset_none:$addr)>,
+            Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]>;
 def : T2Pat<(stlex_2 (and GPR:$Rt, 0xffff), addr_offset_none:$addr),
-            (t2STLEXH GPR:$Rt, addr_offset_none:$addr)>;
+            (t2STLEXH GPR:$Rt, addr_offset_none:$addr)>,
+            Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]>;
 
 //===----------------------------------------------------------------------===//
 // SJLJ Exception handling intrinsics
@@ -3517,9 +3524,10 @@ def t2LDMIA_RET: t2PseudoExpand<(outs GPR:$wb), (ins GPR:$Rn, pred:$p,
 
 let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
 let isPredicable = 1 in
-def t2B   : T2I<(outs), (ins uncondbrtarget:$target), IIC_Br,
+def t2B   : T2I<(outs), (ins thumb_br_target:$target), IIC_Br,
                  "b", ".w\t$target",
-                 [(br bb:$target)]>, Sched<[WriteBr]> {
+                 [(br bb:$target)]>, Sched<[WriteBr]>,
+                 Requires<[IsThumb, HasV8MBaseline]> {
   let Inst{31-27} = 0b11110;
   let Inst{15-14} = 0b10;
   let Inst{12} = 1;
@@ -3609,9 +3617,9 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
   // IOS version.
   let Uses = [SP] in
   def tTAILJMPd: tPseudoExpand<(outs),
-                   (ins uncondbrtarget:$dst, pred:$p),
+                   (ins thumb_br_target:$dst, pred:$p),
                    4, IIC_Br, [],
-                   (t2B uncondbrtarget:$dst, pred:$p)>,
+                   (t2B thumb_br_target:$dst, pred:$p)>,
                  Requires<[IsThumb2, IsMachO]>, Sched<[WriteBr]>;
 }
 
@@ -3647,10 +3655,10 @@ def t2BXJ : T2I<(outs), (ins GPRnopc:$func), NoItinerary, "bxj", "\t$func", []>,
 
 // Compare and branch on zero / non-zero
 let isBranch = 1, isTerminator = 1 in {
-  def tCBZ  : T1I<(outs), (ins tGPR:$Rn, t_cbtarget:$target), IIC_Br,
+  def tCBZ  : T1I<(outs), (ins tGPR:$Rn, thumb_cb_target:$target), IIC_Br,
                   "cbz\t$Rn, $target", []>,
               T1Misc<{0,0,?,1,?,?,?}>,
-              Requires<[IsThumb2]>, Sched<[WriteBr]> {
+              Requires<[IsThumb, HasV8MBaseline]>, Sched<[WriteBr]> {
     // A8.6.27
     bits<6> target;
     bits<3> Rn;
@@ -3659,10 +3667,10 @@ let isBranch = 1, isTerminator = 1 in {
     let Inst{2-0} = Rn;
   }
 
-  def tCBNZ : T1I<(outs), (ins tGPR:$Rn, t_cbtarget:$target), IIC_Br,
+  def tCBNZ : T1I<(outs), (ins tGPR:$Rn, thumb_cb_target:$target), IIC_Br,
                   "cbnz\t$Rn, $target", []>,
               T1Misc<{1,0,?,1,?,?,?}>,
-              Requires<[IsThumb2]>, Sched<[WriteBr]> {
+              Requires<[IsThumb, HasV8MBaseline]>, Sched<[WriteBr]> {
     // A8.6.27
     bits<6> target;
     bits<3> Rn;
@@ -3715,15 +3723,21 @@ def t2HINT : T2I<(outs), (ins imm0_239:$imm), NoItinerary, "hint", ".w\t$imm",
   let Inst{7-0} = imm;
 }
 
-def : t2InstAlias<"hint$p $imm", (t2HINT imm0_239:$imm, pred:$p)>;
-def : t2InstAlias<"nop$p.w", (t2HINT 0, pred:$p)>;
-def : t2InstAlias<"yield$p.w", (t2HINT 1, pred:$p)>;
-def : t2InstAlias<"wfe$p.w", (t2HINT 2, pred:$p)>;
-def : t2InstAlias<"wfi$p.w", (t2HINT 3, pred:$p)>;
-def : t2InstAlias<"sev$p.w", (t2HINT 4, pred:$p)>;
-def : t2InstAlias<"sevl$p.w", (t2HINT 5, pred:$p)> {
+def : t2InstAlias<"hint$p $imm", (t2HINT imm0_239:$imm, pred:$p), 0>;
+def : t2InstAlias<"nop$p.w", (t2HINT 0, pred:$p), 1>;
+def : t2InstAlias<"yield$p.w", (t2HINT 1, pred:$p), 1>;
+def : t2InstAlias<"wfe$p.w", (t2HINT 2, pred:$p), 1>;
+def : t2InstAlias<"wfi$p.w", (t2HINT 3, pred:$p), 1>;
+def : t2InstAlias<"sev$p.w", (t2HINT 4, pred:$p), 1>;
+def : t2InstAlias<"sevl$p.w", (t2HINT 5, pred:$p), 1> {
   let Predicates = [IsThumb2, HasV8];
 }
+def : t2InstAlias<"esb$p.w", (t2HINT 16, pred:$p), 1> {
+  let Predicates = [IsThumb2, HasRAS];
+}
+def : t2InstAlias<"esb$p", (t2HINT 16, pred:$p), 0> {
+  let Predicates = [IsThumb2, HasRAS];
+}
 
 def t2DBG : T2I<(outs), (ins imm0_15:$opt), NoItinerary, "dbg", "\t$opt",
                 [(int_arm_dbg imm0_15:$opt)]> {
@@ -3848,7 +3862,7 @@ def : t2InstAlias<"hvc\t$imm16", (t2HVC imm0_65535:$imm16)>;
 // ERET - Return from exception in Hypervisor mode.
 // B9.3.3, B9.3.20: ERET is an alias for "SUBS PC, LR, #0" in an implementation that
 // includes virtualization extensions.
-def t2ERET : InstAlias<"eret${p}", (t2SUBS_PC_LR 0, pred:$p)>,
+def t2ERET : InstAlias<"eret${p}", (t2SUBS_PC_LR 0, pred:$p), 1>,
              Requires<[IsThumb2, HasVirtualization]>;
 
 //===----------------------------------------------------------------------===//
@@ -3871,7 +3885,7 @@ let isReMaterializable = 1 in {
 def t2MOV_ga_pcrel : PseudoInst<(outs rGPR:$dst), (ins i32imm:$addr),
                                 IIC_iMOVix2addpc,
                           [(set rGPR:$dst, (ARMWrapperPIC tglobaladdr:$addr))]>,
-                          Requires<[IsThumb2, UseMovt]>;
+                          Requires<[IsThumb, HasV8MBaseline, UseMovt]>;
 
 }
 
@@ -3883,12 +3897,13 @@ def : T2Pat<(ARMWrapper tglobaltlsaddr:$dst),
       Requires<[IsThumb2, UseMovt]>;
 
 // ConstantPool, GlobalAddress, and JumpTable
-def : T2Pat<(ARMWrapper  tconstpool  :$dst), (t2LEApcrel tconstpool  :$dst)>;
-def : T2Pat<(ARMWrapper  tglobaladdr :$dst), (t2MOVi32imm tglobaladdr :$dst)>,
-           Requires<[IsThumb2, UseMovt]>;
+def : T2Pat<(ARMWrapper tconstpool :$dst), (t2LEApcrel tconstpool :$dst)>;
+def : T2Pat<(ARMWrapper texternalsym :$dst), (t2MOVi32imm texternalsym :$dst)>,
+    Requires<[IsThumb, HasV8MBaseline, UseMovt]>;
+def : T2Pat<(ARMWrapper tglobaladdr :$dst), (t2MOVi32imm tglobaladdr :$dst)>,
+    Requires<[IsThumb, HasV8MBaseline, UseMovt]>;
 
-def : T2Pat<(ARMWrapperJT tjumptable:$dst),
-            (t2LEApcrelJT tjumptable:$dst)>;
+def : T2Pat<(ARMWrapperJT tjumptable:$dst), (t2LEApcrelJT tjumptable:$dst)>;
 
 // Pseudo instruction that combines ldr from constpool and add pc. This should
 // be expanded into two instructions late to allow if-conversion and
@@ -3910,16 +3925,16 @@ def t2ABS : PseudoInst<(outs rGPR:$dst), (ins rGPR:$src),
 //===----------------------------------------------------------------------===//
 // Coprocessor load/store -- for disassembly only
 //
-class T2CI<bits<4> op31_28, dag oops, dag iops, string opc, string asm>
-  : T2I<oops, iops, NoItinerary, opc, asm, []> {
+class T2CI<bits<4> op31_28, dag oops, dag iops, string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, NoItinerary, opc, asm, pattern> {
   let Inst{31-28} = op31_28;
   let Inst{27-25} = 0b110;
 }
 
-multiclass t2LdStCop<bits<4> op31_28, bit load, bit Dbit, string asm> {
+multiclass t2LdStCop<bits<4> op31_28, bit load, bit Dbit, string asm, list<dag> pattern> {
   def _OFFSET : T2CI<op31_28,
                      (outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr),
-                     asm, "\t$cop, $CRd, $addr"> {
+                     asm, "\t$cop, $CRd, $addr", pattern> {
     bits<13> addr;
     bits<4> cop;
     bits<4> CRd;
@@ -3936,7 +3951,7 @@ multiclass t2LdStCop<bits<4> op31_28, bit load, bit Dbit, string asm> {
   }
   def _PRE : T2CI<op31_28,
                   (outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5_pre:$addr),
-                  asm, "\t$cop, $CRd, $addr!"> {
+                  asm, "\t$cop, $CRd, $addr!", []> {
     bits<13> addr;
     bits<4> cop;
     bits<4> CRd;
@@ -3954,7 +3969,7 @@ multiclass t2LdStCop<bits<4> op31_28, bit load, bit Dbit, string asm> {
   def _POST: T2CI<op31_28,
                   (outs), (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr,
                                postidx_imm8s4:$offset),
-                 asm, "\t$cop, $CRd, $addr, $offset"> {
+                 asm, "\t$cop, $CRd, $addr, $offset", []> {
     bits<9> offset;
     bits<4> addr;
     bits<4> cop;
@@ -3973,7 +3988,7 @@ multiclass t2LdStCop<bits<4> op31_28, bit load, bit Dbit, string asm> {
   def _OPTION : T2CI<op31_28, (outs),
                      (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr,
                           coproc_option_imm:$option),
-      asm, "\t$cop, $CRd, $addr, $option"> {
+      asm, "\t$cop, $CRd, $addr, $option", []> {
     bits<8> option;
     bits<4> addr;
     bits<4> cop;
@@ -3991,14 +4006,15 @@ multiclass t2LdStCop<bits<4> op31_28, bit load, bit Dbit, string asm> {
   }
 }
 
-defm t2LDC   : t2LdStCop<0b1110, 1, 0, "ldc">;
-defm t2LDCL  : t2LdStCop<0b1110, 1, 1, "ldcl">;
-defm t2STC   : t2LdStCop<0b1110, 0, 0, "stc">;
-defm t2STCL  : t2LdStCop<0b1110, 0, 1, "stcl">;
-defm t2LDC2  : t2LdStCop<0b1111, 1, 0, "ldc2">, Requires<[PreV8,IsThumb2]>;
-defm t2LDC2L : t2LdStCop<0b1111, 1, 1, "ldc2l">, Requires<[PreV8,IsThumb2]>;
-defm t2STC2  : t2LdStCop<0b1111, 0, 0, "stc2">, Requires<[PreV8,IsThumb2]>;
-defm t2STC2L : t2LdStCop<0b1111, 0, 1, "stc2l">, Requires<[PreV8,IsThumb2]>;
+defm t2LDC   : t2LdStCop<0b1110, 1, 0, "ldc", [(int_arm_ldc imm:$cop, imm:$CRd, addrmode5:$addr)]>;
+defm t2LDCL  : t2LdStCop<0b1110, 1, 1, "ldcl", [(int_arm_ldcl imm:$cop, imm:$CRd, addrmode5:$addr)]>;
+defm t2LDC2  : t2LdStCop<0b1111, 1, 0, "ldc2", [(int_arm_ldc2 imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>;
+defm t2LDC2L : t2LdStCop<0b1111, 1, 1, "ldc2l", [(int_arm_ldc2l imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>;
+
+defm t2STC   : t2LdStCop<0b1110, 0, 0, "stc", [(int_arm_stc imm:$cop, imm:$CRd, addrmode5:$addr)]>;
+defm t2STCL  : t2LdStCop<0b1110, 0, 1, "stcl", [(int_arm_stcl imm:$cop, imm:$CRd, addrmode5:$addr)]>;
+defm t2STC2  : t2LdStCop<0b1111, 0, 0, "stc2", [(int_arm_stc2 imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>;
+defm t2STC2L : t2LdStCop<0b1111, 0, 1, "stc2l", [(int_arm_stc2l imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>;
 
 
 //===----------------------------------------------------------------------===//
@@ -4070,6 +4086,7 @@ def t2MRS_M : T2I<(outs rGPR:$Rd), (ins msr_mask:$SYSm), NoItinerary,
 // same and the assembly parser has no way to distinguish between them. The mask
 // operand contains the special register (R Bit) in bit 4 and bits 3-0 contains
 // the mask with the fields to be accessed in the special register.
+let Defs = [CPSR] in
 def t2MSR_AR : T2I<(outs), (ins msr_mask:$mask, rGPR:$Rn),
                    NoItinerary, "msr", "\t$mask, $Rn", []>,
                Requires<[IsThumb2,IsNotMClass]> {
@@ -4105,6 +4122,7 @@ def t2MSRbanked : T2I<(outs), (ins banked_reg:$banked, rGPR:$Rn),
 // M class MSR.
 //
 // Move from ARM core register to Special Register
+let Defs = [CPSR] in
 def t2MSR_M : T2I<(outs), (ins msr_mask:$SYSm, rGPR:$Rn),
                   NoItinerary, "msr", "\t$SYSm, $Rn", []>,
               Requires<[IsThumb,IsMClass]> {
@@ -4314,6 +4332,37 @@ def t2SETPAN : T1I<(outs), (ins imm0_1:$imm), NoItinerary, "setpan\t$imm", []>,
 }
 
 //===----------------------------------------------------------------------===//
+// ARMv8-M Security Extensions instructions
+//
+
+let hasSideEffects = 1 in
+def t2SG : T2I<(outs), (ins), NoItinerary, "sg", "", []>,
+           Requires<[Has8MSecExt]> {
+  let Inst = 0xe97fe97f;
+}
+
+class T2TT<bits<2> at, string asm, list<dag> pattern>
+  : T2I<(outs rGPR:$Rt), (ins GPRnopc:$Rn), NoItinerary, asm, "\t$Rt, $Rn",
+        pattern> {
+  bits<4> Rn;
+  bits<4> Rt;
+
+  let Inst{31-20} = 0b111010000100;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = 0b1111;
+  let Inst{11-8} = Rt;
+  let Inst{7-6} = at;
+  let Inst{5-0} = 0b000000;
+
+  let Unpredictable{5-0} = 0b111111;
+}
+
+def t2TT   : T2TT<0b00, "tt",   []>, Requires<[IsThumb,Has8MSecExt]>;
+def t2TTT  : T2TT<0b01, "ttt",  []>, Requires<[IsThumb,Has8MSecExt]>;
+def t2TTA  : T2TT<0b10, "tta",  []>, Requires<[IsThumb,Has8MSecExt]>;
+def t2TTAT : T2TT<0b11, "ttat", []>, Requires<[IsThumb,Has8MSecExt]>;
+
+//===----------------------------------------------------------------------===//
 // Non-Instruction Patterns
 //
 
@@ -4488,9 +4537,9 @@ def : t2InstAlias<"tst${p} $Rn, $Rm",
                   (t2TSTrr GPRnopc:$Rn, rGPR:$Rm, pred:$p)>;
 
 // Memory barriers
-def : InstAlias<"dmb${p}", (t2DMB 0xf, pred:$p)>, Requires<[HasDB]>;
-def : InstAlias<"dsb${p}", (t2DSB 0xf, pred:$p)>, Requires<[HasDB]>;
-def : InstAlias<"isb${p}", (t2ISB 0xf, pred:$p)>, Requires<[HasDB]>;
+def : InstAlias<"dmb${p}", (t2DMB 0xf, pred:$p), 0>, Requires<[HasDB]>;
+def : InstAlias<"dsb${p}", (t2DSB 0xf, pred:$p), 0>, Requires<[HasDB]>;
+def : InstAlias<"isb${p}", (t2ISB 0xf, pred:$p), 0>, Requires<[HasDB]>;
 
 // Alias for LDR, LDRB, LDRH, LDRSB, and LDRSH without the ".w" optional
 // width specifier.
@@ -4535,13 +4584,13 @@ def : t2InstAlias<"mvn${s}${p} $Rd, $Rm",
 def : t2InstAlias<"mvn${s}${p} $Rd, $ShiftedRm",
            (t2MVNs rGPR:$Rd, t2_so_reg:$ShiftedRm, pred:$p, cc_out:$s)>;
 
-// PKHBT/PKHTB with default shift amount. PKHTB is equivalent to PKHBT when the
-// shift amount is zero (i.e., unspecified).
+// PKHBT/PKHTB with default shift amount. PKHTB is equivalent to PKHBT with the
+// input operands swapped when the shift amount is zero (i.e., unspecified).
 def : InstAlias<"pkhbt${p} $Rd, $Rn, $Rm",
-                (t2PKHBT rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>,
+                (t2PKHBT rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p), 0>,
             Requires<[HasT2ExtractPack, IsThumb2]>;
 def : InstAlias<"pkhtb${p} $Rd, $Rn, $Rm",
-                (t2PKHBT rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>,
+                (t2PKHBT rGPR:$Rd, rGPR:$Rm, rGPR:$Rn, 0, pred:$p), 0>,
             Requires<[HasT2ExtractPack, IsThumb2]>;
 
 // PUSH/POP aliases for STM/LDM
@@ -4620,16 +4669,16 @@ def : t2InstAlias<"strh${p} $Rt, $addr",
 
 // Extend instruction optional rotate operand.
 def : InstAlias<"sxtab${p} $Rd, $Rn, $Rm",
-              (t2SXTAB rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>,
+              (t2SXTAB rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p), 0>,
               Requires<[HasT2ExtractPack, IsThumb2]>;
 def : InstAlias<"sxtah${p} $Rd, $Rn, $Rm",
-              (t2SXTAH rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>,
+              (t2SXTAH rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p), 0>,
               Requires<[HasT2ExtractPack, IsThumb2]>;
 def : InstAlias<"sxtab16${p} $Rd, $Rn, $Rm",
-              (t2SXTAB16 rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>,
+              (t2SXTAB16 rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p), 0>,
               Requires<[HasT2ExtractPack, IsThumb2]>;
 def : InstAlias<"sxtb16${p} $Rd, $Rm",
-              (t2SXTB16 rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>,
+              (t2SXTB16 rGPR:$Rd, rGPR:$Rm, 0, pred:$p), 0>,
               Requires<[HasT2ExtractPack, IsThumb2]>;
 
 def : t2InstAlias<"sxtb${p} $Rd, $Rm",
@@ -4642,16 +4691,16 @@ def : t2InstAlias<"sxth${p}.w $Rd, $Rm",
                 (t2SXTH rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
 
 def : InstAlias<"uxtab${p} $Rd, $Rn, $Rm",
-              (t2UXTAB rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>,
+              (t2UXTAB rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p), 0>,
               Requires<[HasT2ExtractPack, IsThumb2]>;
 def : InstAlias<"uxtah${p} $Rd, $Rn, $Rm",
-              (t2UXTAH rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>,
+              (t2UXTAH rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p), 0>,
               Requires<[HasT2ExtractPack, IsThumb2]>;
 def : InstAlias<"uxtab16${p} $Rd, $Rn, $Rm",
-              (t2UXTAB16 rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>,
+              (t2UXTAB16 rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p), 0>,
               Requires<[HasT2ExtractPack, IsThumb2]>;
 def : InstAlias<"uxtb16${p} $Rd, $Rm",
-              (t2UXTB16 rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>,
+              (t2UXTB16 rGPR:$Rd, rGPR:$Rm, 0, pred:$p), 0>,
               Requires<[HasT2ExtractPack, IsThumb2]>;
 
 def : t2InstAlias<"uxtb${p} $Rd, $Rm",
@@ -4667,7 +4716,7 @@ def : t2InstAlias<"uxth${p}.w $Rd, $Rm",
 def : t2InstAlias<"uxtb${p} $Rd, $Rm$rot",
                   (t2UXTB rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
 def : InstAlias<"uxtb16${p} $Rd, $Rm$rot",
-                (t2UXTB16 rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>,
+                (t2UXTB16 rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p), 0>,
                 Requires<[HasT2ExtractPack, IsThumb2]>;
 def : t2InstAlias<"uxth${p} $Rd, $Rm$rot",
                   (t2UXTH rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
@@ -4675,7 +4724,7 @@ def : t2InstAlias<"uxth${p} $Rd, $Rm$rot",
 def : t2InstAlias<"sxtb${p} $Rd, $Rm$rot",
                   (t2SXTB rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
 def : InstAlias<"sxtb16${p} $Rd, $Rm$rot",
-                (t2SXTB16 rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>,
+                (t2SXTB16 rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p), 0>,
                 Requires<[HasT2ExtractPack, IsThumb2]>;
 def : t2InstAlias<"sxth${p} $Rd, $Rm$rot",
                   (t2SXTH rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
@@ -4764,9 +4813,14 @@ def : t2InstAlias<"ldrsh${p}.w $Rt, $addr",
 def : t2InstAlias<"add${p} $Rd, pc, $imm",
                   (t2ADR rGPR:$Rd, imm0_4095:$imm, pred:$p)>;
 
+// Pseudo instruction ldr Rt, =immediate
+def t2LDRConstPool
+  : t2AsmPseudo<"ldr${p} $Rt, $immediate",
+                (ins GPRnopc:$Rt, const_pool_asm_imm:$immediate, pred:$p)>;
+
 // PLD/PLDW/PLI with alternate literal form.
 def : t2InstAlias<"pld${p} $addr",
                   (t2PLDpci t2ldr_pcrel_imm12:$addr, pred:$p)>;
 def : InstAlias<"pli${p} $addr",
-                 (t2PLIpci  t2ldr_pcrel_imm12:$addr, pred:$p)>,
+                 (t2PLIpci  t2ldr_pcrel_imm12:$addr, pred:$p), 0>,
       Requires<[IsThumb2,HasV7]>;
diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td
index 63e7940bb14e..e29d265ae3d1 100644
--- a/lib/Target/ARM/ARMInstrVFP.td
+++ b/lib/Target/ARM/ARMInstrVFP.td
@@ -30,6 +30,18 @@ def FPImmOperand : AsmOperandClass {
   let ParserMethod = "parseFPImm";
 }
 
+def vfp_f16imm : Operand<f16>,
+                 PatLeaf<(f16 fpimm), [{
+      return ARM_AM::getFP16Imm(N->getValueAPF()) != -1;
+    }], SDNodeXForm<fpimm, [{
+      APFloat InVal = N->getValueAPF();
+      uint32_t enc = ARM_AM::getFP16Imm(InVal);
+      return CurDAG->getTargetConstant(enc, MVT::i32);
+    }]>> {
+  let PrintMethod = "printFPImmOperand";
+  let ParserMatchClass = FPImmOperand;
+}
+
 def vfp_f32imm : Operand<f32>,
                  PatLeaf<(f32 fpimm), [{
       return ARM_AM::getFP32Imm(N->getValueAPF()) != -1;
@@ -98,6 +110,11 @@ def VLDRS : ASI5<0b1101, 0b01, (outs SPR:$Sd), (ins addrmode5:$addr),
   let D = VFPNeonDomain;
 }
 
+def VLDRH : AHI5<0b1101, 0b01, (outs SPR:$Sd), (ins addrmode5fp16:$addr),
+                 IIC_fpLoad16, "vldr", ".16\t$Sd, $addr",
+                 []>,
+            Requires<[HasFullFP16]>;
+
 } // End of 'let canFoldAsLoad = 1, isReMaterializable = 1 in'
 
 def VSTRD : ADI5<0b1101, 0b00, (outs), (ins DPR:$Dd, addrmode5:$addr),
@@ -112,6 +129,11 @@ def VSTRS : ASI5<0b1101, 0b00, (outs), (ins SPR:$Sd, addrmode5:$addr),
   let D = VFPNeonDomain;
 }
 
+def VSTRH : AHI5<0b1101, 0b00, (outs), (ins SPR:$Sd, addrmode5fp16:$addr),
+                 IIC_fpStore16, "vstr", ".16\t$Sd, $addr",
+                 []>,
+            Requires<[HasFullFP16]>;
+
 //===----------------------------------------------------------------------===//
 //  Load / store multiple Instructions.
 //
@@ -200,6 +222,37 @@ defm VSTM : vfp_ldst_mult<"vstm", 0, IIC_fpStore_m, IIC_fpStore_mu>;
 def : MnemonicAlias<"vldm", "vldmia">;
 def : MnemonicAlias<"vstm", "vstmia">;
 
+
+//===----------------------------------------------------------------------===//
+//  Lazy load / store multiple Instructions
+//
+let mayLoad = 1 in
+def VLLDM : AXSI4<(outs), (ins GPRnopc:$Rn, pred:$p), IndexModeNone,
+                  IIC_fpLoad_m, "vlldm${p}\t$Rn", "", []>,
+            Requires<[HasV8MMainline, Has8MSecExt]> {
+    let Inst{24-23} = 0b00;
+    let Inst{22}    = 0;
+    let Inst{21}    = 1;
+    let Inst{20}    = 1;
+    let Inst{15-12} = 0;
+    let Inst{7-0}   = 0;
+    let mayLoad     = 1;
+}
+
+let mayStore = 1 in
+def VLSTM : AXSI4<(outs), (ins GPRnopc:$Rn, pred:$p), IndexModeNone,
+                  IIC_fpStore_m, "vlstm${p}\t$Rn", "", []>,
+            Requires<[HasV8MMainline, Has8MSecExt]> {
+    let Inst{24-23} = 0b00;
+    let Inst{22}    = 0;
+    let Inst{21}    = 1;
+    let Inst{20}    = 0;
+    let Inst{15-12} = 0;
+    let Inst{7-0}   = 0;
+    let mayStore    = 1;
+}
+
+
 // FLDM/FSTM - Load / Store multiple single / double precision registers for
 // pre-ARMv6 cores.
 // These instructions are deprecated!
@@ -221,13 +274,13 @@ def : VFP2MnemonicAlias<"fstmdbd", "vstmdb">;
 def : VFP2MnemonicAlias<"fstmead", "vstmia">;
 def : VFP2MnemonicAlias<"fstmfdd", "vstmdb">;
 
-def : InstAlias<"vpush${p} $r", (VSTMDDB_UPD SP, pred:$p, dpr_reglist:$r)>,
+def : InstAlias<"vpush${p} $r", (VSTMDDB_UPD SP, pred:$p, dpr_reglist:$r), 0>,
                 Requires<[HasVFP2]>;
-def : InstAlias<"vpush${p} $r", (VSTMSDB_UPD SP, pred:$p, spr_reglist:$r)>,
+def : InstAlias<"vpush${p} $r", (VSTMSDB_UPD SP, pred:$p, spr_reglist:$r), 0>,
                 Requires<[HasVFP2]>;
-def : InstAlias<"vpop${p} $r",  (VLDMDIA_UPD SP, pred:$p, dpr_reglist:$r)>,
+def : InstAlias<"vpop${p} $r",  (VLDMDIA_UPD SP, pred:$p, dpr_reglist:$r), 0>,
                 Requires<[HasVFP2]>;
-def : InstAlias<"vpop${p} $r",  (VLDMSIA_UPD SP, pred:$p, spr_reglist:$r)>,
+def : InstAlias<"vpop${p} $r",  (VLDMSIA_UPD SP, pred:$p, spr_reglist:$r), 0>,
                 Requires<[HasVFP2]>;
 defm : VFPDTAnyInstAlias<"vpush${p}", "$r",
                          (VSTMSDB_UPD SP, pred:$p, spr_reglist:$r)>;
@@ -295,6 +348,12 @@ def VADDS  : ASbIn<0b11100, 0b11, 0, 0,
   let D = VFPNeonA8Domain;
 }
 
+let TwoOperandAliasConstraint = "$Sn = $Sd" in
+def VADDH  : AHbI<0b11100, 0b11, 0, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                  IIC_fpALU16, "vadd", ".f16\t$Sd, $Sn, $Sm",
+                  []>;
+
 let TwoOperandAliasConstraint = "$Dn = $Dd" in
 def VSUBD  : ADbI<0b11100, 0b11, 1, 0,
                   (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
@@ -311,6 +370,12 @@ def VSUBS  : ASbIn<0b11100, 0b11, 1, 0,
   let D = VFPNeonA8Domain;
 }
 
+let TwoOperandAliasConstraint = "$Sn = $Sd" in
+def VSUBH  : AHbI<0b11100, 0b11, 1, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                  IIC_fpALU16, "vsub", ".f16\t$Sd, $Sn, $Sm",
+                  []>;
+
 let TwoOperandAliasConstraint = "$Dn = $Dd" in
 def VDIVD  : ADbI<0b11101, 0b00, 0, 0,
                   (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
@@ -323,6 +388,12 @@ def VDIVS  : ASbI<0b11101, 0b00, 0, 0,
                   IIC_fpDIV32, "vdiv", ".f32\t$Sd, $Sn, $Sm",
                   [(set SPR:$Sd, (fdiv SPR:$Sn, SPR:$Sm))]>;
 
+let TwoOperandAliasConstraint = "$Sn = $Sd" in
+def VDIVH  : AHbI<0b11101, 0b00, 0, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                  IIC_fpDIV16, "vdiv", ".f16\t$Sd, $Sn, $Sm",
+                  []>;
+
 let TwoOperandAliasConstraint = "$Dn = $Dd" in
 def VMULD  : ADbI<0b11100, 0b10, 0, 0,
                   (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
@@ -339,6 +410,12 @@ def VMULS  : ASbIn<0b11100, 0b10, 0, 0,
   let D = VFPNeonA8Domain;
 }
 
+let TwoOperandAliasConstraint = "$Sn = $Sd" in
+def VMULH  : AHbI<0b11100, 0b10, 0, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                  IIC_fpMUL16, "vmul", ".f16\t$Sd, $Sn, $Sm",
+                  []>;
+
 def VNMULD : ADbI<0b11100, 0b10, 1, 0,
                   (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
                   IIC_fpMUL64, "vnmul", ".f64\t$Dd, $Dn, $Dm",
@@ -353,9 +430,20 @@ def VNMULS : ASbI<0b11100, 0b10, 1, 0,
   let D = VFPNeonA8Domain;
 }
 
+def VNMULH : AHbI<0b11100, 0b10, 1, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                  IIC_fpMUL16, "vnmul", ".f16\t$Sd, $Sn, $Sm",
+                  []>;
+
 multiclass vsel_inst<string op, bits<2> opc, int CC> {
   let DecoderNamespace = "VFPV8", PostEncoderMethod = "",
       Uses = [CPSR], AddedComplexity = 4 in {
+    def H : AHbInp<0b11100, opc, 0,
+                   (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                   NoItinerary, !strconcat("vsel", op, ".f16\t$Sd, $Sn, $Sm"),
+                   []>,
+                   Requires<[HasFullFP16]>;
+
     def S : ASbInp<0b11100, opc, 0,
                    (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
                    NoItinerary, !strconcat("vsel", op, ".f32\t$Sd, $Sn, $Sm"),
@@ -378,6 +466,12 @@ defm VSELVS : vsel_inst<"vs", 0b01, 6>;
 
 multiclass vmaxmin_inst<string op, bit opc, SDNode SD> {
   let DecoderNamespace = "VFPV8", PostEncoderMethod = "" in {
+    def H : AHbInp<0b11101, 0b00, opc,
+                   (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                   NoItinerary, !strconcat(op, ".f16\t$Sd, $Sn, $Sm"),
+                   []>,
+                   Requires<[HasFullFP16]>;
+
     def S : ASbInp<0b11101, 0b00, opc,
                    (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
                    NoItinerary, !strconcat(op, ".f32\t$Sd, $Sn, $Sm"),
@@ -418,6 +512,12 @@ def VCMPES : ASuI<0b11101, 0b11, 0b0100, 0b11, 0,
   let D = VFPNeonA8Domain;
 }
 
+def VCMPEH : AHuI<0b11101, 0b11, 0b0100, 0b11, 0,
+                  (outs), (ins SPR:$Sd, SPR:$Sm),
+                  IIC_fpCMP16, "vcmpe", ".f16\t$Sd, $Sm",
+                  []>;
+
+
 // FIXME: Verify encoding after integrated assembler is working.
 def VCMPD  : ADuI<0b11101, 0b11, 0b0100, 0b01, 0,
                   (outs), (ins DPR:$Dd, DPR:$Dm),
@@ -432,6 +532,11 @@ def VCMPS  : ASuI<0b11101, 0b11, 0b0100, 0b01, 0,
   // VFP pipelines on A8.
   let D = VFPNeonA8Domain;
 }
+
+def VCMPH  : AHuI<0b11101, 0b11, 0b0100, 0b01, 0,
+                  (outs), (ins SPR:$Sd, SPR:$Sm),
+                  IIC_fpCMP16, "vcmp", ".f16\t$Sd, $Sm",
+                  []>;
 } // Defs = [FPSCR_NZCV]
 
 //===----------------------------------------------------------------------===//
@@ -452,6 +557,11 @@ def VABSS  : ASuIn<0b11101, 0b11, 0b0000, 0b11, 0,
   let D = VFPNeonA8Domain;
 }
 
+def VABSH  : AHuI<0b11101, 0b11, 0b0000, 0b11, 0,
+                   (outs SPR:$Sd), (ins SPR:$Sm),
+                   IIC_fpUNA16, "vabs", ".f16\t$Sd, $Sm",
+                   []>;
+
 let Defs = [FPSCR_NZCV] in {
 def VCMPEZD : ADuI<0b11101, 0b11, 0b0101, 0b11, 0,
                    (outs), (ins DPR:$Dd),
@@ -473,6 +583,14 @@ def VCMPEZS : ASuI<0b11101, 0b11, 0b0101, 0b11, 0,
   let D = VFPNeonA8Domain;
 }
 
+def VCMPEZH : AHuI<0b11101, 0b11, 0b0101, 0b11, 0,
+                   (outs), (ins SPR:$Sd),
+                   IIC_fpCMP16, "vcmpe", ".f16\t$Sd, #0",
+                   []> {
+  let Inst{3-0} = 0b0000;
+  let Inst{5}   = 0;
+}
+
 // FIXME: Verify encoding after integrated assembler is working.
 def VCMPZD  : ADuI<0b11101, 0b11, 0b0101, 0b01, 0,
                    (outs), (ins DPR:$Dd),
@@ -493,6 +611,14 @@ def VCMPZS  : ASuI<0b11101, 0b11, 0b0101, 0b01, 0,
   // VFP pipelines on A8.
   let D = VFPNeonA8Domain;
 }
+
+def VCMPZH  : AHuI<0b11101, 0b11, 0b0101, 0b01, 0,
+                   (outs), (ins SPR:$Sd),
+                   IIC_fpCMP16, "vcmp", ".f16\t$Sd, #0",
+                   []> {
+  let Inst{3-0} = 0b0000;
+  let Inst{5}   = 0;
+}
 } // Defs = [FPSCR_NZCV]
 
 def VCVTDS  : ASuI<0b11101, 0b11, 0b0111, 0b11, 0,
@@ -627,6 +753,22 @@ def : Pat<(f64 (f16_to_fp GPR:$a)),
 multiclass vcvt_inst<string opc, bits<2> rm,
                      SDPatternOperator node = null_frag> {
   let PostEncoderMethod = "", DecoderNamespace = "VFPV8" in {
+    def SH : AHuInp<0b11101, 0b11, 0b1100, 0b11, 0,
+                    (outs SPR:$Sd), (ins SPR:$Sm),
+                    NoItinerary, !strconcat("vcvt", opc, ".s32.f16\t$Sd, $Sm"),
+                    []>,
+                    Requires<[HasFullFP16]> {
+      let Inst{17-16} = rm;
+    }
+
+    def UH : AHuInp<0b11101, 0b11, 0b1100, 0b01, 0,
+                    (outs SPR:$Sd), (ins SPR:$Sm),
+                    NoItinerary, !strconcat("vcvt", opc, ".u32.f16\t$Sd, $Sm"),
+                    []>,
+                    Requires<[HasFullFP16]> {
+      let Inst{17-16} = rm;
+    }
+
     def SS : ASuInp<0b11101, 0b11, 0b1100, 0b11, 0,
                     (outs SPR:$Sd), (ins SPR:$Sm),
                     NoItinerary, !strconcat("vcvt", opc, ".s32.f32\t$Sd, $Sm"),
@@ -715,7 +857,21 @@ def VNEGS  : ASuIn<0b11101, 0b11, 0b0001, 0b01, 0,
   let D = VFPNeonA8Domain;
 }
 
+def VNEGH  : AHuI<0b11101, 0b11, 0b0001, 0b01, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sm),
+                  IIC_fpUNA16, "vneg", ".f16\t$Sd, $Sm",
+                  []>;
+
 multiclass vrint_inst_zrx<string opc, bit op, bit op2, SDPatternOperator node> {
+  def H : AHuI<0b11101, 0b11, 0b0110, 0b11, 0,
+               (outs SPR:$Sd), (ins SPR:$Sm),
+               NoItinerary, !strconcat("vrint", opc), ".f16\t$Sd, $Sm",
+               []>,
+               Requires<[HasFullFP16]> {
+    let Inst{7} = op2;
+    let Inst{16} = op;
+  }
+
   def S : ASuI<0b11101, 0b11, 0b0110, 0b11, 0,
                (outs SPR:$Sd), (ins SPR:$Sm),
                NoItinerary, !strconcat("vrint", opc), ".f32\t$Sd, $Sm",
@@ -733,11 +889,14 @@ multiclass vrint_inst_zrx<string opc, bit op, bit op2, SDPatternOperator node> {
     let Inst{16} = op;
   }
 
+  def : InstAlias<!strconcat("vrint", opc, "$p.f16.f16\t$Sd, $Sm"),
+                  (!cast<Instruction>(NAME#"H") SPR:$Sd, SPR:$Sm, pred:$p), 0>,
+        Requires<[HasFullFP16]>;
   def : InstAlias<!strconcat("vrint", opc, "$p.f32.f32\t$Sd, $Sm"),
-                  (!cast<Instruction>(NAME#"S") SPR:$Sd, SPR:$Sm, pred:$p)>,
+                  (!cast<Instruction>(NAME#"S") SPR:$Sd, SPR:$Sm, pred:$p), 0>,
         Requires<[HasFPARMv8]>;
   def : InstAlias<!strconcat("vrint", opc, "$p.f64.f64\t$Dd, $Dm"),
-                  (!cast<Instruction>(NAME#"D") DPR:$Dd, DPR:$Dm, pred:$p)>,
+                  (!cast<Instruction>(NAME#"D") DPR:$Dd, DPR:$Dm, pred:$p), 0>,
         Requires<[HasFPARMv8,HasDPVFP]>;
 }
 
@@ -748,6 +907,13 @@ defm VRINTX : vrint_inst_zrx<"x", 1, 0, frint>;
 multiclass vrint_inst_anpm<string opc, bits<2> rm,
                            SDPatternOperator node = null_frag> {
   let PostEncoderMethod = "", DecoderNamespace = "VFPV8" in {
+    def H : AHuInp<0b11101, 0b11, 0b1000, 0b01, 0,
+                   (outs SPR:$Sd), (ins SPR:$Sm),
+                   NoItinerary, !strconcat("vrint", opc, ".f16\t$Sd, $Sm"),
+                   []>,
+                   Requires<[HasFullFP16]> {
+      let Inst{17-16} = rm;
+    }
     def S : ASuInp<0b11101, 0b11, 0b1000, 0b01, 0,
                    (outs SPR:$Sd), (ins SPR:$Sm),
                    NoItinerary, !strconcat("vrint", opc, ".f32\t$Sd, $Sm"),
@@ -765,10 +931,10 @@ multiclass vrint_inst_anpm<string opc, bits<2> rm,
   }
 
   def : InstAlias<!strconcat("vrint", opc, ".f32.f32\t$Sd, $Sm"),
-                  (!cast<Instruction>(NAME#"S") SPR:$Sd, SPR:$Sm)>,
+                  (!cast<Instruction>(NAME#"S") SPR:$Sd, SPR:$Sm), 0>,
         Requires<[HasFPARMv8]>;
   def : InstAlias<!strconcat("vrint", opc, ".f64.f64\t$Dd, $Dm"),
-                  (!cast<Instruction>(NAME#"D") DPR:$Dd, DPR:$Dm)>,
+                  (!cast<Instruction>(NAME#"D") DPR:$Dd, DPR:$Dm), 0>,
         Requires<[HasFPARMv8,HasDPVFP]>;
 }
 
@@ -787,6 +953,11 @@ def VSQRTS : ASuI<0b11101, 0b11, 0b0001, 0b11, 0,
                   IIC_fpSQRT32, "vsqrt", ".f32\t$Sd, $Sm",
                   [(set SPR:$Sd, (fsqrt SPR:$Sm))]>;
 
+def VSQRTH : AHuI<0b11101, 0b11, 0b0001, 0b11, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sm),
+                  IIC_fpSQRT16, "vsqrt", ".f16\t$Sd, $Sm",
+                  []>;
+
 let hasSideEffects = 0 in {
 def VMOVD  : ADuI<0b11101, 0b11, 0b0000, 0b01, 0,
                   (outs DPR:$Dd), (ins DPR:$Dm),
@@ -795,6 +966,18 @@ def VMOVD  : ADuI<0b11101, 0b11, 0b0000, 0b01, 0,
 def VMOVS  : ASuI<0b11101, 0b11, 0b0000, 0b01, 0,
                   (outs SPR:$Sd), (ins SPR:$Sm),
                   IIC_fpUNA32, "vmov", ".f32\t$Sd, $Sm", []>;
+
+let PostEncoderMethod = "", DecoderNamespace = "VFPV8" in {
+def VMOVH  : ASuInp<0b11101, 0b11, 0b0000, 0b01, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sm),
+                  IIC_fpUNA16, "vmovx.f16\t$Sd, $Sm", []>,
+             Requires<[HasFullFP16]>;
+
+def VINSH  : ASuInp<0b11101, 0b11, 0b0000, 0b11, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sm),
+                  IIC_fpUNA16, "vins.f16\t$Sd, $Sm", []>,
+             Requires<[HasFullFP16]>;
+} // PostEncoderMethod
 } // hasSideEffects
 
 //===----------------------------------------------------------------------===//
@@ -966,6 +1149,44 @@ def VMOVSRR : AVConv5I<0b11000100, 0b1010,
   let DecoderMethod = "DecodeVMOVSRR";
 }
 
+// Move H->R, clearing top 16 bits
+def VMOVRH : AVConv2I<0b11100001, 0b1001,
+                      (outs GPR:$Rt), (ins SPR:$Sn),
+                      IIC_fpMOVSI, "vmov", ".f16\t$Rt, $Sn",
+                      []>,
+             Requires<[HasFullFP16]> {
+  // Instruction operands.
+  bits<4> Rt;
+  bits<5> Sn;
+
+  // Encode instruction operands.
+  let Inst{19-16} = Sn{4-1};
+  let Inst{7}     = Sn{0};
+  let Inst{15-12} = Rt;
+
+  let Inst{6-5}   = 0b00;
+  let Inst{3-0}   = 0b0000;
+}
+
+// Move R->H, clearing top 16 bits
+def VMOVHR : AVConv4I<0b11100000, 0b1001,
+                      (outs SPR:$Sn), (ins GPR:$Rt),
+                      IIC_fpMOVIS, "vmov", ".f16\t$Sn, $Rt",
+                      []>,
+             Requires<[HasFullFP16]> {
+  // Instruction operands.
+  bits<5> Sn;
+  bits<4> Rt;
+
+  // Encode instruction operands.
+  let Inst{19-16} = Sn{4-1};
+  let Inst{7}     = Sn{0};
+  let Inst{15-12} = Rt;
+
+  let Inst{6-5}   = 0b00;
+  let Inst{3-0}   = 0b0000;
+}
+
 // FMRDH: SPR -> GPR
 // FMRDL: SPR -> GPR
 // FMRRS: SPR -> GPR
@@ -1011,6 +1232,25 @@ class AVConv1InSs_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
   let Inst{22}    = Sd{0};
 }
 
+class AVConv1IHs_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
+                        bits<4> opcod4, dag oops, dag iops,
+                        InstrItinClass itin, string opc, string asm,
+                        list<dag> pattern>
+  : AVConv1I<opcod1, opcod2, opcod3, opcod4, oops, iops, itin, opc, asm,
+             pattern> {
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Sm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+
+  let Predicates = [HasFullFP16];
+}
+
 def VSITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011,
                                (outs DPR:$Dd), (ins SPR:$Sm),
                                IIC_fpCVTID, "vcvt", ".f64.s32\t$Dd, $Sm",
@@ -1043,6 +1283,13 @@ def : VFPNoNEONPat<(f32 (sint_to_fp GPR:$a)),
 def : VFPNoNEONPat<(f32 (sint_to_fp (i32 (alignedload32 addrmode5:$a)))),
                    (VSITOS (VLDRS addrmode5:$a))>;
 
+def VSITOH : AVConv1IHs_Encode<0b11101, 0b11, 0b1000, 0b1001,
+                               (outs SPR:$Sd), (ins SPR:$Sm),
+                               IIC_fpCVTIH, "vcvt", ".f16.s32\t$Sd, $Sm",
+                               []> {
+  let Inst{7} = 1; // s32
+}
+
 def VUITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011,
                                (outs DPR:$Dd), (ins SPR:$Sm),
                                IIC_fpCVTID, "vcvt", ".f64.u32\t$Dd, $Sm",
@@ -1075,6 +1322,13 @@ def : VFPNoNEONPat<(f32 (uint_to_fp GPR:$a)),
 def : VFPNoNEONPat<(f32 (uint_to_fp (i32 (alignedload32 addrmode5:$a)))),
                    (VUITOS (VLDRS addrmode5:$a))>;
 
+def VUITOH : AVConv1IHs_Encode<0b11101, 0b11, 0b1000, 0b1001,
+                                (outs SPR:$Sd), (ins SPR:$Sm),
+                                IIC_fpCVTIH, "vcvt", ".f16.u32\t$Sd, $Sm",
+                                []> {
+  let Inst{7} = 0; // u32
+}
+
 // FP -> Int:
 
 class AVConv1IsD_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
@@ -1113,6 +1367,25 @@ class AVConv1InsS_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
   let Inst{22}    = Sd{0};
 }
 
+class AVConv1IsH_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
+                         bits<4> opcod4, dag oops, dag iops,
+                         InstrItinClass itin, string opc, string asm,
+                         list<dag> pattern>
+  : AVConv1I<opcod1, opcod2, opcod3, opcod4, oops, iops, itin, opc, asm,
+              pattern> {
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Sm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+
+  let Predicates = [HasFullFP16];
+}
+
 // Always set Z bit in the instruction, i.e. "round towards zero" variants.
 def VTOSIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1101, 0b1011,
                                 (outs SPR:$Sd), (ins DPR:$Dm),
@@ -1147,6 +1420,13 @@ def : VFPNoNEONPat<(alignedstore32 (i32 (fp_to_sint (f32 SPR:$a))),
                                    addrmode5:$ptr),
                    (VSTRS (VTOSIZS SPR:$a), addrmode5:$ptr)>;
 
+def VTOSIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1101, 0b1001,
+                                 (outs SPR:$Sd), (ins SPR:$Sm),
+                                 IIC_fpCVTHI, "vcvt", ".s32.f16\t$Sd, $Sm",
+                                 []> {
+  let Inst{7} = 1; // Z bit
+}
+
 def VTOUIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1100, 0b1011,
                                (outs SPR:$Sd), (ins DPR:$Dm),
                                IIC_fpCVTDI, "vcvt", ".u32.f64\t$Sd, $Dm",
@@ -1180,6 +1460,13 @@ def : VFPNoNEONPat<(alignedstore32 (i32 (fp_to_uint (f32 SPR:$a))),
                                    addrmode5:$ptr),
                   (VSTRS (VTOUIZS SPR:$a), addrmode5:$ptr)>;
 
+def VTOUIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1100, 0b1001,
+                                 (outs SPR:$Sd), (ins SPR:$Sm),
+                                 IIC_fpCVTHI, "vcvt", ".u32.f16\t$Sd, $Sm",
+                                 []> {
+  let Inst{7} = 1; // Z bit
+}
+
 // And the Z bit '0' variants, i.e. use the rounding mode specified by FPSCR.
 let Uses = [FPSCR] in {
 // FIXME: Verify encoding after integrated assembler is working.
@@ -1197,6 +1484,13 @@ def VTOSIRS : AVConv1InsS_Encode<0b11101, 0b11, 0b1101, 0b1010,
   let Inst{7} = 0; // Z bit
 }
 
+def VTOSIRH : AVConv1IsH_Encode<0b11101, 0b11, 0b1101, 0b1001,
+                                 (outs SPR:$Sd), (ins SPR:$Sm),
+                                 IIC_fpCVTHI, "vcvtr", ".s32.f16\t$Sd, $Sm",
+                                 []> {
+  let Inst{7} = 0; // Z bit
+}
+
 def VTOUIRD : AVConv1IsD_Encode<0b11101, 0b11, 0b1100, 0b1011,
                                 (outs SPR:$Sd), (ins DPR:$Dm),
                                 IIC_fpCVTDI, "vcvtr", ".u32.f64\t$Sd, $Dm",
@@ -1210,6 +1504,13 @@ def VTOUIRS : AVConv1InsS_Encode<0b11101, 0b11, 0b1100, 0b1010,
                                  [(set SPR:$Sd, (int_arm_vcvtru SPR:$Sm))]> {
   let Inst{7} = 0; // Z bit
 }
+
+def VTOUIRH : AVConv1IsH_Encode<0b11101, 0b11, 0b1100, 0b1001,
+                                 (outs SPR:$Sd), (ins SPR:$Sm),
+                                 IIC_fpCVTHI, "vcvtr", ".u32.f16\t$Sd, $Sm",
+                                 []> {
+  let Inst{7} = 0; // Z bit
+}
 }
 
 // Convert between floating-point and fixed-point
@@ -1249,6 +1550,26 @@ class AVConv1XInsD_Encode<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4,
   let Predicates = [HasVFP2, HasDPVFP];
 }
 
+def VTOSHH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1001, 0,
+                       (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
+                 IIC_fpCVTHI, "vcvt", ".s16.f16\t$dst, $a, $fbits", []>,
+             Requires<[HasFullFP16]>;
+
+def VTOUHH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1111, 0b1001, 0,
+                       (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
+                 IIC_fpCVTHI, "vcvt", ".u16.f16\t$dst, $a, $fbits", []>,
+             Requires<[HasFullFP16]>;
+
+def VTOSLH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1001, 1,
+                       (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
+                 IIC_fpCVTHI, "vcvt", ".s32.f16\t$dst, $a, $fbits", []>,
+             Requires<[HasFullFP16]>;
+
+def VTOULH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1111, 0b1001, 1,
+                       (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
+                 IIC_fpCVTHI, "vcvt", ".u32.f16\t$dst, $a, $fbits", []>,
+             Requires<[HasFullFP16]>;
+
 def VTOSHS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1010, 0,
                        (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
                  IIC_fpCVTSI, "vcvt", ".s16.f32\t$dst, $a, $fbits", []> {
@@ -1299,6 +1620,26 @@ def VTOULD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1111, 0b1011, 1,
 
 // Fixed-Point to FP:
 
+def VSHTOH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1010, 0b1001, 0,
+                       (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
+                 IIC_fpCVTIH, "vcvt", ".f16.s16\t$dst, $a, $fbits", []>,
+             Requires<[HasFullFP16]>;
+
+def VUHTOH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1011, 0b1001, 0,
+                       (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
+                 IIC_fpCVTIH, "vcvt", ".f16.u16\t$dst, $a, $fbits", []>,
+             Requires<[HasFullFP16]>;
+
+def VSLTOH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1010, 0b1001, 1,
+                       (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
+                 IIC_fpCVTIH, "vcvt", ".f16.s32\t$dst, $a, $fbits", []>,
+             Requires<[HasFullFP16]>;
+
+def VULTOH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1011, 0b1001, 1,
+                       (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
+                 IIC_fpCVTIH, "vcvt", ".f16.u32\t$dst, $a, $fbits", []>,
+             Requires<[HasFullFP16]>;
+
 def VSHTOS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1010, 0b1010, 0,
                        (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
                  IIC_fpCVTIS, "vcvt", ".f32.s16\t$dst, $a, $fbits", []> {
@@ -1373,6 +1714,13 @@ def VMLAS : ASbIn<0b11100, 0b00, 0, 0,
   let D = VFPNeonA8Domain;
 }
 
+def VMLAH : AHbI<0b11100, 0b00, 0, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  IIC_fpMAC16, "vmla", ".f16\t$Sd, $Sn, $Sm",
+                  []>,
+              RegConstraint<"$Sdin = $Sd">,
+              Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>;
+
 def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
           (VMLAD DPR:$dstin, DPR:$a, DPR:$b)>,
           Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
@@ -1400,6 +1748,13 @@ def VMLSS : ASbIn<0b11100, 0b00, 1, 0,
   let D = VFPNeonA8Domain;
 }
 
+def VMLSH : AHbI<0b11100, 0b00, 1, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  IIC_fpMAC16, "vmls", ".f16\t$Sd, $Sn, $Sm",
+                  []>,
+              RegConstraint<"$Sdin = $Sd">,
+              Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>;
+
 def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
           (VMLSD DPR:$dstin, DPR:$a, DPR:$b)>,
           Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
@@ -1427,6 +1782,13 @@ def VNMLAS : ASbI<0b11100, 0b01, 1, 0,
   let D = VFPNeonA8Domain;
 }
 
+def VNMLAH : AHbI<0b11100, 0b01, 1, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  IIC_fpMAC16, "vnmla", ".f16\t$Sd, $Sn, $Sm",
+                  []>,
+                RegConstraint<"$Sdin = $Sd">,
+                Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>;
+
 def : Pat<(fsub_mlx (fneg (fmul_su DPR:$a, (f64 DPR:$b))), DPR:$dstin),
           (VNMLAD DPR:$dstin, DPR:$a, DPR:$b)>,
           Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
@@ -1453,6 +1815,13 @@ def VNMLSS : ASbI<0b11100, 0b01, 0, 0,
   let D = VFPNeonA8Domain;
 }
 
+def VNMLSH : AHbI<0b11100, 0b01, 0, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  IIC_fpMAC16, "vnmls", ".f16\t$Sd, $Sn, $Sm",
+             []>,
+                         RegConstraint<"$Sdin = $Sd">,
+                Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>;
+
 def : Pat<(fsub_mlx (fmul_su DPR:$a, (f64 DPR:$b)), DPR:$dstin),
           (VNMLSD DPR:$dstin, DPR:$a, DPR:$b)>,
           Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
@@ -1482,6 +1851,13 @@ def VFMAS : ASbIn<0b11101, 0b10, 0, 0,
   // VFP pipelines.
 }
 
+def VFMAH : AHbI<0b11101, 0b10, 0, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  IIC_fpFMAC16, "vfma", ".f16\t$Sd, $Sn, $Sm",
+                  []>,
+              RegConstraint<"$Sdin = $Sd">,
+              Requires<[HasFullFP16,UseFusedMAC]>;
+
 def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
           (VFMAD DPR:$dstin, DPR:$a, DPR:$b)>,
           Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
@@ -1517,6 +1893,13 @@ def VFMSS : ASbIn<0b11101, 0b10, 1, 0,
   // VFP pipelines.
 }
 
+def VFMSH : AHbI<0b11101, 0b10, 1, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  IIC_fpFMAC16, "vfms", ".f16\t$Sd, $Sn, $Sm",
+                  []>,
+              RegConstraint<"$Sdin = $Sd">,
+              Requires<[HasFullFP16,UseFusedMAC]>;
+
 def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
           (VFMSD DPR:$dstin, DPR:$a, DPR:$b)>,
           Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
@@ -1559,6 +1942,13 @@ def VFNMAS : ASbI<0b11101, 0b01, 1, 0,
   // VFP pipelines.
 }
 
+def VFNMAH : AHbI<0b11101, 0b01, 1, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  IIC_fpFMAC16, "vfnma", ".f16\t$Sd, $Sn, $Sm",
+                  []>,
+                RegConstraint<"$Sdin = $Sd">,
+                Requires<[HasFullFP16,UseFusedMAC]>;
+
 def : Pat<(fsub_mlx (fneg (fmul_su DPR:$a, (f64 DPR:$b))), DPR:$dstin),
           (VFNMAD DPR:$dstin, DPR:$a, DPR:$b)>,
           Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
@@ -1600,6 +1990,13 @@ def VFNMSS : ASbI<0b11101, 0b01, 0, 0,
   // VFP pipelines.
 }
 
+def VFNMSH : AHbI<0b11101, 0b01, 0, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  IIC_fpFMAC16, "vfnms", ".f16\t$Sd, $Sn, $Sm",
+             []>,
+                         RegConstraint<"$Sdin = $Sd">,
+                  Requires<[HasFullFP16,UseFusedMAC]>;
+
 def : Pat<(fsub_mlx (fmul_su DPR:$a, (f64 DPR:$b)), DPR:$dstin),
           (VFNMSD DPR:$dstin, DPR:$a, DPR:$b)>,
           Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
@@ -1780,6 +2177,23 @@ def FCONSTS : VFPAI<(outs SPR:$Sd), (ins vfp_f32imm:$imm),
   let Inst{7-4}   = 0b0000;
   let Inst{3-0}   = imm{3-0};
 }
+
+def FCONSTH : VFPAI<(outs SPR:$Sd), (ins vfp_f16imm:$imm),
+                     VFPMiscFrm, IIC_fpUNA16,
+                     "vmov", ".f16\t$Sd, $imm",
+                     []>, Requires<[HasFullFP16]> {
+  bits<5> Sd;
+  bits<8> imm;
+
+  let Inst{27-23} = 0b11101;
+  let Inst{22}    = Sd{0};
+  let Inst{21-20} = 0b11;
+  let Inst{19-16} = imm{7-4};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{11-8}  = 0b1001;     // Half precision
+  let Inst{7-4}   = 0b0000;
+  let Inst{3-0}   = imm{3-0};
+}
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index 6e7e47b8706a..62d57f3f4986 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -60,9 +60,14 @@ STATISTIC(NumSTRD2STM,  "Number of strd instructions turned back into stm");
 STATISTIC(NumLDRD2LDR,  "Number of ldrd instructions turned back into ldr's");
 STATISTIC(NumSTRD2STR,  "Number of strd instructions turned back into str's");
 
-namespace llvm {
-void initializeARMLoadStoreOptPass(PassRegistry &);
-}
+/// This switch disables formation of double/multi instructions that could
+/// potentially lead to (new) alignment traps even with CCR.UNALIGN_TRP
+/// disabled. This can be used to create libraries that are robust even when
+/// users provoke undefined behaviour by supplying misaligned pointers.
+/// \see mayCombineMisaligned()
+static cl::opt<bool>
+AssumeMisalignedLoadStores("arm-assume-misaligned-load-store", cl::Hidden,
+    cl::init(false), cl::desc("Be more conservative in ARM load/store opt"));
 
 #define ARM_LOAD_STORE_OPT_NAME "ARM load / store optimization pass"
 
@@ -71,9 +76,7 @@ namespace {
   /// form ldm / stm instructions.
   struct ARMLoadStoreOpt : public MachineFunctionPass {
     static char ID;
-    ARMLoadStoreOpt() : MachineFunctionPass(ID) {
-      initializeARMLoadStoreOptPass(*PassRegistry::getPassRegistry());
-    }
+    ARMLoadStoreOpt() : MachineFunctionPass(ID) {}
 
     const MachineFunction *MF;
     const TargetInstrInfo *TII;
@@ -90,6 +93,11 @@ namespace {
 
     bool runOnMachineFunction(MachineFunction &Fn) override;
 
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::AllVRegsAllocated);
+    }
+
     const char *getPassName() const override {
       return ARM_LOAD_STORE_OPT_NAME;
     }
@@ -101,8 +109,8 @@ namespace {
       MachineInstr *MI;
       int Offset;        ///< Load/Store offset.
       unsigned Position; ///< Position as counted from end of basic block.
-      MemOpQueueEntry(MachineInstr *MI, int Offset, unsigned Position)
-        : MI(MI), Offset(Offset), Position(Position) {}
+      MemOpQueueEntry(MachineInstr &MI, int Offset, unsigned Position)
+          : MI(&MI), Offset(Offset), Position(Position) {}
     };
     typedef SmallVector<MemOpQueueEntry,8> MemOpQueue;
 
@@ -131,17 +139,19 @@ namespace {
                             MachineBasicBlock::const_iterator Before);
     unsigned findFreeReg(const TargetRegisterClass &RegClass);
     void UpdateBaseRegUses(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MBBI,
-                           DebugLoc DL, unsigned Base, unsigned WordOffset,
+                           MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
+                           unsigned Base, unsigned WordOffset,
                            ARMCC::CondCodes Pred, unsigned PredReg);
-    MachineInstr *CreateLoadStoreMulti(MachineBasicBlock &MBB,
-        MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base,
-        bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg,
-        DebugLoc DL, ArrayRef<std::pair<unsigned, bool>> Regs);
-    MachineInstr *CreateLoadStoreDouble(MachineBasicBlock &MBB,
-        MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base,
-        bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg,
-        DebugLoc DL, ArrayRef<std::pair<unsigned, bool>> Regs) const;
+    MachineInstr *CreateLoadStoreMulti(
+        MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
+        int Offset, unsigned Base, bool BaseKill, unsigned Opcode,
+        ARMCC::CondCodes Pred, unsigned PredReg, const DebugLoc &DL,
+        ArrayRef<std::pair<unsigned, bool>> Regs);
+    MachineInstr *CreateLoadStoreDouble(
+        MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
+        int Offset, unsigned Base, bool BaseKill, unsigned Opcode,
+        ARMCC::CondCodes Pred, unsigned PredReg, const DebugLoc &DL,
+        ArrayRef<std::pair<unsigned, bool>> Regs) const;
     void FormCandidates(const MemOpQueue &MemOps);
     MachineInstr *MergeOpsUpdate(const MergeCandidate &Cand);
     bool FixInvalidRegPairOp(MachineBasicBlock &MBB,
@@ -156,10 +166,11 @@ namespace {
   char ARMLoadStoreOpt::ID = 0;
 }
 
-INITIALIZE_PASS(ARMLoadStoreOpt, "arm-load-store-opt", ARM_LOAD_STORE_OPT_NAME, false, false)
+INITIALIZE_PASS(ARMLoadStoreOpt, "arm-ldst-opt", ARM_LOAD_STORE_OPT_NAME, false,
+                false)
 
-static bool definesCPSR(const MachineInstr *MI) {
-  for (const auto &MO : MI->operands()) {
+static bool definesCPSR(const MachineInstr &MI) {
+  for (const auto &MO : MI.operands()) {
     if (!MO.isReg())
       continue;
     if (MO.isDef() && MO.getReg() == ARM::CPSR && !MO.isDead())
@@ -171,11 +182,11 @@ static bool definesCPSR(const MachineInstr *MI) {
   return false;
 }
 
-static int getMemoryOpOffset(const MachineInstr *MI) {
-  unsigned Opcode = MI->getOpcode();
+static int getMemoryOpOffset(const MachineInstr &MI) {
+  unsigned Opcode = MI.getOpcode();
   bool isAM3 = Opcode == ARM::LDRD || Opcode == ARM::STRD;
-  unsigned NumOperands = MI->getDesc().getNumOperands();
-  unsigned OffField = MI->getOperand(NumOperands-3).getImm();
+  unsigned NumOperands = MI.getDesc().getNumOperands();
+  unsigned OffField = MI.getOperand(NumOperands - 3).getImm();
 
   if (Opcode == ARM::t2LDRi12 || Opcode == ARM::t2LDRi8 ||
       Opcode == ARM::t2STRi12 || Opcode == ARM::t2STRi8 ||
@@ -436,12 +447,12 @@ static unsigned getLSMultipleTransferSize(const MachineInstr *MI) {
 
 /// Update future uses of the base register with the offset introduced
 /// due to writeback. This function only works on Thumb1.
-void
-ARMLoadStoreOpt::UpdateBaseRegUses(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MBBI,
-                                   DebugLoc DL, unsigned Base,
-                                   unsigned WordOffset,
-                                   ARMCC::CondCodes Pred, unsigned PredReg) {
+void ARMLoadStoreOpt::UpdateBaseRegUses(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MBBI,
+                                        const DebugLoc &DL, unsigned Base,
+                                        unsigned WordOffset,
+                                        ARMCC::CondCodes Pred,
+                                        unsigned PredReg) {
   assert(isThumb1 && "Can only update base register uses for Thumb1!");
   // Start updating any instructions with immediate offsets. Insert a SUB before
   // the first non-updateable instruction (if any).
@@ -475,7 +486,7 @@ ARMLoadStoreOpt::UpdateBaseRegUses(MachineBasicBlock &MBB,
           InsertSub = true;
 
       } else if ((Opc == ARM::tSUBi8 || Opc == ARM::tADDi8) &&
-                 !definesCPSR(MBBI)) {
+                 !definesCPSR(*MBBI)) {
         // SUBS/ADDS using this register, with a dead def of the CPSR.
         // Merge it with the update; if the merged offset is too large,
         // insert a new sub instead.
@@ -499,7 +510,7 @@ ARMLoadStoreOpt::UpdateBaseRegUses(MachineBasicBlock &MBB,
         InsertSub = true;
       }
 
-    } else if (definesCPSR(MBBI) || MBBI->isCall() || MBBI->isBranch()) {
+    } else if (definesCPSR(*MBBI) || MBBI->isCall() || MBBI->isBranch()) {
       // Since SUBS sets the condition flags, we can't place the base reset
       // after an instruction that has a live CPSR def.
       // The base register might also contain an argument for a function call.
@@ -552,7 +563,7 @@ void ARMLoadStoreOpt::moveLiveRegsBefore(const MachineBasicBlock &MBB,
   // Initialize if we never queried in this block.
   if (!LiveRegsValid) {
     LiveRegs.init(TRI);
-    LiveRegs.addLiveOuts(&MBB, true);
+    LiveRegs.addLiveOuts(MBB);
     LiveRegPos = MBB.end();
     LiveRegsValid = true;
   }
@@ -574,10 +585,11 @@ static bool ContainsReg(const ArrayRef<std::pair<unsigned, bool>> &Regs,
 /// Create and insert a LDM or STM with Base as base register and registers in
 /// Regs as the register operands that would be loaded / stored.  It returns
 /// true if the transformation is done.
-MachineInstr *ARMLoadStoreOpt::CreateLoadStoreMulti(MachineBasicBlock &MBB,
-    MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base,
-    bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg,
-    DebugLoc DL, ArrayRef<std::pair<unsigned, bool>> Regs) {
+MachineInstr *ARMLoadStoreOpt::CreateLoadStoreMulti(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
+    int Offset, unsigned Base, bool BaseKill, unsigned Opcode,
+    ARMCC::CondCodes Pred, unsigned PredReg, const DebugLoc &DL,
+    ArrayRef<std::pair<unsigned, bool>> Regs) {
   unsigned NumRegs = Regs.size();
   assert(NumRegs > 1);
 
@@ -770,10 +782,11 @@ MachineInstr *ARMLoadStoreOpt::CreateLoadStoreMulti(MachineBasicBlock &MBB,
   return MIB.getInstr();
 }
 
-MachineInstr *ARMLoadStoreOpt::CreateLoadStoreDouble(MachineBasicBlock &MBB,
-    MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base,
-    bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg,
-    DebugLoc DL, ArrayRef<std::pair<unsigned, bool>> Regs) const {
+MachineInstr *ARMLoadStoreOpt::CreateLoadStoreDouble(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
+    int Offset, unsigned Base, bool BaseKill, unsigned Opcode,
+    ARMCC::CondCodes Pred, unsigned PredReg, const DebugLoc &DL,
+    ArrayRef<std::pair<unsigned, bool>> Regs) const {
   bool IsLoad = isi32Load(Opcode);
   assert((IsLoad || isi32Store(Opcode)) && "Must have integer load or store");
   unsigned LoadStoreOpcode = IsLoad ? ARM::t2LDRDi8 : ARM::t2STRDi8;
@@ -836,11 +849,11 @@ MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) {
   MachineInstr *LatestMI = Cand.Instrs[Cand.LatestMIIdx];
   iterator InsertBefore = std::next(iterator(LatestMI));
   MachineBasicBlock &MBB = *LatestMI->getParent();
-  unsigned Offset = getMemoryOpOffset(First);
+  unsigned Offset = getMemoryOpOffset(*First);
   unsigned Base = getLoadStoreBaseOp(*First).getReg();
   bool BaseKill = LatestMI->killsRegister(Base);
   unsigned PredReg = 0;
-  ARMCC::CondCodes Pred = getInstrPredicate(First, PredReg);
+  ARMCC::CondCodes Pred = getInstrPredicate(*First, PredReg);
   DebugLoc DL = First->getDebugLoc();
   MachineInstr *Merged = nullptr;
   if (Cand.CanMergeToLSDouble)
@@ -916,6 +929,24 @@ static bool isValidLSDoubleOffset(int Offset) {
   return (Value % 4) == 0 && Value < 1024;
 }
 
+/// Return true for loads/stores that can be combined to a double/multi
+/// operation without increasing the requirements for alignment.
+static bool mayCombineMisaligned(const TargetSubtargetInfo &STI,
+                                 const MachineInstr &MI) {
+  // vldr/vstr trap on misaligned pointers anyway, forming vldm makes no
+  // difference.
+  unsigned Opcode = MI.getOpcode();
+  if (!isi32Load(Opcode) && !isi32Store(Opcode))
+    return true;
+
+  // Stack pointer alignment is out of the programmers control so we can trust
+  // SP-relative loads/stores.
+  if (getLoadStoreBaseOp(MI).getReg() == ARM::SP &&
+      STI.getFrameLowering()->getTransientStackAlignment() >= 4)
+    return true;
+  return false;
+}
+
 /// Find candidates for load/store multiple merge in list of MemOpQueueEntries.
 void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) {
   const MachineInstr *FirstMI = MemOps[0].MI;
@@ -946,7 +977,7 @@ void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) {
     bool CanMergeToLSMulti = true;
     // On swift vldm/vstm starting with an odd register number as that needs
     // more uops than single vldrs.
-    if (STI->isSwift() && !isNotVFP && (PRegNum % 2) == 1)
+    if (STI->hasSlowOddRegister() && !isNotVFP && (PRegNum % 2) == 1)
       CanMergeToLSMulti = false;
 
     // LDRD/STRD do not allow SP/PC. LDM/STM do not support it or have it
@@ -954,6 +985,10 @@ void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) {
     if (PReg == ARM::SP || PReg == ARM::PC)
       CanMergeToLSMulti = CanMergeToLSDouble = false;
 
+    // Should we be conservative?
+    if (AssumeMisalignedLoadStores && !mayCombineMisaligned(*STI, *MI))
+      CanMergeToLSMulti = CanMergeToLSDouble = false;
+
     // Merge following instructions where possible.
     for (unsigned I = SIndex+1; I < EIndex; ++I, ++Count) {
       int NewOffset = MemOps[I].Offset;
@@ -1102,11 +1137,11 @@ static int isIncrementOrDecrement(const MachineInstr &MI, unsigned Reg,
   unsigned MIPredReg;
   if (MI.getOperand(0).getReg() != Reg ||
       MI.getOperand(1).getReg() != Reg ||
-      getInstrPredicate(&MI, MIPredReg) != Pred ||
+      getInstrPredicate(MI, MIPredReg) != Pred ||
       MIPredReg != PredReg)
     return 0;
 
-  if (CheckCPSRDef && definesCPSR(&MI))
+  if (CheckCPSRDef && definesCPSR(MI))
     return 0;
   return MI.getOperand(2).getImm() * Scale;
 }
@@ -1169,7 +1204,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) {
   unsigned Base = BaseOP.getReg();
   bool BaseKill = BaseOP.isKill();
   unsigned PredReg = 0;
-  ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
+  ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg);
   unsigned Opcode = MI->getOpcode();
   DebugLoc DL = MI->getDebugLoc();
 
@@ -1193,10 +1228,30 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) {
   } else {
     MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset);
     if (((Mode != ARM_AM::ia && Mode != ARM_AM::ib) || Offset != Bytes) &&
-        ((Mode != ARM_AM::da && Mode != ARM_AM::db) || Offset != -Bytes))
-      return false;
+        ((Mode != ARM_AM::da && Mode != ARM_AM::db) || Offset != -Bytes)) {
+
+      // We couldn't find an inc/dec to merge. But if the base is dead, we
+      // can still change to a writeback form as that will save us 2 bytes
+      // of code size. It can create WAW hazards though, so only do it if
+      // we're minimizing code size.
+      if (!MBB.getParent()->getFunction()->optForMinSize() || !BaseKill)
+        return false;
+      
+      bool HighRegsUsed = false;
+      for (unsigned i = 2, e = MI->getNumOperands(); i != e; ++i)
+        if (MI->getOperand(i).getReg() >= ARM::R8) {
+          HighRegsUsed = true;
+          break;
+        }
+
+      if (!HighRegsUsed)
+        MergeInstr = MBB.end();
+      else
+        return false;
+    }
   }
-  MBB.erase(MergeInstr);
+  if (MergeInstr != MBB.end())
+    MBB.erase(MergeInstr);
 
   unsigned NewOpc = getUpdatingLSMultipleOpcode(Opcode, Mode);
   MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc))
@@ -1291,7 +1346,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
     return false;
 
   unsigned PredReg = 0;
-  ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
+  ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg);
   int Bytes = getLSMultipleTransferSize(MI);
   MachineBasicBlock &MBB = *MI->getParent();
   MachineBasicBlock::iterator MBBI(MI);
@@ -1388,7 +1443,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSDouble(MachineInstr &MI) const {
     return false;
 
   unsigned PredReg;
-  ARMCC::CondCodes Pred = getInstrPredicate(&MI, PredReg);
+  ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
   MachineBasicBlock::iterator MBBI(MI);
   MachineBasicBlock &MBB = *MI.getParent();
   int Offset;
@@ -1487,14 +1542,13 @@ static bool isMemoryOp(const MachineInstr &MI) {
 }
 
 static void InsertLDR_STR(MachineBasicBlock &MBB,
-                          MachineBasicBlock::iterator &MBBI,
-                          int Offset, bool isDef,
-                          DebugLoc DL, unsigned NewOpc,
+                          MachineBasicBlock::iterator &MBBI, int Offset,
+                          bool isDef, const DebugLoc &DL, unsigned NewOpc,
                           unsigned Reg, bool RegDeadKill, bool RegUndef,
                           unsigned BaseReg, bool BaseKill, bool BaseUndef,
-                          bool OffKill, bool OffUndef,
-                          ARMCC::CondCodes Pred, unsigned PredReg,
-                          const TargetInstrInfo *TII, bool isT2) {
+                          bool OffKill, bool OffUndef, ARMCC::CondCodes Pred,
+                          unsigned PredReg, const TargetInstrInfo *TII,
+                          bool isT2) {
   if (isDef) {
     MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
                                       TII->get(NewOpc))
@@ -1547,9 +1601,9 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
   bool BaseUndef = BaseOp.isUndef();
   bool OffKill = isT2 ? false : MI->getOperand(3).isKill();
   bool OffUndef = isT2 ? false : MI->getOperand(3).isUndef();
-  int OffImm = getMemoryOpOffset(MI);
+  int OffImm = getMemoryOpOffset(*MI);
   unsigned PredReg = 0;
-  ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
+  ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg);
 
   if (OddRegNum > EvenRegNum && OffImm == 0) {
     // Ascending register numbers and no offset. It's safe to change it to a
@@ -1655,14 +1709,14 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
       unsigned Reg = MO.getReg();
       unsigned Base = getLoadStoreBaseOp(*MBBI).getReg();
       unsigned PredReg = 0;
-      ARMCC::CondCodes Pred = getInstrPredicate(MBBI, PredReg);
-      int Offset = getMemoryOpOffset(MBBI);
+      ARMCC::CondCodes Pred = getInstrPredicate(*MBBI, PredReg);
+      int Offset = getMemoryOpOffset(*MBBI);
       if (CurrBase == 0) {
         // Start of a new chain.
         CurrBase = Base;
         CurrOpc  = Opcode;
         CurrPred = Pred;
-        MemOps.push_back(MemOpQueueEntry(MBBI, Offset, Position));
+        MemOps.push_back(MemOpQueueEntry(*MBBI, Offset, Position));
         continue;
       }
       // Note: No need to match PredReg in the next if.
@@ -1690,7 +1744,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
         if (!Overlap) {
           // Check offset and sort memory operation into the current chain.
           if (Offset > MemOps.back().Offset) {
-            MemOps.push_back(MemOpQueueEntry(MBBI, Offset, Position));
+            MemOps.push_back(MemOpQueueEntry(*MBBI, Offset, Position));
             continue;
           } else {
             MemOpQueue::iterator MI, ME;
@@ -1706,7 +1760,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
               }
             }
             if (MI != MemOps.end()) {
-              MemOps.insert(MI, MemOpQueueEntry(MBBI, Offset, Position));
+              MemOps.insert(MI, MemOpQueueEntry(*MBBI, Offset, Position));
               continue;
             }
           }
@@ -1723,7 +1777,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
                MBBI->getOpcode() == ARM::t2STRDi8) {
       // ARMPreAllocLoadStoreOpt has already formed some LDRD/STRD instructions
       // remember them because we may still be able to merge add/sub into them.
-      MergeBaseCandidates.push_back(MBBI);
+      MergeBaseCandidates.push_back(&*MBBI);
     }
 
 
@@ -1805,20 +1859,20 @@ bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) {
     // Ignore any DBG_VALUE instructions.
     while (PrevI->isDebugValue() && PrevI != MBB.begin())
       --PrevI;
-    MachineInstr *PrevMI = PrevI;
-    unsigned Opcode = PrevMI->getOpcode();
+    MachineInstr &PrevMI = *PrevI;
+    unsigned Opcode = PrevMI.getOpcode();
     if (Opcode == ARM::LDMIA_UPD || Opcode == ARM::LDMDA_UPD ||
         Opcode == ARM::LDMDB_UPD || Opcode == ARM::LDMIB_UPD ||
         Opcode == ARM::t2LDMIA_UPD || Opcode == ARM::t2LDMDB_UPD) {
-      MachineOperand &MO = PrevMI->getOperand(PrevMI->getNumOperands()-1);
+      MachineOperand &MO = PrevMI.getOperand(PrevMI.getNumOperands() - 1);
       if (MO.getReg() != ARM::LR)
         return false;
       unsigned NewOpc = (isThumb2 ? ARM::t2LDMIA_RET : ARM::LDMIA_RET);
       assert(((isThumb2 && Opcode == ARM::t2LDMIA_UPD) ||
               Opcode == ARM::LDMIA_UPD) && "Unsupported multiple load-return!");
-      PrevMI->setDesc(TII->get(NewOpc));
+      PrevMI.setDesc(TII->get(NewOpc));
       MO.setReg(ARM::PC);
-      PrevMI->copyImplicitOps(*MBB.getParent(), &*MBBI);
+      PrevMI.copyImplicitOps(*MBB.getParent(), *MBBI);
       MBB.erase(MBBI);
       return true;
     }
@@ -1840,8 +1894,8 @@ bool ARMLoadStoreOpt::CombineMovBx(MachineBasicBlock &MBB) {
   for (auto Use : Prev->uses())
     if (Use.isKill()) {
       AddDefaultPred(BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(ARM::tBX))
-          .addReg(Use.getReg(), RegState::Kill))
-          .copyImplicitOps(&*MBBI);
+                         .addReg(Use.getReg(), RegState::Kill))
+          .copyImplicitOps(*MBBI);
       MBB.erase(MBBI);
       MBB.erase(Prev);
       return true;
@@ -1851,6 +1905,9 @@ bool ARMLoadStoreOpt::CombineMovBx(MachineBasicBlock &MBB) {
 }
 
 bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
+  if (skipFunction(*Fn.getFunction()))
+    return false;
+
   MF = &Fn;
   STI = &static_cast<const ARMSubtarget &>(Fn.getSubtarget());
   TL = STI->getTargetLowering();
@@ -1877,10 +1934,6 @@ bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
   return Modified;
 }
 
-namespace llvm {
-void initializeARMPreAllocLoadStoreOptPass(PassRegistry &);
-}
-
 #define ARM_PREALLOC_LOAD_STORE_OPT_NAME                                       \
   "ARM pre- register allocation load / store optimization pass"
 
@@ -1889,9 +1942,7 @@ namespace {
   /// locations close to make it more likely they will be combined later.
   struct ARMPreAllocLoadStoreOpt : public MachineFunctionPass{
     static char ID;
-    ARMPreAllocLoadStoreOpt() : MachineFunctionPass(ID) {
-      initializeARMPreAllocLoadStoreOptPass(*PassRegistry::getPassRegistry());
-    }
+    ARMPreAllocLoadStoreOpt() : MachineFunctionPass(ID) {}
 
     const DataLayout *TD;
     const TargetInstrInfo *TII;
@@ -1922,10 +1973,13 @@ namespace {
   char ARMPreAllocLoadStoreOpt::ID = 0;
 }
 
-INITIALIZE_PASS(ARMPreAllocLoadStoreOpt, "arm-prera-load-store-opt",
+INITIALIZE_PASS(ARMPreAllocLoadStoreOpt, "arm-prera-ldst-opt",
                 ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false)
 
 bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
+  if (AssumeMisalignedLoadStores || skipFunction(*Fn.getFunction()))
+    return false;
+
   TD = &Fn.getDataLayout();
   STI = &static_cast<const ARMSubtarget &>(Fn.getSubtarget());
   TII = STI->getInstrInfo();
@@ -2034,7 +2088,7 @@ ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1,
     return false;
 
   // Then make sure the immediate offset fits.
-  int OffImm = getMemoryOpOffset(Op0);
+  int OffImm = getMemoryOpOffset(*Op0);
   if (isT2) {
     int Limit = (1 << 8) * Scale;
     if (OffImm >= Limit || (OffImm <= -Limit) || (OffImm & (Scale-1)))
@@ -2056,7 +2110,7 @@ ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1,
   if (FirstReg == SecondReg)
     return false;
   BaseReg = Op0->getOperand(1).getReg();
-  Pred = getInstrPredicate(Op0, PredReg);
+  Pred = getInstrPredicate(*Op0, PredReg);
   dl = Op0->getDebugLoc();
   return true;
 }
@@ -2070,11 +2124,11 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
   // Sort by offset (in reverse order).
   std::sort(Ops.begin(), Ops.end(),
             [](const MachineInstr *LHS, const MachineInstr *RHS) {
-    int LOffset = getMemoryOpOffset(LHS);
-    int ROffset = getMemoryOpOffset(RHS);
-    assert(LHS == RHS || LOffset != ROffset);
-    return LOffset > ROffset;
-  });
+              int LOffset = getMemoryOpOffset(*LHS);
+              int ROffset = getMemoryOpOffset(*RHS);
+              assert(LHS == RHS || LOffset != ROffset);
+              return LOffset > ROffset;
+            });
 
   // The loads / stores of the same base are in order. Scan them from first to
   // last and check for the following:
@@ -2106,7 +2160,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
       if (LastOpcode && LSMOpcode != LastOpcode)
         break;
 
-      int Offset = getMemoryOpOffset(Op);
+      int Offset = getMemoryOpOffset(*Op);
       unsigned Bytes = getLSMultipleTransferSize(Op);
       if (LastBytes) {
         if (Bytes != LastBytes || Offset != (LastOffset + (int)Bytes))
@@ -2141,8 +2195,8 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
       } else {
         // This is the new location for the loads / stores.
         MachineBasicBlock::iterator InsertPos = isLd ? FirstOp : LastOp;
-        while (InsertPos != MBB->end()
-               && (MemOps.count(InsertPos) || InsertPos->isDebugValue()))
+        while (InsertPos != MBB->end() &&
+               (MemOps.count(&*InsertPos) || InsertPos->isDebugValue()))
           ++InsertPos;
 
         // If we are moving a pair of loads / stores, see if it makes sense
@@ -2237,25 +2291,25 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
   MachineBasicBlock::iterator E = MBB->end();
   while (MBBI != E) {
     for (; MBBI != E; ++MBBI) {
-      MachineInstr *MI = MBBI;
-      if (MI->isCall() || MI->isTerminator()) {
+      MachineInstr &MI = *MBBI;
+      if (MI.isCall() || MI.isTerminator()) {
         // Stop at barriers.
         ++MBBI;
         break;
       }
 
-      if (!MI->isDebugValue())
-        MI2LocMap[MI] = ++Loc;
+      if (!MI.isDebugValue())
+        MI2LocMap[&MI] = ++Loc;
 
-      if (!isMemoryOp(*MI))
+      if (!isMemoryOp(MI))
         continue;
       unsigned PredReg = 0;
       if (getInstrPredicate(MI, PredReg) != ARMCC::AL)
         continue;
 
-      int Opc = MI->getOpcode();
+      int Opc = MI.getOpcode();
       bool isLd = isLoadSingle(Opc);
-      unsigned Base = MI->getOperand(1).getReg();
+      unsigned Base = MI.getOperand(1).getReg();
       int Offset = getMemoryOpOffset(MI);
 
       bool StopHere = false;
@@ -2264,15 +2318,15 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
           Base2LdsMap.find(Base);
         if (BI != Base2LdsMap.end()) {
           for (unsigned i = 0, e = BI->second.size(); i != e; ++i) {
-            if (Offset == getMemoryOpOffset(BI->second[i])) {
+            if (Offset == getMemoryOpOffset(*BI->second[i])) {
               StopHere = true;
               break;
             }
           }
           if (!StopHere)
-            BI->second.push_back(MI);
+            BI->second.push_back(&MI);
         } else {
-          Base2LdsMap[Base].push_back(MI);
+          Base2LdsMap[Base].push_back(&MI);
           LdBases.push_back(Base);
         }
       } else {
@@ -2280,15 +2334,15 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
           Base2StsMap.find(Base);
         if (BI != Base2StsMap.end()) {
           for (unsigned i = 0, e = BI->second.size(); i != e; ++i) {
-            if (Offset == getMemoryOpOffset(BI->second[i])) {
+            if (Offset == getMemoryOpOffset(*BI->second[i])) {
               StopHere = true;
               break;
             }
           }
           if (!StopHere)
-            BI->second.push_back(MI);
+            BI->second.push_back(&MI);
         } else {
-          Base2StsMap[Base].push_back(MI);
+          Base2StsMap[Base].push_back(&MI);
           StBases.push_back(Base);
         }
       }
@@ -2335,4 +2389,3 @@ FunctionPass *llvm::createARMLoadStoreOptimizationPass(bool PreAlloc) {
     return new ARMPreAllocLoadStoreOpt();
   return new ARMLoadStoreOpt();
 }
-
diff --git a/lib/Target/ARM/ARMMCInstLower.cpp b/lib/Target/ARM/ARMMCInstLower.cpp
index a2aca2d1a69e..7429acdb09ad 100644
--- a/lib/Target/ARM/ARMMCInstLower.cpp
+++ b/lib/Target/ARM/ARMMCInstLower.cpp
@@ -26,33 +26,22 @@ using namespace llvm;
 
 MCOperand ARMAsmPrinter::GetSymbolRef(const MachineOperand &MO,
                                       const MCSymbol *Symbol) {
-  const MCExpr *Expr;
-  unsigned Option = MO.getTargetFlags() & ARMII::MO_OPTION_MASK;
-  switch (Option) {
-  default: {
-    Expr = MCSymbolRefExpr::create(Symbol, MCSymbolRefExpr::VK_None,
-                                   OutContext);
-    switch (Option) {
-    default: llvm_unreachable("Unknown target flag on symbol operand");
-    case ARMII::MO_NO_FLAG:
-      break;
-    case ARMII::MO_LO16:
-      Expr = MCSymbolRefExpr::create(Symbol, MCSymbolRefExpr::VK_None,
-                                     OutContext);
-      Expr = ARMMCExpr::createLower16(Expr, OutContext);
-      break;
-    case ARMII::MO_HI16:
-      Expr = MCSymbolRefExpr::create(Symbol, MCSymbolRefExpr::VK_None,
-                                     OutContext);
-      Expr = ARMMCExpr::createUpper16(Expr, OutContext);
-      break;
-    }
+  const MCExpr *Expr =
+      MCSymbolRefExpr::create(Symbol, MCSymbolRefExpr::VK_None, OutContext);
+  switch (MO.getTargetFlags() & ARMII::MO_OPTION_MASK) {
+  default:
+    llvm_unreachable("Unknown target flag on symbol operand");
+  case ARMII::MO_NO_FLAG:
     break;
-  }
-
-  case ARMII::MO_PLT:
-    Expr = MCSymbolRefExpr::create(Symbol, MCSymbolRefExpr::VK_PLT,
-                                   OutContext);
+  case ARMII::MO_LO16:
+    Expr =
+        MCSymbolRefExpr::create(Symbol, MCSymbolRefExpr::VK_None, OutContext);
+    Expr = ARMMCExpr::createLower16(Expr, OutContext);
+    break;
+  case ARMII::MO_HI16:
+    Expr =
+        MCSymbolRefExpr::create(Symbol, MCSymbolRefExpr::VK_None, OutContext);
+    Expr = ARMMCExpr::createUpper16(Expr, OutContext);
     break;
   }
 
@@ -89,7 +78,7 @@ bool ARMAsmPrinter::lowerOperand(const MachineOperand &MO,
     break;
   }
   case MachineOperand::MO_ExternalSymbol:
-   MCOp = GetSymbolRef(MO,
+    MCOp = GetSymbolRef(MO,
                         GetExternalSymbolSymbol(MO.getSymbolName()));
     break;
   case MachineOperand::MO_JumpTableIndex:
diff --git a/lib/Target/ARM/ARMMachineFunctionInfo.cpp b/lib/Target/ARM/ARMMachineFunctionInfo.cpp
index 71ad7a4a732a..b6dee9ff8385 100644
--- a/lib/Target/ARM/ARMMachineFunctionInfo.cpp
+++ b/lib/Target/ARM/ARMMachineFunctionInfo.cpp
@@ -21,4 +21,4 @@ ARMFunctionInfo::ARMFunctionInfo(MachineFunction &MF)
       FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0),
       GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0),
       PICLabelUId(0), VarArgsFrameIndex(0), HasITBlocks(false),
-      IsSplitCSR(false) {}
+      ArgumentStackSize(0), IsSplitCSR(false) {}
diff --git a/lib/Target/ARM/ARMMachineFunctionInfo.h b/lib/Target/ARM/ARMMachineFunctionInfo.h
index 68f9aec8cae5..f71497240ff3 100644
--- a/lib/Target/ARM/ARMMachineFunctionInfo.h
+++ b/lib/Target/ARM/ARMMachineFunctionInfo.h
@@ -15,7 +15,6 @@
 #define LLVM_LIB_TARGET_ARM_ARMMACHINEFUNCTIONINFO_H
 
 #include "ARMSubtarget.h"
-#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/Target/TargetMachine.h"
diff --git a/lib/Target/ARM/ARMOptimizeBarriersPass.cpp b/lib/Target/ARM/ARMOptimizeBarriersPass.cpp
index 30baf4263c11..73dcb9641b61 100644
--- a/lib/Target/ARM/ARMOptimizeBarriersPass.cpp
+++ b/lib/Target/ARM/ARMOptimizeBarriersPass.cpp
@@ -27,6 +27,11 @@ public:
 
   bool runOnMachineFunction(MachineFunction &Fn) override;
 
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::AllVRegsAllocated);
+  }
+
   const char *getPassName() const override {
     return "optimise barriers pass";
   }
@@ -46,6 +51,9 @@ static bool CanMovePastDMB(const MachineInstr *MI) {
 }
 
 bool ARMOptimizeBarriersPass::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
   // Vector to store the DMBs we will remove after the first iteration
   std::vector<MachineInstr *> ToRemove;
   // DMBType is the Imm value of the first operand. It determines whether it's a
diff --git a/lib/Target/ARM/ARMSchedule.td b/lib/Target/ARM/ARMSchedule.td
index 528c4ec73781..47a99313025c 100644
--- a/lib/Target/ARM/ARMSchedule.td
+++ b/lib/Target/ARM/ARMSchedule.td
@@ -94,7 +94,7 @@ def : PredicateProlog<[{
   (void)TII;
 }]>;
 
-def IsPredicatedPred : SchedPredicate<[{TII->isPredicated(MI)}]>;
+def IsPredicatedPred : SchedPredicate<[{TII->isPredicated(*MI)}]>;
 
 //===----------------------------------------------------------------------===//
 // Instruction Itinerary classes used for ARM
@@ -186,38 +186,50 @@ def IIC_iStore_mu  : InstrItinClass;
 def IIC_Preload    : InstrItinClass;
 def IIC_Br         : InstrItinClass;
 def IIC_fpSTAT     : InstrItinClass;
+def IIC_fpUNA16    : InstrItinClass;
 def IIC_fpUNA32    : InstrItinClass;
 def IIC_fpUNA64    : InstrItinClass;
+def IIC_fpCMP16    : InstrItinClass;
 def IIC_fpCMP32    : InstrItinClass;
 def IIC_fpCMP64    : InstrItinClass;
 def IIC_fpCVTSD    : InstrItinClass;
 def IIC_fpCVTDS    : InstrItinClass;
 def IIC_fpCVTSH    : InstrItinClass;
 def IIC_fpCVTHS    : InstrItinClass;
+def IIC_fpCVTIH    : InstrItinClass;
 def IIC_fpCVTIS    : InstrItinClass;
 def IIC_fpCVTID    : InstrItinClass;
+def IIC_fpCVTHI    : InstrItinClass;
 def IIC_fpCVTSI    : InstrItinClass;
 def IIC_fpCVTDI    : InstrItinClass;
 def IIC_fpMOVIS    : InstrItinClass;
 def IIC_fpMOVID    : InstrItinClass;
 def IIC_fpMOVSI    : InstrItinClass;
 def IIC_fpMOVDI    : InstrItinClass;
+def IIC_fpALU16    : InstrItinClass;
 def IIC_fpALU32    : InstrItinClass;
 def IIC_fpALU64    : InstrItinClass;
+def IIC_fpMUL16    : InstrItinClass;
 def IIC_fpMUL32    : InstrItinClass;
 def IIC_fpMUL64    : InstrItinClass;
+def IIC_fpMAC16    : InstrItinClass;
 def IIC_fpMAC32    : InstrItinClass;
 def IIC_fpMAC64    : InstrItinClass;
+def IIC_fpFMAC16   : InstrItinClass;
 def IIC_fpFMAC32   : InstrItinClass;
 def IIC_fpFMAC64   : InstrItinClass;
+def IIC_fpDIV16    : InstrItinClass;
 def IIC_fpDIV32    : InstrItinClass;
 def IIC_fpDIV64    : InstrItinClass;
+def IIC_fpSQRT16   : InstrItinClass;
 def IIC_fpSQRT32   : InstrItinClass;
 def IIC_fpSQRT64   : InstrItinClass;
+def IIC_fpLoad16   : InstrItinClass;
 def IIC_fpLoad32   : InstrItinClass;
 def IIC_fpLoad64   : InstrItinClass;
 def IIC_fpLoad_m   : InstrItinClass;
 def IIC_fpLoad_mu  : InstrItinClass;
+def IIC_fpStore16  : InstrItinClass;
 def IIC_fpStore32  : InstrItinClass;
 def IIC_fpStore64  : InstrItinClass;
 def IIC_fpStore_m  : InstrItinClass;
diff --git a/lib/Target/ARM/ARMScheduleA8.td b/lib/Target/ARM/ARMScheduleA8.td
index 2c6382542ab9..ba380cba100f 100644
--- a/lib/Target/ARM/ARMScheduleA8.td
+++ b/lib/Target/ARM/ARMScheduleA8.td
@@ -1065,11 +1065,11 @@ def CortexA8Itineraries : ProcessorItineraries<
 // Cortex-A8 machine model for scheduling and other instruction cost heuristics.
 def CortexA8Model : SchedMachineModel {
   let IssueWidth = 2; // 2 micro-ops are dispatched per cycle.
-  let MinLatency = -1; // OperandCycles are interpreted as MinLatency.
   let LoadLatency = 2; // Optimistic load latency assuming bypass.
                        // This is overriden by OperandCycles if the
                        // Itineraries are queried instead.
   let MispredictPenalty = 13; // Based on estimate of pipeline depth.
+  let CompleteModel = 0;
 
   let Itineraries = CortexA8Itineraries;
 }
diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td
index 9a1d22275646..519e595bd184 100644
--- a/lib/Target/ARM/ARMScheduleA9.td
+++ b/lib/Target/ARM/ARMScheduleA9.td
@@ -2025,12 +2025,12 @@ def A9WriteAdr#NumAddr : WriteSequence<[A9WriteAdr], NumAddr>;
 
 // Define a predicate to select the LDM based on number of memory addresses.
 def A9LMAdr#NumAddr#Pred :
-  SchedPredicate<"(TII->getNumLDMAddresses(MI)+1)/2 == "#NumAddr>;
+  SchedPredicate<"(TII->getNumLDMAddresses(*MI)+1)/2 == "#NumAddr>;
 
 } // foreach NumAddr
 
 // Fall-back for unknown LDMs.
-def A9LMUnknownPred : SchedPredicate<"TII->getNumLDMAddresses(MI) == 0">;
+def A9LMUnknownPred : SchedPredicate<"TII->getNumLDMAddresses(*MI) == 0">;
 
 // LDM/VLDM/VLDn address generation latency & resources.
 // Dynamically select the A9WriteAdrN sequence using a predicate.
diff --git a/lib/Target/ARM/ARMScheduleSwift.td b/lib/Target/ARM/ARMScheduleSwift.td
index 3ad7730228e5..ea2bf4b578f0 100644
--- a/lib/Target/ARM/ARMScheduleSwift.td
+++ b/lib/Target/ARM/ARMScheduleSwift.td
@@ -374,7 +374,7 @@ let SchedModel = SwiftModel in {
   }
   // Predicate.
   foreach NumAddr = 1-16 in {
-    def SwiftLMAddr#NumAddr#Pred : SchedPredicate<"TII->getNumLDMAddresses(MI) == "#NumAddr>;
+    def SwiftLMAddr#NumAddr#Pred : SchedPredicate<"TII->getNumLDMAddresses(*MI) == "#NumAddr>;
   }
   def SwiftWriteLDMAddrNoWB : SchedWriteRes<[SwiftUnitP01]> { let Latency = 0; }
   def SwiftWriteLDMAddrWB : SchedWriteRes<[SwiftUnitP01, SwiftUnitP01]>;
diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
index 6fded9c8ab73..3b99762f7157 100644
--- a/lib/Target/ARM/ARMSelectionDAGInfo.cpp
+++ b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -21,12 +21,9 @@ using namespace llvm;
 // Emit, if possible, a specialized version of the given Libcall. Typically this
 // means selecting the appropriately aligned version, but we also convert memset
 // of 0 into memclr.
-SDValue ARMSelectionDAGInfo::
-EmitSpecializedLibcall(SelectionDAG &DAG, SDLoc dl,
-                       SDValue Chain,
-                       SDValue Dst, SDValue Src,
-                       SDValue Size, unsigned Align,
-                       RTLIB::Libcall LC) const {
+SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
+    SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
+    SDValue Size, unsigned Align, RTLIB::Libcall LC) const {
   const ARMSubtarget &Subtarget =
       DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
   const ARMTargetLowering *TLI = Subtarget.getTargetLowering();
@@ -121,21 +118,17 @@ EmitSpecializedLibcall(SelectionDAG &DAG, SDLoc dl,
            TLI->getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()),
            DAG.getExternalSymbol(FunctionNames[AEABILibcall][AlignVariant],
                                  TLI->getPointerTy(DAG.getDataLayout())),
-           std::move(Args), 0)
+           std::move(Args))
       .setDiscardResult();
   std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
   
   return CallResult.second;
 }
 
-SDValue
-ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
-                                             SDValue Chain,
-                                             SDValue Dst, SDValue Src,
-                                             SDValue Size, unsigned Align,
-                                             bool isVolatile, bool AlwaysInline,
-                                             MachinePointerInfo DstPtrInfo,
-                                          MachinePointerInfo SrcPtrInfo) const {
+SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
+    SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
+    SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline,
+    MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
   const ARMSubtarget &Subtarget =
       DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
   // Do repeated 4-byte loads and stores. To be improved.
@@ -176,6 +169,12 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
   // emit.
   unsigned NumMEMCPYs = (NumMemOps + MaxLoadsInLDM - 1) / MaxLoadsInLDM;
 
+  // Code size optimisation: do not inline memcpy if expansion results in
+  // more instructions than the libary call.
+  if (NumMEMCPYs > 1 && DAG.getMachineFunction().getFunction()->optForMinSize()) {
+    return SDValue();
+  }
+
   SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other, MVT::Glue);
 
   for (unsigned I = 0; I != NumMEMCPYs; ++I) {
@@ -213,8 +212,7 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
     Loads[i] = DAG.getLoad(VT, dl, Chain,
                            DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
                                        DAG.getConstant(SrcOff, dl, MVT::i32)),
-                           SrcPtrInfo.getWithOffset(SrcOff),
-                           false, false, false, 0);
+                           SrcPtrInfo.getWithOffset(SrcOff));
     TFOps[i] = Loads[i].getValue(1);
     ++i;
     SrcOff += VTSize;
@@ -237,7 +235,7 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
     TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
                             DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
                                         DAG.getConstant(DstOff, dl, MVT::i32)),
-                            DstPtrInfo.getWithOffset(DstOff), false, false, 0);
+                            DstPtrInfo.getWithOffset(DstOff));
     ++i;
     DstOff += VTSize;
     BytesLeft -= VTSize;
@@ -246,26 +244,18 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
                      makeArrayRef(TFOps, i));
 }
 
-
-SDValue ARMSelectionDAGInfo::
-EmitTargetCodeForMemmove(SelectionDAG &DAG, SDLoc dl,
-                         SDValue Chain,
-                         SDValue Dst, SDValue Src,
-                         SDValue Size, unsigned Align,
-                         bool isVolatile,
-                         MachinePointerInfo DstPtrInfo,
-                         MachinePointerInfo SrcPtrInfo) const {
+SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemmove(
+    SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
+    SDValue Size, unsigned Align, bool isVolatile,
+    MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
   return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Align,
                                 RTLIB::MEMMOVE);
 }
 
-
-SDValue ARMSelectionDAGInfo::
-EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
-                        SDValue Chain, SDValue Dst,
-                        SDValue Src, SDValue Size,
-                        unsigned Align, bool isVolatile,
-                        MachinePointerInfo DstPtrInfo) const {
+SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemset(
+    SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
+    SDValue Size, unsigned Align, bool isVolatile,
+    MachinePointerInfo DstPtrInfo) const {
   return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Align,
                                 RTLIB::MEMSET);
 }
diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.h b/lib/Target/ARM/ARMSelectionDAGInfo.h
index 289879ee1d7e..2ddb42c95397 100644
--- a/lib/Target/ARM/ARMSelectionDAGInfo.h
+++ b/lib/Target/ARM/ARMSelectionDAGInfo.h
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines the ARM subclass for TargetSelectionDAGInfo.
+// This file defines the ARM subclass for SelectionDAGTargetInfo.
 //
 //===----------------------------------------------------------------------===//
 
@@ -15,7 +15,8 @@
 #define LLVM_LIB_TARGET_ARM_ARMSELECTIONDAGINFO_H
 
 #include "MCTargetDesc/ARMAddressingModes.h"
-#include "llvm/Target/TargetSelectionDAGInfo.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 
 namespace llvm {
 
@@ -35,35 +36,30 @@ namespace ARM_AM {
   }
 }  // end namespace ARM_AM
 
-class ARMSelectionDAGInfo : public TargetSelectionDAGInfo {
+class ARMSelectionDAGInfo : public SelectionDAGTargetInfo {
 public:
-
-  SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
-                                  SDValue Chain,
-                                  SDValue Dst, SDValue Src,
-                                  SDValue Size, unsigned Align,
-                                  bool isVolatile, bool AlwaysInline,
+  SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
+                                  SDValue Chain, SDValue Dst, SDValue Src,
+                                  SDValue Size, unsigned Align, bool isVolatile,
+                                  bool AlwaysInline,
                                   MachinePointerInfo DstPtrInfo,
                                   MachinePointerInfo SrcPtrInfo) const override;
 
-  SDValue EmitTargetCodeForMemmove(SelectionDAG &DAG, SDLoc dl,
-                                   SDValue Chain,
-                                   SDValue Dst, SDValue Src,
-                                   SDValue Size, unsigned Align, bool isVolatile,
-                                   MachinePointerInfo DstPtrInfo,
-                                   MachinePointerInfo SrcPtrInfo) const override;
+  SDValue
+  EmitTargetCodeForMemmove(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain,
+                           SDValue Dst, SDValue Src, SDValue Size,
+                           unsigned Align, bool isVolatile,
+                           MachinePointerInfo DstPtrInfo,
+                           MachinePointerInfo SrcPtrInfo) const override;
 
   // Adjust parameters for memset, see RTABI section 4.3.4
-  SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
-                                  SDValue Chain,
-                                  SDValue Op1, SDValue Op2,
-                                  SDValue Op3, unsigned Align,
-                                  bool isVolatile,
+  SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl,
+                                  SDValue Chain, SDValue Op1, SDValue Op2,
+                                  SDValue Op3, unsigned Align, bool isVolatile,
                                   MachinePointerInfo DstPtrInfo) const override;
 
-  SDValue EmitSpecializedLibcall(SelectionDAG &DAG, SDLoc dl,
-                                 SDValue Chain,
-                                 SDValue Dst, SDValue Src,
+  SDValue EmitSpecializedLibcall(SelectionDAG &DAG, const SDLoc &dl,
+                                 SDValue Chain, SDValue Dst, SDValue Src,
                                  SDValue Size, unsigned Align,
                                  RTLIB::Libcall LC) const;
 };
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index bb6ae28065bd..1d7eef9ddcfd 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -88,10 +88,9 @@ ARMFrameLowering *ARMSubtarget::initializeFrameLowering(StringRef CPU,
 ARMSubtarget::ARMSubtarget(const Triple &TT, const std::string &CPU,
                            const std::string &FS,
                            const ARMBaseTargetMachine &TM, bool IsLittle)
-    : ARMGenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others),
-      ARMProcClass(None), ARMArch(ARMv4t), stackAlignment(4), CPUString(CPU),
-      IsLittle(IsLittle), TargetTriple(TT), Options(TM.Options), TM(TM),
-      FrameLowering(initializeFrameLowering(CPU, FS)),
+    : ARMGenSubtargetInfo(TT, CPU, FS), UseMulOps(UseFusedMulOps),
+      CPUString(CPU), IsLittle(IsLittle), TargetTriple(TT), Options(TM.Options),
+      TM(TM), FrameLowering(initializeFrameLowering(CPU, FS)),
       // At this point initializeSubtargetDependencies has been called so
       // we can query directly.
       InstrInfo(isThumb1Only()
@@ -102,63 +101,10 @@ ARMSubtarget::ARMSubtarget(const Triple &TT, const std::string &CPU,
       TLInfo(TM, *this) {}
 
 void ARMSubtarget::initializeEnvironment() {
-  HasV4TOps = false;
-  HasV5TOps = false;
-  HasV5TEOps = false;
-  HasV6Ops = false;
-  HasV6MOps = false;
-  HasV6KOps = false;
-  HasV6T2Ops = false;
-  HasV7Ops = false;
-  HasV8Ops = false;
-  HasV8_1aOps = false;
-  HasV8_2aOps = false;
-  HasVFPv2 = false;
-  HasVFPv3 = false;
-  HasVFPv4 = false;
-  HasFPARMv8 = false;
-  HasNEON = false;
-  UseNEONForSinglePrecisionFP = false;
-  UseMulOps = UseFusedMulOps;
-  SlowFPVMLx = false;
-  HasVMLxForwarding = false;
-  SlowFPBrcc = false;
-  InThumbMode = false;
-  UseSoftFloat = false;
-  HasThumb2 = false;
-  NoARM = false;
-  ReserveR9 = false;
-  NoMovt = false;
-  SupportsTailCall = false;
-  HasFP16 = false;
-  HasFullFP16 = false;
-  HasD16 = false;
-  HasHardwareDivide = false;
-  HasHardwareDivideInARM = false;
-  HasT2ExtractPack = false;
-  HasDataBarrier = false;
-  Pref32BitThumb = false;
-  AvoidCPSRPartialUpdate = false;
-  AvoidMOVsShifterOperand = false;
-  HasRAS = false;
-  HasMPExtension = false;
-  HasVirtualization = false;
-  FPOnlySP = false;
-  HasPerfMon = false;
-  HasTrustZone = false;
-  HasCrypto = false;
-  HasCRC = false;
-  HasZeroCycleZeroing = false;
-  StrictAlign = false;
-  HasDSP = false;
-  UseNaClTrap = false;
-  GenLongCalls = false;
-  UnsafeFPMath = false;
-
   // MCAsmInfo isn't always present (e.g. in opt) so we can't initialize this
   // directly from it, but we can try to make sure they're consistent when both
   // available.
-  UseSjLjEH = isTargetDarwin() && !isTargetWatchOS();
+  UseSjLjEH = isTargetDarwin() && !isTargetWatchABI();
   assert((!TM.getMCAsmInfo() ||
           (TM.getMCAsmInfo()->getExceptionHandlingType() ==
            ExceptionHandling::SjLj) == UseSjLjEH) &&
@@ -230,7 +176,7 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
   // registers are the 4 used for parameters.  We don't currently do this
   // case.
 
-  SupportsTailCall = !isThumb1Only();
+  SupportsTailCall = !isThumb() || hasV8MBaselineOps();
 
   if (isTargetMachO() && isTargetIOS() && getTargetTriple().isOSVersionLT(5, 0))
     SupportsTailCall = false;
@@ -252,6 +198,53 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
   if ((Bits[ARM::ProcA5] || Bits[ARM::ProcA8]) && // Where this matters
       (Options.UnsafeFPMath || isTargetDarwin()))
     UseNEONForSinglePrecisionFP = true;
+
+  // FIXME: Teach TableGen to deal with these instead of doing it manually here.
+  switch (ARMProcFamily) {
+  case Others:
+  case CortexA5:
+    break;
+  case CortexA7:
+    LdStMultipleTiming = DoubleIssue;
+    break;
+  case CortexA8:
+    LdStMultipleTiming = DoubleIssue;
+    break;
+  case CortexA9:
+    LdStMultipleTiming = DoubleIssueCheckUnalignedAccess;
+    PreISelOperandLatencyAdjustment = 1;
+    break;
+  case CortexA12:
+    break;
+  case CortexA15:
+    MaxInterleaveFactor = 2;
+    PreISelOperandLatencyAdjustment = 1;
+    PartialUpdateClearance = 12;
+    break;
+  case CortexA17:
+  case CortexA32:
+  case CortexA35:
+  case CortexA53:
+  case CortexA57:
+  case CortexA72:
+  case CortexA73:
+  case CortexR4:
+  case CortexR4F:
+  case CortexR5:
+  case CortexR7:
+  case CortexM3:
+  case ExynosM1:
+    break;
+  case Krait:
+    PreISelOperandLatencyAdjustment = 1;
+    break;
+  case Swift:
+    MaxInterleaveFactor = 2;
+    LdStMultipleTiming = SingleIssuePlusExtras;
+    PreISelOperandLatencyAdjustment = 1;
+    PartialUpdateClearance = 12;
+    break;
+  }
 }
 
 bool ARMSubtarget::isAPCS_ABI() const {
@@ -268,40 +261,16 @@ bool ARMSubtarget::isAAPCS16_ABI() const {
   return TM.TargetABI == ARMBaseTargetMachine::ARM_ABI_AAPCS16;
 }
 
+bool ARMSubtarget::isGVIndirectSymbol(const GlobalValue *GV) const {
+  if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
+    return true;
 
-/// GVIsIndirectSymbol - true if the GV will be accessed via an indirect symbol.
-bool
-ARMSubtarget::GVIsIndirectSymbol(const GlobalValue *GV,
-                                 Reloc::Model RelocM) const {
-  if (RelocM == Reloc::Static)
-    return false;
-
-  bool isDef = GV->isStrongDefinitionForLinker();
-
-  if (!isTargetMachO()) {
-    // Extra load is needed for all externally visible.
-    if (GV->hasLocalLinkage() || GV->hasHiddenVisibility())
-      return false;
+  // 32 bit macho has no relocation for a-b if a is undefined, even if b is in
+  // the section that is being relocated. This means we have to use o load even
+  // for GVs that are known to be local to the dso.
+  if (isTargetDarwin() && TM.isPositionIndependent() &&
+      (GV->isDeclarationForLinker() || GV->hasCommonLinkage()))
     return true;
-  } else {
-    // If this is a strong reference to a definition, it is definitely not
-    // through a stub.
-    if (isDef)
-      return false;
-
-    // Unless we have a symbol with hidden visibility, we have to go through a
-    // normal $non_lazy_ptr stub because this symbol might be resolved late.
-    if (!GV->hasHiddenVisibility())  // Non-hidden $non_lazy_ptr reference.
-      return true;
-
-    if (RelocM == Reloc::PIC_) {
-      // If symbol visibility is hidden, we have a stub for common symbol
-      // references and external declarations.
-      if (GV->isDeclarationForLinker() || GV->hasCommonLinkage())
-        // Hidden $non_lazy_ptr reference.
-        return true;
-    }
-  }
 
   return false;
 }
@@ -332,21 +301,21 @@ bool ARMSubtarget::enablePostRAScheduler() const {
 }
 
 bool ARMSubtarget::enableAtomicExpand() const {
-  return hasAnyDataBarrier() && !isThumb1Only();
+  return hasAnyDataBarrier() && (!isThumb() || hasV8MBaselineOps());
 }
 
 bool ARMSubtarget::useStride4VFPs(const MachineFunction &MF) const {
   // For general targets, the prologue can grow when VFPs are allocated with
   // stride 4 (more vpush instructions). But WatchOS uses a compact unwind
   // format which it's more important to get right.
-  return isTargetWatchOS() || (isSwift() && !MF.getFunction()->optForMinSize());
+  return isTargetWatchABI() || (isSwift() && !MF.getFunction()->optForMinSize());
 }
 
 bool ARMSubtarget::useMovt(const MachineFunction &MF) const {
   // NOTE Windows on ARM needs to use mov.w/mov.t pairs to materialise 32-bit
   // immediates as it is inherently position independent, and may be out of
   // range otherwise.
-  return !NoMovt && hasV6T2Ops() &&
+  return !NoMovt && hasV8MBaselineOps() &&
          (isTargetWindows() || !MF.getFunction()->optForMinSize());
 }
 
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index 4d54e5751473..910de0e1e72d 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -43,8 +43,9 @@ class ARMSubtarget : public ARMGenSubtargetInfo {
 protected:
   enum ARMProcFamilyEnum {
     Others, CortexA5, CortexA7, CortexA8, CortexA9, CortexA12, CortexA15,
-    CortexA17, CortexR4, CortexR4F, CortexR5, CortexR7, CortexA35, CortexA53,
-    CortexA57, CortexA72, Krait, Swift, ExynosM1
+    CortexA17, CortexR4, CortexR4F, CortexR5, CortexR7, CortexM3,
+    CortexA32, CortexA35, CortexA53, CortexA57, CortexA72, CortexA73,
+    Krait, Swift, ExynosM1
   };
   enum ARMProcClassEnum {
     None, AClass, RClass, MClass
@@ -52,188 +53,275 @@ protected:
   enum ARMArchEnum {
     ARMv2, ARMv2a, ARMv3, ARMv3m, ARMv4, ARMv4t, ARMv5, ARMv5t, ARMv5te,
     ARMv5tej, ARMv6, ARMv6k, ARMv6kz, ARMv6t2, ARMv6m, ARMv6sm, ARMv7a, ARMv7r,
-    ARMv7m, ARMv7em, ARMv8a, ARMv81a, ARMv82a
+    ARMv7m, ARMv7em, ARMv8a, ARMv81a, ARMv82a, ARMv8mMainline, ARMv8mBaseline
   };
 
+public:
+  /// What kind of timing do load multiple/store multiple instructions have.
+  enum ARMLdStMultipleTiming {
+    /// Can load/store 2 registers/cycle.
+    DoubleIssue,
+    /// Can load/store 2 registers/cycle, but needs an extra cycle if the access
+    /// is not 64-bit aligned.
+    DoubleIssueCheckUnalignedAccess,
+    /// Can load/store 1 register/cycle.
+    SingleIssue,
+    /// Can load/store 1 register/cycle, but needs an extra cycle for address
+    /// computation and potentially also for register writeback.
+    SingleIssuePlusExtras,
+  };
+
+protected:
   /// ARMProcFamily - ARM processor family: Cortex-A8, Cortex-A9, and others.
-  ARMProcFamilyEnum ARMProcFamily;
+  ARMProcFamilyEnum ARMProcFamily = Others;
 
   /// ARMProcClass - ARM processor class: None, AClass, RClass or MClass.
-  ARMProcClassEnum ARMProcClass;
+  ARMProcClassEnum ARMProcClass = None;
 
   /// ARMArch - ARM architecture
-  ARMArchEnum ARMArch;
+  ARMArchEnum ARMArch = ARMv4t;
 
   /// HasV4TOps, HasV5TOps, HasV5TEOps,
   /// HasV6Ops, HasV6MOps, HasV6KOps, HasV6T2Ops, HasV7Ops, HasV8Ops -
   /// Specify whether target support specific ARM ISA variants.
-  bool HasV4TOps;
-  bool HasV5TOps;
-  bool HasV5TEOps;
-  bool HasV6Ops;
-  bool HasV6MOps;
-  bool HasV6KOps;
-  bool HasV6T2Ops;
-  bool HasV7Ops;
-  bool HasV8Ops;
-  bool HasV8_1aOps;
-  bool HasV8_2aOps;
+  bool HasV4TOps = false;
+  bool HasV5TOps = false;
+  bool HasV5TEOps = false;
+  bool HasV6Ops = false;
+  bool HasV6MOps = false;
+  bool HasV6KOps = false;
+  bool HasV6T2Ops = false;
+  bool HasV7Ops = false;
+  bool HasV8Ops = false;
+  bool HasV8_1aOps = false;
+  bool HasV8_2aOps = false;
+  bool HasV8MBaselineOps = false;
+  bool HasV8MMainlineOps = false;
 
   /// HasVFPv2, HasVFPv3, HasVFPv4, HasFPARMv8, HasNEON - Specify what
   /// floating point ISAs are supported.
-  bool HasVFPv2;
-  bool HasVFPv3;
-  bool HasVFPv4;
-  bool HasFPARMv8;
-  bool HasNEON;
+  bool HasVFPv2 = false;
+  bool HasVFPv3 = false;
+  bool HasVFPv4 = false;
+  bool HasFPARMv8 = false;
+  bool HasNEON = false;
 
   /// UseNEONForSinglePrecisionFP - if the NEONFP attribute has been
   /// specified. Use the method useNEONForSinglePrecisionFP() to
   /// determine if NEON should actually be used.
-  bool UseNEONForSinglePrecisionFP;
+  bool UseNEONForSinglePrecisionFP = false;
 
   /// UseMulOps - True if non-microcoded fused integer multiply-add and
   /// multiply-subtract instructions should be used.
-  bool UseMulOps;
+  bool UseMulOps = false;
 
   /// SlowFPVMLx - If the VFP2 / NEON instructions are available, indicates
   /// whether the FP VML[AS] instructions are slow (if so, don't use them).
-  bool SlowFPVMLx;
+  bool SlowFPVMLx = false;
 
   /// HasVMLxForwarding - If true, NEON has special multiplier accumulator
   /// forwarding to allow mul + mla being issued back to back.
-  bool HasVMLxForwarding;
+  bool HasVMLxForwarding = false;
 
   /// SlowFPBrcc - True if floating point compare + branch is slow.
-  bool SlowFPBrcc;
+  bool SlowFPBrcc = false;
 
   /// InThumbMode - True if compiling for Thumb, false for ARM.
-  bool InThumbMode;
+  bool InThumbMode = false;
 
   /// UseSoftFloat - True if we're using software floating point features.
-  bool UseSoftFloat;
+  bool UseSoftFloat = false;
 
   /// HasThumb2 - True if Thumb2 instructions are supported.
-  bool HasThumb2;
+  bool HasThumb2 = false;
 
   /// NoARM - True if subtarget does not support ARM mode execution.
-  bool NoARM;
+  bool NoARM = false;
 
   /// ReserveR9 - True if R9 is not available as a general purpose register.
-  bool ReserveR9;
+  bool ReserveR9 = false;
 
   /// NoMovt - True if MOVT / MOVW pairs are not used for materialization of
   /// 32-bit imms (including global addresses).
-  bool NoMovt;
+  bool NoMovt = false;
 
   /// SupportsTailCall - True if the OS supports tail call. The dynamic linker
   /// must be able to synthesize call stubs for interworking between ARM and
   /// Thumb.
-  bool SupportsTailCall;
+  bool SupportsTailCall = false;
 
   /// HasFP16 - True if subtarget supports half-precision FP conversions
-  bool HasFP16;
+  bool HasFP16 = false;
 
   /// HasFullFP16 - True if subtarget supports half-precision FP operations
-  bool HasFullFP16;
+  bool HasFullFP16 = false;
 
   /// HasD16 - True if subtarget is limited to 16 double precision
   /// FP registers for VFPv3.
-  bool HasD16;
+  bool HasD16 = false;
 
   /// HasHardwareDivide - True if subtarget supports [su]div
-  bool HasHardwareDivide;
+  bool HasHardwareDivide = false;
 
   /// HasHardwareDivideInARM - True if subtarget supports [su]div in ARM mode
-  bool HasHardwareDivideInARM;
+  bool HasHardwareDivideInARM = false;
 
   /// HasT2ExtractPack - True if subtarget supports thumb2 extract/pack
   /// instructions.
-  bool HasT2ExtractPack;
+  bool HasT2ExtractPack = false;
 
   /// HasDataBarrier - True if the subtarget supports DMB / DSB data barrier
   /// instructions.
-  bool HasDataBarrier;
+  bool HasDataBarrier = false;
+
+  /// HasV7Clrex - True if the subtarget supports CLREX instructions
+  bool HasV7Clrex = false;
+
+  /// HasAcquireRelease - True if the subtarget supports v8 atomics (LDA/LDAEX etc)
+  /// instructions
+  bool HasAcquireRelease = false;
 
   /// Pref32BitThumb - If true, codegen would prefer 32-bit Thumb instructions
   /// over 16-bit ones.
-  bool Pref32BitThumb;
+  bool Pref32BitThumb = false;
 
   /// AvoidCPSRPartialUpdate - If true, codegen would avoid using instructions
   /// that partially update CPSR and add false dependency on the previous
   /// CPSR setting instruction.
-  bool AvoidCPSRPartialUpdate;
+  bool AvoidCPSRPartialUpdate = false;
 
   /// AvoidMOVsShifterOperand - If true, codegen should avoid using flag setting
   /// movs with shifter operand (i.e. asr, lsl, lsr).
-  bool AvoidMOVsShifterOperand;
+  bool AvoidMOVsShifterOperand = false;
 
-  /// HasRAS - Some processors perform return stack prediction. CodeGen should
+  /// HasRetAddrStack - Some processors perform return stack prediction. CodeGen should
   /// avoid issue "normal" call instructions to callees which do not return.
-  bool HasRAS;
+  bool HasRetAddrStack = false;
 
   /// HasMPExtension - True if the subtarget supports Multiprocessing
   /// extension (ARMv7 only).
-  bool HasMPExtension;
+  bool HasMPExtension = false;
 
   /// HasVirtualization - True if the subtarget supports the Virtualization
   /// extension.
-  bool HasVirtualization;
+  bool HasVirtualization = false;
 
   /// FPOnlySP - If true, the floating point unit only supports single
   /// precision.
-  bool FPOnlySP;
+  bool FPOnlySP = false;
 
   /// If true, the processor supports the Performance Monitor Extensions. These
   /// include a generic cycle-counter as well as more fine-grained (often
   /// implementation-specific) events.
-  bool HasPerfMon;
+  bool HasPerfMon = false;
 
   /// HasTrustZone - if true, processor supports TrustZone security extensions
-  bool HasTrustZone;
+  bool HasTrustZone = false;
+
+  /// Has8MSecExt - if true, processor supports ARMv8-M Security Extensions
+  bool Has8MSecExt = false;
 
   /// HasCrypto - if true, processor supports Cryptography extensions
-  bool HasCrypto;
+  bool HasCrypto = false;
 
   /// HasCRC - if true, processor supports CRC instructions
-  bool HasCRC;
+  bool HasCRC = false;
+
+  /// HasRAS - if true, the processor supports RAS extensions
+  bool HasRAS = false;
 
   /// If true, the instructions "vmov.i32 d0, #0" and "vmov.i32 q0, #0" are
   /// particularly effective at zeroing a VFP register.
-  bool HasZeroCycleZeroing;
+  bool HasZeroCycleZeroing = false;
+
+  /// If true, if conversion may decide to leave some instructions unpredicated.
+  bool IsProfitableToUnpredicate = false;
+
+  /// If true, VMOV will be favored over VGETLNi32.
+  bool HasSlowVGETLNi32 = false;
+
+  /// If true, VMOV will be favored over VDUP.
+  bool HasSlowVDUP32 = false;
+
+  /// If true, VMOVSR will be favored over VMOVDRR.
+  bool PreferVMOVSR = false;
+
+  /// If true, ISHST barriers will be used for Release semantics.
+  bool PreferISHST = false;
+
+  /// If true, a VLDM/VSTM starting with an odd register number is considered to
+  /// take more microops than single VLDRS/VSTRS.
+  bool SlowOddRegister = false;
+
+  /// If true, loading into a D subregister will be penalized.
+  bool SlowLoadDSubregister = false;
+
+  /// If true, the AGU and NEON/FPU units are multiplexed.
+  bool HasMuxedUnits = false;
+
+  /// If true, VMOVS will never be widened to VMOVD
+  bool DontWidenVMOVS = false;
+
+  /// If true, run the MLx expansion pass.
+  bool ExpandMLx = false;
+
+  /// If true, VFP/NEON VMLA/VMLS have special RAW hazards.
+  bool HasVMLxHazards = false;
+
+  /// If true, VMOVRS, VMOVSR and VMOVS will be converted from VFP to NEON.
+  bool UseNEONForFPMovs = false;
+
+  /// If true, VLDn instructions take an extra cycle for unaligned accesses.
+  bool CheckVLDnAlign = false;
+
+  /// If true, VFP instructions are not pipelined.
+  bool NonpipelinedVFP = false;
 
   /// StrictAlign - If true, the subtarget disallows unaligned memory
   /// accesses for some types.  For details, see
   /// ARMTargetLowering::allowsMisalignedMemoryAccesses().
-  bool StrictAlign;
+  bool StrictAlign = false;
 
   /// RestrictIT - If true, the subtarget disallows generation of deprecated IT
   ///  blocks to conform to ARMv8 rule.
-  bool RestrictIT;
+  bool RestrictIT = false;
 
   /// HasDSP - If true, the subtarget supports the DSP (saturating arith
   /// and such) instructions.
-  bool HasDSP;
+  bool HasDSP = false;
 
   /// NaCl TRAP instruction is generated instead of the regular TRAP.
-  bool UseNaClTrap;
+  bool UseNaClTrap = false;
 
   /// Generate calls via indirect call instructions.
-  bool GenLongCalls;
+  bool GenLongCalls = false;
 
   /// Target machine allowed unsafe FP math (such as use of NEON fp)
-  bool UnsafeFPMath;
+  bool UnsafeFPMath = false;
 
   /// UseSjLjEH - If true, the target uses SjLj exception handling (e.g. iOS).
-  bool UseSjLjEH;
+  bool UseSjLjEH = false;
 
   /// stackAlignment - The minimum alignment known to hold of the stack frame on
   /// entry to the function and which must be maintained by every function.
-  unsigned stackAlignment;
+  unsigned stackAlignment = 4;
 
   /// CPUString - String name of used CPU.
   std::string CPUString;
 
+  unsigned MaxInterleaveFactor = 1;
+
+  /// Clearance before partial register updates (in number of instructions)
+  unsigned PartialUpdateClearance = 0;
+
+  /// What kind of timing do load multiple/store multiple have (double issue,
+  /// single issue etc).
+  ARMLdStMultipleTiming LdStMultipleTiming = SingleIssue;
+
+  /// The adjustment that we need to apply to get the operand latency from the
+  /// operand cycle returned by the itinerary data for pre-ISel operands.
+  int PreISelOperandLatencyAdjustment = 2;
+
   /// IsLittle - The target is Little Endian
   bool IsLittle;
 
@@ -313,17 +401,23 @@ public:
   bool hasV8Ops()   const { return HasV8Ops;  }
   bool hasV8_1aOps() const { return HasV8_1aOps; }
   bool hasV8_2aOps() const { return HasV8_2aOps; }
+  bool hasV8MBaselineOps() const { return HasV8MBaselineOps; }
+  bool hasV8MMainlineOps() const { return HasV8MMainlineOps; }
 
+  /// @{
+  /// These functions are obsolete, please consider adding subtarget features
+  /// or properties instead of calling them.
   bool isCortexA5() const { return ARMProcFamily == CortexA5; }
   bool isCortexA7() const { return ARMProcFamily == CortexA7; }
   bool isCortexA8() const { return ARMProcFamily == CortexA8; }
   bool isCortexA9() const { return ARMProcFamily == CortexA9; }
   bool isCortexA15() const { return ARMProcFamily == CortexA15; }
   bool isSwift()    const { return ARMProcFamily == Swift; }
-  bool isCortexM3() const { return CPUString == "cortex-m3"; }
+  bool isCortexM3() const { return ARMProcFamily == CortexM3; }
   bool isLikeA9() const { return isCortexA9() || isCortexA15() || isKrait(); }
   bool isCortexR5() const { return ARMProcFamily == CortexR5; }
   bool isKrait() const { return ARMProcFamily == Krait; }
+  /// @}
 
   bool hasARMOps() const { return !NoARM; }
 
@@ -334,6 +428,7 @@ public:
   bool hasNEON() const { return HasNEON;  }
   bool hasCrypto() const { return HasCrypto; }
   bool hasCRC() const { return HasCRC; }
+  bool hasRAS() const { return HasRAS; }
   bool hasVirtualization() const { return HasVirtualization; }
   bool useNEONForSinglePrecisionFP() const {
     return hasNEON() && UseNEONForSinglePrecisionFP;
@@ -343,6 +438,8 @@ public:
   bool hasDivideInARMMode() const { return HasHardwareDivideInARM; }
   bool hasT2ExtractPack() const { return HasT2ExtractPack; }
   bool hasDataBarrier() const { return HasDataBarrier; }
+  bool hasV7Clrex() const { return HasV7Clrex; }
+  bool hasAcquireRelease() const { return HasAcquireRelease; }
   bool hasAnyDataBarrier() const {
     return HasDataBarrier || (hasV6Ops() && !isThumb());
   }
@@ -353,11 +450,26 @@ public:
   bool isFPOnlySP() const { return FPOnlySP; }
   bool hasPerfMon() const { return HasPerfMon; }
   bool hasTrustZone() const { return HasTrustZone; }
+  bool has8MSecExt() const { return Has8MSecExt; }
   bool hasZeroCycleZeroing() const { return HasZeroCycleZeroing; }
+  bool isProfitableToUnpredicate() const { return IsProfitableToUnpredicate; }
+  bool hasSlowVGETLNi32() const { return HasSlowVGETLNi32; }
+  bool hasSlowVDUP32() const { return HasSlowVDUP32; }
+  bool preferVMOVSR() const { return PreferVMOVSR; }
+  bool preferISHSTBarriers() const { return PreferISHST; }
+  bool expandMLx() const { return ExpandMLx; }
+  bool hasVMLxHazards() const { return HasVMLxHazards; }
+  bool hasSlowOddRegister() const { return SlowOddRegister; }
+  bool hasSlowLoadDSubregister() const { return SlowLoadDSubregister; }
+  bool hasMuxedUnits() const { return HasMuxedUnits; }
+  bool dontWidenVMOVS() const { return DontWidenVMOVS; }
+  bool useNEONForFPMovs() const { return UseNEONForFPMovs; }
+  bool checkVLDnAccessAlignment() const { return CheckVLDnAlign; }
+  bool nonpipelinedVFP() const { return NonpipelinedVFP; }
   bool prefers32BitThumb() const { return Pref32BitThumb; }
   bool avoidCPSRPartialUpdate() const { return AvoidCPSRPartialUpdate; }
   bool avoidMOVsShifterOperand() const { return AvoidMOVsShifterOperand; }
-  bool hasRAS() const { return HasRAS; }
+  bool hasRetAddrStack() const { return HasRetAddrStack; }
   bool hasMPExtension() const { return HasMPExtension; }
   bool hasDSP() const { return HasDSP; }
   bool useNaClTrap() const { return UseNaClTrap; }
@@ -373,6 +485,7 @@ public:
   bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
   bool isTargetIOS() const { return TargetTriple.isiOS(); }
   bool isTargetWatchOS() const { return TargetTriple.isWatchOS(); }
+  bool isTargetWatchABI() const { return TargetTriple.isWatchABI(); }
   bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
   bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
   bool isTargetNetBSD() const { return TargetTriple.isOSNetBSD(); }
@@ -399,14 +512,21 @@ public:
             TargetTriple.getEnvironment() == Triple::GNUEABIHF) &&
            !isTargetDarwin() && !isTargetWindows();
   }
+  bool isTargetMuslAEABI() const {
+    return (TargetTriple.getEnvironment() == Triple::MuslEABI ||
+            TargetTriple.getEnvironment() == Triple::MuslEABIHF) &&
+           !isTargetDarwin() && !isTargetWindows();
+  }
 
   // ARM Targets that support EHABI exception handling standard
   // Darwin uses SjLj. Other targets might need more checks.
   bool isTargetEHABICompatible() const {
     return (TargetTriple.getEnvironment() == Triple::EABI ||
             TargetTriple.getEnvironment() == Triple::GNUEABI ||
+            TargetTriple.getEnvironment() == Triple::MuslEABI ||
             TargetTriple.getEnvironment() == Triple::EABIHF ||
             TargetTriple.getEnvironment() == Triple::GNUEABIHF ||
+            TargetTriple.getEnvironment() == Triple::MuslEABIHF ||
             isTargetAndroid()) &&
            !isTargetDarwin() && !isTargetWindows();
   }
@@ -414,6 +534,7 @@ public:
   bool isTargetHardFloat() const {
     // FIXME: this is invalid for WindowsCE
     return TargetTriple.getEnvironment() == Triple::GNUEABIHF ||
+           TargetTriple.getEnvironment() == Triple::MuslEABIHF ||
            TargetTriple.getEnvironment() == Triple::EABIHF ||
            isTargetWindows() || isAAPCS16_ABI();
   }
@@ -436,6 +557,13 @@ public:
     return isTargetMachO() ? (ReserveR9 || !HasV6Ops) : ReserveR9;
   }
 
+  /// Returns true if the frame setup is split into two separate pushes (first
+  /// r0-r7,lr then r8-r11), principally so that the frame pointer is adjacent
+  /// to lr.
+  bool splitFramePushPop() const {
+    return isTargetMachO();
+  }
+
   bool useStride4VFPs(const MachineFunction &MF) const;
 
   bool useMovt(const MachineFunction &MF) const;
@@ -476,9 +604,20 @@ public:
   /// function for this subtarget.
   unsigned getStackAlignment() const { return stackAlignment; }
 
-  /// GVIsIndirectSymbol - true if the GV will be accessed via an indirect
-  /// symbol.
-  bool GVIsIndirectSymbol(const GlobalValue *GV, Reloc::Model RelocM) const;
+  unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; }
+
+  unsigned getPartialUpdateClearance() const { return PartialUpdateClearance; }
+
+  ARMLdStMultipleTiming getLdStMultipleTiming() const {
+    return LdStMultipleTiming;
+  }
+
+  int getPreISelOperandLatencyAdjustment() const {
+    return PreISelOperandLatencyAdjustment;
+  }
+
+  /// True if the GV will be accessed via an indirect symbol.
+  bool isGVIndirectSymbol(const GlobalValue *GV) const;
 
   /// True if fast-isel is used.
   bool useFastISel() const;
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index fca1901dc57c..dc730a675bef 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -16,6 +16,7 @@
 #include "ARMTargetObjectFile.h"
 #include "ARMTargetTransformInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -53,6 +54,10 @@ extern "C" void LLVMInitializeARMTarget() {
   RegisterTargetMachine<ARMBETargetMachine> Y(TheARMBETarget);
   RegisterTargetMachine<ThumbLETargetMachine> A(TheThumbLETarget);
   RegisterTargetMachine<ThumbBETargetMachine> B(TheThumbBETarget);
+
+  PassRegistry &Registry = *PassRegistry::getPassRegistry();
+  initializeARMLoadStoreOptPass(Registry);
+  initializeARMPreAllocLoadStoreOptPass(Registry);
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -85,7 +90,7 @@ computeTargetABI(const Triple &TT, StringRef CPU,
         (TT.getOS() == llvm::Triple::UnknownOS && TT.isOSBinFormatMachO()) ||
         CPU.startswith("cortex-m")) {
       TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS;
-    } else if (TT.isWatchOS()) {
+    } else if (TT.isWatchABI()) {
       TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS16;
     } else {
       TargetABI = ARMBaseTargetMachine::ARM_ABI_APCS;
@@ -99,6 +104,8 @@ computeTargetABI(const Triple &TT, StringRef CPU,
     case llvm::Triple::Android:
     case llvm::Triple::GNUEABI:
     case llvm::Triple::GNUEABIHF:
+    case llvm::Triple::MuslEABI:
+    case llvm::Triple::MuslEABIHF:
     case llvm::Triple::EABIHF:
     case llvm::Triple::EABI:
       TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS;
@@ -171,15 +178,30 @@ static std::string computeDataLayout(const Triple &TT, StringRef CPU,
   return Ret;
 }
 
-/// TargetMachine ctor - Create an ARM architecture model.
+static Reloc::Model getEffectiveRelocModel(const Triple &TT,
+                                           Optional<Reloc::Model> RM) {
+  if (!RM.hasValue())
+    // Default relocation model on Darwin is PIC.
+    return TT.isOSBinFormatMachO() ? Reloc::PIC_ : Reloc::Static;
+
+  // DynamicNoPIC is only used on darwin.
+  if (*RM == Reloc::DynamicNoPIC && !TT.isOSDarwin())
+    return Reloc::Static;
+
+  return *RM;
+}
+
+/// Create an ARM architecture model.
 ///
 ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, const Triple &TT,
                                            StringRef CPU, StringRef FS,
                                            const TargetOptions &Options,
-                                           Reloc::Model RM, CodeModel::Model CM,
+                                           Optional<Reloc::Model> RM,
+                                           CodeModel::Model CM,
                                            CodeGenOpt::Level OL, bool isLittle)
     : LLVMTargetMachine(T, computeDataLayout(TT, CPU, Options, isLittle), TT,
-                        CPU, FS, Options, RM, CM, OL),
+                        CPU, FS, Options, getEffectiveRelocModel(TT, RM), CM,
+                        OL),
       TargetABI(computeTargetABI(TT, CPU, Options)),
       TLOF(createTLOF(getTargetTriple())),
       Subtarget(TT, CPU, FS, *this, isLittle), isLittle(isLittle) {
@@ -192,7 +214,8 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, const Triple &TT,
   // Default to triple-appropriate EABI
   if (Options.EABIVersion == EABI::Default ||
       Options.EABIVersion == EABI::Unknown) {
-    if (Subtarget.isTargetGNUAEABI())
+    // musl is compatible with glibc with regard to EABI version
+    if (Subtarget.isTargetGNUAEABI() || Subtarget.isTargetMuslAEABI())
       this->Options.EABIVersion = EABI::GNU;
     else
       this->Options.EABIVersion = EABI::EABI5;
@@ -219,7 +242,6 @@ ARMBaseTargetMachine::getSubtargetImpl(const Function &F) const {
   // it as a key for the subtarget since that can be the only difference
   // between two functions.
   bool SoftFloat =
-      F.hasFnAttribute("use-soft-float") &&
       F.getFnAttribute("use-soft-float").getValueAsString() == "true";
   // If the soft float attribute is set on the function turn on the soft float
   // subtarget feature.
@@ -248,8 +270,9 @@ void ARMTargetMachine::anchor() {}
 ARMTargetMachine::ARMTargetMachine(const Target &T, const Triple &TT,
                                    StringRef CPU, StringRef FS,
                                    const TargetOptions &Options,
-                                   Reloc::Model RM, CodeModel::Model CM,
-                                   CodeGenOpt::Level OL, bool isLittle)
+                                   Optional<Reloc::Model> RM,
+                                   CodeModel::Model CM, CodeGenOpt::Level OL,
+                                   bool isLittle)
     : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, isLittle) {
   initAsmInfo();
   if (!Subtarget.hasARMOps())
@@ -262,7 +285,8 @@ void ARMLETargetMachine::anchor() {}
 ARMLETargetMachine::ARMLETargetMachine(const Target &T, const Triple &TT,
                                        StringRef CPU, StringRef FS,
                                        const TargetOptions &Options,
-                                       Reloc::Model RM, CodeModel::Model CM,
+                                       Optional<Reloc::Model> RM,
+                                       CodeModel::Model CM,
                                        CodeGenOpt::Level OL)
     : ARMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
 
@@ -271,7 +295,8 @@ void ARMBETargetMachine::anchor() {}
 ARMBETargetMachine::ARMBETargetMachine(const Target &T, const Triple &TT,
                                        StringRef CPU, StringRef FS,
                                        const TargetOptions &Options,
-                                       Reloc::Model RM, CodeModel::Model CM,
+                                       Optional<Reloc::Model> RM,
+                                       CodeModel::Model CM,
                                        CodeGenOpt::Level OL)
     : ARMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
 
@@ -280,7 +305,8 @@ void ThumbTargetMachine::anchor() {}
 ThumbTargetMachine::ThumbTargetMachine(const Target &T, const Triple &TT,
                                        StringRef CPU, StringRef FS,
                                        const TargetOptions &Options,
-                                       Reloc::Model RM, CodeModel::Model CM,
+                                       Optional<Reloc::Model> RM,
+                                       CodeModel::Model CM,
                                        CodeGenOpt::Level OL, bool isLittle)
     : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, isLittle) {
   initAsmInfo();
@@ -291,7 +317,8 @@ void ThumbLETargetMachine::anchor() {}
 ThumbLETargetMachine::ThumbLETargetMachine(const Target &T, const Triple &TT,
                                            StringRef CPU, StringRef FS,
                                            const TargetOptions &Options,
-                                           Reloc::Model RM, CodeModel::Model CM,
+                                           Optional<Reloc::Model> RM,
+                                           CodeModel::Model CM,
                                            CodeGenOpt::Level OL)
     : ThumbTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
 
@@ -300,7 +327,8 @@ void ThumbBETargetMachine::anchor() {}
 ThumbBETargetMachine::ThumbBETargetMachine(const Target &T, const Triple &TT,
                                            StringRef CPU, StringRef FS,
                                            const TargetOptions &Options,
-                                           Reloc::Model RM, CodeModel::Model CM,
+                                           Optional<Reloc::Model> RM,
+                                           CodeModel::Model CM,
                                            CodeGenOpt::Level OL)
     : ThumbTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
 
diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h
index 8ad1f3dc2c34..c6b70b953162 100644
--- a/lib/Target/ARM/ARMTargetMachine.h
+++ b/lib/Target/ARM/ARMTargetMachine.h
@@ -39,7 +39,7 @@ protected:
 public:
   ARMBaseTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                        StringRef FS, const TargetOptions &Options,
-                       Reloc::Model RM, CodeModel::Model CM,
+                       Optional<Reloc::Model> RM, CodeModel::Model CM,
                        CodeGenOpt::Level OL, bool isLittle);
   ~ARMBaseTargetMachine() override;
 
@@ -58,39 +58,40 @@ public:
   }
 };
 
-/// ARMTargetMachine - ARM target machine.
+/// ARM target machine.
 ///
 class ARMTargetMachine : public ARMBaseTargetMachine {
   virtual void anchor();
  public:
    ARMTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
-                    StringRef FS, const TargetOptions &Options, Reloc::Model RM,
-                    CodeModel::Model CM, CodeGenOpt::Level OL, bool isLittle);
+                    StringRef FS, const TargetOptions &Options,
+                    Optional<Reloc::Model> RM, CodeModel::Model CM,
+                    CodeGenOpt::Level OL, bool isLittle);
 };
 
-/// ARMLETargetMachine - ARM little endian target machine.
+/// ARM little endian target machine.
 ///
 class ARMLETargetMachine : public ARMTargetMachine {
   void anchor() override;
 public:
   ARMLETargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                      StringRef FS, const TargetOptions &Options,
-                     Reloc::Model RM, CodeModel::Model CM,
+                     Optional<Reloc::Model> RM, CodeModel::Model CM,
                      CodeGenOpt::Level OL);
 };
 
-/// ARMBETargetMachine - ARM big endian target machine.
+/// ARM big endian target machine.
 ///
 class ARMBETargetMachine : public ARMTargetMachine {
   void anchor() override;
 public:
   ARMBETargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                      StringRef FS, const TargetOptions &Options,
-                     Reloc::Model RM, CodeModel::Model CM,
+                     Optional<Reloc::Model> RM, CodeModel::Model CM,
                      CodeGenOpt::Level OL);
 };
 
-/// ThumbTargetMachine - Thumb target machine.
+/// Thumb target machine.
 /// Due to the way architectures are handled, this represents both
 ///   Thumb-1 and Thumb-2.
 ///
@@ -99,29 +100,29 @@ class ThumbTargetMachine : public ARMBaseTargetMachine {
 public:
   ThumbTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                      StringRef FS, const TargetOptions &Options,
-                     Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL,
-                     bool isLittle);
+                     Optional<Reloc::Model> RM, CodeModel::Model CM,
+                     CodeGenOpt::Level OL, bool isLittle);
 };
 
-/// ThumbLETargetMachine - Thumb little endian target machine.
+/// Thumb little endian target machine.
 ///
 class ThumbLETargetMachine : public ThumbTargetMachine {
   void anchor() override;
 public:
   ThumbLETargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                        StringRef FS, const TargetOptions &Options,
-                       Reloc::Model RM, CodeModel::Model CM,
+                       Optional<Reloc::Model> RM, CodeModel::Model CM,
                        CodeGenOpt::Level OL);
 };
 
-/// ThumbBETargetMachine - Thumb big endian target machine.
+/// Thumb big endian target machine.
 ///
 class ThumbBETargetMachine : public ThumbTargetMachine {
   void anchor() override;
 public:
   ThumbBETargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                        StringRef FS, const TargetOptions &Options,
-                       Reloc::Model RM, CodeModel::Model CM,
+                       Optional<Reloc::Model> RM, CodeModel::Model CM,
                        CodeGenOpt::Level OL);
 };
 
diff --git a/lib/Target/ARM/ARMTargetObjectFile.h b/lib/Target/ARM/ARMTargetObjectFile.h
index 98e8763c4705..b1db201cb30d 100644
--- a/lib/Target/ARM/ARMTargetObjectFile.h
+++ b/lib/Target/ARM/ARMTargetObjectFile.h
@@ -21,10 +21,10 @@ class ARMElfTargetObjectFile : public TargetLoweringObjectFileELF {
 protected:
   const MCSection *AttributesSection;
 public:
-  ARMElfTargetObjectFile() :
-    TargetLoweringObjectFileELF(),
-    AttributesSection(nullptr)
-  {}
+  ARMElfTargetObjectFile()
+      : TargetLoweringObjectFileELF(), AttributesSection(nullptr) {
+    PLTRelativeVariantKind = MCSymbolRefExpr::VK_ARM_PREL31;
+  }
 
   void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
 
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp
index c1520119ef21..13c5dc61acd9 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -18,12 +18,12 @@ using namespace llvm;
 int ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
   assert(Ty->isIntegerTy());
 
-  unsigned Bits = Ty->getPrimitiveSizeInBits();
-  if (Bits == 0 || Bits > 32)
-    return 4;
+ unsigned Bits = Ty->getPrimitiveSizeInBits();
+ if (Bits == 0 || Imm.getActiveBits() >= 64)
+   return 4;
 
-  int32_t SImmVal = Imm.getSExtValue();
-  uint32_t ZImmVal = Imm.getZExtValue();
+  int64_t SImmVal = Imm.getSExtValue();
+  uint64_t ZImmVal = Imm.getZExtValue();
   if (!ST->isThumb()) {
     if ((SImmVal >= 0 && SImmVal < 65536) ||
         (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
@@ -47,6 +47,32 @@ int ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
   return 3;
 }
 
+
+// Constants smaller than 256 fit in the immediate field of
+// Thumb1 instructions so we return a zero cost and 1 otherwise.
+int ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
+                                      const APInt &Imm, Type *Ty) {
+  if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
+    return 0;
+
+  return 1;
+}
+
+int ARMTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
+                              Type *Ty) {
+  // Division by a constant can be turned into multiplication, but only if we
+  // know it's constant. So it's not so much that the immediate is cheap (it's
+  // not), but that the alternative is worse.
+  // FIXME: this is probably unneeded with GlobalISel.
+  if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
+       Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
+      Idx == 1)
+    return 0;
+
+  return getIntImmCost(Imm, Ty);
+}
+
+
 int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
@@ -244,10 +270,8 @@ int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
                                    unsigned Index) {
   // Penalize inserting into an D-subregister. We end up with a three times
   // lower estimated throughput on swift.
-  if (ST->isSwift() &&
-      Opcode == Instruction::InsertElement &&
-      ValTy->isVectorTy() &&
-      ValTy->getScalarSizeInBits() <= 32)
+  if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
+      ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
     return 3;
 
   if ((Opcode == Instruction::InsertElement ||
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.h b/lib/Target/ARM/ARMTargetTransformInfo.h
index 7d8d2381c983..a0ca9e648002 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -54,12 +54,24 @@ public:
 
   bool enableInterleavedAccessVectorization() { return true; }
 
+  /// Floating-point computation using ARMv8 AArch32 Advanced
+  /// SIMD instructions remains unchanged from ARMv7. Only AArch64 SIMD
+  /// is IEEE-754 compliant, but it's not covered in this target.
+  bool isFPVectorizationPotentiallyUnsafe() {
+    return !ST->isTargetDarwin();
+  }
+
   /// \name Scalar TTI Implementations
   /// @{
 
+  int getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
+                            Type *Ty);
+
   using BaseT::getIntImmCost;
   int getIntImmCost(const APInt &Imm, Type *Ty);
 
+  int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty);
+
   /// @}
 
   /// \name Vector TTI Implementations
@@ -88,10 +100,7 @@ public:
   }
 
   unsigned getMaxInterleaveFactor(unsigned VF) {
-    // These are out of order CPUs:
-    if (ST->isCortexA15() || ST->isSwift())
-      return 2;
-    return 1;
+    return ST->getMaxInterleaveFactor();
   }
 
   int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index c69a741244cf..7d49302f9a96 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -20,7 +20,7 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
@@ -31,20 +31,20 @@
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCAsmParserUtils.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MCTargetAsmParser.h"
 #include "llvm/Support/ARMBuildAttributes.h"
 #include "llvm/Support/ARMEHABI.h"
-#include "llvm/Support/TargetParser.h"
 #include "llvm/Support/COFF.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetParser.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -257,9 +257,15 @@ class ARMAsmParser : public MCTargetAsmParser {
   bool hasThumb() const {
     return getSTI().getFeatureBits()[ARM::HasV4TOps];
   }
+  bool hasThumb2() const {
+    return getSTI().getFeatureBits()[ARM::FeatureThumb2];
+  }
   bool hasV6Ops() const {
     return getSTI().getFeatureBits()[ARM::HasV6Ops];
   }
+  bool hasV6T2Ops() const {
+    return getSTI().getFeatureBits()[ARM::HasV6T2Ops];
+  }
   bool hasV6MOps() const {
     return getSTI().getFeatureBits()[ARM::HasV6MOps];
   }
@@ -269,6 +275,15 @@ class ARMAsmParser : public MCTargetAsmParser {
   bool hasV8Ops() const {
     return getSTI().getFeatureBits()[ARM::HasV8Ops];
   }
+  bool hasV8MBaseline() const {
+    return getSTI().getFeatureBits()[ARM::HasV8MBaselineOps];
+  }
+  bool hasV8MMainline() const {
+    return getSTI().getFeatureBits()[ARM::HasV8MMainlineOps];
+  }
+  bool has8MSecExt() const {
+    return getSTI().getFeatureBits()[ARM::Feature8MSecExt];
+  }
   bool hasARM() const {
     return !getSTI().getFeatureBits()[ARM::FeatureNoARM];
   }
@@ -281,12 +296,16 @@ class ARMAsmParser : public MCTargetAsmParser {
   bool hasV8_1aOps() const {
     return getSTI().getFeatureBits()[ARM::HasV8_1aOps];
   }
+  bool hasRAS() const {
+    return getSTI().getFeatureBits()[ARM::FeatureRAS];
+  }
 
   void SwitchMode() {
     MCSubtargetInfo &STI = copySTI();
     uint64_t FB = ComputeAvailableFeatures(STI.ToggleFeature(ARM::ModeThumb));
     setAvailableFeatures(FB);
   }
+  void FixModeAfterArchChange(bool WasThumb, SMLoc Loc);
   bool isMClass() const {
     return getSTI().getFeatureBits()[ARM::FeatureMClass];
   }
@@ -417,8 +436,9 @@ class ARMOperand : public MCParsedAsmOperand {
     k_ShifterImmediate,
     k_RotateImmediate,
     k_ModifiedImmediate,
+    k_ConstantPoolImmediate,
     k_BitfieldDescriptor,
-    k_Token
+    k_Token,
   } Kind;
 
   SMLoc StartLoc, EndLoc, AlignmentLoc;
@@ -611,6 +631,11 @@ public:
     return Imm.Val;
   }
 
+  const MCExpr *getConstantPoolImm() const {
+    assert(isConstantPoolImm() && "Invalid access!");
+    return Imm.Val;
+  }
+
   unsigned getVectorIndex() const {
     assert(Kind == k_VectorIndex && "Invalid access!");
     return VectorIndex.Val;
@@ -648,7 +673,27 @@ public:
   bool isCCOut() const { return Kind == k_CCOut; }
   bool isITMask() const { return Kind == k_ITCondMask; }
   bool isITCondCode() const { return Kind == k_CondCode; }
-  bool isImm() const override { return Kind == k_Immediate; }
+  bool isImm() const override {
+    return Kind == k_Immediate;
+  }
+
+  bool isARMBranchTarget() const {
+    if (!isImm()) return false;
+
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()))
+      return CE->getValue() % 4 == 0;
+    return true;
+  }
+
+
+  bool isThumbBranchTarget() const {
+    if (!isImm()) return false;
+
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()))
+      return CE->getValue() % 2 == 0;
+    return true;
+  }
+
   // checks whether this operand is an unsigned offset which fits is a field
   // of specified width and scaled by a specific number of bits
   template<unsigned width, unsigned scale>
@@ -1036,6 +1081,7 @@ public:
     return ARM_AM::getSOImmVal(Value) == -1 &&
       ARM_AM::getSOImmVal(-Value) != -1;
   }
+  bool isConstantPoolImm() const { return Kind == k_ConstantPoolImmediate; }
   bool isBitfield() const { return Kind == k_BitfieldDescriptor; }
   bool isPostIdxRegShifted() const { return Kind == k_PostIndexRegister; }
   bool isPostIdxReg() const {
@@ -1183,6 +1229,20 @@ public:
     return (Val >= -1020 && Val <= 1020 && ((Val & 3) == 0)) ||
       Val == INT32_MIN;
   }
+  bool isAddrMode5FP16() const {
+    // If we have an immediate that's not a constant, treat it as a label
+    // reference needing a fixup. If it is a constant, it's something else
+    // and we reject it.
+    if (isImm() && !isa<MCConstantExpr>(getImm()))
+      return true;
+    if (!isMem() || Memory.Alignment != 0) return false;
+    // Check for register offset.
+    if (Memory.OffsetRegNum) return false;
+    // Immediate offset in range [-510, 510] and a multiple of 2.
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return (Val >= -510 && Val <= 510 && ((Val & 1) == 0)) || Val == INT32_MIN;
+  }
   bool isMemTBB() const {
     if (!isMem() || !Memory.OffsetRegNum || Memory.isNegative ||
         Memory.ShiftType != ARM_AM::no_shift || Memory.Alignment != 0)
@@ -1203,7 +1263,7 @@ public:
   }
   bool isT2MemRegOffset() const {
     if (!isMem() || !Memory.OffsetRegNum || Memory.isNegative ||
-        Memory.Alignment != 0)
+        Memory.Alignment != 0 || Memory.BaseRegNum == ARM::PC)
       return false;
     // Only lsl #{0, 1, 2, 3} allowed.
     if (Memory.ShiftType == ARM_AM::no_shift)
@@ -1319,6 +1379,7 @@ public:
     // If we have an immediate that's not a constant, treat it as a label
     // reference needing a fixup. If it is a constant, it's something else
     // and we reject it.
+
     if (isImm() && !isa<MCConstantExpr>(getImm()))
       return true;
 
@@ -1329,6 +1390,11 @@ public:
     int64_t Val = Memory.OffsetImm->getValue();
     return (Val > -4096 && Val < 4096) || (Val == INT32_MIN);
   }
+  bool isConstPoolAsmImm() const {
+    // Delay processing of Constant Pool Immediate, this will turn into
+    // a constant. Match no other operand
+    return (isConstantPoolImm());
+  }
   bool isPostIdxImm8() const {
     if (!isImm()) return false;
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
@@ -1665,7 +1731,7 @@ public:
     if (!CE) return false;
     uint64_t Value = CE->getValue();
     // i64 value with each byte being either 0 or 0xff.
-    for (unsigned i = 0; i < 8; ++i)
+    for (unsigned i = 0; i < 8; ++i, Value >>= 8)
       if ((Value & 0xff) != 0 && (Value & 0xff) != 0xff) return false;
     return true;
   }
@@ -1680,6 +1746,16 @@ public:
       Inst.addOperand(MCOperand::createExpr(Expr));
   }
 
+  void addARMBranchTargetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
+  }
+
+  void addThumbBranchTargetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
+  }
+
   void addCondCodeOperands(MCInst &Inst, unsigned N) const {
     assert(N == 2 && "Invalid number of operands!");
     Inst.addOperand(MCOperand::createImm(unsigned(getCondCode())));
@@ -1941,6 +2017,7 @@ public:
       }
 
       const MCSymbolRefExpr *SR = dyn_cast<MCSymbolRefExpr>(Imm.Val);
+ 
       assert(SR && "Unknown value type!");
       Inst.addOperand(MCOperand::createExpr(SR));
       return;
@@ -2145,6 +2222,28 @@ public:
     Inst.addOperand(MCOperand::createImm(Val));
   }
 
+  void addAddrMode5FP16Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    // If we have an immediate that's not a constant, treat it as a label
+    // reference needing a fixup. If it is a constant, it's something else
+    // and we reject it.
+    if (isImm()) {
+      Inst.addOperand(MCOperand::createExpr(getImm()));
+      Inst.addOperand(MCOperand::createImm(0));
+      return;
+    }
+
+    // The lower bit is always zero and as such is not encoded.
+    int32_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() / 2 : 0;
+    ARM_AM::AddrOpc AddSub = Val < 0 ? ARM_AM::sub : ARM_AM::add;
+    // Special case for #-0
+    if (Val == INT32_MIN) Val = 0;
+    if (Val < 0) Val = -Val;
+    Val = ARM_AM::getAM5FP16Opc(AddSub, Val);
+    Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::createImm(Val));
+  }
+
   void addMemImm8s4OffsetOperands(MCInst &Inst, unsigned N) const {
     assert(N == 2 && "Invalid number of operands!");
     // If we have an immediate that's not a constant, treat it as a label
@@ -2214,6 +2313,14 @@ public:
     Inst.addOperand(MCOperand::createImm(Val));
   }
 
+  void addConstPoolAsmImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // This is container for the immediate that we will create the constant
+    // pool from
+    addExpr(Inst, getConstantPoolImm());
+    return;
+  }
+
   void addMemTBBOperands(MCInst &Inst, unsigned N) const {
     assert(N == 2 && "Invalid number of operands!");
     Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
@@ -2594,6 +2701,15 @@ public:
   }
 
   static std::unique_ptr<ARMOperand>
+  CreateConstantPoolImm(const MCExpr *Val, SMLoc S, SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_ConstantPoolImmediate);
+    Op->Imm.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand>
   CreateBitfield(unsigned LSB, unsigned Width, SMLoc S, SMLoc E) {
     auto Op = make_unique<ARMOperand>(k_BitfieldDescriptor);
     Op->Bitfield.LSB = LSB;
@@ -2850,6 +2966,9 @@ void ARMOperand::print(raw_ostream &OS) const {
     OS << "<mod_imm #" << ModImm.Bits << ", #"
        <<  ModImm.Rot << ")>";
     break;
+  case k_ConstantPoolImmediate:
+    OS << "<constant_pool_imm #" << *getConstantPoolImm();
+    break;
   case k_BitfieldDescriptor:
     OS << "<bitfield " << "lsb: " << Bitfield.LSB
        << ", width: " << Bitfield.Width << ">";
@@ -3969,6 +4088,18 @@ ARMAsmParser::parseMSRMaskOperand(OperandVector &Operands) {
       .Case("basepri_max", 0x812)
       .Case("faultmask", 0x813)
       .Case("control", 0x814)
+      .Case("msplim", 0x80a)
+      .Case("psplim", 0x80b)
+      .Case("msp_ns", 0x888)
+      .Case("psp_ns", 0x889)
+      .Case("msplim_ns", 0x88a)
+      .Case("psplim_ns", 0x88b)
+      .Case("primask_ns", 0x890)
+      .Case("basepri_ns", 0x891)
+      .Case("basepri_max_ns", 0x892)
+      .Case("faultmask_ns", 0x893)
+      .Case("control_ns", 0x894)
+      .Case("sp_ns", 0x898)
       .Default(~0U);
 
     if (FlagsVal == ~0U)
@@ -3983,6 +4114,14 @@ ARMAsmParser::parseMSRMaskOperand(OperandVector &Operands) {
       // basepri, basepri_max and faultmask only valid for V7m.
       return MatchOperand_NoMatch;
 
+    if (!has8MSecExt() && (FlagsVal == 0x80a || FlagsVal == 0x80b ||
+                             (FlagsVal > 0x814 && FlagsVal < 0xc00)))
+      return MatchOperand_NoMatch;
+
+    if (!hasV8MMainline() && (FlagsVal == 0x88a || FlagsVal == 0x88b ||
+                              (FlagsVal > 0x890 && FlagsVal <= 0x893)))
+      return MatchOperand_NoMatch;
+
     Parser.Lex(); // Eat identifier token.
     Operands.push_back(ARMOperand::CreateMSRMask(FlagsVal, S));
     return MatchOperand_Success;
@@ -4673,14 +4812,14 @@ void ARMAsmParser::cvtThumbBranches(MCInst &Inst,
     // classify tB as either t2B or t1B based on range of immediate operand
     case ARM::tB: {
       ARMOperand &op = static_cast<ARMOperand &>(*Operands[ImmOp]);
-      if (!op.isSignedOffset<11, 1>() && isThumbTwo())
+      if (!op.isSignedOffset<11, 1>() && isThumb() && hasV8MBaseline())
         Inst.setOpcode(ARM::t2B);
       break;
     }
     // classify tBcc as either t2Bcc or t1Bcc based on range of immediate operand
     case ARM::tBcc: {
       ARMOperand &op = static_cast<ARMOperand &>(*Operands[ImmOp]);
-      if (!op.isSignedOffset<8, 1>() && isThumbTwo())
+      if (!op.isSignedOffset<8, 1>() && isThumb() && hasV8MBaseline())
         Inst.setOpcode(ARM::t2Bcc);
       break;
     }
@@ -4973,7 +5112,8 @@ ARMAsmParser::parseFPImm(OperandVector &Operands) {
   // vmov.i{8|16|32|64} <dreg|qreg>, #imm
   ARMOperand &TyOp = static_cast<ARMOperand &>(*Operands[2]);
   bool isVmovf = TyOp.isToken() &&
-                 (TyOp.getToken() == ".f32" || TyOp.getToken() == ".f64");
+                 (TyOp.getToken() == ".f32" || TyOp.getToken() == ".f64" ||
+                  TyOp.getToken() == ".f16");
   ARMOperand &Mnemonic = static_cast<ARMOperand &>(*Operands[0]);
   bool isFconst = Mnemonic.isToken() && (Mnemonic.getToken() == "fconstd" ||
                                          Mnemonic.getToken() == "fconsts");
@@ -5144,16 +5284,12 @@ bool ARMAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
     S = Parser.getTok().getLoc();
     if (Mnemonic != "ldr") // only parse for ldr pseudo (e.g. ldr r0, =val)
       return Error(S, "unexpected token in operand");
-
     Parser.Lex(); // Eat '='
     const MCExpr *SubExprVal;
     if (getParser().parseExpression(SubExprVal))
       return true;
     E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
-
-    const MCExpr *CPLoc =
-        getTargetStreamer().addConstantPoolEntry(SubExprVal, S);
-    Operands.push_back(ARMOperand::CreateImm(CPLoc, S, E));
+    Operands.push_back(ARMOperand::CreateConstantPoolImm(SubExprVal, S, E));
     return false;
   }
   }
@@ -5265,7 +5401,8 @@ StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic,
       Mnemonic == "vcvta" || Mnemonic == "vcvtn"  || Mnemonic == "vcvtp" ||
       Mnemonic == "vcvtm" || Mnemonic == "vrinta" || Mnemonic == "vrintn" ||
       Mnemonic == "vrintp" || Mnemonic == "vrintm" || Mnemonic == "hvc" ||
-      Mnemonic.startswith("vsel"))
+      Mnemonic.startswith("vsel") || Mnemonic == "vins" || Mnemonic == "vmovx" ||
+      Mnemonic == "bxns"  || Mnemonic == "blxns")
     return Mnemonic;
 
   // First, split out any predication code. Ignore mnemonics we know aren't
@@ -5311,6 +5448,7 @@ StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic,
         Mnemonic == "fsts" || Mnemonic == "fcpys" || Mnemonic == "fdivs" ||
         Mnemonic == "fmuls" || Mnemonic == "fcmps" || Mnemonic == "fcmpzs" ||
         Mnemonic == "vfms" || Mnemonic == "vfnms" || Mnemonic == "fconsts" ||
+        Mnemonic == "bxns" || Mnemonic == "blxns" ||
         (Mnemonic == "movs" && isThumb()))) {
     Mnemonic = Mnemonic.slice(0, Mnemonic.size() - 1);
     CarrySetting = true;
@@ -5369,7 +5507,8 @@ void ARMAsmParser::getMnemonicAcceptInfo(StringRef Mnemonic, StringRef FullInst,
       Mnemonic == "vrintn" || Mnemonic == "vrintp" || Mnemonic == "vrintm" ||
       Mnemonic.startswith("aes") || Mnemonic == "hvc" || Mnemonic == "setpan" ||
       Mnemonic.startswith("sha1") || Mnemonic.startswith("sha256") ||
-      (FullInst.startswith("vmull") && FullInst.endswith(".p64"))) {
+      (FullInst.startswith("vmull") && FullInst.endswith(".p64")) ||
+      Mnemonic == "vmovx" || Mnemonic == "vins") {
     // These mnemonics are never predicable
     CanAcceptPredicationCode = false;
   } else if (!isThumb()) {
@@ -6405,6 +6544,20 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
           "immediate expression for mov requires :lower16: or :upper16");
     break;
   }
+  case ARM::HINT:
+  case ARM::t2HINT: {
+    if (hasRAS()) {
+      // ESB is not predicable (pred must be AL)
+      unsigned Imm8 = Inst.getOperand(0).getImm();
+      unsigned Pred = Inst.getOperand(1).getImm();
+      if (Imm8 == 0x10 && Pred != ARMCC::AL)
+        return Error(Operands[1]->getStartLoc(), "instruction 'esb' is not "
+                                                 "predicable, but condition "
+                                                 "code specified");
+    }
+    // Without the RAS extension, this behaves as any other unallocated hint.
+    break;
+  }
   }
 
   return false;
@@ -6766,6 +6919,90 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
   case ARM::t2LDRSHpcrel:
     Inst.setOpcode(ARM::t2LDRSHpci);
     return true;
+  case ARM::LDRConstPool:
+  case ARM::tLDRConstPool:
+  case ARM::t2LDRConstPool: {
+    // Pseudo instruction ldr rt, =immediate is converted to a
+    // MOV rt, immediate if immediate is known and representable
+    // otherwise we create a constant pool entry that we load from.
+    MCInst TmpInst;
+    if (Inst.getOpcode() == ARM::LDRConstPool)
+      TmpInst.setOpcode(ARM::LDRi12);
+    else if (Inst.getOpcode() == ARM::tLDRConstPool)
+      TmpInst.setOpcode(ARM::tLDRpci);
+    else if (Inst.getOpcode() == ARM::t2LDRConstPool)
+      TmpInst.setOpcode(ARM::t2LDRpci);
+    const ARMOperand &PoolOperand =
+      static_cast<ARMOperand &>(*Operands[3]);
+    const MCExpr *SubExprVal = PoolOperand.getConstantPoolImm();
+    // If SubExprVal is a constant we may be able to use a MOV
+    if (isa<MCConstantExpr>(SubExprVal) &&
+        Inst.getOperand(0).getReg() != ARM::PC &&
+        Inst.getOperand(0).getReg() != ARM::SP) {
+      int64_t Value =
+        (int64_t) (cast<MCConstantExpr>(SubExprVal))->getValue();
+      bool UseMov  = true;
+      bool MovHasS = true;
+      if (Inst.getOpcode() == ARM::LDRConstPool) {
+        // ARM Constant
+        if (ARM_AM::getSOImmVal(Value) != -1) {
+          Value = ARM_AM::getSOImmVal(Value);
+          TmpInst.setOpcode(ARM::MOVi);
+        }
+        else if (ARM_AM::getSOImmVal(~Value) != -1) {
+          Value = ARM_AM::getSOImmVal(~Value);
+          TmpInst.setOpcode(ARM::MVNi);
+        }
+        else if (hasV6T2Ops() &&
+                 Value >=0 && Value < 65536) {
+          TmpInst.setOpcode(ARM::MOVi16);
+          MovHasS = false;
+        }
+        else
+          UseMov = false;
+      }
+      else {
+        // Thumb/Thumb2 Constant
+        if (hasThumb2() &&
+            ARM_AM::getT2SOImmVal(Value) != -1)
+          TmpInst.setOpcode(ARM::t2MOVi);
+        else if (hasThumb2() &&
+                 ARM_AM::getT2SOImmVal(~Value) != -1) {
+          TmpInst.setOpcode(ARM::t2MVNi);
+          Value = ~Value;
+        }
+        else if (hasV8MBaseline() &&
+                 Value >=0 && Value < 65536) {
+          TmpInst.setOpcode(ARM::t2MOVi16);
+          MovHasS = false;
+        }
+        else
+          UseMov = false;
+      }
+      if (UseMov) {
+        TmpInst.addOperand(Inst.getOperand(0));           // Rt
+        TmpInst.addOperand(MCOperand::createImm(Value));  // Immediate
+        TmpInst.addOperand(Inst.getOperand(2));           // CondCode
+        TmpInst.addOperand(Inst.getOperand(3));           // CondCode
+        if (MovHasS)
+          TmpInst.addOperand(MCOperand::createReg(0));    // S
+        Inst = TmpInst;
+        return true;
+      }
+    }
+    // No opportunity to use MOV/MVN create constant pool
+    const MCExpr *CPLoc =
+      getTargetStreamer().addConstantPoolEntry(SubExprVal,
+                                               PoolOperand.getStartLoc());
+    TmpInst.addOperand(Inst.getOperand(0));           // Rt
+    TmpInst.addOperand(MCOperand::createExpr(CPLoc)); // offset to constpool
+    if (TmpInst.getOpcode() == ARM::LDRi12)
+      TmpInst.addOperand(MCOperand::createImm(0));    // unused offset
+    TmpInst.addOperand(Inst.getOperand(2));           // CondCode
+    TmpInst.addOperand(Inst.getOperand(3));           // CondCode
+    Inst = TmpInst;
+    return true;
+  }
   // Handle NEON VST complex aliases.
   case ARM::VST1LNdWB_register_Asm_8:
   case ARM::VST1LNdWB_register_Asm_16:
@@ -9031,6 +9268,31 @@ bool ARMAsmParser::parseDirectiveUnreq(SMLoc L) {
   return false;
 }
 
+// After changing arch/CPU, try to put the ARM/Thumb mode back to what it was
+// before, if supported by the new target, or emit mapping symbols for the mode
+// switch.
+void ARMAsmParser::FixModeAfterArchChange(bool WasThumb, SMLoc Loc) {
+  if (WasThumb != isThumb()) {
+    if (WasThumb && hasThumb()) {
+      // Stay in Thumb mode
+      SwitchMode();
+    } else if (!WasThumb && hasARM()) {
+      // Stay in ARM mode
+      SwitchMode();
+    } else {
+      // Mode switch forced, because the new arch doesn't support the old mode.
+      getParser().getStreamer().EmitAssemblerFlag(isThumb() ? MCAF_Code16
+                                                            : MCAF_Code32);
+      // Warn about the implcit mode switch. GAS does not switch modes here,
+      // but instead stays in the old mode, reporting an error on any following
+      // instructions as the mode does not exist on the target.
+      Warning(Loc, Twine("new target does not support ") +
+                       (WasThumb ? "thumb" : "arm") + " mode, switching to " +
+                       (!WasThumb ? "thumb" : "arm") + " mode");
+    }
+  }
+}
+
 /// parseDirectiveArch
 ///  ::= .arch token
 bool ARMAsmParser::parseDirectiveArch(SMLoc L) {
@@ -9043,10 +9305,12 @@ bool ARMAsmParser::parseDirectiveArch(SMLoc L) {
     return false;
   }
 
+  bool WasThumb = isThumb();
   Triple T;
   MCSubtargetInfo &STI = copySTI();
   STI.setDefaultFeatures("", ("+" + ARM::getArchName(ID)).str());
   setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+  FixModeAfterArchChange(WasThumb, L);
 
   getTargetStreamer().emitArch(ID);
   return false;
@@ -9177,9 +9441,11 @@ bool ARMAsmParser::parseDirectiveCPU(SMLoc L) {
     return false;
   }
 
+  bool WasThumb = isThumb();
   MCSubtargetInfo &STI = copySTI();
   STI.setDefaultFeatures(CPU, "");
   setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+  FixModeAfterArchChange(WasThumb, L);
 
   return false;
 }
@@ -9834,7 +10100,7 @@ bool ARMAsmParser::parseDirectiveObjectArch(SMLoc L) {
 
   StringRef Arch = Parser.getTok().getString();
   SMLoc ArchLoc = Parser.getTok().getLoc();
-  getLexer().Lex();
+  Lex();
 
   unsigned ID = ARM::parseArch(Arch);
 
@@ -9863,7 +10129,9 @@ bool ARMAsmParser::parseDirectiveAlign(SMLoc L) {
     return true;
 
   // '.align' is target specifically handled to mean 2**2 byte alignment.
-  if (getStreamer().getCurrentSection().first->UseCodeAlign())
+  const MCSection *Section = getStreamer().getCurrentSection().first;
+  assert(Section && "must have section to emit alignment");
+  if (Section->UseCodeAlign())
     getStreamer().EmitCodeAlignment(4, 0);
   else
     getStreamer().EmitValueToAlignment(4, 0, 1, 0);
@@ -9933,6 +10201,7 @@ static const struct {
   // FIXME: Only available in A-class, isel not predicated
   { ARM::AEK_VIRT, Feature_HasV7, {ARM::FeatureVirtualization} },
   { ARM::AEK_FP16, Feature_HasV8_2a, {ARM::FeatureFPARMv8, ARM::FeatureFullFP16} },
+  { ARM::AEK_RAS, Feature_HasV8, {ARM::FeatureRAS} },
   // FIXME: Unsupported extensions.
   { ARM::AEK_OS, Feature_None, {} },
   { ARM::AEK_IWMMXT, Feature_None, {} },
@@ -9954,7 +10223,7 @@ bool ARMAsmParser::parseDirectiveArchExtension(SMLoc L) {
 
   StringRef Name = Parser.getTok().getString();
   SMLoc ExtLoc = Parser.getTok().getLoc();
-  getLexer().Lex();
+  Lex();
 
   bool EnableFeature = true;
   if (Name.startswith_lower("no")) {
diff --git a/lib/Target/ARM/AsmParser/Makefile b/lib/Target/ARM/AsmParser/Makefile
deleted file mode 100644
index 841516fffbd5..000000000000
--- a/lib/Target/ARM/AsmParser/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/ARM/AsmParser/Makefile -------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMARMAsmParser
-
-# Hack: we need to include 'main' ARM target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index e63defed2288..3196a57ccc3e 100644
--- a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "MCTargetDesc/ARMBaseInfo.h"
 #include "MCTargetDesc/ARMMCExpr.h"
@@ -210,6 +210,8 @@ static DecodeStatus DecodeArmMOVTWInstruction(MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeSMLAInstruction(MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeHINTInstruction(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeCPSInstruction(MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeTSTInstruction(MCInst &Inst, unsigned Insn,
@@ -222,6 +224,8 @@ static DecodeStatus DecodeAddrModeImm12Operand(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeAddrMode5Operand(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeAddrMode5FP16Operand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeAddrMode7Operand(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeT2BInstruction(MCInst &Inst, unsigned Insn,
@@ -391,8 +395,8 @@ static DecodeStatus DecodeT2ShifterImmOperand(MCInst &Inst, unsigned Val,
 
 static DecodeStatus DecodeLDR(MCInst &Inst, unsigned Val,
                                 uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeMRRC2(llvm::MCInst &Inst, unsigned Val,
-                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecoderForMRRC2AndMCRR2(llvm::MCInst &Inst, unsigned Val,
+                                            uint64_t Address, const void *Decoder);
 #include "ARMGenDisassemblerTables.inc"
 
 static MCDisassembler *createARMDisassembler(const Target &T,
@@ -590,6 +594,8 @@ MCDisassembler::DecodeStatus
 ThumbDisassembler::AddThumbPredicate(MCInst &MI) const {
   MCDisassembler::DecodeStatus S = Success;
 
+  const FeatureBitset &FeatureBits = getSubtargetInfo().getFeatureBits();
+
   // A few instructions actually have predicates encoded in them.  Don't
   // try to overwrite it if we're seeing one of those.
   switch (MI.getOpcode()) {
@@ -610,6 +616,10 @@ ThumbDisassembler::AddThumbPredicate(MCInst &MI) const {
       else
         return Success;
       break;
+    case ARM::t2HINT:
+      if (MI.getOperand(0).getImm() == 0x10 && (FeatureBits[ARM::FeatureRAS]) != 0)
+        S = SoftFail;
+      break;
     case ARM::tB:
     case ARM::t2B:
     case ARM::t2TBB:
@@ -1941,6 +1951,29 @@ static DecodeStatus DecodeMemMultipleWritebackInstruction(MCInst &Inst,
   return S;
 }
 
+// Check for UNPREDICTABLE predicated ESB instruction
+static DecodeStatus DecodeHINTInstruction(MCInst &Inst, unsigned Insn,
+                                 uint64_t Address, const void *Decoder) {
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+  unsigned imm8 = fieldFromInstruction(Insn, 0, 8);
+  const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder);
+  const FeatureBitset &FeatureBits = Dis->getSubtargetInfo().getFeatureBits();
+
+  DecodeStatus S = MCDisassembler::Success;
+
+  Inst.addOperand(MCOperand::createImm(imm8));
+
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  // ESB is unpredictable if pred != AL. Without the RAS extension, it is a NOP,
+  // so all predicates should be allowed.
+  if (imm8 == 0x10 && pred != 0xe && ((FeatureBits[ARM::FeatureRAS]) != 0))
+    S = MCDisassembler::SoftFail;
+
+  return S;
+}
+
 static DecodeStatus DecodeCPSInstruction(MCInst &Inst, unsigned Insn,
                                  uint64_t Address, const void *Decoder) {
   unsigned imod = fieldFromInstruction(Insn, 18, 2);
@@ -2183,6 +2216,7 @@ static DecodeStatus DecodeAddrMode5Operand(MCInst &Inst, unsigned Val,
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rn = fieldFromInstruction(Val, 9, 4);
+  // U == 1 to add imm, 0 to subtract it.
   unsigned U = fieldFromInstruction(Val, 8, 1);
   unsigned imm = fieldFromInstruction(Val, 0, 8);
 
@@ -2197,6 +2231,26 @@ static DecodeStatus DecodeAddrMode5Operand(MCInst &Inst, unsigned Val,
   return S;
 }
 
+static DecodeStatus DecodeAddrMode5FP16Operand(MCInst &Inst, unsigned Val,
+                                   uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Val, 9, 4);
+  // U == 1 to add imm, 0 to subtract it.
+  unsigned U = fieldFromInstruction(Val, 8, 1);
+  unsigned imm = fieldFromInstruction(Val, 0, 8);
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if (U)
+    Inst.addOperand(MCOperand::createImm(ARM_AM::getAM5FP16Opc(ARM_AM::add, imm)));
+  else
+    Inst.addOperand(MCOperand::createImm(ARM_AM::getAM5FP16Opc(ARM_AM::sub, imm)));
+
+  return S;
+}
+
 static DecodeStatus DecodeAddrMode7Operand(MCInst &Inst, unsigned Val,
                                    uint64_t Address, const void *Decoder) {
   return DecodeGPRRegisterClass(Inst, Val, Address, Decoder);
@@ -4096,6 +4150,24 @@ static DecodeStatus DecodeMSRMask(MCInst &Inst, unsigned Val,
         // Values basepri, basepri_max and faultmask are only valid for v7m.
         return MCDisassembler::Fail;
       break;
+    case 0x8a: // msplim_ns
+    case 0x8b: // psplim_ns
+    case 0x91: // basepri_ns
+    case 0x92: // basepri_max_ns
+    case 0x93: // faultmask_ns
+      if (!(FeatureBits[ARM::HasV8MMainlineOps]))
+        return MCDisassembler::Fail;
+      // fall through
+    case 10:   // msplim
+    case 11:   // psplim
+    case 0x88: // msp_ns
+    case 0x89: // psp_ns
+    case 0x90: // primask_ns
+    case 0x94: // control_ns
+    case 0x98: // sp_ns
+      if (!(FeatureBits[ARM::Feature8MSecExt]))
+        return MCDisassembler::Fail;
+      break;
     default:
       return MCDisassembler::Fail;
     }
@@ -5193,8 +5265,8 @@ static DecodeStatus DecodeLDR(MCInst &Inst, unsigned Val,
   return S;
 }
 
-static DecodeStatus DecodeMRRC2(llvm::MCInst &Inst, unsigned Val,
-                                uint64_t Address, const void *Decoder) {
+static DecodeStatus DecoderForMRRC2AndMCRR2(llvm::MCInst &Inst, unsigned Val,
+                                            uint64_t Address, const void *Decoder) {
 
   DecodeStatus S = MCDisassembler::Success;
 
@@ -5210,12 +5282,30 @@ static DecodeStatus DecodeMRRC2(llvm::MCInst &Inst, unsigned Val,
   if (Rt == Rt2)
     S = MCDisassembler::SoftFail;
 
+  // We have to check if the instruction is MRRC2
+  // or MCRR2 when constructing the operands for
+  // Inst. Reason is because MRRC2 stores to two
+  // registers so it's tablegen desc has has two
+  // outputs whereas MCRR doesn't store to any
+  // registers so all of it's operands are listed
+  // as inputs, therefore the operand order for
+  // MRRC2 needs to be [Rt, Rt2, cop, opc1, CRm]
+  // and MCRR2 operand order is [cop, opc1, Rt, Rt2, CRm]
+
+  if (Inst.getOpcode() == ARM::MRRC2) {
+    if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rt, Address, Decoder)))
+      return MCDisassembler::Fail;
+    if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rt2, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
   Inst.addOperand(MCOperand::createImm(cop));
   Inst.addOperand(MCOperand::createImm(opc1));
-  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rt, Address, Decoder)))
-    return MCDisassembler::Fail;
-  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rt2, Address, Decoder)))
-    return MCDisassembler::Fail;
+  if (Inst.getOpcode() == ARM::MCRR2) {
+    if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rt, Address, Decoder)))
+      return MCDisassembler::Fail;
+    if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rt2, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
   Inst.addOperand(MCOperand::createImm(CRm));
 
   return S;
diff --git a/lib/Target/ARM/Disassembler/Makefile b/lib/Target/ARM/Disassembler/Makefile
deleted file mode 100644
index 031b6aca5a48..000000000000
--- a/lib/Target/ARM/Disassembler/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/ARM/Disassembler/Makefile ----------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMARMDisassembler
-
-# Hack: we need to include 'main' arm target directory to grab private headers
-CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
index 33fc85af9b19..e81bb77dbdfc 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
@@ -25,6 +25,7 @@ using namespace llvm;
 
 #define DEBUG_TYPE "asm-printer"
 
+#define PRINT_ALIAS_INSTR
 #include "ARMGenAsmWriter.inc"
 
 /// translateShiftImm - Convert shift immediate from 0-31 to 1-32 for printing.
@@ -73,43 +74,6 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
 
   switch (Opcode) {
 
-  // Check for HINT instructions w/ canonical names.
-  case ARM::HINT:
-  case ARM::tHINT:
-  case ARM::t2HINT:
-    switch (MI->getOperand(0).getImm()) {
-    case 0:
-      O << "\tnop";
-      break;
-    case 1:
-      O << "\tyield";
-      break;
-    case 2:
-      O << "\twfe";
-      break;
-    case 3:
-      O << "\twfi";
-      break;
-    case 4:
-      O << "\tsev";
-      break;
-    case 5:
-      if (STI.getFeatureBits()[ARM::HasV8Ops]) {
-        O << "\tsevl";
-        break;
-      } // Fallthrough for non-v8
-    default:
-      // Anything else should just print normally.
-      printInstruction(MI, STI, O);
-      printAnnotation(O, Annot);
-      return;
-    }
-    printPredicateOperand(MI, 1, STI, O);
-    if (Opcode == ARM::t2HINT)
-      O << ".w";
-    printAnnotation(O, Annot);
-    return;
-
   // Check for MOVs and print canonical forms, instead.
   case ARM::MOVsr: {
     // FIXME: Thumb variants?
@@ -297,23 +261,11 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
     }
     break;
   }
-  // B9.3.3 ERET (Thumb)
-  // For a target that has Virtualization Extensions, ERET is the preferred
-  // disassembly of SUBS PC, LR, #0
-  case ARM::t2SUBS_PC_LR: {
-    if (MI->getNumOperands() == 3 && MI->getOperand(0).isImm() &&
-        MI->getOperand(0).getImm() == 0 &&
-        STI.getFeatureBits()[ARM::FeatureVirtualization]) {
-      O << "\teret";
-      printPredicateOperand(MI, 1, STI, O);
-      printAnnotation(O, Annot);
-      return;
-    }
-    break;
-  }
   }
 
-  printInstruction(MI, STI, O);
+  if (!printAliasInstr(MI, STI, O))
+    printInstruction(MI, STI, O);
+
   printAnnotation(O, Annot);
 }
 
@@ -645,6 +597,34 @@ void ARMInstPrinter::printAddrMode5Operand(const MCInst *MI, unsigned OpNum,
   O << "]" << markup(">");
 }
 
+template <bool AlwaysPrintImm0>
+void ARMInstPrinter::printAddrMode5FP16Operand(const MCInst *MI, unsigned OpNum,
+                                               const MCSubtargetInfo &STI,
+                                               raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+
+  if (!MO1.isReg()) {   // FIXME: This is for CP entries, but isn't right.
+    printOperand(MI, OpNum, STI, O);
+    return;
+  }
+
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+
+  unsigned ImmOffs = ARM_AM::getAM5FP16Offset(MO2.getImm());
+  unsigned Op = ARM_AM::getAM5FP16Op(MO2.getImm());
+  if (AlwaysPrintImm0 || ImmOffs || Op == ARM_AM::sub) {
+    O << ", "
+      << markup("<imm:")
+      << "#"
+      << ARM_AM::getAddrOpcStr(ARM_AM::getAM5FP16Op(MO2.getImm()))
+      << ImmOffs * 2
+      << markup(">");
+  }
+  O << "]" << markup(">");
+}
+
 void ARMInstPrinter::printAddrMode6Operand(const MCInst *MI, unsigned OpNum,
                                            const MCSubtargetInfo &STI,
                                            raw_ostream &O) {
@@ -901,6 +881,42 @@ void ARMInstPrinter::printMSRMaskOperand(const MCInst *MI, unsigned OpNum,
     case 20:
       O << "control";
       return;
+    case 10:
+      O << "msplim";
+      return;
+    case 11:
+      O << "psplim";
+      return;
+    case 0x88:
+      O << "msp_ns";
+      return;
+    case 0x89:
+      O << "psp_ns";
+      return;
+    case 0x8a:
+      O << "msplim_ns";
+      return;
+    case 0x8b:
+      O << "psplim_ns";
+      return;
+    case 0x90:
+      O << "primask_ns";
+      return;
+    case 0x91:
+      O << "basepri_ns";
+      return;
+    case 0x92:
+      O << "basepri_max_ns";
+      return;
+    case 0x93:
+      O << "faultmask_ns";
+      return;
+    case 0x94:
+      O << "control_ns";
+      return;
+    case 0x98:
+      O << "sp_ns";
+      return;
     }
   }
 
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
index 52f7115f0558..9d80eed84dc2 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
@@ -30,6 +30,12 @@ public:
   // Autogenerated by tblgen.
   void printInstruction(const MCInst *MI, const MCSubtargetInfo &STI,
                         raw_ostream &O);
+  virtual bool printAliasInstr(const MCInst *MI, const MCSubtargetInfo &STI,
+                               raw_ostream &O);
+  virtual void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+                                       unsigned PrintMethodIdx,
+                                       const MCSubtargetInfo &STI,
+                                       raw_ostream &O);
   static const char *getRegisterName(unsigned RegNo);
 
   void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
@@ -71,6 +77,9 @@ public:
   template <bool AlwaysPrintImm0>
   void printAddrMode5Operand(const MCInst *MI, unsigned OpNum,
                              const MCSubtargetInfo &STI, raw_ostream &O);
+  template <bool AlwaysPrintImm0>
+  void printAddrMode5FP16Operand(const MCInst *MI, unsigned OpNum,
+                                 const MCSubtargetInfo &STI, raw_ostream &O);
   void printAddrMode6Operand(const MCInst *MI, unsigned OpNum,
                              const MCSubtargetInfo &STI, raw_ostream &O);
   void printAddrMode7Operand(const MCInst *MI, unsigned OpNum,
diff --git a/lib/Target/ARM/InstPrinter/Makefile b/lib/Target/ARM/InstPrinter/Makefile
deleted file mode 100644
index 65d372e44b88..000000000000
--- a/lib/Target/ARM/InstPrinter/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/ARM/AsmPrinter/Makefile ------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMARMAsmPrinter
-
-# Hack: we need to include 'main' arm target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h b/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
index b03cada9a641..3959eab966a8 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
@@ -486,7 +486,7 @@ namespace ARM_AM {
   // addrmode5 := reg +/- imm8*4
   //
   // The first operand is always a Reg.  The second operand encodes the
-  // operation in bit 8 and the immediate in bits 0-7.
+  // operation (add or subtract) in bit 8 and the immediate in bits 0-7.
 
   /// getAM5Opc - This function encodes the addrmode5 opc field.
   static inline unsigned getAM5Opc(AddrOpc Opc, unsigned char Offset) {
@@ -501,6 +501,29 @@ namespace ARM_AM {
   }
 
   //===--------------------------------------------------------------------===//
+  // Addressing Mode #5 FP16
+  //===--------------------------------------------------------------------===//
+  //
+  // This is used for coprocessor instructions, such as 16-bit FP load/stores.
+  //
+  // addrmode5fp16 := reg +/- imm8*2
+  //
+  // The first operand is always a Reg.  The second operand encodes the
+  // operation (add or subtract) in bit 8 and the immediate in bits 0-7.
+
+  /// getAM5FP16Opc - This function encodes the addrmode5fp16 opc field.
+  static inline unsigned getAM5FP16Opc(AddrOpc Opc, unsigned char Offset) {
+    bool isSub = Opc == sub;
+    return ((int)isSub << 8) | Offset;
+  }
+  static inline unsigned char getAM5FP16Offset(unsigned AM5Opc) {
+    return AM5Opc & 0xFF;
+  }
+  static inline AddrOpc getAM5FP16Op(unsigned AM5Opc) {
+    return ((AM5Opc >> 8) & 1) ? sub : add;
+  }
+
+  //===--------------------------------------------------------------------===//
   // Addressing Mode #6
   //===--------------------------------------------------------------------===//
   //
@@ -650,6 +673,32 @@ namespace ARM_AM {
     return FPUnion.F;
   }
 
+  /// getFP16Imm - Return an 8-bit floating-point version of the 16-bit
+  /// floating-point value. If the value cannot be represented as an 8-bit
+  /// floating-point value, then return -1.
+  static inline int getFP16Imm(const APInt &Imm) {
+    uint32_t Sign = Imm.lshr(15).getZExtValue() & 1;
+    int32_t Exp = (Imm.lshr(10).getSExtValue() & 0x1f) - 15;  // -14 to 15
+    int64_t Mantissa = Imm.getZExtValue() & 0x3ff;  // 10 bits
+
+    // We can handle 4 bits of mantissa.
+    // mantissa = (16+UInt(e:f:g:h))/16.
+    if (Mantissa & 0x3f)
+      return -1;
+    Mantissa >>= 6;
+
+    // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
+    if (Exp < -3 || Exp > 4)
+      return -1;
+    Exp = ((Exp+3) & 0x7) ^ 4;
+
+    return ((int)Sign << 7) | (Exp << 4) | Mantissa;
+  }
+
+  static inline int getFP16Imm(const APFloat &FPImm) {
+    return getFP16Imm(FPImm.bitcastToAPInt());
+  }
+
   /// getFP32Imm - Return an 8-bit floating-point version of the 32-bit
   /// floating-point value. If the value cannot be represented as an 8-bit
   /// floating-point value, then return -1.
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index fa52c9354c17..0fc758201d47 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -46,6 +46,7 @@ public:
       : MCELFObjectTargetWriter(/*Is64Bit*/ false, OSABI, ELF::EM_ARM,
                                 /*HasRelocationAddend*/ false) {}
 };
+} // end anonymous namespace
 
 const MCFixupKindInfo &ARMAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
   const static MCFixupKindInfo InfosLE[ARM::NumTargetFixupKinds] = {
@@ -62,6 +63,10 @@ const MCFixupKindInfo &ARMAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
       {"fixup_t2_pcrel_10", 0, 32,
        MCFixupKindInfo::FKF_IsPCRel |
            MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+      {"fixup_arm_pcrel_9", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_t2_pcrel_9", 0, 32,
+       MCFixupKindInfo::FKF_IsPCRel |
+           MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
       {"fixup_thumb_adr_pcrel_10", 0, 8,
        MCFixupKindInfo::FKF_IsPCRel |
            MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
@@ -78,7 +83,9 @@ const MCFixupKindInfo &ARMAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
       {"fixup_arm_condbl", 0, 24, MCFixupKindInfo::FKF_IsPCRel},
       {"fixup_arm_blx", 0, 24, MCFixupKindInfo::FKF_IsPCRel},
       {"fixup_arm_thumb_bl", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
-      {"fixup_arm_thumb_blx", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_arm_thumb_blx", 0, 32,
+       MCFixupKindInfo::FKF_IsPCRel |
+           MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
       {"fixup_arm_thumb_cb", 0, 16, MCFixupKindInfo::FKF_IsPCRel},
       {"fixup_arm_thumb_cp", 0, 8,
        MCFixupKindInfo::FKF_IsPCRel |
@@ -90,6 +97,7 @@ const MCFixupKindInfo &ARMAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
       {"fixup_arm_movw_lo16", 0, 20, 0},
       {"fixup_t2_movt_hi16", 0, 20, 0},
       {"fixup_t2_movw_lo16", 0, 20, 0},
+      {"fixup_arm_mod_imm", 0, 12, 0},
   };
   const static MCFixupKindInfo InfosBE[ARM::NumTargetFixupKinds] = {
       // This table *must* be in the order that the fixup_* kinds are defined in
@@ -105,6 +113,10 @@ const MCFixupKindInfo &ARMAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
       {"fixup_t2_pcrel_10", 0, 32,
        MCFixupKindInfo::FKF_IsPCRel |
            MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+      {"fixup_arm_pcrel_9", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_t2_pcrel_9", 0, 32,
+       MCFixupKindInfo::FKF_IsPCRel |
+           MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
       {"fixup_thumb_adr_pcrel_10", 8, 8,
        MCFixupKindInfo::FKF_IsPCRel |
            MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
@@ -121,7 +133,9 @@ const MCFixupKindInfo &ARMAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
       {"fixup_arm_condbl", 8, 24, MCFixupKindInfo::FKF_IsPCRel},
       {"fixup_arm_blx", 8, 24, MCFixupKindInfo::FKF_IsPCRel},
       {"fixup_arm_thumb_bl", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
-      {"fixup_arm_thumb_blx", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_arm_thumb_blx", 0, 32,
+       MCFixupKindInfo::FKF_IsPCRel |
+           MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
       {"fixup_arm_thumb_cb", 0, 16, MCFixupKindInfo::FKF_IsPCRel},
       {"fixup_arm_thumb_cp", 8, 8,
        MCFixupKindInfo::FKF_IsPCRel |
@@ -133,6 +147,7 @@ const MCFixupKindInfo &ARMAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
       {"fixup_arm_movw_lo16", 12, 20, 0},
       {"fixup_t2_movt_hi16", 12, 20, 0},
       {"fixup_t2_movw_lo16", 12, 20, 0},
+      {"fixup_arm_mod_imm", 20, 12, 0},
   };
 
   if (Kind < FirstTargetFixupKind)
@@ -155,10 +170,10 @@ void ARMAsmBackend::handleAssemblerFlag(MCAssemblerFlag Flag) {
     break;
   }
 }
-} // end anonymous namespace
 
 unsigned ARMAsmBackend::getRelaxedOpcode(unsigned Op) const {
   bool HasThumb2 = STI->getFeatureBits()[ARM::FeatureThumb2];
+  bool HasV8MBaselineOps = STI->getFeatureBits()[ARM::HasV8MBaselineOps];
 
   switch (Op) {
   default:
@@ -170,7 +185,7 @@ unsigned ARMAsmBackend::getRelaxedOpcode(unsigned Op) const {
   case ARM::tADR:
     return HasThumb2 ? (unsigned)ARM::t2ADR : Op;
   case ARM::tB:
-    return HasThumb2 ? (unsigned)ARM::t2B : Op;
+    return HasV8MBaselineOps ? (unsigned)ARM::t2B : Op;
   case ARM::tCBZ:
     return ARM::tHINT;
   case ARM::tCBNZ:
@@ -243,7 +258,9 @@ bool ARMAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
   return reasonForFixupRelaxation(Fixup, Value);
 }
 
-void ARMAsmBackend::relaxInstruction(const MCInst &Inst, MCInst &Res) const {
+void ARMAsmBackend::relaxInstruction(const MCInst &Inst,
+                                     const MCSubtargetInfo &STI,
+                                     MCInst &Res) const {
   unsigned RelaxedOp = getRelaxedOpcode(Inst.getOpcode());
 
   // Sanity check w/ diagnostic if we get here w/ a bogus instruction.
@@ -449,7 +466,7 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     // Offset by 8 just as above.
     if (const MCSymbolRefExpr *SRE =
             dyn_cast<MCSymbolRefExpr>(Fixup.getValue()))
-      if (SRE->getKind() == MCSymbolRefExpr::VK_ARM_TLSCALL)
+      if (SRE->getKind() == MCSymbolRefExpr::VK_TLSCALL)
         return 0;
     return 0xffffff & ((Value - 8) >> 2);
   case ARM::fixup_t2_uncondbranch: {
@@ -524,10 +541,15 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     //
     // Note that the halfwords are stored high first, low second; so we need
     // to transpose the fixup value here to map properly.
-    uint32_t offset = (Value - 2) >> 2;
+    if (Ctx && Value  % 4 != 0) {
+      Ctx->reportError(Fixup.getLoc(), "misaligned ARM call destination");
+      return 0;
+    }
+
+    uint32_t offset = (Value - 4) >> 2;
     if (const MCSymbolRefExpr *SRE =
             dyn_cast<MCSymbolRefExpr>(Fixup.getValue()))
-      if (SRE->getKind() == MCSymbolRefExpr::VK_ARM_TLSCALL)
+      if (SRE->getKind() == MCSymbolRefExpr::VK_TLSCALL)
         offset = 0;
     uint32_t signBit = (offset & 0x400000) >> 22;
     uint32_t I1Bit = (offset & 0x200000) >> 21;
@@ -563,7 +585,8 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
   }
   case ARM::fixup_arm_thumb_br:
     // Offset by 4 and don't encode the lower bit, which is always 0.
-    if (Ctx && !STI->getFeatureBits()[ARM::FeatureThumb2]) {
+    if (Ctx && !STI->getFeatureBits()[ARM::FeatureThumb2] &&
+               !STI->getFeatureBits()[ARM::HasV8MBaselineOps]) {
       const char *FixupDiagnostic = reasonForFixupRelaxation(Fixup, Value);
       if (FixupDiagnostic) {
         Ctx->reportError(Fixup.getLoc(), FixupDiagnostic);
@@ -624,6 +647,44 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
 
     return Value;
   }
+  case ARM::fixup_arm_pcrel_9:
+    Value = Value - 4; // ARM fixups offset by an additional word and don't
+                       // need to adjust for the half-word ordering.
+                       // Fall through.
+  case ARM::fixup_t2_pcrel_9: {
+    // Offset by 4, adjusted by two due to the half-word ordering of thumb.
+    Value = Value - 4;
+    bool isAdd = true;
+    if ((int64_t)Value < 0) {
+      Value = -Value;
+      isAdd = false;
+    }
+    // These values don't encode the low bit since it's always zero.
+    if (Ctx && (Value & 1)) {
+      Ctx->reportError(Fixup.getLoc(), "invalid value for this fixup");
+      return 0;
+    }
+    Value >>= 1;
+    if (Ctx && Value >= 256) {
+      Ctx->reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
+      return 0;
+    }
+    Value |= isAdd << 23;
+
+    // Same addressing mode as fixup_arm_pcrel_9, but with 16-bit halfwords
+    // swapped.
+    if (Kind == ARM::fixup_t2_pcrel_9)
+      return swapHalfWords(Value, IsLittleEndian);
+
+    return Value;
+  }
+  case ARM::fixup_arm_mod_imm:
+    Value = ARM_AM::getSOImmVal(Value);
+    if (Ctx && Value >> 12) {
+      Ctx->reportError(Fixup.getLoc(), "out of range immediate fixup value");
+      return 0;
+    }
+    return Value;
   }
 }
 
@@ -690,11 +751,13 @@ static unsigned getFixupKindNumBytes(unsigned Kind) {
   case FK_Data_2:
   case ARM::fixup_arm_thumb_br:
   case ARM::fixup_arm_thumb_cb:
+  case ARM::fixup_arm_mod_imm:
     return 2;
 
   case ARM::fixup_arm_pcrel_10_unscaled:
   case ARM::fixup_arm_ldst_pcrel_12:
   case ARM::fixup_arm_pcrel_10:
+  case ARM::fixup_arm_pcrel_9:
   case ARM::fixup_arm_adr_pcrel_12:
   case ARM::fixup_arm_uncondbl:
   case ARM::fixup_arm_condbl:
@@ -708,6 +771,7 @@ static unsigned getFixupKindNumBytes(unsigned Kind) {
   case ARM::fixup_t2_condbranch:
   case ARM::fixup_t2_uncondbranch:
   case ARM::fixup_t2_pcrel_10:
+  case ARM::fixup_t2_pcrel_9:
   case ARM::fixup_t2_adr_pcrel_12:
   case ARM::fixup_arm_thumb_bl:
   case ARM::fixup_arm_thumb_blx:
@@ -766,6 +830,7 @@ static unsigned getFixupKindContainerSizeBytes(unsigned Kind) {
   case ARM::fixup_arm_movw_lo16:
   case ARM::fixup_t2_movt_hi16:
   case ARM::fixup_t2_movw_lo16:
+  case ARM::fixup_arm_mod_imm:
     // Instruction size is 4 bytes.
     return 4;
   }
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
index 28a62132a419..84caaacc47d3 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
@@ -11,12 +11,12 @@
 #define LLVM_LIB_TARGET_ARM_ARMASMBACKEND_H
 
 #include "MCTargetDesc/ARMFixupKinds.h"
+#include "MCTargetDesc/ARMMCTargetDesc.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/TargetRegistry.h"
 
-using namespace llvm;
-
-namespace {
+namespace llvm {
 
 class ARMAsmBackend : public MCAsmBackend {
   const MCSubtargetInfo *STI;
@@ -63,7 +63,8 @@ public:
                             const MCRelaxableFragment *DF,
                             const MCAsmLayout &Layout) const override;
 
-  void relaxInstruction(const MCInst &Inst, MCInst &Res) const override;
+  void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+                        MCInst &Res) const override;
 
   bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
 
@@ -74,6 +75,6 @@ public:
   void setIsThumb(bool it) { isThumbMode = it; }
   bool isLittle() const { return IsLittleEndian; }
 };
-} // end anonymous namespace
+} // end namespace llvm
 
 #endif
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h b/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h
index 995dd0fe08ee..09dc0173ade6 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h
@@ -10,11 +10,10 @@
 #ifndef LLVM_LIB_TARGET_ARM_ARMASMBACKENDDARWIN_H
 #define LLVM_LIB_TARGET_ARM_ARMASMBACKENDDARWIN_H
 
+#include "ARMAsmBackend.h"
 #include "llvm/Support/MachO.h"
 
-using namespace llvm;
-
-namespace {
+namespace llvm {
 class ARMAsmBackendDarwin : public ARMAsmBackend {
   const MCRegisterInfo &MRI;
 public:
@@ -22,7 +21,6 @@ public:
   ARMAsmBackendDarwin(const Target &T, const Triple &TT,
                       const MCRegisterInfo &MRI, MachO::CPUSubTypeARM st)
       : ARMAsmBackend(T, TT, /* IsLittleEndian */ true), MRI(MRI), Subtype(st) {
-    HasDataInCodeSupport = true;
   }
 
   MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
@@ -33,6 +31,6 @@ public:
   uint32_t generateCompactUnwindEncoding(
       ArrayRef<MCCFIInstruction> Instrs) const override;
 };
-}
+} // end namespace llvm
 
 #endif
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h b/lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h
index 68b12edd089e..748f915be17b 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h
@@ -10,7 +10,10 @@
 #ifndef LLVM_LIB_TARGET_ARM_ELFARMASMBACKEND_H
 #define LLVM_LIB_TARGET_ARM_ELFARMASMBACKEND_H
 
+#include "ARMAsmBackend.h"
+#include "MCTargetDesc/ARMMCTargetDesc.h"
 using namespace llvm;
+
 namespace {
 class ARMAsmBackendELF : public ARMAsmBackend {
 public:
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h b/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h
index 170f59a4c905..2a375be49a83 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h
@@ -10,6 +10,7 @@
 #ifndef LLVM_LIB_TARGET_ARM_ARMASMBACKENDWINCOFF_H
 #define LLVM_LIB_TARGET_ARM_ARMASMBACKENDWINCOFF_H
 
+#include "ARMAsmBackend.h"
 using namespace llvm;
 
 namespace {
diff --git a/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h b/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
index 4289a73e9d6b..088b4205ed62 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
@@ -289,18 +289,20 @@ namespace ARMII {
     /// higher 16 bit of the address. Used only via movt instruction.
     MO_HI16 = 0x2,
 
-    /// MO_PLT - On a symbol operand, this represents an ELF PLT reference on a
-    /// call operand.
-    MO_PLT = 0x3,
-
     /// MO_OPTION_MASK - Most flags are mutually exclusive; this mask selects
     /// just that part of the flag set.
-    MO_OPTION_MASK = 0x3f,
+    MO_OPTION_MASK = 0x1f,
 
     /// MO_DLLIMPORT - On a symbol operand, this represents that the reference
     /// to the symbol is for an import stub.  This is used for DLL import
     /// storage class indication on Windows.
-    MO_DLLIMPORT = 0x40,
+    MO_DLLIMPORT = 0x20,
+
+    /// MO_SECREL - On a symbol operand this indicates that the immediate is
+    /// the offset from beginning of section.
+    ///
+    /// This is the TLS offset for the COFF/Windows TLS mechanism.
+    MO_SECREL = 0x40,
 
     /// MO_NONLAZY - This is an independent flag, on a symbol operand "FOO" it
     /// represents a symbol which, if indirect, will get special Darwin mangling
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
index 52eba8be288f..4118fe8e8cdb 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
@@ -34,8 +34,8 @@ namespace {
 
     ~ARMELFObjectWriter() override;
 
-    unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
-                          bool IsPCRel) const override;
+    unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                          const MCFixup &Fixup, bool IsPCRel) const override;
 
     bool needsRelocateWithSymbol(const MCSymbol &Sym,
                                  unsigned Type) const override;
@@ -67,7 +67,7 @@ bool ARMELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
 // Need to examine the Fixup when determining whether to 
 // emit the relocation as an explicit symbol or as a section relative
 // offset
-unsigned ARMELFObjectWriter::GetRelocType(const MCValue &Target,
+unsigned ARMELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
                                           const MCFixup &Fixup,
                                           bool IsPCRel) const {
   return GetRelocTypeInner(Target, Fixup, IsPCRel);
@@ -98,6 +98,9 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
       case MCSymbolRefExpr::VK_ARM_GOT_PREL:
         Type = ELF::R_ARM_GOT_PREL;
         break;
+      case MCSymbolRefExpr::VK_ARM_PREL31:
+        Type = ELF::R_ARM_PREL31;
+        break;
       }
       break;
     case ARM::fixup_arm_blx:
@@ -106,7 +109,7 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
       case MCSymbolRefExpr::VK_PLT:
         Type = ELF::R_ARM_CALL;
         break;
-      case MCSymbolRefExpr::VK_ARM_TLSCALL:
+      case MCSymbolRefExpr::VK_TLSCALL:
         Type = ELF::R_ARM_TLS_CALL;
         break;
       default:
@@ -120,6 +123,8 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
       Type = ELF::R_ARM_JUMP24;
       break;
     case ARM::fixup_t2_condbranch:
+      Type = ELF::R_ARM_THM_JUMP19;
+      break;
     case ARM::fixup_t2_uncondbranch:
       Type = ELF::R_ARM_THM_JUMP24;
       break;
@@ -138,7 +143,7 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
     case ARM::fixup_arm_thumb_bl:
     case ARM::fixup_arm_thumb_blx:
       switch (Modifier) {
-      case MCSymbolRefExpr::VK_ARM_TLSCALL:
+      case MCSymbolRefExpr::VK_TLSCALL:
         Type = ELF::R_ARM_THM_TLS_CALL;
         break;
       default:
@@ -210,10 +215,10 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
       case MCSymbolRefExpr::VK_ARM_TLSLDO:
         Type = ELF::R_ARM_TLS_LDO32;
         break;
-      case MCSymbolRefExpr::VK_ARM_TLSCALL:
+      case MCSymbolRefExpr::VK_TLSCALL:
         Type = ELF::R_ARM_TLS_CALL;
         break;
-      case MCSymbolRefExpr::VK_ARM_TLSDESC:
+      case MCSymbolRefExpr::VK_TLSDESC:
         Type = ELF::R_ARM_TLS_GOTDESC;
         break;
       case MCSymbolRefExpr::VK_ARM_TLSDESCSEQ:
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index 57577dc834b7..36cb74765f3b 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -763,6 +763,12 @@ void ARMTargetELFStreamer::emitArchDefaultAttributes() {
     setAttributeItem(Virtualization_use, AllowTZVirtualization, false);
     break;
 
+  case ARM::AK_ARMV8MBaseline:
+  case ARM::AK_ARMV8MMainline:
+    setAttributeItem(THUMB_ISA_use, AllowThumbDerived, false);
+    setAttributeItem(CPU_arch_profile, MicroControllerProfile, false);
+    break;
+
   case ARM::AK_IWMMXT:
     setAttributeItem(ARM_ISA_use, Allowed, false);
     setAttributeItem(THUMB_ISA_use, Allowed, false);
diff --git a/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h b/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h
index 46ba57170db5..3fe2302bdd37 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h
@@ -33,6 +33,13 @@ enum Fixups {
   // fixup_t2_pcrel_10 - Equivalent to fixup_arm_pcrel_10, accounting for
   // the short-swapped encoding of Thumb2 instructions.
   fixup_t2_pcrel_10,
+  // fixup_arm_pcrel_9 - 9-bit PC relative relocation for symbol addresses
+  // used in VFP instructions where bit 0 not encoded (so it's encoded as an
+  // 8-bit immediate).
+  fixup_arm_pcrel_9,
+  // fixup_t2_pcrel_9 - Equivalent to fixup_arm_pcrel_9, accounting for
+  // the short-swapped encoding of Thumb2 instructions.
+  fixup_t2_pcrel_9,
   // fixup_thumb_adr_pcrel_10 - 10-bit PC relative relocation for symbol
   // addresses where the lower 2 bits are not encoded (so it's encoded as an
   // 8-bit immediate).
@@ -100,6 +107,9 @@ enum Fixups {
   fixup_t2_movt_hi16, // :upper16:
   fixup_t2_movw_lo16, // :lower16:
 
+  // fixup_arm_mod_imm - Fixup for mod_imm
+  fixup_arm_mod_imm,
+
   // Marker
   LastTargetFixupKind,
   NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
index bda37f6616a8..53cd29a6061e 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
@@ -13,7 +13,6 @@
 
 #include "ARMMCAsmInfo.h"
 #include "llvm/ADT/Triple.h"
-#include "llvm/Support/CommandLine.h"
 
 using namespace llvm;
 
@@ -33,7 +32,7 @@ ARMMCAsmInfoDarwin::ARMMCAsmInfoDarwin(const Triple &TheTriple) {
   SupportsDebugInformation = true;
 
   // Exceptions handling
-  ExceptionsType = TheTriple.isOSDarwin() && !TheTriple.isWatchOS()
+  ExceptionsType = (TheTriple.isOSDarwin() && !TheTriple.isWatchABI())
                        ? ExceptionHandling::SjLj
                        : ExceptionHandling::DwarfCFI;
 
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
index b88578309f08..9fca13eeea93 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
@@ -120,11 +120,11 @@ public:
                                   SmallVectorImpl<MCFixup> &Fixups,
                                   const MCSubtargetInfo &STI) const;
 
-  /// getUnconditionalBranchTargetOpValue - Return encoding info for 24-bit
+  /// getThumbBranchTargetOpValue - Return encoding info for 24-bit
   /// immediate Thumb2 direct branch target.
-  uint32_t getUnconditionalBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                                  SmallVectorImpl<MCFixup> &Fixups,
-                                  const MCSubtargetInfo &STI) const;
+  uint32_t getThumbBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) const;
 
   /// getARMBranchTargetOpValue - Return encoding info for 24-bit immediate
   /// branch target.
@@ -214,11 +214,6 @@ public:
     llvm_unreachable("Invalid ShiftOpc!");
   }
 
-  /// getAddrMode2OpValue - Return encoding for addrmode2 operands.
-  uint32_t getAddrMode2OpValue(const MCInst &MI, unsigned OpIdx,
-                               SmallVectorImpl<MCFixup> &Fixups,
-                               const MCSubtargetInfo &STI) const;
-
   /// getAddrMode2OffsetOpValue - Return encoding for am2offset operands.
   uint32_t getAddrMode2OffsetOpValue(const MCInst &MI, unsigned OpIdx,
                                      SmallVectorImpl<MCFixup> &Fixups,
@@ -255,11 +250,16 @@ public:
                                 SmallVectorImpl<MCFixup> &Fixups,
                                 const MCSubtargetInfo &STI) const;
 
-  /// getAddrMode5OpValue - Return encoding info for 'reg +/- imm8' operand.
+  /// getAddrMode5OpValue - Return encoding info for 'reg +/- (imm8 << 2)' operand.
   uint32_t getAddrMode5OpValue(const MCInst &MI, unsigned OpIdx,
                                SmallVectorImpl<MCFixup> &Fixups,
                                const MCSubtargetInfo &STI) const;
 
+  /// getAddrMode5FP16OpValue - Return encoding info for 'reg +/- (imm8 << 1)' operand.
+  uint32_t getAddrMode5FP16OpValue(const MCInst &MI, unsigned OpIdx,
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
+
   /// getCCOutOpValue - Return encoding of the 's' bit.
   unsigned getCCOutOpValue(const MCInst &MI, unsigned Op,
                            SmallVectorImpl<MCFixup> &Fixups,
@@ -312,12 +312,8 @@ public:
     // Support for fixups (MCFixup)
     if (MO.isExpr()) {
       const MCExpr *Expr = MO.getExpr();
-      // In instruction code this value always encoded as lowest 12 bits,
-      // so we don't have to perform any specific adjustments.
-      // Due to requirements of relocatable records we have to use FK_Data_4.
-      // See ARMELFObjectWriter::ExplicitRelSym and
-      //     ARMELFObjectWriter::GetRelocTypeInner for more details.
-      MCFixupKind Kind = MCFixupKind(FK_Data_4);
+      // Fixups resolve to plain values that need to be encoded.
+      MCFixupKind Kind = MCFixupKind(ARM::fixup_arm_mod_imm);
       Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc()));
       return 0;
     }
@@ -345,9 +341,6 @@ public:
   unsigned getT2AddrModeImm8OffsetOpValue(const MCInst &MI, unsigned OpNum,
     SmallVectorImpl<MCFixup> &Fixups,
     const MCSubtargetInfo &STI) const;
-  unsigned getT2AddrModeImm12OffsetOpValue(const MCInst &MI, unsigned OpNum,
-    SmallVectorImpl<MCFixup> &Fixups,
-    const MCSubtargetInfo &STI) const;
 
   /// getSORegOpValue - Return an encoded so_reg shifted register value.
   unsigned getSORegRegOpValue(const MCInst &MI, unsigned Op,
@@ -757,10 +750,9 @@ getARMBLXTargetOpValue(const MCInst &MI, unsigned OpIdx,
 
 /// getUnconditionalBranchTargetOpValue - Return encoding info for 24-bit
 /// immediate branch target.
-uint32_t ARMMCCodeEmitter::
-getUnconditionalBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                       SmallVectorImpl<MCFixup> &Fixups,
-                       const MCSubtargetInfo &STI) const {
+uint32_t ARMMCCodeEmitter::getThumbBranchTargetOpValue(
+    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const {
   unsigned Val = 0;
   const MCOperand MO = MI.getOperand(OpIdx);
     
@@ -1049,12 +1041,12 @@ ARMMCCodeEmitter::getHiLo16ImmOpValue(const MCInst &MI, unsigned OpIdx,
     switch (ARM16Expr->getKind()) {
     default: llvm_unreachable("Unsupported ARMFixup");
     case ARMMCExpr::VK_ARM_HI16:
-      Kind = MCFixupKind(isThumb2(STI) ? ARM::fixup_t2_movt_hi16
-                                       : ARM::fixup_arm_movt_hi16);
+      Kind = MCFixupKind(isThumb(STI) ? ARM::fixup_t2_movt_hi16
+                                      : ARM::fixup_arm_movt_hi16);
       break;
     case ARMMCExpr::VK_ARM_LO16:
-      Kind = MCFixupKind(isThumb2(STI) ? ARM::fixup_t2_movw_lo16
-                                       : ARM::fixup_arm_movw_lo16);
+      Kind = MCFixupKind(isThumb(STI) ? ARM::fixup_t2_movw_lo16
+                                      : ARM::fixup_arm_movw_lo16);
       break;
     }
 
@@ -1105,21 +1097,6 @@ getLdStSORegOpValue(const MCInst &MI, unsigned OpIdx,
 }
 
 uint32_t ARMMCCodeEmitter::
-getAddrMode2OpValue(const MCInst &MI, unsigned OpIdx,
-                    SmallVectorImpl<MCFixup> &Fixups,
-                    const MCSubtargetInfo &STI) const {
-  // {17-14}  Rn
-  // {13}     1 == imm12, 0 == Rm
-  // {12}     isAdd
-  // {11-0}   imm12/Rm
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  unsigned Rn = CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
-  uint32_t Binary = getAddrMode2OffsetOpValue(MI, OpIdx + 1, Fixups, STI);
-  Binary |= Rn << 14;
-  return Binary;
-}
-
-uint32_t ARMMCCodeEmitter::
 getAddrMode2OffsetOpValue(const MCInst &MI, unsigned OpIdx,
                           SmallVectorImpl<MCFixup> &Fixups,
                           const MCSubtargetInfo &STI) const {
@@ -1252,7 +1229,7 @@ getAddrModePCOpValue(const MCInst &MI, unsigned OpIdx,
   return (MO.getImm() >> 2);
 }
 
-/// getAddrMode5OpValue - Return encoding info for 'reg +/- imm10' operand.
+/// getAddrMode5OpValue - Return encoding info for 'reg +/- (imm8 << 2)' operand.
 uint32_t ARMMCCodeEmitter::
 getAddrMode5OpValue(const MCInst &MI, unsigned OpIdx,
                     SmallVectorImpl<MCFixup> &Fixups,
@@ -1292,6 +1269,46 @@ getAddrMode5OpValue(const MCInst &MI, unsigned OpIdx,
   return Binary;
 }
 
+/// getAddrMode5FP16OpValue - Return encoding info for 'reg +/- (imm8 << 1)' operand.
+uint32_t ARMMCCodeEmitter::
+getAddrMode5FP16OpValue(const MCInst &MI, unsigned OpIdx,
+                    SmallVectorImpl<MCFixup> &Fixups,
+                    const MCSubtargetInfo &STI) const {
+  // {12-9} = reg
+  // {8}    = (U)nsigned (add == '1', sub == '0')
+  // {7-0}  = imm8
+  unsigned Reg, Imm8;
+  bool isAdd;
+  // If The first operand isn't a register, we have a label reference.
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  if (!MO.isReg()) {
+    Reg = CTX.getRegisterInfo()->getEncodingValue(ARM::PC);   // Rn is PC.
+    Imm8 = 0;
+    isAdd = false; // 'U' bit is handled as part of the fixup.
+
+    assert(MO.isExpr() && "Unexpected machine operand type!");
+    const MCExpr *Expr = MO.getExpr();
+    MCFixupKind Kind;
+    if (isThumb2(STI))
+      Kind = MCFixupKind(ARM::fixup_t2_pcrel_9);
+    else
+      Kind = MCFixupKind(ARM::fixup_arm_pcrel_9);
+    Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc()));
+
+    ++MCNumCPRelocations;
+  } else {
+    EncodeAddrModeOpValues(MI, OpIdx, Reg, Imm8, Fixups, STI);
+    isAdd = ARM_AM::getAM5Op(Imm8) == ARM_AM::add;
+  }
+
+  uint32_t Binary = ARM_AM::getAM5Offset(Imm8);
+  // Immediate is always encoded as positive. The 'U' bit controls add vs sub.
+  if (isAdd)
+    Binary |= (1 << 8);
+  Binary |= (Reg << 9);
+  return Binary;
+}
+
 unsigned ARMMCCodeEmitter::
 getSORegRegOpValue(const MCInst &MI, unsigned OpIdx,
                 SmallVectorImpl<MCFixup> &Fixups,
@@ -1446,23 +1463,6 @@ getT2AddrModeImm8OffsetOpValue(const MCInst &MI, unsigned OpNum,
 }
 
 unsigned ARMMCCodeEmitter::
-getT2AddrModeImm12OffsetOpValue(const MCInst &MI, unsigned OpNum,
-                         SmallVectorImpl<MCFixup> &Fixups,
-                         const MCSubtargetInfo &STI) const {
-  const MCOperand &MO1 = MI.getOperand(OpNum);
-
-  // FIXME: Needs fixup support.
-  unsigned Value = 0;
-  int32_t tmp = (int32_t)MO1.getImm();
-  if (tmp < 0)
-    tmp = abs(tmp);
-  else
-    Value |= 4096; // Set the ADD bit
-  Value |= tmp & 4095;
-  return Value;
-}
-
-unsigned ARMMCCodeEmitter::
 getT2SORegOpValue(const MCInst &MI, unsigned OpIdx,
                 SmallVectorImpl<MCFixup> &Fixups,
                 const MCSubtargetInfo &STI) const {
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index 8c8c249addb5..afb089ab0286 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -16,7 +16,6 @@
 #include "ARMMCTargetDesc.h"
 #include "InstPrinter/ARMInstPrinter.h"
 #include "llvm/ADT/Triple.h"
-#include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrInfo.h"
@@ -201,18 +200,6 @@ static MCAsmInfo *createARMMCAsmInfo(const MCRegisterInfo &MRI,
   return MAI;
 }
 
-static MCCodeGenInfo *createARMMCCodeGenInfo(const Triple &TT, Reloc::Model RM,
-                                             CodeModel::Model CM,
-                                             CodeGenOpt::Level OL) {
-  MCCodeGenInfo *X = new MCCodeGenInfo();
-  if (RM == Reloc::Default) {
-    // Default relocation model on Darwin is PIC, not DynamicNoPIC.
-    RM = TT.isOSDarwin() ? Reloc::PIC_ : Reloc::DynamicNoPIC;
-  }
-  X->initMCCodeGenInfo(RM, CM, OL);
-  return X;
-}
-
 static MCStreamer *createELFStreamer(const Triple &T, MCContext &Ctx,
                                      MCAsmBackend &MAB, raw_pwrite_stream &OS,
                                      MCCodeEmitter *Emitter, bool RelaxAll) {
@@ -291,9 +278,6 @@ extern "C" void LLVMInitializeARMTargetMC() {
     // Register the MC asm info.
     RegisterMCAsmInfoFn X(*T, createARMMCAsmInfo);
 
-    // Register the MC codegen info.
-    TargetRegistry::RegisterMCCodeGenInfo(*T, createARMMCCodeGenInfo);
-
     // Register the MC instruction info.
     TargetRegistry::RegisterMCInstrInfo(*T, createARMMCInstrInfo);
 
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp b/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp
index 4468132588cf..482bcf902518 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp
@@ -12,7 +12,7 @@
 #include "llvm-c/Disassembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCRelocationInfo.h"
+#include "llvm/MC/MCDisassembler/MCRelocationInfo.h"
 
 using namespace llvm;
 using namespace object;
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
index cfd504e533af..cfa6ce7da65e 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
@@ -389,7 +389,8 @@ void ARMMachObjectWriter::recordRelocation(MachObjectWriter *Writer,
   uint32_t Offset = Target.getConstant();
   if (IsPCRel && RelocType == MachO::ARM_RELOC_VANILLA)
     Offset += 1 << Log2Size;
-  if (Offset && A && !Writer->doesSymbolRequireExternRelocation(*A))
+  if (Offset && A && !Writer->doesSymbolRequireExternRelocation(*A) &&
+      RelocType != MachO::ARM_RELOC_HALF)
     return RecordARMScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup,
                                         Target, RelocType, Log2Size,
                                         FixedValue);
@@ -447,8 +448,10 @@ void ARMMachObjectWriter::recordRelocation(MachObjectWriter *Writer,
   // Even when it's not a scattered relocation, movw/movt always uses
   // a PAIR relocation.
   if (Type == MachO::ARM_RELOC_HALF) {
-    // The other-half value only gets populated for the movt and movw
-    // relocation entries.
+    // The entire addend is needed to correctly apply a relocation. One half is
+    // extracted from the instruction itself, the other comes from this
+    // PAIR. I.e. it's correct that we insert the high bits of the addend in the
+    // MOVW case here.  relocation entries.
     uint32_t Value = 0;
     switch ((unsigned)Fixup.getKind()) {
     default: break;
diff --git a/lib/Target/ARM/MCTargetDesc/Makefile b/lib/Target/ARM/MCTargetDesc/Makefile
deleted file mode 100644
index 448ed9df2bff..000000000000
--- a/lib/Target/ARM/MCTargetDesc/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/ARM/TargetDesc/Makefile ------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMARMDesc
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/ARM/MLxExpansionPass.cpp b/lib/Target/ARM/MLxExpansionPass.cpp
index ed2deeaa24c0..7f2124033982 100644
--- a/lib/Target/ARM/MLxExpansionPass.cpp
+++ b/lib/Target/ARM/MLxExpansionPass.cpp
@@ -378,12 +378,14 @@ bool MLxExpansion::ExpandFPMLxInstructions(MachineBasicBlock &MBB) {
 }
 
 bool MLxExpansion::runOnMachineFunction(MachineFunction &Fn) {
+  if (skipFunction(*Fn.getFunction()))
+    return false;
+
   TII = static_cast<const ARMBaseInstrInfo *>(Fn.getSubtarget().getInstrInfo());
   TRI = Fn.getSubtarget().getRegisterInfo();
   MRI = &Fn.getRegInfo();
   const ARMSubtarget *STI = &Fn.getSubtarget<ARMSubtarget>();
-  // Only run this for CortexA9.
-  if (!STI->isCortexA9())
+  if (!STI->expandMLx())
     return false;
   isLikeA9 = STI->isLikeA9() || STI->isSwift();
   isSwift = STI->isSwift();
diff --git a/lib/Target/ARM/Makefile b/lib/Target/ARM/Makefile
deleted file mode 100644
index c1601a3f29dd..000000000000
--- a/lib/Target/ARM/Makefile
+++ /dev/null
@@ -1,24 +0,0 @@
-##===- lib/Target/ARM/Makefile -----------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMARMCodeGen
-TARGET = ARM
-
-# Make sure that tblgen is run, first thing.
-BUILT_SOURCES = ARMGenRegisterInfo.inc ARMGenInstrInfo.inc \
-		ARMGenAsmWriter.inc ARMGenAsmMatcher.inc \
-                ARMGenDAGISel.inc ARMGenSubtargetInfo.inc \
-                ARMGenCallingConv.inc \
-                ARMGenFastISel.inc ARMGenMCCodeEmitter.inc \
-                ARMGenMCPseudoLowering.inc ARMGenDisassemblerTables.inc
-
-DIRS = InstPrinter AsmParser Disassembler TargetInfo MCTargetDesc
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/ARM/README.txt b/lib/Target/ARM/README.txt
index 5acb2d46f3e7..549af00fcc99 100644
--- a/lib/Target/ARM/README.txt
+++ b/lib/Target/ARM/README.txt
@@ -115,7 +115,7 @@ L6:
 	.long	-858993459
 	.long	1074318540
 
-3) struct copies appear to be done field by field 
+3) struct copies appear to be done field by field
 instead of by words, at least sometimes:
 
 struct foo { int x; short s; char c1; char c2; };
@@ -142,7 +142,7 @@ a good way to measure on ARM).
 
 * Consider this silly example:
 
-double bar(double x) {  
+double bar(double x) {
   double r = foo(3.1);
   return x+r;
 }
@@ -162,7 +162,7 @@ _bar:
         fmrrd r0, r1, d0
         ldmfd sp!, {r4, r5, r7, pc}
 
-Ignore the prologue and epilogue stuff for a second. Note 
+Ignore the prologue and epilogue stuff for a second. Note
 	mov r4, r0
 	mov r5, r1
 the copys to callee-save registers and the fact they are only being used by the
@@ -269,7 +269,7 @@ LBB4:
   b LBB2
 
 If BB4 is the only predecessor of BB3, then we can emit BB3 after BB4. We can
-then eliminate beq and and turn the unconditional branch to LBB2 to a bne.
+then eliminate beq and turn the unconditional branch to LBB2 to a bne.
 
 See McCat/18-imp/ComputeBoundingBoxes for an example.
 
@@ -391,10 +391,10 @@ void foo(signed char* p) {
 }
 
 llvm decides it's a good idea to turn the repeated if...else into a
-binary tree, as if it were a switch; the resulting code requires -1 
+binary tree, as if it were a switch; the resulting code requires -1
 compare-and-branches when *p<=2 or *p==5, the same number if *p==4
 or *p>6, and +1 if *p==3.  So it should be a speed win
-(on balance).  However, the revised code is larger, with 4 conditional 
+(on balance).  However, the revised code is larger, with 4 conditional
 branches instead of 3.
 
 More seriously, there is a byte->word extend before
@@ -421,8 +421,8 @@ int foo(int a, int b, int c, int d) {
   return (int)(acc >> 32);
 }
 
-Should compile to use SMLAL (Signed Multiply Accumulate Long) which multiplies 
-two signed 32-bit values to produce a 64-bit value, and accumulates this with 
+Should compile to use SMLAL (Signed Multiply Accumulate Long) which multiplies
+two signed 32-bit values to produce a 64-bit value, and accumulates this with
 a 64-bit value.
 
 We currently get this with both v4 and v6:
@@ -513,7 +513,7 @@ Be careful though as the last attempt caused infinite looping on lencod.
 
 //===---------------------------------------------------------------------===//
 
-Predication issue. This function:   
+Predication issue. This function:
 
 extern unsigned array[ 128 ];
 int     foo( int x ) {
diff --git a/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp b/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp
index df73554372d8..3f88eb818062 100644
--- a/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp
+++ b/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp
@@ -8,7 +8,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/ARMMCTargetDesc.h"
-#include "llvm/IR/Module.h"
 #include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
diff --git a/lib/Target/ARM/TargetInfo/Makefile b/lib/Target/ARM/TargetInfo/Makefile
deleted file mode 100644
index 6292ab14b346..000000000000
--- a/lib/Target/ARM/TargetInfo/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/ARM/TargetInfo/Makefile ------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMARMInfo
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp
index 93e0ac4aa320..c0732e4b750a 100644
--- a/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -38,18 +38,17 @@ bool Thumb1FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const{
   return !MF.getFrameInfo()->hasVarSizedObjects();
 }
 
-static void
-emitSPUpdate(MachineBasicBlock &MBB,
-             MachineBasicBlock::iterator &MBBI,
-             const TargetInstrInfo &TII, DebugLoc dl,
-             const ThumbRegisterInfo &MRI,
-             int NumBytes, unsigned MIFlags = MachineInstr::NoFlags)  {
+static void emitSPUpdate(MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator &MBBI,
+                         const TargetInstrInfo &TII, const DebugLoc &dl,
+                         const ThumbRegisterInfo &MRI, int NumBytes,
+                         unsigned MIFlags = MachineInstr::NoFlags) {
   emitThumbRegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes, TII,
                             MRI, MIFlags);
 }
 
 
-void Thumb1FrameLowering::
+MachineBasicBlock::iterator Thumb1FrameLowering::
 eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator I) const {
   const Thumb1InstrInfo &TII =
@@ -60,9 +59,9 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
     // If we have alloca, convert as follows:
     // ADJCALLSTACKDOWN -> sub, sp, sp, amount
     // ADJCALLSTACKUP   -> add, sp, sp, amount
-    MachineInstr *Old = I;
-    DebugLoc dl = Old->getDebugLoc();
-    unsigned Amount = Old->getOperand(0).getImm();
+    MachineInstr &Old = *I;
+    DebugLoc dl = Old.getDebugLoc();
+    unsigned Amount = Old.getOperand(0).getImm();
     if (Amount != 0) {
       // We need to keep the stack aligned properly.  To do this, we round the
       // amount of space needed for the outgoing arguments up to the next
@@ -71,7 +70,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
       Amount = (Amount+Align-1)/Align*Align;
 
       // Replace the pseudo instruction with a new instruction...
-      unsigned Opc = Old->getOpcode();
+      unsigned Opc = Old.getOpcode();
       if (Opc == ARM::ADJCALLSTACKDOWN || Opc == ARM::tADJCALLSTACKDOWN) {
         emitSPUpdate(MBB, I, TII, dl, *RegInfo, -Amount);
       } else {
@@ -80,7 +79,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
       }
     }
   }
-  MBB.erase(I);
+  return MBB.erase(I);
 }
 
 void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
@@ -151,7 +150,7 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
     case ARM::R9:
     case ARM::R10:
     case ARM::R11:
-      if (STI.isTargetMachO()) {
+      if (STI.splitFramePushPop()) {
         GPRCS2Size += 4;
         break;
       }
@@ -189,7 +188,7 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
 
   int FramePtrOffsetInBlock = 0;
   unsigned adjustedGPRCS1Size = GPRCS1Size;
-  if (tryFoldSPUpdateIntoPushPop(STI, MF, std::prev(MBBI), NumBytes)) {
+  if (tryFoldSPUpdateIntoPushPop(STI, MF, &*std::prev(MBBI), NumBytes)) {
     FramePtrOffsetInBlock = NumBytes;
     adjustedGPRCS1Size += NumBytes;
     NumBytes = 0;
@@ -213,7 +212,7 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
     case ARM::R10:
     case ARM::R11:
     case ARM::R12:
-      if (STI.isTargetMachO())
+      if (STI.splitFramePushPop())
         break;
       // fallthough
     case ARM::R0:
@@ -304,16 +303,15 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
     AFI->setShouldRestoreSPFromFP(true);
 }
 
-static bool isCSRestore(MachineInstr *MI, const MCPhysReg *CSRegs) {
-  if (MI->getOpcode() == ARM::tLDRspi &&
-      MI->getOperand(1).isFI() &&
-      isCalleeSavedRegister(MI->getOperand(0).getReg(), CSRegs))
+static bool isCSRestore(MachineInstr &MI, const MCPhysReg *CSRegs) {
+  if (MI.getOpcode() == ARM::tLDRspi && MI.getOperand(1).isFI() &&
+      isCalleeSavedRegister(MI.getOperand(0).getReg(), CSRegs))
     return true;
-  else if (MI->getOpcode() == ARM::tPOP) {
+  else if (MI.getOpcode() == ARM::tPOP) {
     // The first two operands are predicates. The last two are
     // imp-def and imp-use of SP. Check everything in between.
-    for (int i = 2, e = MI->getNumOperands() - 2; i != e; ++i)
-      if (!isCalleeSavedRegister(MI->getOperand(i).getReg(), CSRegs))
+    for (int i = 2, e = MI.getNumOperands() - 2; i != e; ++i)
+      if (!isCalleeSavedRegister(MI.getOperand(i).getReg(), CSRegs))
         return false;
     return true;
   }
@@ -346,8 +344,8 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF,
     if (MBBI != MBB.begin()) {
       do
         --MBBI;
-      while (MBBI != MBB.begin() && isCSRestore(MBBI, CSRegs));
-      if (!isCSRestore(MBBI, CSRegs))
+      while (MBBI != MBB.begin() && isCSRestore(*MBBI, CSRegs));
+      if (!isCSRestore(*MBBI, CSRegs))
         ++MBBI;
     }
 
@@ -376,11 +374,11 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF,
           .addReg(FramePtr));
     } else {
       if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tBX_RET &&
-          &MBB.front() != MBBI && std::prev(MBBI)->getOpcode() == ARM::tPOP) {
+          &MBB.front() != &*MBBI && std::prev(MBBI)->getOpcode() == ARM::tPOP) {
         MachineBasicBlock::iterator PMBBI = std::prev(MBBI);
-        if (!tryFoldSPUpdateIntoPushPop(STI, MF, PMBBI, NumBytes))
+        if (!tryFoldSPUpdateIntoPushPop(STI, MF, &*PMBBI, NumBytes))
           emitSPUpdate(MBB, PMBBI, TII, dl, *RegInfo, NumBytes);
-      } else if (!tryFoldSPUpdateIntoPushPop(STI, MF, MBBI, NumBytes))
+      } else if (!tryFoldSPUpdateIntoPushPop(STI, MF, &*MBBI, NumBytes))
         emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, NumBytes);
     }
   }
@@ -467,7 +465,7 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB,
   // Look for a temporary register to use.
   // First, compute the liveness information.
   LivePhysRegs UsedRegs(STI.getRegisterInfo());
-  UsedRegs.addLiveOuts(&MBB, /*AddPristines*/ true);
+  UsedRegs.addLiveOuts(MBB);
   // The semantic of pristines changed recently and now,
   // the callee-saved registers that are touched in the function
   // are not part of the pristines set anymore.
@@ -637,7 +635,7 @@ restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
         Reg = ARM::PC;
         (*MIB).setDesc(TII.get(ARM::tPOP_RET));
         if (MI != MBB.end())
-          MIB.copyImplicitOps(&*MI);
+          MIB.copyImplicitOps(*MI);
         MI = MBB.erase(MI);
       } else
         // LR may only be popped into PC, as part of return sequence.
diff --git a/lib/Target/ARM/Thumb1FrameLowering.h b/lib/Target/ARM/Thumb1FrameLowering.h
index 27faac63683a..9de1ba1d7009 100644
--- a/lib/Target/ARM/Thumb1FrameLowering.h
+++ b/lib/Target/ARM/Thumb1FrameLowering.h
@@ -41,7 +41,7 @@ public:
 
   bool hasReservedCallFrame(const MachineFunction &MF) const override;
 
-  void
+  MachineBasicBlock::iterator
   eliminateCallFramePseudoInstr(MachineFunction &MF,
                                 MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator MI) const override;
diff --git a/lib/Target/ARM/Thumb1InstrInfo.cpp b/lib/Target/ARM/Thumb1InstrInfo.cpp
index 530e1d33839a..159731d8fc72 100644
--- a/lib/Target/ARM/Thumb1InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb1InstrInfo.cpp
@@ -38,9 +38,9 @@ unsigned Thumb1InstrInfo::getUnindexedOpcode(unsigned Opc) const {
 }
 
 void Thumb1InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
-                                  MachineBasicBlock::iterator I, DebugLoc DL,
-                                  unsigned DestReg, unsigned SrcReg,
-                                  bool KillSrc) const {
+                                  MachineBasicBlock::iterator I,
+                                  const DebugLoc &DL, unsigned DestReg,
+                                  unsigned SrcReg, bool KillSrc) const {
   // Need to check the arch.
   MachineFunction &MF = *MBB.getParent();
   const ARMSubtarget &st = MF.getSubtarget<ARMSubtarget>();
@@ -118,11 +118,12 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   }
 }
 
-void
-Thumb1InstrInfo::expandLoadStackGuard(MachineBasicBlock::iterator MI,
-                                      Reloc::Model RM) const {
-  if (RM == Reloc::PIC_)
-    expandLoadStackGuardBase(MI, ARM::tLDRLIT_ga_pcrel, ARM::tLDRi, RM);
+void Thumb1InstrInfo::expandLoadStackGuard(
+    MachineBasicBlock::iterator MI) const {
+  MachineFunction &MF = *MI->getParent()->getParent();
+  const TargetMachine &TM = MF.getTarget();
+  if (TM.isPositionIndependent())
+    expandLoadStackGuardBase(MI, ARM::tLDRLIT_ga_pcrel, ARM::tLDRi);
   else
-    expandLoadStackGuardBase(MI, ARM::tLDRLIT_ga_abs, ARM::tLDRi, RM);
+    expandLoadStackGuardBase(MI, ARM::tLDRLIT_ga_abs, ARM::tLDRi);
 }
diff --git a/lib/Target/ARM/Thumb1InstrInfo.h b/lib/Target/ARM/Thumb1InstrInfo.h
index f3f493d89237..931914ad2799 100644
--- a/lib/Target/ARM/Thumb1InstrInfo.h
+++ b/lib/Target/ARM/Thumb1InstrInfo.h
@@ -38,9 +38,8 @@ public:
   ///
   const ThumbRegisterInfo &getRegisterInfo() const override { return RI; }
 
-  void copyPhysReg(MachineBasicBlock &MBB,
-                   MachineBasicBlock::iterator I, DebugLoc DL,
-                   unsigned DestReg, unsigned SrcReg,
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                   const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
                    bool KillSrc) const override;
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI,
@@ -55,8 +54,7 @@ public:
                             const TargetRegisterInfo *TRI) const override;
 
 private:
-  void expandLoadStackGuard(MachineBasicBlock::iterator MI,
-                            Reloc::Model RM) const override;
+  void expandLoadStackGuard(MachineBasicBlock::iterator MI) const override;
 };
 }
 
diff --git a/lib/Target/ARM/Thumb2ITBlockPass.cpp b/lib/Target/ARM/Thumb2ITBlockPass.cpp
index bf0498dfda69..0c7055551632 100644
--- a/lib/Target/ARM/Thumb2ITBlockPass.cpp
+++ b/lib/Target/ARM/Thumb2ITBlockPass.cpp
@@ -36,6 +36,11 @@ namespace {
 
     bool runOnMachineFunction(MachineFunction &Fn) override;
 
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::AllVRegsAllocated);
+    }
+
     const char *getPassName() const override {
       return "Thumb IT blocks insertion pass";
     }
@@ -165,7 +170,7 @@ Thumb2ITBlockPass::MoveCopyOutOfITBlock(MachineInstr *MI,
     ++I;
   if (I != E) {
     unsigned NPredReg = 0;
-    ARMCC::CondCodes NCC = getITInstrPredicate(I, NPredReg);
+    ARMCC::CondCodes NCC = getITInstrPredicate(*I, NPredReg);
     if (NCC == CC || NCC == OCC)
       return true;
   }
@@ -182,7 +187,7 @@ bool Thumb2ITBlockPass::InsertITInstructions(MachineBasicBlock &MBB) {
     MachineInstr *MI = &*MBBI;
     DebugLoc dl = MI->getDebugLoc();
     unsigned PredReg = 0;
-    ARMCC::CondCodes CC = getITInstrPredicate(MI, PredReg);
+    ARMCC::CondCodes CC = getITInstrPredicate(*MI, PredReg);
     if (CC == ARMCC::AL) {
       ++MBBI;
       continue;
@@ -222,7 +227,7 @@ bool Thumb2ITBlockPass::InsertITInstructions(MachineBasicBlock &MBB) {
         MI = NMI;
 
         unsigned NPredReg = 0;
-        ARMCC::CondCodes NCC = getITInstrPredicate(NMI, NPredReg);
+        ARMCC::CondCodes NCC = getITInstrPredicate(*NMI, NPredReg);
         if (NCC == CC || NCC == OCC) {
           Mask |= (NCC & 1) << Pos;
           // Add implicit use of ITSTATE.
diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp
index 4da769f23280..e2e6dafd218a 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -50,7 +50,7 @@ Thumb2InstrInfo::ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail,
                                          MachineBasicBlock *NewDest) const {
   MachineBasicBlock *MBB = Tail->getParent();
   ARMFunctionInfo *AFI = MBB->getParent()->getInfo<ARMFunctionInfo>();
-  if (!AFI->hasITBlocks()) {
+  if (!AFI->hasITBlocks() || Tail->isBranch()) {
     TargetInstrInfo::ReplaceTailWithBranchTo(Tail, NewDest);
     return;
   }
@@ -58,7 +58,7 @@ Thumb2InstrInfo::ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail,
   // If the first instruction of Tail is predicated, we may have to update
   // the IT instruction.
   unsigned PredReg = 0;
-  ARMCC::CondCodes CC = getInstrPredicate(Tail, PredReg);
+  ARMCC::CondCodes CC = getInstrPredicate(*Tail, PredReg);
   MachineBasicBlock::iterator MBBI = Tail;
   if (CC != ARMCC::AL)
     // Expecting at least the t2IT instruction before it.
@@ -106,13 +106,13 @@ Thumb2InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB,
   }
 
   unsigned PredReg = 0;
-  return getITInstrPredicate(MBBI, PredReg) == ARMCC::AL;
+  return getITInstrPredicate(*MBBI, PredReg) == ARMCC::AL;
 }
 
 void Thumb2InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
-                                  MachineBasicBlock::iterator I, DebugLoc DL,
-                                  unsigned DestReg, unsigned SrcReg,
-                                  bool KillSrc) const {
+                                  MachineBasicBlock::iterator I,
+                                  const DebugLoc &DL, unsigned DestReg,
+                                  unsigned SrcReg, bool KillSrc) const {
   // Handle SPR, DPR, and QPR copies.
   if (!ARM::GPRRegClass.contains(DestReg, SrcReg))
     return ARMBaseInstrInfo::copyPhysReg(MBB, I, DL, DestReg, SrcReg, KillSrc);
@@ -148,8 +148,10 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     // Thumb2 STRD expects its dest-registers to be in rGPR. Not a problem for
     // gsub_0, but needs an extra constraint for gsub_1 (which could be sp
     // otherwise).
-    MachineRegisterInfo *MRI = &MF.getRegInfo();
-    MRI->constrainRegClass(SrcReg, &ARM::GPRPair_with_gsub_1_in_rGPRRegClass);
+    if (TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+      MachineRegisterInfo *MRI = &MF.getRegInfo();
+      MRI->constrainRegClass(SrcReg, &ARM::GPRPair_with_gsub_1_in_rGPRRegClass);
+    }
 
     MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::t2STRDi8));
     AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill), TRI);
@@ -187,8 +189,11 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     // Thumb2 LDRD expects its dest-registers to be in rGPR. Not a problem for
     // gsub_0, but needs an extra constraint for gsub_1 (which could be sp
     // otherwise).
-    MachineRegisterInfo *MRI = &MF.getRegInfo();
-    MRI->constrainRegClass(DestReg, &ARM::GPRPair_with_gsub_1_in_rGPRRegClass);
+    if (TargetRegisterInfo::isVirtualRegister(DestReg)) {
+      MachineRegisterInfo *MRI = &MF.getRegInfo();
+      MRI->constrainRegClass(DestReg,
+                             &ARM::GPRPair_with_gsub_1_in_rGPRRegClass);
+    }
 
     MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::t2LDRDi8));
     AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead, TRI);
@@ -204,20 +209,22 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   ARMBaseInstrInfo::loadRegFromStackSlot(MBB, I, DestReg, FI, RC, TRI);
 }
 
-void
-Thumb2InstrInfo::expandLoadStackGuard(MachineBasicBlock::iterator MI,
-                                      Reloc::Model RM) const {
-  if (RM == Reloc::PIC_)
-    expandLoadStackGuardBase(MI, ARM::t2MOV_ga_pcrel, ARM::t2LDRi12, RM);
+void Thumb2InstrInfo::expandLoadStackGuard(
+    MachineBasicBlock::iterator MI) const {
+  MachineFunction &MF = *MI->getParent()->getParent();
+  if (MF.getTarget().isPositionIndependent())
+    expandLoadStackGuardBase(MI, ARM::t2MOV_ga_pcrel, ARM::t2LDRi12);
   else
-    expandLoadStackGuardBase(MI, ARM::t2MOVi32imm, ARM::t2LDRi12, RM);
+    expandLoadStackGuardBase(MI, ARM::t2MOVi32imm, ARM::t2LDRi12);
 }
 
 void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB,
-                               MachineBasicBlock::iterator &MBBI, DebugLoc dl,
-                               unsigned DestReg, unsigned BaseReg, int NumBytes,
-                               ARMCC::CondCodes Pred, unsigned PredReg,
-                               const ARMBaseInstrInfo &TII, unsigned MIFlags) {
+                                  MachineBasicBlock::iterator &MBBI,
+                                  const DebugLoc &dl, unsigned DestReg,
+                                  unsigned BaseReg, int NumBytes,
+                                  ARMCC::CondCodes Pred, unsigned PredReg,
+                                  const ARMBaseInstrInfo &TII,
+                                  unsigned MIFlags) {
   if (NumBytes == 0 && DestReg != BaseReg) {
     BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), DestReg)
       .addReg(BaseReg, RegState::Kill)
@@ -459,7 +466,7 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
     Offset += MI.getOperand(FrameRegIdx+1).getImm();
 
     unsigned PredReg;
-    if (Offset == 0 && getInstrPredicate(&MI, PredReg) == ARMCC::AL) {
+    if (Offset == 0 && getInstrPredicate(MI, PredReg) == ARMCC::AL) {
       // Turn it into a move.
       MI.setDesc(TII.get(ARM::tMOVr));
       MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
@@ -627,9 +634,9 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
   return Offset == 0;
 }
 
-ARMCC::CondCodes
-llvm::getITInstrPredicate(const MachineInstr *MI, unsigned &PredReg) {
-  unsigned Opc = MI->getOpcode();
+ARMCC::CondCodes llvm::getITInstrPredicate(const MachineInstr &MI,
+                                           unsigned &PredReg) {
+  unsigned Opc = MI.getOpcode();
   if (Opc == ARM::tBcc || Opc == ARM::t2Bcc)
     return ARMCC::AL;
   return getInstrPredicate(MI, PredReg);
diff --git a/lib/Target/ARM/Thumb2InstrInfo.h b/lib/Target/ARM/Thumb2InstrInfo.h
index 916ab06ec305..15d63300b6a2 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.h
+++ b/lib/Target/ARM/Thumb2InstrInfo.h
@@ -39,9 +39,8 @@ public:
   bool isLegalToSplitMBBAt(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI) const override;
 
-  void copyPhysReg(MachineBasicBlock &MBB,
-                   MachineBasicBlock::iterator I, DebugLoc DL,
-                   unsigned DestReg, unsigned SrcReg,
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                   const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
                    bool KillSrc) const override;
 
   void storeRegToStackSlot(MachineBasicBlock &MBB,
@@ -63,16 +62,13 @@ public:
   const ThumbRegisterInfo &getRegisterInfo() const override { return RI; }
 
 private:
-  void expandLoadStackGuard(MachineBasicBlock::iterator MI,
-                            Reloc::Model RM) const override;
+  void expandLoadStackGuard(MachineBasicBlock::iterator MI) const override;
 };
 
 /// getITInstrPredicate - Valid only in Thumb2 mode. This function is identical
 /// to llvm::getInstrPredicate except it returns AL for conditional branch
 /// instructions which are "predicated", but are not in IT blocks.
-ARMCC::CondCodes getITInstrPredicate(const MachineInstr *MI, unsigned &PredReg);
-
-
+ARMCC::CondCodes getITInstrPredicate(const MachineInstr &MI, unsigned &PredReg);
 }
 
 #endif
diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp
index bcd0e5751258..c4fdb9b3147d 100644
--- a/lib/Target/ARM/Thumb2SizeReduction.cpp
+++ b/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -18,11 +18,12 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/IR/Function.h"        // To access Function attributes
+#include "llvm/IR/Function.h" // To access Function attributes
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
+#include <utility>
 using namespace llvm;
 
 #define DEBUG_TYPE "t2-reduce-size"
@@ -115,12 +116,14 @@ namespace {
   { ARM::t2LDRHs, ARM::tLDRHr,  0,             0,   0,   1,   0,  0,0, 0,1,0 },
   { ARM::t2LDRSBs,ARM::tLDRSB,  0,             0,   0,   1,   0,  0,0, 0,1,0 },
   { ARM::t2LDRSHs,ARM::tLDRSH,  0,             0,   0,   1,   0,  0,0, 0,1,0 },
+  { ARM::t2LDR_POST,ARM::tLDMIA_UPD,0,         0,   0,   1,   0,  0,0, 0,1,0 },
   { ARM::t2STRi12,ARM::tSTRi,   ARM::tSTRspi,  5,   8,   1,   0,  0,0, 0,1,0 },
   { ARM::t2STRs,  ARM::tSTRr,   0,             0,   0,   1,   0,  0,0, 0,1,0 },
   { ARM::t2STRBi12,ARM::tSTRBi, 0,             5,   0,   1,   0,  0,0, 0,1,0 },
   { ARM::t2STRBs, ARM::tSTRBr,  0,             0,   0,   1,   0,  0,0, 0,1,0 },
   { ARM::t2STRHi12,ARM::tSTRHi, 0,             5,   0,   1,   0,  0,0, 0,1,0 },
   { ARM::t2STRHs, ARM::tSTRHr,  0,             0,   0,   1,   0,  0,0, 0,1,0 },
+  { ARM::t2STR_POST,ARM::tSTMIA_UPD,0,         0,   0,   1,   0,  0,0, 0,1,0 },
 
   { ARM::t2LDMIA, ARM::tLDMIA,  0,             0,   0,   1,   1,  1,1, 0,1,0 },
   { ARM::t2LDMIA_RET,0,         ARM::tPOP_RET, 0,   0,   1,   1,  1,1, 0,1,0 },
@@ -143,6 +146,11 @@ namespace {
 
     bool runOnMachineFunction(MachineFunction &MF) override;
 
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::AllVRegsAllocated);
+    }
+
     const char *getPassName() const override {
       return "Thumb2 instruction size reduction pass";
     }
@@ -208,7 +216,7 @@ namespace {
 }
 
 Thumb2SizeReduce::Thumb2SizeReduce(std::function<bool(const Function &)> Ftor)
-    : MachineFunctionPass(ID), PredicateFtor(Ftor) {
+    : MachineFunctionPass(ID), PredicateFtor(std::move(Ftor)) {
   OptimizeSize = MinimizeSize = false;
   for (unsigned i = 0, e = array_lengthof(ReduceTable); i != e; ++i) {
     unsigned FromOpc = ReduceTable[i].WideOpc;
@@ -417,6 +425,46 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
     HasShift = true;
     OpNum = 4;
     break;
+  case ARM::t2LDR_POST:
+  case ARM::t2STR_POST: {
+    if (!MBB.getParent()->getFunction()->optForMinSize())
+      return false;
+
+    // We're creating a completely different type of load/store - LDM from LDR.
+    // For this reason we can't reuse the logic at the end of this function; we
+    // have to implement the MI building here.
+    bool IsStore = Entry.WideOpc == ARM::t2STR_POST;
+    unsigned Rt = MI->getOperand(IsStore ? 1 : 0).getReg();
+    unsigned Rn = MI->getOperand(IsStore ? 0 : 1).getReg();
+    unsigned Offset = MI->getOperand(3).getImm();
+    unsigned PredImm = MI->getOperand(4).getImm();
+    unsigned PredReg = MI->getOperand(5).getReg();
+    assert(isARMLowRegister(Rt));
+    assert(isARMLowRegister(Rn));
+
+    if (Offset != 4)
+      return false;
+
+    // Add the 16-bit load / store instruction.
+    DebugLoc dl = MI->getDebugLoc();
+    auto MIB = BuildMI(MBB, MI, dl, TII->get(Entry.NarrowOpc1))
+                   .addReg(Rn, RegState::Define)
+                   .addReg(Rn)
+                   .addImm(PredImm)
+                   .addReg(PredReg)
+                   .addReg(Rt, IsStore ? 0 : RegState::Define);
+
+    // Transfer memoperands.
+    MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+    // Transfer MI flags.
+    MIB.setMIFlags(MI->getFlags());
+
+    // Kill the old instruction.
+    MI->eraseFromBundle();
+    ++NumLdSts;
+    return true;
+  }
   case ARM::t2LDMIA: {
     unsigned BaseReg = MI->getOperand(0).getReg();
     assert(isARMLowRegister(BaseReg));
@@ -597,7 +645,7 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
   case ARM::t2ADDSri:
   case ARM::t2ADDSrr: {
     unsigned PredReg = 0;
-    if (getInstrPredicate(MI, PredReg) == ARMCC::AL) {
+    if (getInstrPredicate(*MI, PredReg) == ARMCC::AL) {
       switch (Opc) {
       default: break;
       case ARM::t2ADDSri: {
@@ -670,7 +718,7 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
       if (Reg1 != Reg0)
         return false;
       // Try to commute the operands to make it a 2-address instruction.
-      MachineInstr *CommutedMI = TII->commuteInstruction(MI);
+      MachineInstr *CommutedMI = TII->commuteInstruction(*MI);
       if (!CommutedMI)
         return false;
     }
@@ -678,11 +726,11 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
     // Try to commute the operands to make it a 2-address instruction.
     unsigned CommOpIdx1 = 1;
     unsigned CommOpIdx2 = TargetInstrInfo::CommuteAnyOperandIndex;
-    if (!TII->findCommutedOpIndices(MI, CommOpIdx1, CommOpIdx2) ||
+    if (!TII->findCommutedOpIndices(*MI, CommOpIdx1, CommOpIdx2) ||
         MI->getOperand(CommOpIdx2).getReg() != Reg0)
       return false;
     MachineInstr *CommutedMI =
-        TII->commuteInstruction(MI, false, CommOpIdx1, CommOpIdx2);
+        TII->commuteInstruction(*MI, false, CommOpIdx1, CommOpIdx2);
     if (!CommutedMI)
       return false;
   }
@@ -702,7 +750,7 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
   // Check if it's possible / necessary to transfer the predicate.
   const MCInstrDesc &NewMCID = TII->get(Entry.NarrowOpc2);
   unsigned PredReg = 0;
-  ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
+  ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg);
   bool SkipPred = false;
   if (Pred != ARMCC::AL) {
     if (!NewMCID.isPredicable())
@@ -798,7 +846,7 @@ Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
   // Check if it's possible / necessary to transfer the predicate.
   const MCInstrDesc &NewMCID = TII->get(Entry.NarrowOpc1);
   unsigned PredReg = 0;
-  ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
+  ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg);
   bool SkipPred = false;
   if (Pred != ARMCC::AL) {
     if (!NewMCID.isPredicable())
@@ -983,7 +1031,7 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) {
         NextMII->bundleWithPred();
     }
 
-    if (!NextInSameBundle && MI->isInsideBundle()) {
+    if (BundleMI && !NextInSameBundle && MI->isInsideBundle()) {
       // FIXME: Since post-ra scheduler operates on bundles, the CPSR kill
       // marker is only on the BUNDLE instruction. Process the BUNDLE
       // instruction as we finish with the bundled instruction to work around
@@ -1050,5 +1098,5 @@ bool Thumb2SizeReduce::runOnMachineFunction(MachineFunction &MF) {
 /// reduction pass.
 FunctionPass *llvm::createThumb2SizeReductionPass(
     std::function<bool(const Function &)> Ftor) {
-  return new Thumb2SizeReduce(Ftor);
+  return new Thumb2SizeReduce(std::move(Ftor));
 }
diff --git a/lib/Target/ARM/ThumbRegisterInfo.cpp b/lib/Target/ARM/ThumbRegisterInfo.cpp
index b5f9d7e38f27..6c26c8843865 100644
--- a/lib/Target/ARM/ThumbRegisterInfo.cpp
+++ b/lib/Target/ARM/ThumbRegisterInfo.cpp
@@ -61,7 +61,7 @@ ThumbRegisterInfo::getPointerRegClass(const MachineFunction &MF,
 
 static void emitThumb1LoadConstPool(MachineBasicBlock &MBB,
                                     MachineBasicBlock::iterator &MBBI,
-                                    DebugLoc dl, unsigned DestReg,
+                                    const DebugLoc &dl, unsigned DestReg,
                                     unsigned SubIdx, int Val,
                                     ARMCC::CondCodes Pred, unsigned PredReg,
                                     unsigned MIFlags) {
@@ -81,7 +81,7 @@ static void emitThumb1LoadConstPool(MachineBasicBlock &MBB,
 
 static void emitThumb2LoadConstPool(MachineBasicBlock &MBB,
                                     MachineBasicBlock::iterator &MBBI,
-                                    DebugLoc dl, unsigned DestReg,
+                                    const DebugLoc &dl, unsigned DestReg,
                                     unsigned SubIdx, int Val,
                                     ARMCC::CondCodes Pred, unsigned PredReg,
                                     unsigned MIFlags) {
@@ -101,9 +101,9 @@ static void emitThumb2LoadConstPool(MachineBasicBlock &MBB,
 /// emitLoadConstPool - Emits a load from constpool to materialize the
 /// specified immediate.
 void ThumbRegisterInfo::emitLoadConstPool(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, DebugLoc dl,
-    unsigned DestReg, unsigned SubIdx, int Val, ARMCC::CondCodes Pred,
-    unsigned PredReg, unsigned MIFlags) const {
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+    const DebugLoc &dl, unsigned DestReg, unsigned SubIdx, int Val,
+    ARMCC::CondCodes Pred, unsigned PredReg, unsigned MIFlags) const {
   MachineFunction &MF = *MBB.getParent();
   const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
   if (STI.isThumb1Only()) {
@@ -120,57 +120,55 @@ void ThumbRegisterInfo::emitLoadConstPool(
 /// a destreg = basereg + immediate in Thumb code. Materialize the immediate
 /// in a register using mov / mvn sequences or load the immediate from a
 /// constpool entry.
-static
-void emitThumbRegPlusImmInReg(MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator &MBBI,
-                              DebugLoc dl,
-                              unsigned DestReg, unsigned BaseReg,
-                              int NumBytes, bool CanChangeCC,
-                              const TargetInstrInfo &TII,
-                              const ARMBaseRegisterInfo& MRI,
-                              unsigned MIFlags = MachineInstr::NoFlags) {
-    MachineFunction &MF = *MBB.getParent();
-    bool isHigh = !isARMLowRegister(DestReg) ||
-                  (BaseReg != 0 && !isARMLowRegister(BaseReg));
-    bool isSub = false;
-    // Subtract doesn't have high register version. Load the negative value
-    // if either base or dest register is a high register. Also, if do not
-    // issue sub as part of the sequence if condition register is to be
-    // preserved.
-    if (NumBytes < 0 && !isHigh && CanChangeCC) {
-      isSub = true;
-      NumBytes = -NumBytes;
-    }
-    unsigned LdReg = DestReg;
-    if (DestReg == ARM::SP)
-      assert(BaseReg == ARM::SP && "Unexpected!");
-    if (!isARMLowRegister(DestReg) && !MRI.isVirtualRegister(DestReg))
-      LdReg = MF.getRegInfo().createVirtualRegister(&ARM::tGPRRegClass);
-
-    if (NumBytes <= 255 && NumBytes >= 0 && CanChangeCC) {
-      AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg))
-        .addImm(NumBytes).setMIFlags(MIFlags);
-    } else if (NumBytes < 0 && NumBytes >= -255 && CanChangeCC) {
-      AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg))
-        .addImm(NumBytes).setMIFlags(MIFlags);
-      AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tRSB), LdReg))
-        .addReg(LdReg, RegState::Kill).setMIFlags(MIFlags);
-    } else
-      MRI.emitLoadConstPool(MBB, MBBI, dl, LdReg, 0, NumBytes,
-                            ARMCC::AL, 0, MIFlags);
-
-    // Emit add / sub.
-    int Opc = (isSub) ? ARM::tSUBrr : ((isHigh || !CanChangeCC) ? ARM::tADDhirr
-                                                                : ARM::tADDrr);
-    MachineInstrBuilder MIB =
-      BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg);
-    if (Opc != ARM::tADDhirr)
-      MIB = AddDefaultT1CC(MIB);
-    if (DestReg == ARM::SP || isSub)
-      MIB.addReg(BaseReg).addReg(LdReg, RegState::Kill);
-    else
-      MIB.addReg(LdReg).addReg(BaseReg, RegState::Kill);
-    AddDefaultPred(MIB);
+static void emitThumbRegPlusImmInReg(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+    const DebugLoc &dl, unsigned DestReg, unsigned BaseReg, int NumBytes,
+    bool CanChangeCC, const TargetInstrInfo &TII,
+    const ARMBaseRegisterInfo &MRI, unsigned MIFlags = MachineInstr::NoFlags) {
+  MachineFunction &MF = *MBB.getParent();
+  bool isHigh = !isARMLowRegister(DestReg) ||
+                (BaseReg != 0 && !isARMLowRegister(BaseReg));
+  bool isSub = false;
+  // Subtract doesn't have high register version. Load the negative value
+  // if either base or dest register is a high register. Also, if do not
+  // issue sub as part of the sequence if condition register is to be
+  // preserved.
+  if (NumBytes < 0 && !isHigh && CanChangeCC) {
+    isSub = true;
+    NumBytes = -NumBytes;
+  }
+  unsigned LdReg = DestReg;
+  if (DestReg == ARM::SP)
+    assert(BaseReg == ARM::SP && "Unexpected!");
+  if (!isARMLowRegister(DestReg) && !MRI.isVirtualRegister(DestReg))
+    LdReg = MF.getRegInfo().createVirtualRegister(&ARM::tGPRRegClass);
+
+  if (NumBytes <= 255 && NumBytes >= 0 && CanChangeCC) {
+    AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg))
+        .addImm(NumBytes)
+        .setMIFlags(MIFlags);
+  } else if (NumBytes < 0 && NumBytes >= -255 && CanChangeCC) {
+    AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg))
+        .addImm(NumBytes)
+        .setMIFlags(MIFlags);
+    AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tRSB), LdReg))
+        .addReg(LdReg, RegState::Kill)
+        .setMIFlags(MIFlags);
+  } else
+    MRI.emitLoadConstPool(MBB, MBBI, dl, LdReg, 0, NumBytes, ARMCC::AL, 0,
+                          MIFlags);
+
+  // Emit add / sub.
+  int Opc = (isSub) ? ARM::tSUBrr
+                    : ((isHigh || !CanChangeCC) ? ARM::tADDhirr : ARM::tADDrr);
+  MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg);
+  if (Opc != ARM::tADDhirr)
+    MIB = AddDefaultT1CC(MIB);
+  if (DestReg == ARM::SP || isSub)
+    MIB.addReg(BaseReg).addReg(LdReg, RegState::Kill);
+  else
+    MIB.addReg(LdReg).addReg(BaseReg, RegState::Kill);
+  AddDefaultPred(MIB);
 }
 
 /// emitThumbRegPlusImmediate - Emits a series of instructions to materialize
@@ -179,10 +177,10 @@ void emitThumbRegPlusImmInReg(MachineBasicBlock &MBB,
 /// be too long. This is allowed to modify the condition flags.
 void llvm::emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator &MBBI,
-                                     DebugLoc dl,
-                                     unsigned DestReg, unsigned BaseReg,
-                                     int NumBytes, const TargetInstrInfo &TII,
-                                     const ARMBaseRegisterInfo& MRI,
+                                     const DebugLoc &dl, unsigned DestReg,
+                                     unsigned BaseReg, int NumBytes,
+                                     const TargetInstrInfo &TII,
+                                     const ARMBaseRegisterInfo &MRI,
                                      unsigned MIFlags) {
   bool isSub = NumBytes < 0;
   unsigned Bytes = (unsigned)NumBytes;
@@ -281,7 +279,7 @@ void llvm::emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
 
   unsigned RequiredExtraInstrs;
   if (ExtraRange)
-    RequiredExtraInstrs = RoundUpToAlignment(RangeAfterCopy, ExtraRange) / ExtraRange;
+    RequiredExtraInstrs = alignTo(RangeAfterCopy, ExtraRange) / ExtraRange;
   else if (RangeAfterCopy > 0)
     // We need an extra instruction but none is available
     RequiredExtraInstrs = 1000000;
diff --git a/lib/Target/ARM/ThumbRegisterInfo.h b/lib/Target/ARM/ThumbRegisterInfo.h
index 23aaff37f409..e6b06959e428 100644
--- a/lib/Target/ARM/ThumbRegisterInfo.h
+++ b/lib/Target/ARM/ThumbRegisterInfo.h
@@ -39,8 +39,9 @@ public:
   /// specified immediate.
   void
   emitLoadConstPool(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
-                    DebugLoc dl, unsigned DestReg, unsigned SubIdx, int Val,
-                    ARMCC::CondCodes Pred = ARMCC::AL, unsigned PredReg = 0,
+                    const DebugLoc &dl, unsigned DestReg, unsigned SubIdx,
+                    int Val, ARMCC::CondCodes Pred = ARMCC::AL,
+                    unsigned PredReg = 0,
                     unsigned MIFlags = MachineInstr::NoFlags) const override;
 
   // rewrite MI to access 'Offset' bytes from the FP. Update Offset to be