14 files changed, 263 insertions, 171 deletions
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
index 1fb88726d0de..7e2183d7cd5e 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -155,10 +155,11 @@ namespace ARMII {
     //===------------------------------------------------------------------===//
     // Code domain.
     DomainShift   = 18,
-    DomainMask    = 3 << DomainShift,
+    DomainMask    = 7 << DomainShift,
     DomainGeneral = 0 << DomainShift,
     DomainVFP     = 1 << DomainShift,
     DomainNEON    = 2 << DomainShift,
+    DomainNEONA8  = 4 << DomainShift,
 
     //===------------------------------------------------------------------===//
     // Field shifts - such shifts are used to set field while generating
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index 9f295302db0e..26f48b308316 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -172,6 +172,7 @@ class ARMFastISel : public FastISel {
     unsigned ARMMaterializeGV(const GlobalValue *GV, EVT VT);
     unsigned ARMMoveToFPReg(EVT VT, unsigned SrcReg);
     unsigned ARMMoveToIntReg(EVT VT, unsigned SrcReg);
+    unsigned ARMSelectCallOp(const GlobalValue *GV);
 
     // Call handling routines.
   private:
@@ -1633,6 +1634,25 @@ bool ARMFastISel::SelectRet(const Instruction *I) {
   return true;
 }
 
+unsigned ARMFastISel::ARMSelectCallOp(const GlobalValue *GV) {
+
+  // Depend our opcode for thumb on whether or not we're targeting an
+  // externally callable function. For libcalls we'll just pass a NULL GV
+  // in here.
+  bool isExternal = false;
+  if (!GV || GV->hasExternalLinkage()) isExternal = true;
+  
+  // Darwin needs the r9 versions of the opcodes.
+  bool isDarwin = Subtarget->isTargetDarwin();
+  if (isThumb && isExternal) {
+    return isDarwin ? ARM::tBLXi_r9 : ARM::tBLXi;
+  } else if (isThumb) {
+    return isDarwin ? ARM::tBLr9 : ARM::tBL;
+  } else  {
+    return isDarwin ? ARM::BLr9 : ARM::BL;
+  }
+}
+
 // A quick function that will emit a call for a named libcall in F with the
 // vector of passed arguments for the Instruction in I. We can assume that we
 // can emit a call for any libcall we can produce. This is an abridged version
@@ -1694,20 +1714,17 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
   // Issue the call, BLXr9 for darwin, BLX otherwise. This uses V5 ops.
   // TODO: Turn this into the table of arm call ops.
   MachineInstrBuilder MIB;
-  unsigned CallOpc;
-  if(isThumb) {
-    CallOpc = Subtarget->isTargetDarwin() ? ARM::tBLXi_r9 : ARM::tBLXi;
+  unsigned CallOpc = ARMSelectCallOp(NULL);
+  if(isThumb)
     // Explicitly adding the predicate here.
     MIB = AddDefaultPred(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                          TII.get(CallOpc)))
                          .addExternalSymbol(TLI.getLibcallName(Call));
-  } else {
-    CallOpc = Subtarget->isTargetDarwin() ? ARM::BLr9 : ARM::BL;
+  else
     // Explicitly adding the predicate here.
     MIB = AddDefaultPred(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                          TII.get(CallOpc))
           .addExternalSymbol(TLI.getLibcallName(Call)));
-  }
 
   // Add implicit physical register uses to the call.
   for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
@@ -1813,21 +1830,18 @@ bool ARMFastISel::SelectCall(const Instruction *I) {
   // Issue the call, BLXr9 for darwin, BLX otherwise. This uses V5 ops.
   // TODO: Turn this into the table of arm call ops.
   MachineInstrBuilder MIB;
-  unsigned CallOpc;
+  unsigned CallOpc = ARMSelectCallOp(GV);
   // Explicitly adding the predicate here.
-  if(isThumb) {
-    CallOpc = Subtarget->isTargetDarwin() ? ARM::tBLXi_r9 : ARM::tBLXi;
+  if(isThumb)
     // Explicitly adding the predicate here.
     MIB = AddDefaultPred(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                          TII.get(CallOpc)))
           .addGlobalAddress(GV, 0, 0);
-  } else {
-    CallOpc = Subtarget->isTargetDarwin() ? ARM::BLr9 : ARM::BL;
+  else
     // Explicitly adding the predicate here.
     MIB = AddDefaultPred(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                          TII.get(CallOpc))
           .addGlobalAddress(GV, 0, 0));
-  }
   
   // Add implicit physical register uses to the call.
   for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp
index f42c6db84fd3..68c33f098ec9 100644
--- a/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/lib/Target/ARM/ARMFrameLowering.cpp
@@ -215,7 +215,13 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
   AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset);
 
   // Move past area 3.
-  if (DPRCSSize > 0) MBBI++;
+  if (DPRCSSize > 0) {
+    MBBI++;
+    // Since vpush register list cannot have gaps, there may be multiple vpush
+    // instructions in the prologue.
+    while (MBBI->getOpcode() == ARM::VSTMDDB_UPD)
+      MBBI++;
+  }
 
   NumBytes = DPRCSOffset;
   if (NumBytes) {
@@ -370,7 +376,13 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
       emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes);
 
     // Increment past our save areas.
-    if (AFI->getDPRCalleeSavedAreaSize()) MBBI++;
+    if (AFI->getDPRCalleeSavedAreaSize()) {
+      MBBI++;
+      // Since vpop register list cannot have gaps, there may be multiple vpop
+      // instructions in the epilogue.
+      while (MBBI->getOpcode() == ARM::VLDMDIA_UPD)
+        MBBI++;
+    }
     if (AFI->getGPRCalleeSavedArea2Size()) MBBI++;
     if (AFI->getGPRCalleeSavedArea1Size()) MBBI++;
   }
diff --git a/lib/Target/ARM/ARMHazardRecognizer.cpp b/lib/Target/ARM/ARMHazardRecognizer.cpp
index 676b01e91c53..e97ce50bc429 100644
--- a/lib/Target/ARM/ARMHazardRecognizer.cpp
+++ b/lib/Target/ARM/ARMHazardRecognizer.cpp
@@ -21,17 +21,14 @@ static bool hasRAWHazard(MachineInstr *DefMI, MachineInstr *MI,
   // FIXME: Detect integer instructions properly.
   const TargetInstrDesc &TID = MI->getDesc();
   unsigned Domain = TID.TSFlags & ARMII::DomainMask;
-  if (Domain == ARMII::DomainVFP) {
-    unsigned Opcode = MI->getOpcode();
-    if (Opcode == ARM::VSTRS || Opcode == ARM::VSTRD ||
-        Opcode == ARM::VMOVRS || Opcode == ARM::VMOVRRD)
-      return false;
-  } else if (Domain == ARMII::DomainNEON) {
-    if (MI->getDesc().mayStore() || MI->getDesc().mayLoad())
-      return false;
-  } else
+  if (TID.mayStore())
     return false;
-  return MI->readsRegister(DefMI->getOperand(0).getReg(), &TRI);
+  unsigned Opcode = TID.getOpcode();
+  if (Opcode == ARM::VMOVRS || Opcode == ARM::VMOVRRD)
+    return false;
+  if ((Domain & ARMII::DomainVFP) || (Domain & ARMII::DomainNEON))
+    return MI->readsRegister(DefMI->getOperand(0).getReg(), &TRI);
+  return false;
 }
 
 ScheduleHazardRecognizer::HazardType
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index a506cffdba34..f0d5a7d7c2e7 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -126,6 +126,7 @@ public:
   bool SelectAddrMode5(SDValue N, SDValue &Base,
                        SDValue &Offset);
   bool SelectAddrMode6(SDNode *Parent, SDValue N, SDValue &Addr,SDValue &Align);
+  bool SelectAddrMode6Offset(SDNode *Op, SDValue N, SDValue &Offset);
 
   bool SelectAddrModePC(SDValue N, SDValue &Offset, SDValue &Label);
 
@@ -886,6 +887,20 @@ bool ARMDAGToDAGISel::SelectAddrMode6(SDNode *Parent, SDValue N, SDValue &Addr,
   return true;
 }
 
+bool ARMDAGToDAGISel::SelectAddrMode6Offset(SDNode *Op, SDValue N,
+                                            SDValue &Offset) {
+  LSBaseSDNode *LdSt = cast<LSBaseSDNode>(Op);
+  ISD::MemIndexedMode AM = LdSt->getAddressingMode();
+  if (AM != ISD::POST_INC)
+    return false;
+  Offset = N;
+  if (ConstantSDNode *NC = dyn_cast<ConstantSDNode>(N)) {
+    if (NC->getZExtValue() * 8 == LdSt->getMemoryVT().getSizeInBits())
+      Offset = CurDAG->getRegister(0, MVT::i32);
+  }
+  return true;
+}
+
 bool ARMDAGToDAGISel::SelectAddrModePC(SDValue N,
                                        SDValue &Offset, SDValue &Label) {
   if (N.getOpcode() == ARMISD::PIC_ADD && N.hasOneUse()) {
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 1835ec0f0054..ab9f9e1571e3 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -2236,7 +2236,7 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
     RC = ARM::GPRRegisterClass;
 
   // Transform the arguments stored in physical registers into virtual ones.
-  unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC, dl);
+  unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
   SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
 
   SDValue ArgValue2;
@@ -2250,7 +2250,7 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
                             MachinePointerInfo::getFixedStack(FI),
                             false, false, 0);
   } else {
-    Reg = MF.addLiveIn(NextVA.getLocReg(), RC, dl);
+    Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
     ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
   }
 
@@ -2331,7 +2331,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
           llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
 
         // Transform the arguments in physical registers into virtual ones.
-        unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC, dl);
+        unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
         ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
       }
 
@@ -2408,7 +2408,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
         else
           RC = ARM::GPRRegisterClass;
 
-        unsigned VReg = MF.addLiveIn(GPRArgRegs[NumGPRs], RC, dl);
+        unsigned VReg = MF.addLiveIn(GPRArgRegs[NumGPRs], RC);
         SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
         SDValue Store =
           DAG.getStore(Val.getValue(1), dl, Val, FIN,
@@ -2838,8 +2838,51 @@ SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
   DebugLoc dl = Op.getDebugLoc();
   EVT VT = Op.getValueType();
   EVT SrcVT = Tmp1.getValueType();
-  bool F2IisFast = Subtarget->isCortexA9() ||
-    Tmp0.getOpcode() == ISD::BITCAST || Tmp0.getOpcode() == ARMISD::VMOVDRR;
+  bool InGPR = Tmp0.getOpcode() == ISD::BITCAST ||
+    Tmp0.getOpcode() == ARMISD::VMOVDRR;
+  bool UseNEON = !InGPR && Subtarget->hasNEON();
+
+  if (UseNEON) {
+    // Use VBSL to copy the sign bit.
+    unsigned EncodedVal = ARM_AM::createNEONModImm(0x6, 0x80);
+    SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32,
+                               DAG.getTargetConstant(EncodedVal, MVT::i32));
+    EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64;
+    if (VT == MVT::f64)
+      Mask = DAG.getNode(ARMISD::VSHL, dl, OpVT,
+                         DAG.getNode(ISD::BITCAST, dl, OpVT, Mask),
+                         DAG.getConstant(32, MVT::i32));
+    else /*if (VT == MVT::f32)*/
+      Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0);
+    if (SrcVT == MVT::f32) {
+      Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1);
+      if (VT == MVT::f64)
+        Tmp1 = DAG.getNode(ARMISD::VSHL, dl, OpVT,
+                           DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1),
+                           DAG.getConstant(32, MVT::i32));
+    }
+    Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0);
+    Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1);
+
+    SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createNEONModImm(0xe, 0xff),
+                                            MVT::i32);
+    AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes);
+    SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask,
+                                  DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes));
+                                              
+    SDValue Res = DAG.getNode(ISD::OR, dl, OpVT,
+                              DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask),
+                              DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot));
+    if (SrcVT == MVT::f32) {
+      Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res);
+      Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
+                        DAG.getConstant(0, MVT::i32));
+    } else {
+      Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res);
+    }
+
+    return Res;
+  }
 
   // Bitcast operand 1 to i32.
   if (SrcVT == MVT::f64)
@@ -2847,37 +2890,24 @@ SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
                        &Tmp1, 1).getValue(1);
   Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1);
 
-  // If float to int conversion isn't going to be super expensive, then simply
-  // or in the signbit.
-  if (F2IisFast) {
-    SDValue Mask1 = DAG.getConstant(0x80000000, MVT::i32);
-    SDValue Mask2 = DAG.getConstant(0x7fffffff, MVT::i32);
-    Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1);
-    if (VT == MVT::f32) {
-      Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32,
-                         DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2);
-      return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
-                         DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1));
-    }
-
-    // f64: Or the high part with signbit and then combine two parts.
-    Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
-                       &Tmp0, 1);
-    SDValue Lo = Tmp0.getValue(0);
-    SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2);
-    Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1);
-    return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
+  // Or in the signbit with integer operations.
+  SDValue Mask1 = DAG.getConstant(0x80000000, MVT::i32);
+  SDValue Mask2 = DAG.getConstant(0x7fffffff, MVT::i32);
+  Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1);
+  if (VT == MVT::f32) {
+    Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32,
+                       DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2);
+    return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
+                       DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1));
   }
 
-  // Remove the signbit of operand 0.
-  Tmp0 = DAG.getNode(ISD::FABS, dl, VT, Tmp0);
-
-  // If operand 1 signbit is one, then negate operand 0.
-  SDValue ARMcc;
-  SDValue Cmp = getARMCmp(Tmp1, DAG.getConstant(0, MVT::i32),
-                          ISD::SETLT, ARMcc, DAG, dl);
-  SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
-  return DAG.getNode(ARMISD::CNEG, dl, VT, Tmp0, Tmp0, ARMcc, CCR, Cmp);
+  // f64: Or the high part with signbit and then combine two parts.
+  Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
+                     &Tmp0, 1);
+  SDValue Lo = Tmp0.getValue(0);
+  SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2);
+  Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1);
+  return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
 }
 
 SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
@@ -2897,7 +2927,7 @@ SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
   }
 
   // Return LR, which contains the return address. Mark it an implicit live-in.
-  unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32), dl);
+  unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
   return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
 }
 
diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td
index 765cba42d0bd..359ac45cee1d 100644
--- a/lib/Target/ARM/ARMInstrFormats.td
+++ b/lib/Target/ARM/ARMInstrFormats.td
@@ -127,13 +127,14 @@ def IndexModePost : IndexMode<2>;
 def IndexModeUpd  : IndexMode<3>;
 
 // Instruction execution domain.
-class Domain<bits<2> val> {
-  bits<2> Value = val;
+class Domain<bits<3> val> {
+  bits<3> Value = val;
 }
 def GenericDomain : Domain<0>;
 def VFPDomain     : Domain<1>; // Instructions in VFP domain only
 def NeonDomain    : Domain<2>; // Instructions in Neon domain only
 def VFPNeonDomain : Domain<3>; // Instructions in both VFP & Neon domains
+def VFPNeonA8Domain : Domain<5>; // Instructions in VFP & Neon under A8
 
 //===----------------------------------------------------------------------===//
 // ARM special operands.
@@ -249,7 +250,7 @@ class InstTemplate<AddrMode am, SizeFlagVal sz, IndexMode im,
   let TSFlags{15-10} = Form;
   let TSFlags{16}    = isUnaryDataProc;
   let TSFlags{17}    = canXformTo16Bit;
-  let TSFlags{19-18} = D.Value;
+  let TSFlags{20-18} = D.Value;
 
   let Constraints = cstr;
   let Itinerary = itin;
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index c827ce3da97c..6e3fe2e039f5 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -561,7 +561,9 @@ def addrmode6 : Operand<i32>,
   let EncoderMethod = "getAddrMode6AddressOpValue";
 }
 
-def am6offset : Operand<i32> {
+def am6offset : Operand<i32>,
+                ComplexPattern<i32, 1, "SelectAddrMode6Offset",
+                               [], [SDNPWantRoot]> {
   let PrintMethod = "printAddrMode6OffsetOperand";
   let MIOperandInfo = (ops GPR);
   let EncoderMethod = "getAddrMode6OffsetOpValue";
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index 1e2e5504e662..dc3d63e26ef5 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -1402,31 +1402,42 @@ def : Pat<(store (extractelt (v2f32 DPR:$src), imm:$lane), addrmode6:$addr),
 def : Pat<(store (extractelt (v4f32 QPR:$src), imm:$lane), addrmode6:$addr),
           (VST1LNq32Pseudo addrmode6:$addr, QPR:$src, imm:$lane)>;
 
-let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in {
-
 // ...with address register writeback:
-class VST1LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+class VST1LNWB<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
+               PatFrag StoreOp, SDNode ExtractOp>
   : NLdStLn<1, 0b00, op11_8, op7_4, (outs GPR:$wb),
           (ins addrmode6:$Rn, am6offset:$Rm,
            DPR:$Vd, nohash_imm:$lane), IIC_VST1lnu, "vst1", Dt,
           "\\{$Vd[$lane]\\}, $Rn$Rm",
-          "$Rn.addr = $wb", []>;
+          "$Rn.addr = $wb",
+          [(set GPR:$wb, (StoreOp (ExtractOp (Ty DPR:$Vd), imm:$lane),
+                                  addrmode6:$Rn, am6offset:$Rm))]>;
+class VST1QLNWBPseudo<ValueType Ty, PatFrag StoreOp, SDNode ExtractOp>
+  : VSTQLNWBPseudo<IIC_VST1lnu> {
+  let Pattern = [(set GPR:$wb, (StoreOp (ExtractOp (Ty QPR:$src), imm:$lane),
+                                        addrmode6:$addr, am6offset:$offset))];
+}
 
-def VST1LNd8_UPD  : VST1LNWB<0b0000, {?,?,?,0}, "8"> {
+def VST1LNd8_UPD  : VST1LNWB<0b0000, {?,?,?,0}, "8", v8i8, post_truncsti8,
+                             NEONvgetlaneu> {
   let Inst{7-5} = lane{2-0};
 }
-def VST1LNd16_UPD : VST1LNWB<0b0100, {?,?,0,?}, "16"> {
+def VST1LNd16_UPD : VST1LNWB<0b0100, {?,?,0,?}, "16", v4i16, post_truncsti16,
+                             NEONvgetlaneu> {
   let Inst{7-6} = lane{1-0};
   let Inst{4}   = Rn{5};
 }
-def VST1LNd32_UPD : VST1LNWB<0b1000, {?,0,?,?}, "32"> {
+def VST1LNd32_UPD : VST1LNWB<0b1000, {?,0,?,?}, "32", v2i32, post_store,
+                             extractelt> {
   let Inst{7}   = lane{0};
   let Inst{5-4} = Rn{5-4};
 }
 
-def VST1LNq8Pseudo_UPD  : VSTQLNWBPseudo<IIC_VST1lnu>;
-def VST1LNq16Pseudo_UPD : VSTQLNWBPseudo<IIC_VST1lnu>;
-def VST1LNq32Pseudo_UPD : VSTQLNWBPseudo<IIC_VST1lnu>;
+def VST1LNq8Pseudo_UPD  : VST1QLNWBPseudo<v16i8, post_truncsti8, NEONvgetlaneu>;
+def VST1LNq16Pseudo_UPD : VST1QLNWBPseudo<v8i16, post_truncsti16,NEONvgetlaneu>;
+def VST1LNq32Pseudo_UPD : VST1QLNWBPseudo<v4i32, post_store, extractelt>;
+
+let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in {
 
 //   VST2LN   : Vector Store (single 2-element structure from one lane)
 class VST2LN<bits<4> op11_8, bits<4> op7_4, string Dt>
diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td
index 920c5c98002a..29902833f2bb 100644
--- a/lib/Target/ARM/ARMInstrVFP.td
+++ b/lib/Target/ARM/ARMInstrVFP.td
@@ -197,9 +197,9 @@ def VADDS  : ASbIn<0b11100, 0b11, 0, 0,
                    (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
                    IIC_fpALU32, "vadd", ".f32\t$Sd, $Sn, $Sm",
                    [(set SPR:$Sd, (fadd SPR:$Sn, SPR:$Sm))]> {
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 
 def VSUBD  : ADbI<0b11100, 0b11, 1, 0,
@@ -211,9 +211,9 @@ def VSUBS  : ASbIn<0b11100, 0b11, 1, 0,
                    (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
                    IIC_fpALU32, "vsub", ".f32\t$Sd, $Sn, $Sm",
                    [(set SPR:$Sd, (fsub SPR:$Sn, SPR:$Sm))]> {
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 
 def VDIVD  : ADbI<0b11101, 0b00, 0, 0,
@@ -235,9 +235,9 @@ def VMULS  : ASbIn<0b11100, 0b10, 0, 0,
                    (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
                    IIC_fpMUL32, "vmul", ".f32\t$Sd, $Sn, $Sm",
                    [(set SPR:$Sd, (fmul SPR:$Sn, SPR:$Sm))]> {
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 
 def VNMULD : ADbI<0b11100, 0b10, 1, 0,
@@ -249,9 +249,9 @@ def VNMULS : ASbI<0b11100, 0b10, 1, 0,
                   (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
                   IIC_fpMUL32, "vnmul", ".f32\t$Sd, $Sn, $Sm",
                   [(set SPR:$Sd, (fneg (fmul SPR:$Sn, SPR:$Sm)))]> {
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 
 // Match reassociated forms only if not sign dependent rounding.
@@ -271,9 +271,9 @@ def VCMPES : ASuI<0b11101, 0b11, 0b0100, 0b11, 0,
                   (outs), (ins SPR:$Sd, SPR:$Sm),
                   IIC_fpCMP32, "vcmpe", ".f32\t$Sd, $Sm",
                   [(arm_cmpfp SPR:$Sd, SPR:$Sm)]> {
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 
 // FIXME: Verify encoding after integrated assembler is working.
@@ -286,9 +286,9 @@ def VCMPS  : ASuI<0b11101, 0b11, 0b0100, 0b01, 0,
                   (outs), (ins SPR:$Sd, SPR:$Sm),
                   IIC_fpCMP32, "vcmp", ".f32\t$Sd, $Sm",
                   [/* For disassembly only; pattern left blank */]> {
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 } // Defs = [FPSCR]
 
@@ -305,9 +305,9 @@ def VABSS  : ASuIn<0b11101, 0b11, 0b0000, 0b11, 0,
                    (outs SPR:$Sd), (ins SPR:$Sm),
                    IIC_fpUNA32, "vabs", ".f32\t$Sd, $Sm",
                    [(set SPR:$Sd, (fabs SPR:$Sm))]> {
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 
 let Defs = [FPSCR] in {
@@ -326,9 +326,9 @@ def VCMPEZS : ASuI<0b11101, 0b11, 0b0101, 0b11, 0,
   let Inst{3-0} = 0b0000;
   let Inst{5}   = 0;
 
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 
 // FIXME: Verify encoding after integrated assembler is working.
@@ -347,9 +347,9 @@ def VCMPZS  : ASuI<0b11101, 0b11, 0b0101, 0b01, 0,
   let Inst{3-0} = 0b0000;
   let Inst{5}   = 0;
 
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 } // Defs = [FPSCR]
 
@@ -423,9 +423,9 @@ def VNEGS  : ASuIn<0b11101, 0b11, 0b0001, 0b01, 0,
                    (outs SPR:$Sd), (ins SPR:$Sm),
                    IIC_fpUNA32, "vneg", ".f32\t$Sd, $Sm",
                    [(set SPR:$Sd, (fneg SPR:$Sm))]> {
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 
 def VSQRTD : ADuI<0b11101, 0b11, 0b0001, 0b11, 0,
@@ -598,9 +598,9 @@ def VSITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010,
                                 [(set SPR:$Sd, (arm_sitof SPR:$Sm))]> {
   let Inst{7} = 1; // s32
 
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 
 def VUITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011,
@@ -616,9 +616,9 @@ def VUITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010,
                                 [(set SPR:$Sd, (arm_uitof SPR:$Sm))]> {
   let Inst{7} = 0; // u32
 
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 
 // FP -> Int:
@@ -671,9 +671,9 @@ def VTOSIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1101, 0b1010,
                                  [(set SPR:$Sd, (arm_ftosi SPR:$Sm))]> {
   let Inst{7} = 1; // Z bit
 
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 
 def VTOUIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1100, 0b1011,
@@ -689,9 +689,9 @@ def VTOUIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1100, 0b1010,
                                  [(set SPR:$Sd, (arm_ftoui SPR:$Sm))]> {
   let Inst{7} = 1; // Z bit
 
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 
 // And the Z bit '0' variants, i.e. use the rounding mode specified by FPSCR.
@@ -743,36 +743,36 @@ def VTOSHS : AVConv1XI<0b11101, 0b11, 0b1110, 0b1010, 0,
                        (outs SPR:$dst), (ins SPR:$a, i32imm:$fbits),
                  IIC_fpCVTSI, "vcvt", ".s16.f32\t$dst, $a, $fbits",
                  [/* For disassembly only; pattern left blank */]> {
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 
 def VTOUHS : AVConv1XI<0b11101, 0b11, 0b1111, 0b1010, 0,
                        (outs SPR:$dst), (ins SPR:$a, i32imm:$fbits),
                  IIC_fpCVTSI, "vcvt", ".u16.f32\t$dst, $a, $fbits",
                  [/* For disassembly only; pattern left blank */]> {
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 
 def VTOSLS : AVConv1XI<0b11101, 0b11, 0b1110, 0b1010, 1,
                        (outs SPR:$dst), (ins SPR:$a, i32imm:$fbits),
                  IIC_fpCVTSI, "vcvt", ".s32.f32\t$dst, $a, $fbits",
                  [/* For disassembly only; pattern left blank */]> {
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 
 def VTOULS : AVConv1XI<0b11101, 0b11, 0b1111, 0b1010, 1,
                        (outs SPR:$dst), (ins SPR:$a, i32imm:$fbits),
                  IIC_fpCVTSI, "vcvt", ".u32.f32\t$dst, $a, $fbits",
                  [/* For disassembly only; pattern left blank */]> {
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 
 def VTOSHD : AVConv1XI<0b11101, 0b11, 0b1110, 0b1011, 0,
@@ -801,36 +801,36 @@ def VSHTOS : AVConv1XI<0b11101, 0b11, 0b1010, 0b1010, 0,
                        (outs SPR:$dst), (ins SPR:$a, i32imm:$fbits),
                  IIC_fpCVTIS, "vcvt", ".f32.s16\t$dst, $a, $fbits",
                  [/* For disassembly only; pattern left blank */]> {
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 
 def VUHTOS : AVConv1XI<0b11101, 0b11, 0b1011, 0b1010, 0,
                        (outs SPR:$dst), (ins SPR:$a, i32imm:$fbits),
                  IIC_fpCVTIS, "vcvt", ".f32.u16\t$dst, $a, $fbits",
                  [/* For disassembly only; pattern left blank */]> {
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 
 def VSLTOS : AVConv1XI<0b11101, 0b11, 0b1010, 0b1010, 1,
                        (outs SPR:$dst), (ins SPR:$a, i32imm:$fbits),
                  IIC_fpCVTIS, "vcvt", ".f32.s32\t$dst, $a, $fbits",
                  [/* For disassembly only; pattern left blank */]> {
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 
 def VULTOS : AVConv1XI<0b11101, 0b11, 0b1011, 0b1010, 1,
                        (outs SPR:$dst), (ins SPR:$a, i32imm:$fbits),
                  IIC_fpCVTIS, "vcvt", ".f32.u32\t$dst, $a, $fbits",
                  [/* For disassembly only; pattern left blank */]> {
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 
 def VSHTOD : AVConv1XI<0b11101, 0b11, 0b1010, 0b1011, 0,
@@ -874,9 +874,9 @@ def VMLAS : ASbIn<0b11100, 0b00, 0, 0,
                                            SPR:$Sdin))]>,
               RegConstraint<"$Sdin = $Sd">,
               Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]> {
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 
 def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
@@ -901,9 +901,9 @@ def VMLSS : ASbIn<0b11100, 0b00, 1, 0,
                                            SPR:$Sdin))]>,
               RegConstraint<"$Sdin = $Sd">,
               Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]> {
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 
 def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
@@ -928,9 +928,9 @@ def VNMLAS : ASbI<0b11100, 0b01, 1, 0,
                                            SPR:$Sdin))]>,
                 RegConstraint<"$Sdin = $Sd">,
                 Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]> {
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 
 def : Pat<(fsub_mlx (fneg (fmul_su DPR:$a, (f64 DPR:$b))), DPR:$dstin),
@@ -954,9 +954,9 @@ def VNMLSS : ASbI<0b11100, 0b01, 0, 0,
              [(set SPR:$Sd, (fsub_mlx (fmul_su SPR:$Sn, SPR:$Sm), SPR:$Sdin))]>,
                          RegConstraint<"$Sdin = $Sd">,
                   Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]> {
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 
 def : Pat<(fsub_mlx (fmul_su DPR:$a, (f64 DPR:$b)), DPR:$dstin),
@@ -995,9 +995,9 @@ def VNEGScc  : ASuI<0b11101, 0b11, 0b0001, 0b01, 0,
                     IIC_fpUNA32, "vneg", ".f32\t$Sd, $Sm",
                     [/*(set SPR:$Sd, (ARMcneg SPR:$Sn, SPR:$Sm, imm:$cc))*/]>,
                  RegConstraint<"$Sn = $Sd"> {
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 } // neverHasSideEffects
 
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index 0bd740cfb28c..1465984899c6 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -171,7 +171,9 @@ ARMSubtarget::GVIsIndirectSymbol(const GlobalValue *GV,
 
   // Materializable GVs (in JIT lazy compilation mode) do not require an extra
   // load from stub.
-  bool isDecl = GV->isDeclaration() && !GV->isMaterializable();
+  bool isDecl = GV->hasAvailableExternallyLinkage();
+  if (GV->isDeclaration() && !GV->isMaterializable())
+    isDecl = true;
 
   if (!isTargetDarwin()) {
     // Extra load is needed for all externally visible.
diff --git a/lib/Target/ARM/MLxExpansionPass.cpp b/lib/Target/ARM/MLxExpansionPass.cpp
index f9e86eb36e04..9a27e2f47064 100644
--- a/lib/Target/ARM/MLxExpansionPass.cpp
+++ b/lib/Target/ARM/MLxExpansionPass.cpp
@@ -132,22 +132,16 @@ unsigned MLxExpansion::getDefReg(MachineInstr *MI) const {
 }
 
 bool MLxExpansion::hasRAWHazard(unsigned Reg, MachineInstr *MI) const {
-  const TargetInstrDesc &TID = MI->getDesc();
   // FIXME: Detect integer instructions properly.
+  const TargetInstrDesc &TID = MI->getDesc();
   unsigned Domain = TID.TSFlags & ARMII::DomainMask;
-  if (Domain == ARMII::DomainVFP) {
-    unsigned Opcode = TID.getOpcode();
-    if (Opcode == ARM::VSTRS || Opcode == ARM::VSTRD ||
-        Opcode == ARM::VMOVRS || Opcode == ARM::VMOVRRD)
-      return false;
-  } else if (Domain == ARMII::DomainNEON) {
-    if (TID.mayStore() || TID.mayLoad())
-      return false;
-  } else {
+  if (TID.mayStore())
     return false;
-  }
-
-  return MI->readsRegister(Reg, TRI);
+  unsigned Opcode = TID.getOpcode();
+  if (Opcode == ARM::VMOVRS || Opcode == ARM::VMOVRRD)
+    return false;
+  if ((Domain & ARMII::DomainVFP) || (Domain & ARMII::DomainNEON))
+    return MI->readsRegister(Reg, TRI);
   return false;
 }
 
diff --git a/lib/Target/ARM/NEONMoveFix.cpp b/lib/Target/ARM/NEONMoveFix.cpp
index 97e54bfaed9e..965665c2821a 100644
--- a/lib/Target/ARM/NEONMoveFix.cpp
+++ b/lib/Target/ARM/NEONMoveFix.cpp
@@ -35,6 +35,7 @@ namespace {
   private:
     const TargetRegisterInfo *TRI;
     const ARMBaseInstrInfo *TII;
+    bool isA8;
 
     typedef DenseMap<unsigned, const MachineInstr*> RegMap;
 
@@ -43,6 +44,11 @@ namespace {
   char NEONMoveFixPass::ID = 0;
 }
 
+static bool inNEONDomain(unsigned Domain, bool isA8) {
+  return (Domain & ARMII::DomainNEON) ||
+    (isA8 && (Domain & ARMII::DomainNEONA8));
+}
+
 bool NEONMoveFixPass::InsertMoves(MachineBasicBlock &MBB) {
   RegMap Defs;
   bool Modified = false;
@@ -70,7 +76,7 @@ bool NEONMoveFixPass::InsertMoves(MachineBasicBlock &MBB) {
           Domain = ARMII::DomainNEON;
       }
 
-      if (Domain & ARMII::DomainNEON) {
+      if (inNEONDomain(Domain, isA8)) {
         // Convert VMOVD to VMOVDneon
         unsigned DestReg = MI->getOperand(0).getReg();
 
@@ -123,6 +129,7 @@ bool NEONMoveFixPass::runOnMachineFunction(MachineFunction &Fn) {
 
   TRI = TM.getRegisterInfo();
   TII = static_cast<const ARMBaseInstrInfo*>(TM.getInstrInfo());
+  isA8 = TM.getSubtarget<ARMSubtarget>().isCortexA8();
 
   bool Modified = false;
   for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp
index 2f67257f8fa1..9b1073be3c8e 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -95,6 +95,12 @@ Thumb2InstrInfo::ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail,
 bool
 Thumb2InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator MBBI) const {
+  while (MBBI->isDebugValue()) {
+    ++MBBI;
+    if (MBBI == MBB.end())
+      return false;
+  }
+
   unsigned PredReg = 0;
   return llvm::getITInstrPredicate(MBBI, PredReg) == ARMCC::AL;
 }