28 files changed, 1722 insertions, 464 deletions
diff --git a/lib/Target/ARM/ARMAddressingModes.h b/lib/Target/ARM/ARMAddressingModes.h
index 183915335192..c603708652f6 100644
--- a/lib/Target/ARM/ARMAddressingModes.h
+++ b/lib/Target/ARM/ARMAddressingModes.h
@@ -15,7 +15,6 @@
 #define LLVM_TARGET_ARM_ARMADDRESSINGMODES_H
 
 #include "llvm/CodeGen/SelectionDAGNodes.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include <cassert>
 
@@ -38,7 +37,7 @@ namespace ARM_AM {
 
   static inline const char *getShiftOpcStr(ShiftOpc Op) {
     switch (Op) {
-    default: llvm_unreachable("Unknown shift opc!");
+    default: assert(0 && "Unknown shift opc!");
     case ARM_AM::asr: return "asr";
     case ARM_AM::lsl: return "lsl";
     case ARM_AM::lsr: return "lsr";
@@ -71,7 +70,7 @@ namespace ARM_AM {
 
   static inline const char *getAMSubModeStr(AMSubMode Mode) {
     switch (Mode) {
-    default: llvm_unreachable("Unknown addressing sub-mode!");
+    default: assert(0 && "Unknown addressing sub-mode!");
     case ARM_AM::ia: return "ia";
     case ARM_AM::ib: return "ib";
     case ARM_AM::da: return "da";
@@ -81,7 +80,7 @@ namespace ARM_AM {
 
   static inline const char *getAMSubModeAltStr(AMSubMode Mode, bool isLD) {
     switch (Mode) {
-    default: llvm_unreachable("Unknown addressing sub-mode!");
+    default: assert(0 && "Unknown addressing sub-mode!");
     case ARM_AM::ia: return isLD ? "fd" : "ea";
     case ARM_AM::ib: return isLD ? "ed" : "fa";
     case ARM_AM::da: return isLD ? "fa" : "ed";
@@ -342,6 +341,66 @@ namespace ARM_AM {
     return -1;
   }
 
+  static inline unsigned getT2SOImmValRotate(unsigned V) {
+    if ((V & ~255U) == 0) return 0;
+    // Use CTZ to compute the rotate amount.
+    unsigned RotAmt = CountTrailingZeros_32(V);
+    return (32 - RotAmt) & 31;
+  }
+
+  static inline bool isT2SOImmTwoPartVal (unsigned Imm) {
+    unsigned V = Imm;
+    // Passing values can be any combination of splat values and shifter
+    // values. If this can be handled with a single shifter or splat, bail
+    // out. Those should be handled directly, not with a two-part val.
+    if (getT2SOImmValSplatVal(V) != -1)
+      return false;
+    V = rotr32 (~255U, getT2SOImmValRotate(V)) & V;
+    if (V == 0)
+      return false;
+
+    // If this can be handled as an immediate, accept.
+    if (getT2SOImmVal(V) != -1) return true;
+
+    // Likewise, try masking out a splat value first.
+    V = Imm;
+    if (getT2SOImmValSplatVal(V & 0xff00ff00U) != -1)
+      V &= ~0xff00ff00U;
+    else if (getT2SOImmValSplatVal(V & 0x00ff00ffU) != -1)
+      V &= ~0x00ff00ffU;
+    // If what's left can be handled as an immediate, accept.
+    if (getT2SOImmVal(V) != -1) return true;
+
+    // Otherwise, do not accept.
+    return false;
+  }
+
+  static inline unsigned getT2SOImmTwoPartFirst(unsigned Imm) {
+    assert (isT2SOImmTwoPartVal(Imm) &&
+            "Immedate cannot be encoded as two part immediate!");
+    // Try a shifter operand as one part
+    unsigned V = rotr32 (~255, getT2SOImmValRotate(Imm)) & Imm;
+    // If the rest is encodable as an immediate, then return it.
+    if (getT2SOImmVal(V) != -1) return V;
+
+    // Try masking out a splat value first.
+    if (getT2SOImmValSplatVal(Imm & 0xff00ff00U) != -1)
+      return Imm & 0xff00ff00U;
+
+    // The other splat is all that's left as an option.
+    assert (getT2SOImmValSplatVal(Imm & 0x00ff00ffU) != -1);
+    return Imm & 0x00ff00ffU;
+  }
+
+  static inline unsigned getT2SOImmTwoPartSecond(unsigned Imm) {
+    // Mask out the first hunk
+    Imm ^= getT2SOImmTwoPartFirst(Imm);
+    // Return what's left
+    assert (getT2SOImmVal(Imm) != -1 &&
+            "Unable to encode second part of T2 two part SO immediate");
+    return Imm;
+  }
+
 
   //===--------------------------------------------------------------------===//
   // Addressing Mode #2
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index 42ef183e5261..00e75310b6fb 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -36,8 +36,18 @@
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/CommandLine.h"
 using namespace llvm;
 
+static cl::opt<bool>
+ScavengeFrameIndexVals("arm-virtual-frame-index-vals", cl::Hidden,
+          cl::init(false),
+          cl::desc("Resolve frame index values via scavenging in PEI"));
+
+static cl::opt<bool>
+ReuseFrameIndexVals("arm-reuse-frame-index-vals", cl::Hidden, cl::init(false),
+          cl::desc("Reuse repeated frame index values"));
+
 unsigned ARMBaseRegisterInfo::getRegisterNumbering(unsigned RegEnum,
                                                    bool *isSPVFP) {
   if (isSPVFP)
@@ -740,8 +750,7 @@ unsigned ARMBaseRegisterInfo::getRegisterPairEven(unsigned Reg,
   case ARM::R1:
     return ARM::R0;
   case ARM::R3:
-    // FIXME!
-    return STI.isThumb1Only() ? 0 : ARM::R2;
+    return ARM::R2;
   case ARM::R5:
     return ARM::R4;
   case ARM::R7:
@@ -830,8 +839,7 @@ unsigned ARMBaseRegisterInfo::getRegisterPairOdd(unsigned Reg,
   case ARM::R0:
     return ARM::R1;
   case ARM::R2:
-    // FIXME!
-    return STI.isThumb1Only() ? 0 : ARM::R3;
+    return ARM::R3;
   case ARM::R4:
     return ARM::R5;
   case ARM::R6:
@@ -937,6 +945,11 @@ requiresRegisterScavenging(const MachineFunction &MF) const {
   return true;
 }
 
+bool ARMBaseRegisterInfo::
+requiresFrameIndexScavenging(const MachineFunction &MF) const {
+  return ScavengeFrameIndexVals;
+}
+
 // hasReservedCallFrame - Under normal circumstances, when a frame pointer is
 // not required, we reserve argument space for call sites in the function
 // immediately on entry to the current function. This eliminates the need for
@@ -1077,14 +1090,7 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
           (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrMode4) &&
          "This code isn't needed if offset already handled!");
 
-  // Insert a set of r12 with the full address: r12 = sp + offset
-  // If the offset we have is too large to fit into the instruction, we need
-  // to form it with a series of ADDri's.  Do this by taking 8-bit chunks
-  // out of 'Offset'.
-  unsigned ScratchReg = findScratchRegister(RS, ARM::GPRRegisterClass, AFI);
-  if (ScratchReg == 0)
-    // No register is "free". Scavenge a register.
-    ScratchReg = RS->scavengeRegister(ARM::GPRRegisterClass, II, SPAdj);
+  unsigned ScratchReg = 0;
   int PIdx = MI.findFirstPredOperandIdx();
   ARMCC::CondCodes Pred = (PIdx == -1)
     ? ARMCC::AL : (ARMCC::CondCodes)MI.getOperand(PIdx).getImm();
@@ -1093,6 +1099,19 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     // Must be addrmode4.
     MI.getOperand(i).ChangeToRegister(FrameReg, false, false, false);
   else {
+    if (!ScavengeFrameIndexVals) {
+      // Insert a set of r12 with the full address: r12 = sp + offset
+      // If the offset we have is too large to fit into the instruction, we need
+      // to form it with a series of ADDri's.  Do this by taking 8-bit chunks
+      // out of 'Offset'.
+      ScratchReg = findScratchRegister(RS, ARM::GPRRegisterClass, AFI);
+      if (ScratchReg == 0)
+        // No register is "free". Scavenge a register.
+        ScratchReg = RS->scavengeRegister(ARM::GPRRegisterClass, II, SPAdj);
+    } else {
+      ScratchReg = MF.getRegInfo().createVirtualRegister(ARM::GPRRegisterClass);
+      *Value = Offset;
+    }
     if (!AFI->isThumbFunction())
       emitARMRegPlusImmediate(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg,
                               Offset, Pred, PredReg, TII);
@@ -1102,8 +1121,10 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
                              Offset, Pred, PredReg, TII);
     }
     MI.getOperand(i).ChangeToRegister(ScratchReg, false, false, true);
+    if (!ReuseFrameIndexVals || !ScavengeFrameIndexVals)
+      ScratchReg = 0;
   }
-  return 0;
+  return ScratchReg;
 }
 
 /// Move iterator pass the next bunch of callee save load / store ops for
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.h b/lib/Target/ARM/ARMBaseRegisterInfo.h
index da703fbc8c19..f7d38e540def 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -122,6 +122,8 @@ public:
 
   virtual bool requiresRegisterScavenging(const MachineFunction &MF) const;
 
+  virtual bool requiresFrameIndexScavenging(const MachineFunction &MF) const;
+
   virtual bool hasReservedCallFrame(MachineFunction &MF) const;
 
   virtual void eliminateCallFramePseudoInstr(MachineFunction &MF,
diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp
index b0bd04bade40..c995ff2d9906 100644
--- a/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -28,9 +28,11 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
+#include <algorithm>
 using namespace llvm;
 
 STATISTIC(NumCPEs,       "Number of constpool entries");
@@ -70,6 +72,10 @@ namespace {
     /// to a return, unreachable, or unconditional branch).
     std::vector<MachineBasicBlock*> WaterList;
 
+    /// NewWaterList - The subset of WaterList that was created since the
+    /// previous iteration by inserting unconditional branches.
+    SmallSet<MachineBasicBlock*, 4> NewWaterList;
+
     typedef std::vector<MachineBasicBlock*>::iterator water_iterator;
 
     /// CPUser - One user of a constant pool, keeping the machine instruction
@@ -175,9 +181,7 @@ namespace {
     void AdjustBBOffsetsAfter(MachineBasicBlock *BB, int delta);
     bool DecrementOldEntry(unsigned CPI, MachineInstr* CPEMI);
     int LookForExistingCPEntry(CPUser& U, unsigned UserOffset);
-    bool LookForWater(CPUser&U, unsigned UserOffset,
-                      MachineBasicBlock *&NewMBB);
-    MachineBasicBlock *AcceptWater(water_iterator IP);
+    bool LookForWater(CPUser&U, unsigned UserOffset, water_iterator &WaterIter);
     void CreateNewWater(unsigned CPUserIndex, unsigned UserOffset,
                         MachineBasicBlock *&NewMBB);
     bool HandleConstantPoolUser(MachineFunction &MF, unsigned CPUserIndex);
@@ -297,6 +301,10 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &MF) {
     if (CPChange && ++NoCPIters > 30)
       llvm_unreachable("Constant Island pass failed to converge!");
     DEBUG(dumpBBs());
+    
+    // Clear NewWaterList now.  If we split a block for branches, it should
+    // appear as "new water" for the next iteration of constant pool placement.
+    NewWaterList.clear();
 
     bool BRChange = false;
     for (unsigned i = 0, e = ImmBranches.size(); i != e; ++i)
@@ -629,7 +637,7 @@ void ARMConstantIslands::UpdateForInsertedWaterBlock(MachineBasicBlock *NewBB) {
 
 
 /// Split the basic block containing MI into two blocks, which are joined by
-/// an unconditional branch.  Update datastructures and renumber blocks to
+/// an unconditional branch.  Update data structures and renumber blocks to
 /// account for this change and returns the newly created block.
 MachineBasicBlock *ARMConstantIslands::SplitBlockBeforeInstr(MachineInstr *MI) {
   MachineBasicBlock *OrigBB = MI->getParent();
@@ -691,6 +699,7 @@ MachineBasicBlock *ARMConstantIslands::SplitBlockBeforeInstr(MachineInstr *MI) {
     WaterList.insert(next(IP), NewBB);
   else
     WaterList.insert(IP, OrigBB);
+  NewWaterList.insert(OrigBB);
 
   // Figure out how large the first NewMBB is.  (It cannot
   // contain a constpool_entry or tablejump.)
@@ -941,30 +950,16 @@ static inline unsigned getUnconditionalBrDisp(int Opc) {
   return ((1<<23)-1)*4;
 }
 
-/// AcceptWater - Small amount of common code factored out of the following.
-///
-MachineBasicBlock *ARMConstantIslands::AcceptWater(water_iterator IP) {
-  DEBUG(errs() << "found water in range\n");
-  MachineBasicBlock *WaterBB = *IP;
-  // Remove the original WaterList entry; we want subsequent
-  // insertions in this vicinity to go after the one we're
-  // about to insert.  This considerably reduces the number
-  // of times we have to move the same CPE more than once.
-  WaterList.erase(IP);
-  // CPE goes before following block (NewMBB).
-  return next(MachineFunction::iterator(WaterBB));
-}
-
-/// LookForWater - look for an existing entry in the WaterList in which
+/// LookForWater - Look for an existing entry in the WaterList in which
 /// we can place the CPE referenced from U so it's within range of U's MI.
-/// Returns true if found, false if not.  If it returns true, NewMBB
+/// Returns true if found, false if not.  If it returns true, WaterIter
 /// is set to the WaterList entry.  For Thumb, prefer water that will not
 /// introduce padding to water that will.  To ensure that this pass
 /// terminates, the CPE location for a particular CPUser is only allowed to
 /// move to a lower address, so search backward from the end of the list and
 /// prefer the first water that is in range.
 bool ARMConstantIslands::LookForWater(CPUser &U, unsigned UserOffset,
-                                      MachineBasicBlock *&NewMBB) {
+                                      water_iterator &WaterIter) {
   if (WaterList.empty())
     return false;
 
@@ -973,9 +968,17 @@ bool ARMConstantIslands::LookForWater(CPUser &U, unsigned UserOffset,
   for (water_iterator IP = prior(WaterList.end()),
          B = WaterList.begin();; --IP) {
     MachineBasicBlock* WaterBB = *IP;
-    // Check if water is in range and at a lower address than the current one.
-    if (WaterBB->getNumber() < U.HighWaterMark->getNumber() &&
-        WaterIsInRange(UserOffset, WaterBB, U)) {
+    // Check if water is in range and is either at a lower address than the
+    // current "high water mark" or a new water block that was created since
+    // the previous iteration by inserting an unconditional branch.  In the
+    // latter case, we want to allow resetting the high water mark back to
+    // this new water since we haven't seen it before.  Inserting branches
+    // should be relatively uncommon and when it does happen, we want to be
+    // sure to take advantage of it for all the CPEs near that block, so that
+    // we don't insert more branches than necessary.
+    if (WaterIsInRange(UserOffset, WaterBB, U) &&
+        (WaterBB->getNumber() < U.HighWaterMark->getNumber() ||
+         NewWaterList.count(WaterBB))) {
       unsigned WBBId = WaterBB->getNumber();
       if (isThumb &&
           (BBOffsets[WBBId] + BBSizes[WBBId])%4 != 0) {
@@ -986,7 +989,7 @@ bool ARMConstantIslands::LookForWater(CPUser &U, unsigned UserOffset,
           IPThatWouldPad = IP;
         }
       } else {
-        NewMBB = AcceptWater(IP);
+        WaterIter = IP;
         return true;
       }
     }
@@ -994,7 +997,7 @@ bool ARMConstantIslands::LookForWater(CPUser &U, unsigned UserOffset,
       break;
   }
   if (FoundWaterThatWouldPad) {
-    NewMBB = AcceptWater(IPThatWouldPad);
+    WaterIter = IPThatWouldPad;
     return true;
   }
   return false;
@@ -1107,7 +1110,6 @@ bool ARMConstantIslands::HandleConstantPoolUser(MachineFunction &MF,
   MachineInstr *CPEMI  = U.CPEMI;
   unsigned CPI = CPEMI->getOperand(1).getIndex();
   unsigned Size = CPEMI->getOperand(2).getImm();
-  MachineBasicBlock *NewMBB;
   // Compute this only once, it's expensive.  The 4 or 8 is the value the
   // hardware keeps in the PC.
   unsigned UserOffset = GetOffsetOf(UserMI) + (isThumb ? 4 : 8);
@@ -1123,14 +1125,50 @@ bool ARMConstantIslands::HandleConstantPoolUser(MachineFunction &MF,
   unsigned ID = AFI->createConstPoolEntryUId();
 
   // Look for water where we can place this CPE.
-  if (!LookForWater(U, UserOffset, NewMBB)) {
+  MachineBasicBlock *NewIsland = MF.CreateMachineBasicBlock();
+  MachineBasicBlock *NewMBB;
+  water_iterator IP;
+  if (LookForWater(U, UserOffset, IP)) {
+    DEBUG(errs() << "found water in range\n");
+    MachineBasicBlock *WaterBB = *IP;
+
+    // If the original WaterList entry was "new water" on this iteration,
+    // propagate that to the new island.  This is just keeping NewWaterList
+    // updated to match the WaterList, which will be updated below.
+    if (NewWaterList.count(WaterBB)) {
+      NewWaterList.erase(WaterBB);
+      NewWaterList.insert(NewIsland);
+    }
+    // The new CPE goes before the following block (NewMBB).
+    NewMBB = next(MachineFunction::iterator(WaterBB));
+
+  } else {
     // No water found.
     DEBUG(errs() << "No water found\n");
     CreateNewWater(CPUserIndex, UserOffset, NewMBB);
+
+    // SplitBlockBeforeInstr adds to WaterList, which is important when it is
+    // called while handling branches so that the water will be seen on the
+    // next iteration for constant pools, but in this context, we don't want
+    // it.  Check for this so it will be removed from the WaterList.
+    // Also remove any entry from NewWaterList.
+    MachineBasicBlock *WaterBB = prior(MachineFunction::iterator(NewMBB));
+    IP = std::find(WaterList.begin(), WaterList.end(), WaterBB);
+    if (IP != WaterList.end())
+      NewWaterList.erase(WaterBB);
+
+    // We are adding new water.  Update NewWaterList.
+    NewWaterList.insert(NewIsland);
   }
 
+  // Remove the original WaterList entry; we want subsequent insertions in
+  // this vicinity to go after the one we're about to insert.  This
+  // considerably reduces the number of times we have to move the same CPE
+  // more than once and is also important to ensure the algorithm terminates.
+  if (IP != WaterList.end())
+    WaterList.erase(IP);
+
   // Okay, we know we can put an island before NewMBB now, do it!
-  MachineBasicBlock *NewIsland = MF.CreateMachineBasicBlock();
   MF.insert(NewMBB, NewIsland);
 
   // Update internal data structures to account for the newly inserted MBB.
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index c39de0a12b8f..5c1835b46a22 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -1287,7 +1287,7 @@ SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDValue Op,
       assert(Srl_imm > 0 && Srl_imm < 32 && "bad amount in shift node!");
       unsigned Width = 32 - Srl_imm;
       int LSB = Srl_imm - Shl_imm;
-      if ((LSB + Width) > 32)
+      if (LSB < 0)
         return NULL;
       SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
       SDValue Ops[] = { Op.getOperand(0).getOperand(0),
@@ -1427,6 +1427,43 @@ SDNode *ARMDAGToDAGISel::Select(SDValue Op) {
       }
     }
     break;
+  case ISD::AND: {
+    // (and (or x, c2), c1) and top 16-bits of c1 and c2 match, lower 16-bits
+    // of c1 are 0xffff, and lower 16-bit of c2 are 0. That is, the top 16-bits
+    // are entirely contributed by c2 and lower 16-bits are entirely contributed
+    // by x. That's equal to (or (and x, 0xffff), (and c1, 0xffff0000)).
+    // Select it to: "movt x, ((c1 & 0xffff) >> 16)
+    EVT VT = Op.getValueType();
+    if (VT != MVT::i32)
+      break;
+    unsigned Opc = (Subtarget->isThumb() && Subtarget->hasThumb2())
+      ? ARM::t2MOVTi16
+      : (Subtarget->hasV6T2Ops() ? ARM::MOVTi16 : 0);
+    if (!Opc)
+      break;
+    SDValue N0 = Op.getOperand(0), N1 = Op.getOperand(1);
+    ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+    if (!N1C)
+      break;
+    if (N0.getOpcode() == ISD::OR && N0.getNode()->hasOneUse()) {
+      SDValue N2 = N0.getOperand(1);
+      ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2);
+      if (!N2C)
+        break;
+      unsigned N1CVal = N1C->getZExtValue();
+      unsigned N2CVal = N2C->getZExtValue();
+      if ((N1CVal & 0xffff0000U) == (N2CVal & 0xffff0000U) &&
+          (N1CVal & 0xffffU) == 0xffffU &&
+          (N2CVal & 0xffffU) == 0x0U) {
+        SDValue Imm16 = CurDAG->getTargetConstant((N2CVal & 0xFFFF0000U) >> 16,
+                                                  MVT::i32);
+        SDValue Ops[] = { N0.getOperand(0), Imm16,
+                          getAL(CurDAG), CurDAG->getRegister(0, MVT::i32) };
+        return CurDAG->getMachineNode(Opc, dl, VT, Ops, 4);
+      }
+    }
+    break;
+  }
   case ARMISD::FMRRD:
     return CurDAG->getMachineNode(ARM::FMRRD, dl, MVT::i32, MVT::i32,
                                   Op.getOperand(0), getAL(CurDAG),
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 426cecb28eb7..6a264fdfc44f 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -25,9 +25,10 @@
 #include "llvm/CallingConv.h"
 #include "llvm/Constants.h"
 #include "llvm/Function.h"
+#include "llvm/GlobalValue.h"
 #include "llvm/Instruction.h"
 #include "llvm/Intrinsics.h"
-#include "llvm/GlobalValue.h"
+#include "llvm/Type.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -2360,8 +2361,11 @@ static bool isVREVMask(const SmallVectorImpl<int> &M, EVT VT,
   assert((BlockSize==16 || BlockSize==32 || BlockSize==64) &&
          "Only possible block sizes for VREV are: 16, 32, 64");
 
-  unsigned NumElts = VT.getVectorNumElements();
   unsigned EltSz = VT.getVectorElementType().getSizeInBits();
+  if (EltSz == 64)
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
   unsigned BlockElts = M[0] + 1;
 
   if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
@@ -2378,6 +2382,10 @@ static bool isVREVMask(const SmallVectorImpl<int> &M, EVT VT,
 
 static bool isVTRNMask(const SmallVectorImpl<int> &M, EVT VT,
                        unsigned &WhichResult) {
+  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
+  if (EltSz == 64)
+    return false;
+
   unsigned NumElts = VT.getVectorNumElements();
   WhichResult = (M[0] == 0 ? 0 : 1);
   for (unsigned i = 0; i < NumElts; i += 2) {
@@ -2390,6 +2398,10 @@ static bool isVTRNMask(const SmallVectorImpl<int> &M, EVT VT,
 
 static bool isVUZPMask(const SmallVectorImpl<int> &M, EVT VT,
                        unsigned &WhichResult) {
+  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
+  if (EltSz == 64)
+    return false;
+
   unsigned NumElts = VT.getVectorNumElements();
   WhichResult = (M[0] == 0 ? 0 : 1);
   for (unsigned i = 0; i != NumElts; ++i) {
@@ -2398,7 +2410,7 @@ static bool isVUZPMask(const SmallVectorImpl<int> &M, EVT VT,
   }
 
   // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
-  if (VT.is64BitVector() && VT.getVectorElementType().getSizeInBits() == 32)
+  if (VT.is64BitVector() && EltSz == 32)
     return false;
 
   return true;
@@ -2406,6 +2418,10 @@ static bool isVUZPMask(const SmallVectorImpl<int> &M, EVT VT,
 
 static bool isVZIPMask(const SmallVectorImpl<int> &M, EVT VT,
                        unsigned &WhichResult) {
+  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
+  if (EltSz == 64)
+    return false;
+
   unsigned NumElts = VT.getVectorNumElements();
   WhichResult = (M[0] == 0 ? 0 : 1);
   unsigned Idx = WhichResult * NumElts / 2;
@@ -2417,7 +2433,7 @@ static bool isVZIPMask(const SmallVectorImpl<int> &M, EVT VT,
   }
 
   // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
-  if (VT.is64BitVector() && VT.getVectorElementType().getSizeInBits() == 32)
+  if (VT.is64BitVector() && EltSz == 32)
     return false;
 
   return true;
@@ -2695,18 +2711,10 @@ static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
   DebugLoc dl = Op.getDebugLoc();
   SDValue Vec = Op.getOperand(0);
   SDValue Lane = Op.getOperand(1);
-
-  // FIXME: This is invalid for 8 and 16-bit elements - the information about
-  // sign / zero extension is lost!
-  Op = DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
-  Op = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Op, DAG.getValueType(VT));
-
-  if (VT.bitsLT(MVT::i32))
-    Op = DAG.getNode(ISD::TRUNCATE, dl, VT, Op);
-  else if (VT.bitsGT(MVT::i32))
-    Op = DAG.getNode(ISD::ANY_EXTEND, dl, VT, Op);
-
-  return Op;
+  assert(VT == MVT::i32 &&
+         Vec.getValueType().getVectorElementType().getSizeInBits() < 32 &&
+         "unexpected type for custom-lowering vector extract");
+  return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
 }
 
 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
@@ -3029,7 +3037,6 @@ static SDValue PerformSUBCombine(SDNode *N,
   return SDValue();
 }
 
-
 /// PerformFMRRDCombine - Target-specific dag combine xforms for ARMISD::FMRRD.
 static SDValue PerformFMRRDCombine(SDNode *N,
                                    TargetLowering::DAGCombinerInfo &DCI) {
diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td
index 3d19f2345d30..8225fd741bb8 100644
--- a/lib/Target/ARM/ARMInstrFormats.td
+++ b/lib/Target/ARM/ARMInstrFormats.td
@@ -1220,6 +1220,10 @@ class NLdSt<bit op23, bits<2> op21_20, bits<4> op11_8, bits<4> op7_4,
             string asm, string cstr, list<dag> pattern>
   : NeonI<oops, iops, AddrMode6, IndexModeNone, itin, asm, cstr, pattern> {
   let Inst{31-24} = 0b11110100;
+  let Inst{23} = op23;
+  let Inst{21-20} = op21_20;
+  let Inst{11-8} = op11_8;
+  let Inst{7-4} = op7_4;
 }
 
 class NDataI<dag oops, dag iops, InstrItinClass itin,
@@ -1258,15 +1262,26 @@ class N2V<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16,
   let Inst{4} = op4;
 }
 
+// NEON Vector Duplicate (scalar).
+// Inst{19-16} is specified by subclasses.
+class N2VDup<bits<2> op24_23, bits<2> op21_20, bits<5> op11_7, bit op6, bit op4,
+             dag oops, dag iops, InstrItinClass itin,
+             string asm, string cstr, list<dag> pattern>
+  : NDataI<oops, iops, itin, asm, cstr, pattern> {
+  let Inst{24-23} = op24_23;
+  let Inst{21-20} = op21_20;
+  let Inst{11-7} = op11_7;
+  let Inst{6} = op6;
+  let Inst{4} = op4;
+}
+
 // NEON 2 vector register with immediate.
-class N2VImm<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
-             bit op6, bit op4,
+class N2VImm<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4,
              dag oops, dag iops, InstrItinClass itin,
              string asm, string cstr, list<dag> pattern>
   : NDataI<oops, iops, itin, asm, cstr, pattern> {
   let Inst{24} = op24;
   let Inst{23} = op23;
-  let Inst{21-16} = op21_16;
   let Inst{11-8} = op11_8;
   let Inst{7} = op7;
   let Inst{6} = op6;
@@ -1286,6 +1301,20 @@ class N3V<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4,
   let Inst{4} = op4;
 }
 
+// NEON 3 vector register with immediate.  This is only used for VEXT where
+// op11_8 represents the starting byte index of the extracted result in the
+// concatenation of the operands and is left unspecified.
+class N3VImm<bit op24, bit op23, bits<2> op21_20, bit op6, bit op4,
+             dag oops, dag iops, InstrItinClass itin,
+             string asm, string cstr, list<dag> pattern>
+  : NDataI<oops, iops, itin, asm, cstr, pattern> {
+  let Inst{24} = op24;
+  let Inst{23} = op23;
+  let Inst{21-20} = op21_20;
+  let Inst{6} = op6;
+  let Inst{4} = op4;
+}
+
 // NEON VMOVs between scalar and core registers.
 class NVLaneOp<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
                dag oops, dag iops, Format f, InstrItinClass itin,
diff --git a/lib/Target/ARM/ARMInstrInfo.cpp b/lib/Target/ARM/ARMInstrInfo.cpp
index 4c92891c82bd..dd4123bfa0a3 100644
--- a/lib/Target/ARM/ARMInstrInfo.cpp
+++ b/lib/Target/ARM/ARMInstrInfo.cpp
@@ -22,7 +22,6 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/MC/MCAsmInfo.h"
-#include "llvm/Support/CommandLine.h"
 using namespace llvm;
 
 ARMInstrInfo::ARMInstrInfo(const ARMSubtarget &STI)
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 6ec78bc72ee7..384b98cf540c 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -980,6 +980,9 @@ def MOVTi16 : AI1<0b1010, (outs GPR:$dst), (ins GPR:$src, i32imm:$imm),
   let Inst{25} = 1;
 }
 
+def : ARMPat<(or GPR:$src, 0xffff0000), (MOVTi16 GPR:$src, 0xffff)>,
+      Requires<[IsARM, HasV6T2]>;
+
 let Uses = [CPSR] in
 def MOVrx : AsI1<0b1101, (outs GPR:$dst), (ins GPR:$src), Pseudo, IIC_iMOVsi,
                  "mov", " $dst, $src, rrx",
@@ -1580,10 +1583,17 @@ def : ARMPat<(or GPR:$LHS, so_imm2part:$RHS),
 def : ARMPat<(xor GPR:$LHS, so_imm2part:$RHS),
              (EORri (EORri GPR:$LHS, (so_imm2part_1 imm:$RHS)),
                     (so_imm2part_2 imm:$RHS))>;
+def : ARMPat<(add GPR:$LHS, so_imm2part:$RHS),
+             (ADDri (ADDri GPR:$LHS, (so_imm2part_1 imm:$RHS)),
+                    (so_imm2part_2 imm:$RHS))>;
+def : ARMPat<(sub GPR:$LHS, so_imm2part:$RHS),
+             (SUBri (SUBri GPR:$LHS, (so_imm2part_1 imm:$RHS)),
+                    (so_imm2part_2 imm:$RHS))>;
 
 // 32-bit immediate using movw + movt.
-// This is a single pseudo instruction to make it re-materializable. Remove
-// when we can do generalized remat.
+// This is a single pseudo instruction, the benefit is that it can be remat'd
+// as a single unit instead of having to handle reg inputs.
+// FIXME: Remove this when we can do generalized remat.
 let isReMaterializable = 1 in
 def MOVi32imm : AI1x2<(outs GPR:$dst), (ins i32imm:$src), Pseudo, IIC_iMOVi,
                      "movw", " $dst, ${src:lo16}\n\tmovt${p} $dst, ${src:hi16}",
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index b34aff7dcb3e..822950c52836 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -456,17 +456,17 @@ class VST2LN<bits<4> op11_8, string OpcodeStr>
           !strconcat(OpcodeStr, "\t\\{$src1[$lane],$src2[$lane]\\}, $addr"),
           "", []>;
 
-def VST2LNd8  : VST2LN<0b0000, "vst2.8">;
-def VST2LNd16 : VST2LN<0b0100, "vst2.16">;
-def VST2LNd32 : VST2LN<0b1000, "vst2.32">;
+def VST2LNd8  : VST2LN<0b0001, "vst2.8">;
+def VST2LNd16 : VST2LN<0b0101, "vst2.16">;
+def VST2LNd32 : VST2LN<0b1001, "vst2.32">;
 
 // vst2 to double-spaced even registers.
-def VST2LNq16a: VST2LN<0b0100, "vst2.16">;
-def VST2LNq32a: VST2LN<0b1000, "vst2.32">;
+def VST2LNq16a: VST2LN<0b0101, "vst2.16">;
+def VST2LNq32a: VST2LN<0b1001, "vst2.32">;
 
 // vst2 to double-spaced odd registers.
-def VST2LNq16b: VST2LN<0b0100, "vst2.16">;
-def VST2LNq32b: VST2LN<0b1000, "vst2.32">;
+def VST2LNq16b: VST2LN<0b0101, "vst2.16">;
+def VST2LNq32b: VST2LN<0b1001, "vst2.32">;
 
 //   VST3LN   : Vector Store (single 3-element structure from one lane)
 class VST3LN<bits<4> op11_8, string OpcodeStr>
@@ -623,12 +623,12 @@ class N2VNInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
         (ins QPR:$src), itin, !strconcat(OpcodeStr, "\t$dst, $src"), "",
         [(set DPR:$dst, (TyD (IntOp (TyQ QPR:$src))))]>;
 
-// Long 2-register intrinsics.  (This is currently only used for VMOVL and is
-// derived from N2VImm instead of N2V because of the way the size is encoded.)
-class N2VLInt<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
-              bit op6, bit op4, InstrItinClass itin, string OpcodeStr,
+// Long 2-register intrinsics (currently only used for VMOVL).
+class N2VLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+              bits<2> op17_16, bits<5> op11_7, bit op6, bit op4,
+              InstrItinClass itin, string OpcodeStr,
               ValueType TyQ, ValueType TyD, Intrinsic IntOp>
-  : N2VImm<op24, op23, op21_16, op11_8, op7, op6, op4, (outs QPR:$dst),
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs QPR:$dst),
         (ins DPR:$src), itin, !strconcat(OpcodeStr, "\t$dst, $src"), "",
         [(set QPR:$dst, (TyQ (IntOp (TyD DPR:$src))))]>;
 
@@ -1016,36 +1016,33 @@ class N2VQPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
 
 // Shift by immediate,
 // both double- and quad-register.
-class N2VDSh<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
-             bit op4, InstrItinClass itin, string OpcodeStr,
-             ValueType Ty, SDNode OpNode>
-  : N2VImm<op24, op23, op21_16, op11_8, op7, 0, op4,
+class N2VDSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+             InstrItinClass itin, string OpcodeStr, ValueType Ty, SDNode OpNode>
+  : N2VImm<op24, op23, op11_8, op7, 0, op4,
            (outs DPR:$dst), (ins DPR:$src, i32imm:$SIMM), itin,
            !strconcat(OpcodeStr, "\t$dst, $src, $SIMM"), "",
            [(set DPR:$dst, (Ty (OpNode (Ty DPR:$src), (i32 imm:$SIMM))))]>;
-class N2VQSh<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
-             bit op4, InstrItinClass itin, string OpcodeStr,
-             ValueType Ty, SDNode OpNode>
-  : N2VImm<op24, op23, op21_16, op11_8, op7, 1, op4,
+class N2VQSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+             InstrItinClass itin, string OpcodeStr, ValueType Ty, SDNode OpNode>
+  : N2VImm<op24, op23, op11_8, op7, 1, op4,
            (outs QPR:$dst), (ins QPR:$src, i32imm:$SIMM), itin,
            !strconcat(OpcodeStr, "\t$dst, $src, $SIMM"), "",
            [(set QPR:$dst, (Ty (OpNode (Ty QPR:$src), (i32 imm:$SIMM))))]>;
 
 // Long shift by immediate.
-class N2VLSh<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
-             bit op6, bit op4, string OpcodeStr, ValueType ResTy,
-             ValueType OpTy, SDNode OpNode>
-  : N2VImm<op24, op23, op21_16, op11_8, op7, op6, op4,
+class N2VLSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4,
+             string OpcodeStr, ValueType ResTy, ValueType OpTy, SDNode OpNode>
+  : N2VImm<op24, op23, op11_8, op7, op6, op4,
            (outs QPR:$dst), (ins DPR:$src, i32imm:$SIMM), IIC_VSHLiD,
            !strconcat(OpcodeStr, "\t$dst, $src, $SIMM"), "",
            [(set QPR:$dst, (ResTy (OpNode (OpTy DPR:$src),
                                           (i32 imm:$SIMM))))]>;
 
 // Narrow shift by immediate.
-class N2VNSh<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
-             bit op6, bit op4, InstrItinClass itin, string OpcodeStr,
+class N2VNSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4,
+             InstrItinClass itin, string OpcodeStr,
              ValueType ResTy, ValueType OpTy, SDNode OpNode>
-  : N2VImm<op24, op23, op21_16, op11_8, op7, op6, op4,
+  : N2VImm<op24, op23, op11_8, op7, op6, op4,
            (outs DPR:$dst), (ins QPR:$src, i32imm:$SIMM), itin,
            !strconcat(OpcodeStr, "\t$dst, $src, $SIMM"), "",
            [(set DPR:$dst, (ResTy (OpNode (OpTy QPR:$src),
@@ -1053,53 +1050,49 @@ class N2VNSh<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
 
 // Shift right by immediate and accumulate,
 // both double- and quad-register.
-class N2VDShAdd<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
-                bit op4, string OpcodeStr, ValueType Ty, SDNode ShOp>
-  : N2VImm<op24, op23, op21_16, op11_8, op7, 0, op4,
-           (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, i32imm:$SIMM),
-           IIC_VPALiD, 
+class N2VDShAdd<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+                string OpcodeStr, ValueType Ty, SDNode ShOp>
+  : N2VImm<op24, op23, op11_8, op7, 0, op4, (outs DPR:$dst),
+           (ins DPR:$src1, DPR:$src2, i32imm:$SIMM), IIC_VPALiD, 
            !strconcat(OpcodeStr, "\t$dst, $src2, $SIMM"), "$src1 = $dst",
            [(set DPR:$dst, (Ty (add DPR:$src1,
                                 (Ty (ShOp DPR:$src2, (i32 imm:$SIMM))))))]>;
-class N2VQShAdd<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
-                bit op4, string OpcodeStr, ValueType Ty, SDNode ShOp>
-  : N2VImm<op24, op23, op21_16, op11_8, op7, 1, op4,
-           (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, i32imm:$SIMM),
-           IIC_VPALiD, 
+class N2VQShAdd<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+                string OpcodeStr, ValueType Ty, SDNode ShOp>
+  : N2VImm<op24, op23, op11_8, op7, 1, op4, (outs QPR:$dst),
+           (ins QPR:$src1, QPR:$src2, i32imm:$SIMM), IIC_VPALiD, 
            !strconcat(OpcodeStr, "\t$dst, $src2, $SIMM"), "$src1 = $dst",
            [(set QPR:$dst, (Ty (add QPR:$src1,
                                 (Ty (ShOp QPR:$src2, (i32 imm:$SIMM))))))]>;
 
 // Shift by immediate and insert,
 // both double- and quad-register.
-class N2VDShIns<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
-                bit op4, string OpcodeStr, ValueType Ty, SDNode ShOp>
-  : N2VImm<op24, op23, op21_16, op11_8, op7, 0, op4,
-           (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, i32imm:$SIMM),
-           IIC_VSHLiD, 
+class N2VDShIns<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+                string OpcodeStr, ValueType Ty, SDNode ShOp>
+  : N2VImm<op24, op23, op11_8, op7, 0, op4, (outs DPR:$dst),
+           (ins DPR:$src1, DPR:$src2, i32imm:$SIMM), IIC_VSHLiD, 
            !strconcat(OpcodeStr, "\t$dst, $src2, $SIMM"), "$src1 = $dst",
            [(set DPR:$dst, (Ty (ShOp DPR:$src1, DPR:$src2, (i32 imm:$SIMM))))]>;
-class N2VQShIns<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
-                bit op4, string OpcodeStr, ValueType Ty, SDNode ShOp>
-  : N2VImm<op24, op23, op21_16, op11_8, op7, 1, op4,
-           (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, i32imm:$SIMM),
-           IIC_VSHLiQ, 
+class N2VQShIns<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+                string OpcodeStr, ValueType Ty, SDNode ShOp>
+  : N2VImm<op24, op23, op11_8, op7, 1, op4, (outs QPR:$dst),
+           (ins QPR:$src1, QPR:$src2, i32imm:$SIMM), IIC_VSHLiQ, 
            !strconcat(OpcodeStr, "\t$dst, $src2, $SIMM"), "$src1 = $dst",
            [(set QPR:$dst, (Ty (ShOp QPR:$src1, QPR:$src2, (i32 imm:$SIMM))))]>;
 
 // Convert, with fractional bits immediate,
 // both double- and quad-register.
-class N2VCvtD<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
-              bit op4, string OpcodeStr, ValueType ResTy, ValueType OpTy,
+class N2VCvtD<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+              string OpcodeStr, ValueType ResTy, ValueType OpTy,
               Intrinsic IntOp>
-  : N2VImm<op24, op23, op21_16, op11_8, op7, 0, op4,
+  : N2VImm<op24, op23, op11_8, op7, 0, op4,
            (outs DPR:$dst), (ins DPR:$src, i32imm:$SIMM), IIC_VUNAD, 
            !strconcat(OpcodeStr, "\t$dst, $src, $SIMM"), "",
            [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src), (i32 imm:$SIMM))))]>;
-class N2VCvtQ<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
-              bit op4, string OpcodeStr, ValueType ResTy, ValueType OpTy,
+class N2VCvtQ<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+              string OpcodeStr, ValueType ResTy, ValueType OpTy,
               Intrinsic IntOp>
-  : N2VImm<op24, op23, op21_16, op11_8, op7, 1, op4,
+  : N2VImm<op24, op23, op11_8, op7, 1, op4,
            (outs QPR:$dst), (ins QPR:$src, i32imm:$SIMM), IIC_VUNAQ, 
            !strconcat(OpcodeStr, "\t$dst, $src, $SIMM"), "",
            [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src), (i32 imm:$SIMM))))]>;
@@ -1175,14 +1168,14 @@ multiclass N2VNInt_HSD<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
 
 // Neon Lengthening 2-register vector intrinsic (currently specific to VMOVL).
 //   source operand element sizes of 16, 32 and 64 bits:
-multiclass N2VLInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6,
-                       bit op4, string OpcodeStr, Intrinsic IntOp> {
-  def v8i16 : N2VLInt<op24, op23, 0b001000, op11_8, op7, op6, op4,
-                      IIC_VQUNAiD, !strconcat(OpcodeStr, "8"), v8i16, v8i8, IntOp>;
-  def v4i32 : N2VLInt<op24, op23, 0b010000, op11_8, op7, op6, op4,
-                      IIC_VQUNAiD, !strconcat(OpcodeStr, "16"), v4i32, v4i16, IntOp>;
-  def v2i64 : N2VLInt<op24, op23, 0b100000, op11_8, op7, op6, op4,
-                      IIC_VQUNAiD, !strconcat(OpcodeStr, "32"), v2i64, v2i32, IntOp>;
+multiclass N2VLInt_QHS<bits<2> op24_23, bits<5> op11_7, bit op6, bit op4,
+                       string OpcodeStr, Intrinsic IntOp> {
+  def v8i16 : N2VLInt<op24_23, 0b00, 0b10, 0b00, op11_7, op6, op4, IIC_VQUNAiD,
+                      !strconcat(OpcodeStr, "8"), v8i16, v8i8, IntOp>;
+  def v4i32 : N2VLInt<op24_23, 0b01, 0b00, 0b00, op11_7, op6, op4, IIC_VQUNAiD,
+                      !strconcat(OpcodeStr, "16"), v4i32, v4i16, IntOp>;
+  def v2i64 : N2VLInt<op24_23, 0b10, 0b00, 0b00, op11_7, op6, op4, IIC_VQUNAiD,
+                      !strconcat(OpcodeStr, "32"), v2i64, v2i32, IntOp>;
 }
 
 
@@ -1381,7 +1374,7 @@ multiclass N3VLInt3SL_HS<bit op24, bits<4> op11_8,
 multiclass N3VLInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
                         string OpcodeStr, Intrinsic IntOp>
   : N3VLInt3_HS<op24, op23, op11_8, op4, OpcodeStr, IntOp> {
-  def v8i16 : N3VLInt3<op24, op23, 0b01, op11_8, op4, IIC_VMACi16D,
+  def v8i16 : N3VLInt3<op24, op23, 0b00, op11_8, op4, IIC_VMACi16D,
                        !strconcat(OpcodeStr, "8"), v8i16, v8i8, IntOp>;
 }
 
@@ -1461,24 +1454,38 @@ multiclass N2VPLInt2_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
 multiclass N2VSh_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
                       InstrItinClass itin, string OpcodeStr, SDNode OpNode> {
   // 64-bit vector types.
-  def v8i8  : N2VDSh<op24, op23, 0b001000, op11_8, 0, op4, itin,
-                     !strconcat(OpcodeStr, "8"), v8i8, OpNode>;
-  def v4i16 : N2VDSh<op24, op23, 0b010000, op11_8, 0, op4, itin,
-                     !strconcat(OpcodeStr, "16"), v4i16, OpNode>;
-  def v2i32 : N2VDSh<op24, op23, 0b100000, op11_8, 0, op4, itin,
-                     !strconcat(OpcodeStr, "32"), v2i32, OpNode>;
-  def v1i64 : N2VDSh<op24, op23, 0b000000, op11_8, 1, op4, itin,
+  def v8i8  : N2VDSh<op24, op23, op11_8, 0, op4, itin,
+                     !strconcat(OpcodeStr, "8"), v8i8, OpNode> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v4i16 : N2VDSh<op24, op23, op11_8, 0, op4, itin,
+                     !strconcat(OpcodeStr, "16"), v4i16, OpNode> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v2i32 : N2VDSh<op24, op23, op11_8, 0, op4, itin,
+                     !strconcat(OpcodeStr, "32"), v2i32, OpNode> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+  def v1i64 : N2VDSh<op24, op23, op11_8, 1, op4, itin,
                      !strconcat(OpcodeStr, "64"), v1i64, OpNode>;
+                             // imm6 = xxxxxx
 
   // 128-bit vector types.
-  def v16i8 : N2VQSh<op24, op23, 0b001000, op11_8, 0, op4, itin,
-                     !strconcat(OpcodeStr, "8"), v16i8, OpNode>;
-  def v8i16 : N2VQSh<op24, op23, 0b010000, op11_8, 0, op4, itin,
-                     !strconcat(OpcodeStr, "16"), v8i16, OpNode>;
-  def v4i32 : N2VQSh<op24, op23, 0b100000, op11_8, 0, op4, itin,
-                     !strconcat(OpcodeStr, "32"), v4i32, OpNode>;
-  def v2i64 : N2VQSh<op24, op23, 0b000000, op11_8, 1, op4, itin,
+  def v16i8 : N2VQSh<op24, op23, op11_8, 0, op4, itin,
+                     !strconcat(OpcodeStr, "8"), v16i8, OpNode> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v8i16 : N2VQSh<op24, op23, op11_8, 0, op4, itin,
+                     !strconcat(OpcodeStr, "16"), v8i16, OpNode> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v4i32 : N2VQSh<op24, op23, op11_8, 0, op4, itin,
+                     !strconcat(OpcodeStr, "32"), v4i32, OpNode> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+  def v2i64 : N2VQSh<op24, op23, op11_8, 1, op4, itin,
                      !strconcat(OpcodeStr, "64"), v2i64, OpNode>;
+                             // imm6 = xxxxxx
 }
 
 
@@ -1487,24 +1494,38 @@ multiclass N2VSh_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
 multiclass N2VShAdd_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
                          string OpcodeStr, SDNode ShOp> {
   // 64-bit vector types.
-  def v8i8  : N2VDShAdd<op24, op23, 0b001000, op11_8, 0, op4,
-                        !strconcat(OpcodeStr, "8"), v8i8, ShOp>;
-  def v4i16 : N2VDShAdd<op24, op23, 0b010000, op11_8, 0, op4,
-                        !strconcat(OpcodeStr, "16"), v4i16, ShOp>;
-  def v2i32 : N2VDShAdd<op24, op23, 0b100000, op11_8, 0, op4,
-                        !strconcat(OpcodeStr, "32"), v2i32, ShOp>;
-  def v1i64 : N2VDShAdd<op24, op23, 0b000000, op11_8, 1, op4,
+  def v8i8  : N2VDShAdd<op24, op23, op11_8, 0, op4,
+                        !strconcat(OpcodeStr, "8"), v8i8, ShOp> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v4i16 : N2VDShAdd<op24, op23, op11_8, 0, op4,
+                        !strconcat(OpcodeStr, "16"), v4i16, ShOp> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v2i32 : N2VDShAdd<op24, op23, op11_8, 0, op4,
+                        !strconcat(OpcodeStr, "32"), v2i32, ShOp> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+  def v1i64 : N2VDShAdd<op24, op23, op11_8, 1, op4,
                         !strconcat(OpcodeStr, "64"), v1i64, ShOp>;
+                             // imm6 = xxxxxx
 
   // 128-bit vector types.
-  def v16i8 : N2VQShAdd<op24, op23, 0b001000, op11_8, 0, op4,
-                        !strconcat(OpcodeStr, "8"), v16i8, ShOp>;
-  def v8i16 : N2VQShAdd<op24, op23, 0b010000, op11_8, 0, op4,
-                        !strconcat(OpcodeStr, "16"), v8i16, ShOp>;
-  def v4i32 : N2VQShAdd<op24, op23, 0b100000, op11_8, 0, op4,
-                        !strconcat(OpcodeStr, "32"), v4i32, ShOp>;
-  def v2i64 : N2VQShAdd<op24, op23, 0b000000, op11_8, 1, op4,
+  def v16i8 : N2VQShAdd<op24, op23, op11_8, 0, op4,
+                        !strconcat(OpcodeStr, "8"), v16i8, ShOp> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v8i16 : N2VQShAdd<op24, op23, op11_8, 0, op4,
+                        !strconcat(OpcodeStr, "16"), v8i16, ShOp> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v4i32 : N2VQShAdd<op24, op23, op11_8, 0, op4,
+                        !strconcat(OpcodeStr, "32"), v4i32, ShOp> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+  def v2i64 : N2VQShAdd<op24, op23, op11_8, 1, op4,
                         !strconcat(OpcodeStr, "64"), v2i64, ShOp>;
+                             // imm6 = xxxxxx
 }
 
 
@@ -1513,24 +1534,75 @@ multiclass N2VShAdd_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
 multiclass N2VShIns_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
                          string OpcodeStr, SDNode ShOp> {
   // 64-bit vector types.
-  def v8i8  : N2VDShIns<op24, op23, 0b001000, op11_8, 0, op4,
-                        !strconcat(OpcodeStr, "8"), v8i8, ShOp>;
-  def v4i16 : N2VDShIns<op24, op23, 0b010000, op11_8, 0, op4,
-                        !strconcat(OpcodeStr, "16"), v4i16, ShOp>;
-  def v2i32 : N2VDShIns<op24, op23, 0b100000, op11_8, 0, op4,
-                        !strconcat(OpcodeStr, "32"), v2i32, ShOp>;
-  def v1i64 : N2VDShIns<op24, op23, 0b000000, op11_8, 1, op4,
+  def v8i8  : N2VDShIns<op24, op23, op11_8, 0, op4,
+                        !strconcat(OpcodeStr, "8"), v8i8, ShOp> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v4i16 : N2VDShIns<op24, op23, op11_8, 0, op4,
+                        !strconcat(OpcodeStr, "16"), v4i16, ShOp> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v2i32 : N2VDShIns<op24, op23, op11_8, 0, op4,
+                        !strconcat(OpcodeStr, "32"), v2i32, ShOp> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+  def v1i64 : N2VDShIns<op24, op23, op11_8, 1, op4,
                         !strconcat(OpcodeStr, "64"), v1i64, ShOp>;
+                             // imm6 = xxxxxx
 
   // 128-bit vector types.
-  def v16i8 : N2VQShIns<op24, op23, 0b001000, op11_8, 0, op4,
-                        !strconcat(OpcodeStr, "8"), v16i8, ShOp>;
-  def v8i16 : N2VQShIns<op24, op23, 0b010000, op11_8, 0, op4,
-                        !strconcat(OpcodeStr, "16"), v8i16, ShOp>;
-  def v4i32 : N2VQShIns<op24, op23, 0b100000, op11_8, 0, op4,
-                        !strconcat(OpcodeStr, "32"), v4i32, ShOp>;
-  def v2i64 : N2VQShIns<op24, op23, 0b000000, op11_8, 1, op4,
+  def v16i8 : N2VQShIns<op24, op23, op11_8, 0, op4,
+                        !strconcat(OpcodeStr, "8"), v16i8, ShOp> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v8i16 : N2VQShIns<op24, op23, op11_8, 0, op4,
+                        !strconcat(OpcodeStr, "16"), v8i16, ShOp> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v4i32 : N2VQShIns<op24, op23, op11_8, 0, op4,
+                        !strconcat(OpcodeStr, "32"), v4i32, ShOp> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+  def v2i64 : N2VQShIns<op24, op23, op11_8, 1, op4,
                         !strconcat(OpcodeStr, "64"), v2i64, ShOp>;
+                             // imm6 = xxxxxx
+}
+
+// Neon Shift Long operations,
+//   element sizes of 8, 16, 32 bits:
+multiclass N2VLSh_QHS<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6,
+                      bit op4, string OpcodeStr, SDNode OpNode> {
+  def v8i16 : N2VLSh<op24, op23, op11_8, op7, op6, op4,
+                 !strconcat(OpcodeStr, "8"), v8i16, v8i8, OpNode> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v4i32 : N2VLSh<op24, op23, op11_8, op7, op6, op4,
+                  !strconcat(OpcodeStr, "16"), v4i32, v4i16, OpNode> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v2i64 : N2VLSh<op24, op23, op11_8, op7, op6, op4,
+                  !strconcat(OpcodeStr, "32"), v2i64, v2i32, OpNode> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+}
+
+// Neon Shift Narrow operations,
+//   element sizes of 16, 32, 64 bits:
+multiclass N2VNSh_HSD<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6,
+                      bit op4, InstrItinClass itin, string OpcodeStr,
+                      SDNode OpNode> {
+  def v8i8 : N2VNSh<op24, op23, op11_8, op7, op6, op4, itin,
+                    !strconcat(OpcodeStr, "16"), v8i8, v8i16, OpNode> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v4i16 : N2VNSh<op24, op23, op11_8, op7, op6, op4, itin,
+                     !strconcat(OpcodeStr, "32"), v4i16, v4i32, OpNode> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v2i32 : N2VNSh<op24, op23, op11_8, op7, op6, op4, itin,
+                     !strconcat(OpcodeStr, "64"), v2i32, v2i64, OpNode> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
 }
 
 //===----------------------------------------------------------------------===//
@@ -1903,8 +1975,8 @@ defm VABDLs   : N3VLInt_QHS<0,1,0b0111,0, IIC_VBINi4Q, "vabdl.s", int_arm_neon_v
 defm VABDLu   : N3VLInt_QHS<1,1,0b0111,0, IIC_VBINi4Q, "vabdl.u", int_arm_neon_vabdlu, 0>;
 
 //   VABA     : Vector Absolute Difference and Accumulate
-defm VABAs    : N3VInt3_QHS<0,1,0b0101,0, "vaba.s", int_arm_neon_vabas>;
-defm VABAu    : N3VInt3_QHS<1,1,0b0101,0, "vaba.u", int_arm_neon_vabau>;
+defm VABAs    : N3VInt3_QHS<0,0,0b0111,1, "vaba.s", int_arm_neon_vabas>;
+defm VABAu    : N3VInt3_QHS<1,0,0b0111,1, "vaba.u", int_arm_neon_vabau>;
 
 //   VABAL    : Vector Absolute Difference and Accumulate Long (Q += | D - D |)
 defm VABALs   : N3VLInt3_QHS<0,1,0b0101,0, "vabal.s", int_arm_neon_vabals>;
@@ -2044,34 +2116,25 @@ defm VSHRs    : N2VSh_QHSD<0, 1, 0b0000, 1, IIC_VSHLiD, "vshr.s", NEONvshrs>;
 defm VSHRu    : N2VSh_QHSD<1, 1, 0b0000, 1, IIC_VSHLiD, "vshr.u", NEONvshru>;
 
 //   VSHLL    : Vector Shift Left Long
-def  VSHLLs8  : N2VLSh<0, 1, 0b001000, 0b1010, 0, 0, 1, "vshll.s8",
-                       v8i16, v8i8, NEONvshlls>;
-def  VSHLLs16 : N2VLSh<0, 1, 0b010000, 0b1010, 0, 0, 1, "vshll.s16",
-                       v4i32, v4i16, NEONvshlls>;
-def  VSHLLs32 : N2VLSh<0, 1, 0b100000, 0b1010, 0, 0, 1, "vshll.s32",
-                       v2i64, v2i32, NEONvshlls>;
-def  VSHLLu8  : N2VLSh<1, 1, 0b001000, 0b1010, 0, 0, 1, "vshll.u8",
-                       v8i16, v8i8, NEONvshllu>;
-def  VSHLLu16 : N2VLSh<1, 1, 0b010000, 0b1010, 0, 0, 1, "vshll.u16",
-                       v4i32, v4i16, NEONvshllu>;
-def  VSHLLu32 : N2VLSh<1, 1, 0b100000, 0b1010, 0, 0, 1, "vshll.u32",
-                       v2i64, v2i32, NEONvshllu>;
+defm VSHLLs   : N2VLSh_QHS<0, 1, 0b1010, 0, 0, 1, "vshll.s", NEONvshlls>;
+defm VSHLLu   : N2VLSh_QHS<1, 1, 0b1010, 0, 0, 1, "vshll.u", NEONvshllu>;
 
 //   VSHLL    : Vector Shift Left Long (with maximum shift count)
-def  VSHLLi8  : N2VLSh<1, 1, 0b110010, 0b0011, 0, 0, 0, "vshll.i8",
-                       v8i16, v8i8, NEONvshlli>;
-def  VSHLLi16 : N2VLSh<1, 1, 0b110110, 0b0011, 0, 0, 0, "vshll.i16",
-                       v4i32, v4i16, NEONvshlli>;
-def  VSHLLi32 : N2VLSh<1, 1, 0b111010, 0b0011, 0, 0, 0, "vshll.i32",
-                       v2i64, v2i32, NEONvshlli>;
+class N2VLShMax<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
+                bit op6, bit op4, string OpcodeStr, ValueType ResTy,
+                ValueType OpTy, SDNode OpNode>
+  : N2VLSh<op24, op23, op11_8, op7, op6, op4, OpcodeStr, ResTy, OpTy, OpNode> {
+  let Inst{21-16} = op21_16;
+}
+def  VSHLLi8  : N2VLShMax<1, 1, 0b110010, 0b0011, 0, 0, 0, "vshll.i8",
+                          v8i16, v8i8, NEONvshlli>;
+def  VSHLLi16 : N2VLShMax<1, 1, 0b110110, 0b0011, 0, 0, 0, "vshll.i16",
+                          v4i32, v4i16, NEONvshlli>;
+def  VSHLLi32 : N2VLShMax<1, 1, 0b111010, 0b0011, 0, 0, 0, "vshll.i32",
+                          v2i64, v2i32, NEONvshlli>;
 
 //   VSHRN    : Vector Shift Right and Narrow
-def  VSHRN16  : N2VNSh<0, 1, 0b001000, 0b1000, 0, 0, 1, 
-                       IIC_VSHLiD, "vshrn.i16", v8i8, v8i16, NEONvshrn>;
-def  VSHRN32  : N2VNSh<0, 1, 0b010000, 0b1000, 0, 0, 1,
-                       IIC_VSHLiD, "vshrn.i32", v4i16, v4i32, NEONvshrn>;
-def  VSHRN64  : N2VNSh<0, 1, 0b100000, 0b1000, 0, 0, 1,
-                       IIC_VSHLiD, "vshrn.i64", v2i32, v2i64, NEONvshrn>;
+defm VSHRN    : N2VNSh_HSD<0,1,0b1000,0,0,1, IIC_VSHLiD, "vshrn.i", NEONvshrn>;
 
 //   VRSHL    : Vector Rounding Shift
 defm VRSHLs   : N3VInt_QHSD<0,0,0b0101,0, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q,
@@ -2083,12 +2146,8 @@ defm VRSHRs   : N2VSh_QHSD<0, 1, 0b0010, 1, IIC_VSHLi4D, "vrshr.s", NEONvrshrs>;
 defm VRSHRu   : N2VSh_QHSD<1, 1, 0b0010, 1, IIC_VSHLi4D, "vrshr.u", NEONvrshru>;
 
 //   VRSHRN   : Vector Rounding Shift Right and Narrow
-def  VRSHRN16 : N2VNSh<0, 1, 0b001000, 0b1000, 0, 1, 1,
-                       IIC_VSHLi4D, "vrshrn.i16", v8i8, v8i16, NEONvrshrn>;
-def  VRSHRN32 : N2VNSh<0, 1, 0b010000, 0b1000, 0, 1, 1, 
-                       IIC_VSHLi4D, "vrshrn.i32", v4i16, v4i32, NEONvrshrn>;
-def  VRSHRN64 : N2VNSh<0, 1, 0b100000, 0b1000, 0, 1, 1,
-                       IIC_VSHLi4D, "vrshrn.i64", v2i32, v2i64, NEONvrshrn>;
+defm VRSHRN   : N2VNSh_HSD<0, 1, 0b1000, 0, 1, 1, IIC_VSHLi4D, "vrshrn.i",
+                           NEONvrshrn>;
 
 //   VQSHL    : Vector Saturating Shift
 defm VQSHLs   : N3VInt_QHSD<0,0,0b0100,1, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q,
@@ -2102,26 +2161,14 @@ defm VQSHLui  : N2VSh_QHSD<1, 1, 0b0111, 1, IIC_VSHLi4D, "vqshl.u", NEONvqshlu>;
 defm VQSHLsu  : N2VSh_QHSD<1, 1, 0b0110, 1, IIC_VSHLi4D, "vqshlu.s", NEONvqshlsu>;
 
 //   VQSHRN   : Vector Saturating Shift Right and Narrow
-def VQSHRNs16 : N2VNSh<0, 1, 0b001000, 0b1001, 0, 0, 1, 
-                       IIC_VSHLi4D, "vqshrn.s16", v8i8, v8i16, NEONvqshrns>;
-def VQSHRNs32 : N2VNSh<0, 1, 0b010000, 0b1001, 0, 0, 1,
-                       IIC_VSHLi4D, "vqshrn.s32", v4i16, v4i32, NEONvqshrns>;
-def VQSHRNs64 : N2VNSh<0, 1, 0b100000, 0b1001, 0, 0, 1, 
-                       IIC_VSHLi4D, "vqshrn.s64", v2i32, v2i64, NEONvqshrns>;
-def VQSHRNu16 : N2VNSh<1, 1, 0b001000, 0b1001, 0, 0, 1,
-                       IIC_VSHLi4D, "vqshrn.u16", v8i8, v8i16, NEONvqshrnu>;
-def VQSHRNu32 : N2VNSh<1, 1, 0b010000, 0b1001, 0, 0, 1,
-                       IIC_VSHLi4D, "vqshrn.u32", v4i16, v4i32, NEONvqshrnu>;
-def VQSHRNu64 : N2VNSh<1, 1, 0b100000, 0b1001, 0, 0, 1,
-                       IIC_VSHLi4D, "vqshrn.u64", v2i32, v2i64, NEONvqshrnu>;
+defm VQSHRNs  : N2VNSh_HSD<0, 1, 0b1001, 0, 0, 1, IIC_VSHLi4D, "vqshrn.s",
+                           NEONvqshrns>;
+defm VQSHRNu  : N2VNSh_HSD<1, 1, 0b1001, 0, 0, 1, IIC_VSHLi4D, "vqshrn.u",
+                           NEONvqshrnu>;
 
 //   VQSHRUN  : Vector Saturating Shift Right and Narrow (Unsigned)
-def VQSHRUN16 : N2VNSh<1, 1, 0b001000, 0b1000, 0, 0, 1,
-                       IIC_VSHLi4D, "vqshrun.s16", v8i8, v8i16, NEONvqshrnsu>;
-def VQSHRUN32 : N2VNSh<1, 1, 0b010000, 0b1000, 0, 0, 1,
-                       IIC_VSHLi4D, "vqshrun.s32", v4i16, v4i32, NEONvqshrnsu>;
-def VQSHRUN64 : N2VNSh<1, 1, 0b100000, 0b1000, 0, 0, 1,
-                       IIC_VSHLi4D, "vqshrun.s64", v2i32, v2i64, NEONvqshrnsu>;
+defm VQSHRUN  : N2VNSh_HSD<1, 1, 0b1000, 0, 0, 1, IIC_VSHLi4D, "vqshrun.s",
+                           NEONvqshrnsu>;
 
 //   VQRSHL   : Vector Saturating Rounding Shift
 defm VQRSHLs  : N3VInt_QHSD<0, 0, 0b0101, 1, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q,
@@ -2130,26 +2177,14 @@ defm VQRSHLu  : N3VInt_QHSD<1, 0, 0b0101, 1, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi
                             IIC_VSHLi4Q, "vqrshl.u", int_arm_neon_vqrshiftu, 0>;
 
 //   VQRSHRN  : Vector Saturating Rounding Shift Right and Narrow
-def VQRSHRNs16: N2VNSh<0, 1, 0b001000, 0b1001, 0, 1, 1,
-                       IIC_VSHLi4D, "vqrshrn.s16", v8i8, v8i16, NEONvqrshrns>;
-def VQRSHRNs32: N2VNSh<0, 1, 0b010000, 0b1001, 0, 1, 1,
-                       IIC_VSHLi4D, "vqrshrn.s32", v4i16, v4i32, NEONvqrshrns>;
-def VQRSHRNs64: N2VNSh<0, 1, 0b100000, 0b1001, 0, 1, 1,
-                       IIC_VSHLi4D, "vqrshrn.s64", v2i32, v2i64, NEONvqrshrns>;
-def VQRSHRNu16: N2VNSh<1, 1, 0b001000, 0b1001, 0, 1, 1,
-                       IIC_VSHLi4D, "vqrshrn.u16", v8i8, v8i16, NEONvqrshrnu>;
-def VQRSHRNu32: N2VNSh<1, 1, 0b010000, 0b1001, 0, 1, 1,
-                       IIC_VSHLi4D, "vqrshrn.u32", v4i16, v4i32, NEONvqrshrnu>;
-def VQRSHRNu64: N2VNSh<1, 1, 0b100000, 0b1001, 0, 1, 1, 
-                       IIC_VSHLi4D, "vqrshrn.u64", v2i32, v2i64, NEONvqrshrnu>;
+defm VQRSHRNs : N2VNSh_HSD<0, 1, 0b1001, 0, 1, 1, IIC_VSHLi4D, "vqrshrn.s",
+                           NEONvqrshrns>;
+defm VQRSHRNu : N2VNSh_HSD<1, 1, 0b1001, 0, 1, 1, IIC_VSHLi4D, "vqrshrn.u",
+                           NEONvqrshrnu>;
 
 //   VQRSHRUN : Vector Saturating Rounding Shift Right and Narrow (Unsigned)
-def VQRSHRUN16: N2VNSh<1, 1, 0b001000, 0b1000, 0, 1, 1,
-                       IIC_VSHLi4D, "vqrshrun.s16", v8i8, v8i16, NEONvqrshrnsu>;
-def VQRSHRUN32: N2VNSh<1, 1, 0b010000, 0b1000, 0, 1, 1, 
-                       IIC_VSHLi4D, "vqrshrun.s32", v4i16, v4i32, NEONvqrshrnsu>;
-def VQRSHRUN64: N2VNSh<1, 1, 0b100000, 0b1000, 0, 1, 1,
-                       IIC_VSHLi4D, "vqrshrun.s64", v2i32, v2i64, NEONvqrshrnsu>;
+defm VQRSHRUN : N2VNSh_HSD<1, 1, 0b1000, 0, 1, 1, IIC_VSHLi4D, "vqrshrun.s",
+                           NEONvqrshrnsu>;
 
 //   VSRA     : Vector Shift Right and Accumulate
 defm VSRAs    : N2VShAdd_QHSD<0, 1, 0b0001, 1, "vsra.s", NEONvshrs>;
@@ -2491,27 +2526,28 @@ def  VDUPfq   : NVDup<0b11101010, 0b1011, 0b00, (outs QPR:$dst), (ins GPR:$src),
 
 //   VDUP     : Vector Duplicate Lane (from scalar to all elements)
 
-class VDUPLND<bits<2> op19_18, bits<2> op17_16, string OpcodeStr, ValueType Ty>
-  : N2V<0b11, 0b11, op19_18, op17_16, 0b11000, 0, 0,
+class VDUPLND<string OpcodeStr, ValueType Ty>
+  : N2VDup<0b11, 0b11, 0b11000, 0, 0,
         (outs DPR:$dst), (ins DPR:$src, nohash_imm:$lane), IIC_VMOVD,
         !strconcat(OpcodeStr, "\t$dst, $src[$lane]"), "",
         [(set DPR:$dst, (Ty (NEONvduplane (Ty DPR:$src), imm:$lane)))]>;
 
-class VDUPLNQ<bits<2> op19_18, bits<2> op17_16, string OpcodeStr,
-              ValueType ResTy, ValueType OpTy>
-  : N2V<0b11, 0b11, op19_18, op17_16, 0b11000, 1, 0,
+class VDUPLNQ<string OpcodeStr, ValueType ResTy, ValueType OpTy>
+  : N2VDup<0b11, 0b11, 0b11000, 1, 0,
         (outs QPR:$dst), (ins DPR:$src, nohash_imm:$lane), IIC_VMOVD,
         !strconcat(OpcodeStr, "\t$dst, $src[$lane]"), "",
         [(set QPR:$dst, (ResTy (NEONvduplane (OpTy DPR:$src), imm:$lane)))]>;
 
-def VDUPLN8d  : VDUPLND<0b00, 0b01, "vdup.8", v8i8>;
-def VDUPLN16d : VDUPLND<0b00, 0b10, "vdup.16", v4i16>;
-def VDUPLN32d : VDUPLND<0b01, 0b00, "vdup.32", v2i32>;
-def VDUPLNfd  : VDUPLND<0b01, 0b00, "vdup.32", v2f32>;
-def VDUPLN8q  : VDUPLNQ<0b00, 0b01, "vdup.8", v16i8, v8i8>;
-def VDUPLN16q : VDUPLNQ<0b00, 0b10, "vdup.16", v8i16, v4i16>;
-def VDUPLN32q : VDUPLNQ<0b01, 0b00, "vdup.32", v4i32, v2i32>;
-def VDUPLNfq  : VDUPLNQ<0b01, 0b00, "vdup.32", v4f32, v2f32>;
+// Inst{19-16} is partially specified depending on the element size.
+
+def VDUPLN8d  : VDUPLND<"vdup.8", v8i8> { let Inst{16} = 1; }
+def VDUPLN16d : VDUPLND<"vdup.16", v4i16> { let Inst{17-16} = 0b10; }
+def VDUPLN32d : VDUPLND<"vdup.32", v2i32> { let Inst{18-16} = 0b100; }
+def VDUPLNfd  : VDUPLND<"vdup.32", v2f32> { let Inst{18-16} = 0b100; }
+def VDUPLN8q  : VDUPLNQ<"vdup.8", v16i8, v8i8> { let Inst{16} = 1; }
+def VDUPLN16q : VDUPLNQ<"vdup.16", v8i16, v4i16> { let Inst{17-16} = 0b10; }
+def VDUPLN32q : VDUPLNQ<"vdup.32", v4i32, v2i32> { let Inst{18-16} = 0b100; }
+def VDUPLNfq  : VDUPLNQ<"vdup.32", v4f32, v2f32> { let Inst{18-16} = 0b100; }
 
 def : Pat<(v16i8 (NEONvduplane (v16i8 QPR:$src), imm:$lane)),
           (v16i8 (VDUPLN8q (v8i8 (EXTRACT_SUBREG QPR:$src,
@@ -2530,15 +2566,19 @@ def : Pat<(v4f32 (NEONvduplane (v4f32 QPR:$src), imm:$lane)),
                                    (DSubReg_i32_reg imm:$lane))),
                            (SubReg_i32_lane imm:$lane)))>;
 
-def VDUPfdf   : N2V<0b11, 0b11, 0b01, 0b00, 0b11000, 0, 0,
-                    (outs DPR:$dst), (ins SPR:$src),
-                    IIC_VMOVD, "vdup.32\t$dst, ${src:lane}", "",
-                    [(set DPR:$dst, (v2f32 (NEONvdup (f32 SPR:$src))))]>;
+def  VDUPfdf  : N2VDup<0b11, 0b11, 0b11000, 0, 0,
+                       (outs DPR:$dst), (ins SPR:$src),
+                       IIC_VMOVD, "vdup.32\t$dst, ${src:lane}", "",
+                       [(set DPR:$dst, (v2f32 (NEONvdup (f32 SPR:$src))))]> {
+  let Inst{18-16} = 0b100;
+}
 
-def VDUPfqf   : N2V<0b11, 0b11, 0b01, 0b00, 0b11000, 1, 0,
-                    (outs QPR:$dst), (ins SPR:$src),
-                    IIC_VMOVD, "vdup.32\t$dst, ${src:lane}", "",
-                    [(set QPR:$dst, (v4f32 (NEONvdup (f32 SPR:$src))))]>;
+def  VDUPfqf  : N2VDup<0b11, 0b11, 0b11000, 1, 0,
+                       (outs QPR:$dst), (ins SPR:$src),
+                       IIC_VMOVD, "vdup.32\t$dst, ${src:lane}", "",
+                       [(set QPR:$dst, (v4f32 (NEONvdup (f32 SPR:$src))))]> {
+  let Inst{18-16} = 0b100;
+}
 
 def : Pat<(v2i64 (NEONvduplane (v2i64 QPR:$src), imm:$lane)),
           (INSERT_SUBREG QPR:$src, 
@@ -2560,8 +2600,8 @@ defm VQMOVNu  : N2VNInt_HSD<0b11,0b11,0b10,0b00101,1,0, IIC_VQUNAiD, "vqmovn.u",
 defm VQMOVNsu : N2VNInt_HSD<0b11,0b11,0b10,0b00100,1,0, IIC_VQUNAiD, "vqmovun.s",
                             int_arm_neon_vqmovnsu>;
 //   VMOVL    : Vector Lengthening Move
-defm VMOVLs   : N2VLInt_QHS<0,1,0b1010,0,0,1, "vmovl.s", int_arm_neon_vmovls>;
-defm VMOVLu   : N2VLInt_QHS<1,1,0b1010,0,0,1, "vmovl.u", int_arm_neon_vmovlu>;
+defm VMOVLs   : N2VLInt_QHS<0b01,0b10100,0,1, "vmovl.s", int_arm_neon_vmovls>;
+defm VMOVLu   : N2VLInt_QHS<0b11,0b10100,0,1, "vmovl.u", int_arm_neon_vmovlu>;
 
 // Vector Conversions.
 
@@ -2585,24 +2625,22 @@ def  VCVTu2fq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01101, 0, "vcvt.f32.u32",
                      v4f32, v4i32, uint_to_fp>;
 
 //   VCVT     : Vector Convert Between Floating-Point and Fixed-Point.
-// Note: Some of the opcode bits in the following VCVT instructions need to
-// be encoded based on the immed values.
-def VCVTf2xsd : N2VCvtD<0, 1, 0b000000, 0b1111, 0, 1, "vcvt.s32.f32",
+def VCVTf2xsd : N2VCvtD<0, 1, 0b1111, 0, 1, "vcvt.s32.f32",
                         v2i32, v2f32, int_arm_neon_vcvtfp2fxs>;
-def VCVTf2xud : N2VCvtD<1, 1, 0b000000, 0b1111, 0, 1, "vcvt.u32.f32",
+def VCVTf2xud : N2VCvtD<1, 1, 0b1111, 0, 1, "vcvt.u32.f32",
                         v2i32, v2f32, int_arm_neon_vcvtfp2fxu>;
-def VCVTxs2fd : N2VCvtD<0, 1, 0b000000, 0b1110, 0, 1, "vcvt.f32.s32",
+def VCVTxs2fd : N2VCvtD<0, 1, 0b1110, 0, 1, "vcvt.f32.s32",
                         v2f32, v2i32, int_arm_neon_vcvtfxs2fp>;
-def VCVTxu2fd : N2VCvtD<1, 1, 0b000000, 0b1110, 0, 1, "vcvt.f32.u32",
+def VCVTxu2fd : N2VCvtD<1, 1, 0b1110, 0, 1, "vcvt.f32.u32",
                         v2f32, v2i32, int_arm_neon_vcvtfxu2fp>;
 
-def VCVTf2xsq : N2VCvtQ<0, 1, 0b000000, 0b1111, 0, 1, "vcvt.s32.f32",
+def VCVTf2xsq : N2VCvtQ<0, 1, 0b1111, 0, 1, "vcvt.s32.f32",
                         v4i32, v4f32, int_arm_neon_vcvtfp2fxs>;
-def VCVTf2xuq : N2VCvtQ<1, 1, 0b000000, 0b1111, 0, 1, "vcvt.u32.f32",
+def VCVTf2xuq : N2VCvtQ<1, 1, 0b1111, 0, 1, "vcvt.u32.f32",
                         v4i32, v4f32, int_arm_neon_vcvtfp2fxu>;
-def VCVTxs2fq : N2VCvtQ<0, 1, 0b000000, 0b1110, 0, 1, "vcvt.f32.s32",
+def VCVTxs2fq : N2VCvtQ<0, 1, 0b1110, 0, 1, "vcvt.f32.s32",
                         v4f32, v4i32, int_arm_neon_vcvtfxs2fp>;
-def VCVTxu2fq : N2VCvtQ<1, 1, 0b000000, 0b1110, 0, 1, "vcvt.f32.u32",
+def VCVTxu2fq : N2VCvtQ<1, 1, 0b1110, 0, 1, "vcvt.f32.u32",
                         v4f32, v4i32, int_arm_neon_vcvtfxu2fp>;
 
 // Vector Reverse.
@@ -2670,18 +2708,18 @@ def VREV16q8  : VREV16Q<0b00, "vrev16.8", v16i8>;
 //   VEXT     : Vector Extract
 
 class VEXTd<string OpcodeStr, ValueType Ty>
-  : N3V<0,1,0b11,0b0000,0,0, (outs DPR:$dst),
-        (ins DPR:$lhs, DPR:$rhs, i32imm:$index), IIC_VEXTD,
-        !strconcat(OpcodeStr, "\t$dst, $lhs, $rhs, $index"), "",
-        [(set DPR:$dst, (Ty (NEONvext (Ty DPR:$lhs),
-                                      (Ty DPR:$rhs), imm:$index)))]>;
+  : N3VImm<0,1,0b11,0,0, (outs DPR:$dst),
+           (ins DPR:$lhs, DPR:$rhs, i32imm:$index), IIC_VEXTD,
+           !strconcat(OpcodeStr, "\t$dst, $lhs, $rhs, $index"), "",
+           [(set DPR:$dst, (Ty (NEONvext (Ty DPR:$lhs),
+                                         (Ty DPR:$rhs), imm:$index)))]>;
 
 class VEXTq<string OpcodeStr, ValueType Ty>
-  : N3V<0,1,0b11,0b0000,1,0, (outs QPR:$dst),
-        (ins QPR:$lhs, QPR:$rhs, i32imm:$index), IIC_VEXTQ,
-        !strconcat(OpcodeStr, "\t$dst, $lhs, $rhs, $index"), "",
-        [(set QPR:$dst, (Ty (NEONvext (Ty QPR:$lhs),
-                                      (Ty QPR:$rhs), imm:$index)))]>;
+  : N3VImm<0,1,0b11,1,0, (outs QPR:$dst),
+           (ins QPR:$lhs, QPR:$rhs, i32imm:$index), IIC_VEXTQ,
+           !strconcat(OpcodeStr, "\t$dst, $lhs, $rhs, $index"), "",
+           [(set QPR:$dst, (Ty (NEONvext (Ty QPR:$lhs),
+                                         (Ty QPR:$rhs), imm:$index)))]>;
 
 def VEXTd8  : VEXTd<"vext.8",  v8i8>;
 def VEXTd16 : VEXTd<"vext.16", v4i16>;
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index 0750dcc7fdc4..2b6fa98ed3c3 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -69,6 +69,25 @@ def t2_so_imm_neg : Operand<i32>,
   return ARM_AM::getT2SOImmVal(-((int)N->getZExtValue())) != -1;
 }], t2_so_imm_neg_XFORM>;
 
+// Break t2_so_imm's up into two pieces.  This handles immediates with up to 16
+// bits set in them.  This uses t2_so_imm2part to match and t2_so_imm2part_[12]
+// to get the first/second pieces.
+def t2_so_imm2part : Operand<i32>,
+                  PatLeaf<(imm), [{
+      return ARM_AM::isT2SOImmTwoPartVal((unsigned)N->getZExtValue());
+    }]> {
+}
+
+def t2_so_imm2part_1 : SDNodeXForm<imm, [{
+  unsigned V = ARM_AM::getT2SOImmTwoPartFirst((unsigned)N->getZExtValue());
+  return CurDAG->getTargetConstant(V, MVT::i32);
+}]>;
+
+def t2_so_imm2part_2 : SDNodeXForm<imm, [{
+  unsigned V = ARM_AM::getT2SOImmTwoPartSecond((unsigned)N->getZExtValue());
+  return CurDAG->getTargetConstant(V, MVT::i32);
+}]>;
+
 /// imm1_31 predicate - True if the 32-bit immediate is in the range [1,31].
 def imm1_31 : PatLeaf<(i32 imm), [{
   return (int32_t)N->getZExtValue() >= 1 && (int32_t)N->getZExtValue() < 32;
@@ -666,6 +685,8 @@ def t2MOVTi16 : T2I<(outs GPR:$dst), (ins GPR:$src, i32imm:$imm), IIC_iMOVi,
                     [(set GPR:$dst,
                           (or (and GPR:$src, 0xffff), lo16AllZero:$imm))]>;
 
+def : T2Pat<(or GPR:$src, 0xffff0000), (t2MOVTi16 GPR:$src, 0xffff)>;
+
 //===----------------------------------------------------------------------===//
 //  Extend Instructions.
 //
@@ -1129,6 +1150,20 @@ def t2IT : Thumb2XI<(outs), (ins it_pred:$cc, it_mask:$mask),
 // Non-Instruction Patterns
 //
 
+// Two piece so_imms.
+def : T2Pat<(or GPR:$LHS, t2_so_imm2part:$RHS),
+             (t2ORRri (t2ORRri GPR:$LHS, (t2_so_imm2part_1 imm:$RHS)),
+                    (t2_so_imm2part_2 imm:$RHS))>;
+def : T2Pat<(xor GPR:$LHS, t2_so_imm2part:$RHS),
+             (t2EORri (t2EORri GPR:$LHS, (t2_so_imm2part_1 imm:$RHS)),
+                    (t2_so_imm2part_2 imm:$RHS))>;
+def : T2Pat<(add GPR:$LHS, t2_so_imm2part:$RHS),
+             (t2ADDri (t2ADDri GPR:$LHS, (t2_so_imm2part_1 imm:$RHS)),
+                    (t2_so_imm2part_2 imm:$RHS))>;
+def : T2Pat<(sub GPR:$LHS, t2_so_imm2part:$RHS),
+             (t2SUBri (t2SUBri GPR:$LHS, (t2_so_imm2part_1 imm:$RHS)),
+                    (t2_so_imm2part_2 imm:$RHS))>;
+
 // ConstantPool, GlobalAddress, and JumpTable
 def : T2Pat<(ARMWrapper  tglobaladdr :$dst), (t2LEApcrel tglobaladdr :$dst)>;
 def : T2Pat<(ARMWrapper  tconstpool  :$dst), (t2LEApcrel tconstpool  :$dst)>;
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index d2ec9ee6cdf9..c9b9e84c2a3b 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -974,6 +974,9 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
     if (Advance) {
       ++Position;
       ++MBBI;
+      if (MBBI == E)
+        // Reach the end of the block, try merging the memory instructions.
+        TryMerge = true;
     } else
       TryMerge = true;
 
diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td
index 20a7355b7653..e0be78432973 100644
--- a/lib/Target/ARM/ARMRegisterInfo.td
+++ b/lib/Target/ARM/ARMRegisterInfo.td
@@ -222,12 +222,9 @@ def tGPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6, R7]> {
     iterator allocation_order_begin(const MachineFunction &MF) const;
     iterator allocation_order_end(const MachineFunction &MF) const;
   }];
-  // FIXME: We are reserving r3 in Thumb mode in case the PEI needs to use it
-  // to generate large stack offset. Make it available once we have register
-  // scavenging.
   let MethodBodies = [{
     static const unsigned THUMB_tGPR_AO[] = {
-      ARM::R0, ARM::R1, ARM::R2,
+      ARM::R0, ARM::R1, ARM::R2, ARM::R3,
       ARM::R4, ARM::R5, ARM::R6, ARM::R7 };
 
     // FP is R7, only low registers available.
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index cf1ee3f02953..5af95c33b930 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -27,11 +27,11 @@ UseNEONFP("arm-use-neon-fp",
           cl::init(false), cl::Hidden);
 
 ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &FS,
-                           bool isThumb)
+                           bool isT)
   : ARMArchVersion(V4T)
   , ARMFPUType(None)
   , UseNEONForSinglePrecisionFP(UseNEONFP)
-  , IsThumb(isThumb)
+  , IsThumb(isT)
   , ThumbMode(Thumb1)
   , PostRAScheduler(false)
   , IsR9Reserved(ReserveR9)
@@ -98,9 +98,13 @@ ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &FS,
   if (isTargetDarwin())
     IsR9Reserved = ReserveR9 | (ARMArchVersion < V6);
 
+  if (!isThumb() || hasThumb2())
+    PostRAScheduler = true;
+
   // Set CPU specific features.
   if (CPUString == "cortex-a8") {
-    PostRAScheduler = true;
+    // On Cortex-a8, it's faster to perform some single-precision FP
+    // operations with NEON instructions.
     if (UseNEONFP.getPosition() == 0)
       UseNEONForSinglePrecisionFP = true;
   }
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index 7098fd4f36ba..74781593a0d9 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -126,9 +126,13 @@ protected:
 
   const std::string & getCPUString() const { return CPUString; }
   
-  /// enablePostRAScheduler - From TargetSubtarget, return true to
-  /// enable post-RA scheduler.
-  bool enablePostRAScheduler() const { return PostRAScheduler; }
+  /// enablePostRAScheduler - True at 'More' optimization except
+  /// for Thumb1.
+  bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
+                             TargetSubtarget::AntiDepBreakMode& mode) const {
+    mode = TargetSubtarget::ANTIDEP_NONE;
+    return PostRAScheduler && OptLevel >= CodeGenOpt::Default;
+  }
 
   /// getInstrItins - Return the instruction itineraies based on subtarget
   /// selection.
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index 32ddc20a5604..c1da6ce88b9a 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -16,7 +16,6 @@
 #include "ARM.h"
 #include "llvm/PassManager.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetRegistry.h"
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 7438ea9c79f3..403f96c69e58 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -56,6 +56,14 @@ private:
 
   bool ParseDirectiveWord(unsigned Size, SMLoc L);
 
+  bool ParseDirectiveThumb(SMLoc L);
+
+  bool ParseDirectiveThumbFunc(SMLoc L);
+
+  bool ParseDirectiveCode(SMLoc L);
+
+  bool ParseDirectiveSyntax(SMLoc L);
+
   // TODO - For now hacked versions of the next two are in here in this file to
   // allow some parser testing until the table gen versions are implemented.
 
@@ -230,8 +238,8 @@ bool ARMAsmParser::ParseRegister(ARMOperand &Op) {
   return false;
 }
 
-// Try to parse a register list.  The first token must be a '{' when called
-// for now.
+// Parse a register list, return false if successful else return true or an 
+// error.  The first token must be a '{' when called.
 bool ARMAsmParser::ParseRegisterList(ARMOperand &Op) {
   assert(getLexer().getTok().is(AsmToken::LCurly) &&
          "Token is not an Left Curly Brace");
@@ -277,7 +285,8 @@ bool ARMAsmParser::ParseRegisterList(ARMOperand &Op) {
   return false;
 }
 
-// Try to parse an arm memory expression.  It must start with a '[' token.
+// Parse an arm memory expression, return false if successful else return true
+// or an error.  The first token must be a '[' when called.
 // TODO Only preindexing and postindexing addressing are started, unindexed
 // with option, etc are still to do.
 bool ARMAsmParser::ParseMemory(ARMOperand &Op) {
@@ -374,50 +383,55 @@ bool ARMAsmParser::ParseMemory(ARMOperand &Op) {
     Writeback = true;
     getLexer().Lex(); // Eat right bracket token.
 
-    const AsmToken &CommaTok = getLexer().getTok();
-    if (CommaTok.isNot(AsmToken::Comma))
-      return Error(CommaTok.getLoc(), "',' expected");
-    getLexer().Lex(); // Eat comma token.
-
-    const AsmToken &NextTok = getLexer().getTok();
-    if (NextTok.is(AsmToken::Plus))
-      getLexer().Lex(); // Eat plus token.
-    else if (NextTok.is(AsmToken::Minus)) {
-      Negative = true;
-      getLexer().Lex(); // Eat minus token
-    }
-
-    // See if there is a register following the "[Rn]," we have so far.
-    const AsmToken &OffsetRegTok = getLexer().getTok();
-    int OffsetRegNum = MatchRegisterName(OffsetRegTok.getString());
+    int OffsetRegNum = 0;
     bool OffsetRegShifted = false;
     enum ShiftType ShiftType;
     const MCExpr *ShiftAmount;
     const MCExpr *Offset;
-    if (OffsetRegNum != -1) {
-      OffsetIsReg = true;
-      getLexer().Lex(); // Eat identifier token for the offset register.
-      // Look for a comma then a shift
-      const AsmToken &Tok = getLexer().getTok();
-      if (Tok.is(AsmToken::Comma)) {
-        getLexer().Lex(); // Eat comma token.
 
-        const AsmToken &Tok = getLexer().getTok();
-        if (ParseShift(&ShiftType, ShiftAmount))
-          return Error(Tok.getLoc(), "shift expected");
-        OffsetRegShifted = true;
+    const AsmToken &NextTok = getLexer().getTok();
+    if (NextTok.isNot(AsmToken::EndOfStatement)) {
+      if (NextTok.isNot(AsmToken::Comma))
+	return Error(NextTok.getLoc(), "',' expected");
+      getLexer().Lex(); // Eat comma token.
+
+      const AsmToken &PlusMinusTok = getLexer().getTok();
+      if (PlusMinusTok.is(AsmToken::Plus))
+	getLexer().Lex(); // Eat plus token.
+      else if (PlusMinusTok.is(AsmToken::Minus)) {
+	Negative = true;
+	getLexer().Lex(); // Eat minus token
       }
-    }
-    else { // "[Rn]," we have so far was not followed by "Rm"
-      // Look for #offset following the "[Rn],"
-      const AsmToken &HashTok = getLexer().getTok();
-      if (HashTok.isNot(AsmToken::Hash))
-        return Error(HashTok.getLoc(), "'#' expected");
-      getLexer().Lex(); // Eat hash token.
 
-      if (getParser().ParseExpression(Offset))
-       return true;
+      // See if there is a register following the "[Rn]," we have so far.
+      const AsmToken &OffsetRegTok = getLexer().getTok();
+      OffsetRegNum = MatchRegisterName(OffsetRegTok.getString());
+      if (OffsetRegNum != -1) {
+	OffsetIsReg = true;
+	getLexer().Lex(); // Eat identifier token for the offset register.
+	// Look for a comma then a shift
+	const AsmToken &Tok = getLexer().getTok();
+	if (Tok.is(AsmToken::Comma)) {
+	  getLexer().Lex(); // Eat comma token.
+
+	  const AsmToken &Tok = getLexer().getTok();
+	  if (ParseShift(&ShiftType, ShiftAmount))
+	    return Error(Tok.getLoc(), "shift expected");
+	  OffsetRegShifted = true;
+	}
+      }
+      else { // "[Rn]," we have so far was not followed by "Rm"
+	// Look for #offset following the "[Rn],"
+	const AsmToken &HashTok = getLexer().getTok();
+	if (HashTok.isNot(AsmToken::Hash))
+	  return Error(HashTok.getLoc(), "'#' expected");
+	getLexer().Lex(); // Eat hash token.
+
+	if (getParser().ParseExpression(Offset))
+	 return true;
+      }
     }
+
     Op = ARMOperand::CreateMem(BaseRegNum, OffsetIsReg, Offset, OffsetRegNum,
                                OffsetRegShifted, ShiftType, ShiftAmount,
                                Preindexed, Postindexed, Negative, Writeback);
@@ -465,7 +479,7 @@ bool ARMAsmParser::ParseShift(ShiftType *St, const MCExpr *&ShiftAmount) {
   return false;
 }
 
-// A hack to allow some testing
+// A hack to allow some testing, to be replaced by a real table gen version.
 int ARMAsmParser::MatchRegisterName(const StringRef &Name) {
   if (Name == "r0" || Name == "R0")
     return 0;
@@ -504,7 +518,7 @@ int ARMAsmParser::MatchRegisterName(const StringRef &Name) {
   return -1;
 }
 
-// A hack to allow some testing
+// A hack to allow some testing, to be replaced by a real table gen version.
 bool ARMAsmParser::MatchInstruction(SmallVectorImpl<ARMOperand> &Operands,
                                     MCInst &Inst) {
   struct ARMOperand Op0 = Operands[0];
@@ -516,40 +530,58 @@ bool ARMAsmParser::MatchInstruction(SmallVectorImpl<ARMOperand> &Operands,
       Mnemonic == "ldmfd" ||
       Mnemonic == "ldr" ||
       Mnemonic == "mov" ||
-      Mnemonic == "sub")
+      Mnemonic == "sub" ||
+      Mnemonic == "bl" ||
+      Mnemonic == "push" ||
+      Mnemonic == "blx" ||
+      Mnemonic == "pop") {
+    // Hard-coded to a valid instruction, till we have a real matcher.
+    Inst = MCInst();
+    Inst.setOpcode(ARM::MOVr);
+    Inst.addOperand(MCOperand::CreateReg(2));
+    Inst.addOperand(MCOperand::CreateReg(2));
+    Inst.addOperand(MCOperand::CreateImm(0));
+    Inst.addOperand(MCOperand::CreateImm(0));
+    Inst.addOperand(MCOperand::CreateReg(0));
     return false;
+  }
 
   return true;
 }
 
-// TODO - this is a work in progress
+// Parse a arm instruction operand.  For now this parses the operand regardless
+// of the mnemonic.
 bool ARMAsmParser::ParseOperand(ARMOperand &Op) {
   switch (getLexer().getKind()) {
   case AsmToken::Identifier:
     if (!ParseRegister(Op))
       return false;
-    // TODO parse other operands that start with an identifier like labels
-    return Error(getLexer().getTok().getLoc(), "labels not yet supported");
+    // This was not a register so parse other operands that start with an
+    // identifier (like labels) as expressions and create them as immediates.
+    const MCExpr *IdVal;
+    if (getParser().ParseExpression(IdVal))
+      return true;
+    Op = ARMOperand::CreateImm(IdVal);
+    return false;
   case AsmToken::LBrac:
-    if (!ParseMemory(Op))
-      return false;
+    return ParseMemory(Op);
   case AsmToken::LCurly:
-    if (!ParseRegisterList(Op))
-      return false;
+    return ParseRegisterList(Op);
   case AsmToken::Hash:
     // #42 -> immediate.
     // TODO: ":lower16:" and ":upper16:" modifiers after # before immediate
     getLexer().Lex();
-    const MCExpr *Val;
-    if (getParser().ParseExpression(Val))
+    const MCExpr *ImmVal;
+    if (getParser().ParseExpression(ImmVal))
       return true;
-    Op = ARMOperand::CreateImm(Val);
+    Op = ARMOperand::CreateImm(ImmVal);
     return false;
   default:
     return Error(getLexer().getTok().getLoc(), "unexpected token in operand");
   }
 }
 
+// Parse an arm instruction mnemonic followed by its operands.
 bool ARMAsmParser::ParseInstruction(const StringRef &Name, MCInst &Inst) {
   SmallVector<ARMOperand, 7> Operands;
 
@@ -579,10 +611,19 @@ bool ARMAsmParser::ParseInstruction(const StringRef &Name, MCInst &Inst) {
   return true;
 }
 
+/// ParseDirective parses the arm specific directives
 bool ARMAsmParser::ParseDirective(AsmToken DirectiveID) {
   StringRef IDVal = DirectiveID.getIdentifier();
   if (IDVal == ".word")
     return ParseDirectiveWord(4, DirectiveID.getLoc());
+  else if (IDVal == ".thumb")
+    return ParseDirectiveThumb(DirectiveID.getLoc());
+  else if (IDVal == ".thumb_func")
+    return ParseDirectiveThumbFunc(DirectiveID.getLoc());
+  else if (IDVal == ".code")
+    return ParseDirectiveCode(DirectiveID.getLoc());
+  else if (IDVal == ".syntax")
+    return ParseDirectiveSyntax(DirectiveID.getLoc());
   return true;
 }
 
@@ -611,6 +652,93 @@ bool ARMAsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
   return false;
 }
 
+/// ParseDirectiveThumb
+///  ::= .thumb
+bool ARMAsmParser::ParseDirectiveThumb(SMLoc L) {
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return Error(L, "unexpected token in directive");
+  getLexer().Lex();
+
+  // TODO: set thumb mode
+  // TODO: tell the MC streamer the mode
+  // getParser().getStreamer().Emit???();
+  return false;
+}
+
+/// ParseDirectiveThumbFunc
+///  ::= .thumbfunc symbol_name
+bool ARMAsmParser::ParseDirectiveThumbFunc(SMLoc L) {
+  const AsmToken &Tok = getLexer().getTok();
+  if (Tok.isNot(AsmToken::Identifier) && Tok.isNot(AsmToken::String))
+    return Error(L, "unexpected token in .syntax directive");
+  StringRef SymbolName = getLexer().getTok().getIdentifier();
+  getLexer().Lex(); // Consume the identifier token.
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return Error(L, "unexpected token in directive");
+  getLexer().Lex();
+
+  // TODO: mark symbol as a thumb symbol
+  // getParser().getStreamer().Emit???();
+  return false;
+}
+
+/// ParseDirectiveSyntax
+///  ::= .syntax unified | divided
+bool ARMAsmParser::ParseDirectiveSyntax(SMLoc L) {
+  const AsmToken &Tok = getLexer().getTok();
+  if (Tok.isNot(AsmToken::Identifier))
+    return Error(L, "unexpected token in .syntax directive");
+  const StringRef &Mode = Tok.getString();
+  bool unified_syntax;
+  if (Mode == "unified" || Mode == "UNIFIED") {
+    getLexer().Lex();
+    unified_syntax = true;
+  }
+  else if (Mode == "divided" || Mode == "DIVIDED") {
+    getLexer().Lex();
+    unified_syntax = false;
+  }
+  else
+    return Error(L, "unrecognized syntax mode in .syntax directive");
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return Error(getLexer().getTok().getLoc(), "unexpected token in directive");
+  getLexer().Lex();
+
+  // TODO tell the MC streamer the mode
+  // getParser().getStreamer().Emit???();
+  return false;
+}
+
+/// ParseDirectiveCode
+///  ::= .code 16 | 32
+bool ARMAsmParser::ParseDirectiveCode(SMLoc L) {
+  const AsmToken &Tok = getLexer().getTok();
+  if (Tok.isNot(AsmToken::Integer))
+    return Error(L, "unexpected token in .code directive");
+  int64_t Val = getLexer().getTok().getIntVal();
+  bool thumb_mode;
+  if (Val == 16) {
+    getLexer().Lex();
+    thumb_mode = true;
+  }
+  else if (Val == 32) {
+    getLexer().Lex();
+    thumb_mode = false;
+  }
+  else
+    return Error(L, "invalid operand to .code directive");
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return Error(getLexer().getTok().getLoc(), "unexpected token in directive");
+  getLexer().Lex();
+
+  // TODO tell the MC streamer the mode
+  // getParser().getStreamer().Emit???();
+  return false;
+}
+
 // Force static initialization.
 extern "C" void LLVMInitializeARMAsmParser() {
   RegisterAsmParser<ARMAsmParser> X(TheARMTarget);
diff --git a/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp b/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp
index 546731b00d3c..8719e4c33903 100644
--- a/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp
@@ -1,3 +1,5 @@
+//===-- ARMAsmPrinter.cpp - Print machine code to an ARM .s file ----------===//
+//
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
@@ -13,21 +15,25 @@
 #define DEBUG_TYPE "asm-printer"
 #include "ARM.h"
 #include "ARMBuildAttrs.h"
-#include "ARMTargetMachine.h"
 #include "ARMAddressingModes.h"
 #include "ARMConstantPoolValue.h"
+#include "ARMInstPrinter.h"
 #include "ARMMachineFunctionInfo.h"
+#include "ARMMCInstLower.h"
+#include "ARMTargetMachine.h"
 #include "llvm/Constants.h"
 #include "llvm/Module.h"
 #include "llvm/Assembly/Writer.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/DwarfWriter.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
@@ -38,18 +44,22 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringSet.h"
-#include "llvm/Support/Compiler.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/Mangler.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Support/FormattedStream.h"
 #include <cctype>
 using namespace llvm;
 
 STATISTIC(EmittedInsts, "Number of machine instrs printed");
 
+static cl::opt<bool>
+EnableMCInst("enable-arm-mcinst-printer", cl::Hidden,
+            cl::desc("enable experimental asmprinter gunk in the arm backend"));
+
 namespace {
-  class VISIBILITY_HIDDEN ARMAsmPrinter : public AsmPrinter {
+  class ARMAsmPrinter : public AsmPrinter {
 
     /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
     /// make the right decision when printing asm code for different targets.
@@ -63,34 +73,23 @@ namespace {
     /// MachineFunction.
     const MachineConstantPool *MCP;
 
-    /// We name each basic block in a Function with a unique number, so
-    /// that we can consistently refer to them later. This is cleared
-    /// at the beginning of each call to runOnMachineFunction().
-    ///
-    typedef std::map<const Value *, unsigned> ValueMapTy;
-    ValueMapTy NumberForBB;
-
-    /// GVNonLazyPtrs - Keeps the set of GlobalValues that require
-    /// non-lazy-pointers for indirect access.
-    StringMap<std::string> GVNonLazyPtrs;
-
-    /// HiddenGVNonLazyPtrs - Keeps the set of GlobalValues with hidden
-    /// visibility that require non-lazy-pointers for indirect access.
-    StringMap<std::string> HiddenGVNonLazyPtrs;
-
-    /// True if asm printer is printing a series of CONSTPOOL_ENTRY.
-    bool InCPMode;
   public:
     explicit ARMAsmPrinter(formatted_raw_ostream &O, TargetMachine &TM,
                            const MCAsmInfo *T, bool V)
-      : AsmPrinter(O, TM, T, V), AFI(NULL), MCP(NULL),
-        InCPMode(false) {
+      : AsmPrinter(O, TM, T, V), AFI(NULL), MCP(NULL) {
       Subtarget = &TM.getSubtarget<ARMSubtarget>();
     }
 
     virtual const char *getPassName() const {
       return "ARM Assembly Printer";
     }
+    
+    void printMCInst(const MCInst *MI) {
+      ARMInstPrinter(O, *MAI, VerboseAsm).printInstruction(MI);
+    }
+    
+    void printInstructionThroughMCStreamer(const MachineInstr *MI);
+    
 
     void printOperand(const MachineInstr *MI, int OpNum,
                       const char *Modifier = 0);
@@ -149,8 +148,8 @@ namespace {
 
     void printMachineInstruction(const MachineInstr *MI);
     bool runOnMachineFunction(MachineFunction &F);
-    bool doFinalization(Module &M);
     void EmitStartOfAsmFile(Module &M);
+    void EmitEndOfAsmFile(Module &M);
 
     /// EmitMachineConstantPoolValue - Print a machine constantpool value to
     /// the .s file.
@@ -173,12 +172,19 @@ namespace {
           Name = Mang->getMangledName(GV);
         else {
           // FIXME: Remove this when Darwin transition to @GOT like syntax.
-          std::string SymName = Mang->getMangledName(GV);
           Name = Mang->getMangledName(GV, "$non_lazy_ptr", true);
-          if (GV->hasHiddenVisibility())
-            HiddenGVNonLazyPtrs[SymName] = Name;
-          else
-            GVNonLazyPtrs[SymName] = Name;
+          MCSymbol *Sym = OutContext.GetOrCreateSymbol(StringRef(Name));
+          
+          MachineModuleInfoMachO &MMIMachO =
+            MMI->getObjFileInfo<MachineModuleInfoMachO>();
+          const MCSymbol *&StubSym =
+            GV->hasHiddenVisibility() ? MMIMachO.getHiddenGVStubEntry(Sym) :
+                                        MMIMachO.getGVStubEntry(Sym);
+          if (StubSym == 0) {
+            SmallString<128> NameStr;
+            Mang->getNameWithPrefix(NameStr, GV, false);
+            StubSym = OutContext.GetOrCreateSymbol(NameStr.str());
+          }
         }
       } else
         Name = Mang->makeNameProper(ACPV->getSymbol());
@@ -260,7 +266,6 @@ bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
     if (Subtarget->isTargetDarwin())
       O << "\t" << CurrentFnName;
     O << "\n";
-    InCPMode = false;
   } else {
     EmitAlignment(FnAlign, F);
   }
@@ -283,14 +288,13 @@ bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
        I != E; ++I) {
     // Print a label for the basic block.
-    if (I != MF.begin()) {
+    if (I != MF.begin())
       EmitBasicBlockStart(I);
-    }
+
+    // Print the assembly for the instruction.
     for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end();
-         II != E; ++II) {
-      // Print the assembly for the instruction.
+         II != E; ++II)
       printMachineInstruction(II);
-    }
   }
 
   if (MAI->hasDotTypeDotSizeDirective())
@@ -306,25 +310,25 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
                                  const char *Modifier) {
   const MachineOperand &MO = MI->getOperand(OpNum);
   switch (MO.getType()) {
+  default:
+    assert(0 && "<unknown operand type>");
   case MachineOperand::MO_Register: {
     unsigned Reg = MO.getReg();
-    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
-      if (Modifier && strcmp(Modifier, "dregpair") == 0) {
-        unsigned DRegLo = TRI->getSubReg(Reg, 5); // arm_dsubreg_0
-        unsigned DRegHi = TRI->getSubReg(Reg, 6); // arm_dsubreg_1
-        O << '{'
-          << getRegisterName(DRegLo) << ',' << getRegisterName(DRegHi)
-          << '}';
-      } else if (Modifier && strcmp(Modifier, "lane") == 0) {
-        unsigned RegNum = ARMRegisterInfo::getRegisterNumbering(Reg);
-        unsigned DReg = TRI->getMatchingSuperReg(Reg, RegNum & 1 ? 2 : 1,
-                                                 &ARM::DPR_VFP2RegClass);
-        O << getRegisterName(DReg) << '[' << (RegNum & 1) << ']';
-      } else {
-        O << getRegisterName(Reg);
-      }
-    } else
-      llvm_unreachable("not implemented");
+    assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+    if (Modifier && strcmp(Modifier, "dregpair") == 0) {
+      unsigned DRegLo = TRI->getSubReg(Reg, 5); // arm_dsubreg_0
+      unsigned DRegHi = TRI->getSubReg(Reg, 6); // arm_dsubreg_1
+      O << '{'
+        << getRegisterName(DRegLo) << ',' << getRegisterName(DRegHi)
+        << '}';
+    } else if (Modifier && strcmp(Modifier, "lane") == 0) {
+      unsigned RegNum = ARMRegisterInfo::getRegisterNumbering(Reg);
+      unsigned DReg = TRI->getMatchingSuperReg(Reg, RegNum & 1 ? 2 : 1,
+                                               &ARM::DPR_VFP2RegClass);
+      O << getRegisterName(DReg) << '[' << (RegNum & 1) << ']';
+    } else {
+      O << getRegisterName(Reg);
+    }
     break;
   }
   case MachineOperand::MO_Immediate: {
@@ -372,8 +376,6 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
     O << MAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
       << '_' << MO.getIndex();
     break;
-  default:
-    O << "<unknown operand type>"; abort (); break;
   }
 }
 
@@ -1027,22 +1029,19 @@ bool ARMAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
 void ARMAsmPrinter::printMachineInstruction(const MachineInstr *MI) {
   ++EmittedInsts;
 
-  int Opc = MI->getOpcode();
-  switch (Opc) {
-  case ARM::CONSTPOOL_ENTRY:
-    if (!InCPMode && AFI->isThumbFunction()) {
-      EmitAlignment(2);
-      InCPMode = true;
-    }
-    break;
-  default: {
-    if (InCPMode && AFI->isThumbFunction())
-      InCPMode = false;
-  }}
-
   // Call the autogenerated instruction printer routines.
   processDebugLoc(MI, true);
-  printInstruction(MI);
+  
+  if (EnableMCInst) {
+    printInstructionThroughMCStreamer(MI);
+  } else {
+    int Opc = MI->getOpcode();
+    if (Opc == ARM::CONSTPOOL_ENTRY)
+      EmitAlignment(2);
+    
+    printInstruction(MI);
+  }
+  
   if (VerboseAsm && !MI->getDebugLoc().isUnknown())
     EmitComments(*MI);
   O << '\n';
@@ -1256,34 +1255,40 @@ void ARMAsmPrinter::PrintGlobalVariable(const GlobalVariable* GVar) {
 }
 
 
-bool ARMAsmPrinter::doFinalization(Module &M) {
+void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
   if (Subtarget->isTargetDarwin()) {
     // All darwin targets use mach-o.
     TargetLoweringObjectFileMachO &TLOFMacho =
       static_cast<TargetLoweringObjectFileMachO &>(getObjFileLowering());
+    MachineModuleInfoMachO &MMIMacho =
+      MMI->getObjFileInfo<MachineModuleInfoMachO>();
 
     O << '\n';
 
     // Output non-lazy-pointers for external and common global variables.
-    if (!GVNonLazyPtrs.empty()) {
+    MachineModuleInfoMachO::SymbolListTy Stubs = MMIMacho.GetGVStubList();
+    
+    if (!Stubs.empty()) {
       // Switch with ".non_lazy_symbol_pointer" directive.
       OutStreamer.SwitchSection(TLOFMacho.getNonLazySymbolPointerSection());
       EmitAlignment(2);
-      for (StringMap<std::string>::iterator I = GVNonLazyPtrs.begin(),
-           E = GVNonLazyPtrs.end(); I != E; ++I) {
-        O << I->second << ":\n";
-        O << "\t.indirect_symbol " << I->getKeyData() << "\n";
-        O << "\t.long\t0\n";
+      for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+        Stubs[i].first->print(O, MAI);
+        O << ":\n\t.indirect_symbol ";
+        Stubs[i].second->print(O, MAI);
+        O << "\n\t.long\t0\n";
       }
     }
 
-    if (!HiddenGVNonLazyPtrs.empty()) {
+    Stubs = MMIMacho.GetHiddenGVStubList();
+    if (!Stubs.empty()) {
       OutStreamer.SwitchSection(getObjFileLowering().getDataSection());
       EmitAlignment(2);
-      for (StringMap<std::string>::iterator I = HiddenGVNonLazyPtrs.begin(),
-             E = HiddenGVNonLazyPtrs.end(); I != E; ++I) {
-        O << I->second << ":\n";
-        O << "\t.long " << I->getKeyData() << "\n";
+      for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+        Stubs[i].first->print(O, MAI);
+        O << ":\n\t.long ";
+        Stubs[i].second->print(O, MAI);
+        O << "\n";
       }
     }
 
@@ -1292,14 +1297,179 @@ bool ARMAsmPrinter::doFinalization(Module &M) {
     // implementation of multiple entry points).  If this doesn't occur, the
     // linker can safely perform dead code stripping.  Since LLVM never
     // generates code that does this, it is always safe to set.
-    O << "\t.subsections_via_symbols\n";
+    OutStreamer.EmitAssemblerFlag(MCStreamer::SubsectionsViaSymbols);
+  }
+}
+
+//===----------------------------------------------------------------------===//
+
+void ARMAsmPrinter::printInstructionThroughMCStreamer(const MachineInstr *MI) {
+  ARMMCInstLower MCInstLowering(OutContext, *Mang, *this);
+  switch (MI->getOpcode()) {
+  case ARM::t2MOVi32imm:
+    assert(0 && "Should be lowered by thumb2it pass");
+  default: break;
+  case TargetInstrInfo::DBG_LABEL:
+  case TargetInstrInfo::EH_LABEL:
+  case TargetInstrInfo::GC_LABEL:
+    printLabel(MI);
+    return;
+  case TargetInstrInfo::KILL:
+    return;
+  case TargetInstrInfo::INLINEASM:
+    O << '\t';
+    printInlineAsm(MI);
+    return;
+  case TargetInstrInfo::IMPLICIT_DEF:
+    printImplicitDef(MI);
+    return;
+  case ARM::PICADD: { // FIXME: Remove asm string from td file.
+    // This is a pseudo op for a label + instruction sequence, which looks like:
+    // LPC0:
+    //     add r0, pc, r0
+    // This adds the address of LPC0 to r0.
+    
+    // Emit the label.
+    // FIXME: MOVE TO SHARED PLACE.
+    unsigned Id = (unsigned)MI->getOperand(2).getImm();
+    const char *Prefix = MAI->getPrivateGlobalPrefix();
+    MCSymbol *Label =OutContext.GetOrCreateSymbol(Twine(Prefix)+"PC"+Twine(Id));
+    OutStreamer.EmitLabel(Label);
+    
+    
+    // Form and emit tha dd.
+    MCInst AddInst;
+    AddInst.setOpcode(ARM::ADDrr);
+    AddInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
+    AddInst.addOperand(MCOperand::CreateReg(ARM::PC));
+    AddInst.addOperand(MCOperand::CreateReg(MI->getOperand(1).getReg()));
+    printMCInst(&AddInst);
+    return;
+  }
+  case ARM::CONSTPOOL_ENTRY: { // FIXME: Remove asm string from td file.
+    /// CONSTPOOL_ENTRY - This instruction represents a floating constant pool
+    /// in the function.  The first operand is the ID# for this instruction, the
+    /// second is the index into the MachineConstantPool that this is, the third
+    /// is the size in bytes of this constant pool entry.
+    unsigned LabelId = (unsigned)MI->getOperand(0).getImm();
+    unsigned CPIdx   = (unsigned)MI->getOperand(1).getIndex();
+
+    EmitAlignment(2);
+
+    const char *Prefix = MAI->getPrivateGlobalPrefix();
+    MCSymbol *Label = OutContext.GetOrCreateSymbol(Twine(Prefix)+"CPI"+
+                                                   Twine(getFunctionNumber())+
+                                                   "_"+ Twine(LabelId));
+    OutStreamer.EmitLabel(Label);
+
+    const MachineConstantPoolEntry &MCPE = MCP->getConstants()[CPIdx];
+    if (MCPE.isMachineConstantPoolEntry())
+      EmitMachineConstantPoolValue(MCPE.Val.MachineCPVal);
+    else
+      EmitGlobalConstant(MCPE.Val.ConstVal);
+    
+    return;
   }
+  case ARM::MOVi2pieces: { // FIXME: Remove asmstring from td file.
+    // This is a hack that lowers as a two instruction sequence.
+    unsigned DstReg = MI->getOperand(0).getReg();
+    unsigned ImmVal = (unsigned)MI->getOperand(1).getImm();
+
+    unsigned SOImmValV1 = ARM_AM::getSOImmTwoPartFirst(ImmVal);
+    unsigned SOImmValV2 = ARM_AM::getSOImmTwoPartSecond(ImmVal);
+    
+    {
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::MOVi);
+      TmpInst.addOperand(MCOperand::CreateReg(DstReg));
+      TmpInst.addOperand(MCOperand::CreateImm(SOImmValV1));
+      
+      // Predicate.
+      TmpInst.addOperand(MCOperand::CreateImm(MI->getOperand(2).getImm()));
+      TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(3).getReg()));
+
+      TmpInst.addOperand(MCOperand::CreateReg(0));          // cc_out
+      printMCInst(&TmpInst);
+      O << '\n';
+    }
 
-  return AsmPrinter::doFinalization(M);
+    {
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::ORRri);
+      TmpInst.addOperand(MCOperand::CreateReg(DstReg));     // dstreg
+      TmpInst.addOperand(MCOperand::CreateReg(DstReg));     // inreg
+      TmpInst.addOperand(MCOperand::CreateImm(SOImmValV2)); // so_imm
+      // Predicate.
+      TmpInst.addOperand(MCOperand::CreateImm(MI->getOperand(2).getImm()));
+      TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(3).getReg()));
+      
+      TmpInst.addOperand(MCOperand::CreateReg(0));          // cc_out
+      printMCInst(&TmpInst);
+    }
+    return; 
+  }
+  case ARM::MOVi32imm: { // FIXME: Remove asmstring from td file.
+    // This is a hack that lowers as a two instruction sequence.
+    unsigned DstReg = MI->getOperand(0).getReg();
+    unsigned ImmVal = (unsigned)MI->getOperand(1).getImm();
+    
+    {
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::MOVi16);
+      TmpInst.addOperand(MCOperand::CreateReg(DstReg));         // dstreg
+      TmpInst.addOperand(MCOperand::CreateImm(ImmVal & 65535)); // lower16(imm)
+      
+      // Predicate.
+      TmpInst.addOperand(MCOperand::CreateImm(MI->getOperand(2).getImm()));
+      TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(3).getReg()));
+      
+      printMCInst(&TmpInst);
+      O << '\n';
+    }
+    
+    {
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::MOVTi16);
+      TmpInst.addOperand(MCOperand::CreateReg(DstReg));         // dstreg
+      TmpInst.addOperand(MCOperand::CreateReg(DstReg));         // srcreg
+      TmpInst.addOperand(MCOperand::CreateImm(ImmVal >> 16));   // upper16(imm)
+      
+      // Predicate.
+      TmpInst.addOperand(MCOperand::CreateImm(MI->getOperand(2).getImm()));
+      TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(3).getReg()));
+      
+      printMCInst(&TmpInst);
+    }
+    
+    return;
+  }
+  }
+      
+  MCInst TmpInst;
+  MCInstLowering.Lower(MI, TmpInst);
+  
+  printMCInst(&TmpInst);
+}
+
+//===----------------------------------------------------------------------===//
+// Target Registry Stuff
+//===----------------------------------------------------------------------===//
+
+static MCInstPrinter *createARMMCInstPrinter(const Target &T,
+                                             unsigned SyntaxVariant,
+                                             const MCAsmInfo &MAI,
+                                             raw_ostream &O) {
+  if (SyntaxVariant == 0)
+    return new ARMInstPrinter(O, MAI, false);
+  return 0;
 }
 
 // Force static initialization.
 extern "C" void LLVMInitializeARMAsmPrinter() {
   RegisterAsmPrinter<ARMAsmPrinter> X(TheARMTarget);
   RegisterAsmPrinter<ARMAsmPrinter> Y(TheThumbTarget);
+
+  TargetRegistry::RegisterMCInstPrinter(TheARMTarget, createARMMCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(TheThumbTarget, createARMMCInstPrinter);
 }
+
diff --git a/lib/Target/ARM/AsmPrinter/ARMInstPrinter.cpp b/lib/Target/ARM/AsmPrinter/ARMInstPrinter.cpp
new file mode 100644
index 000000000000..f422798e315c
--- /dev/null
+++ b/lib/Target/ARM/AsmPrinter/ARMInstPrinter.cpp
@@ -0,0 +1,358 @@
+//===-- ARMInstPrinter.cpp - Convert ARM MCInst to assembly syntax --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an ARM MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "ARM.h" // FIXME: FACTOR ENUMS BETTER.
+#include "ARMInstPrinter.h"
+#include "ARMAddressingModes.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+// Include the auto-generated portion of the assembly writer.
+#define MachineInstr MCInst
+#define ARMAsmPrinter ARMInstPrinter  // FIXME: REMOVE.
+#define NO_ASM_WRITER_BOILERPLATE
+#include "ARMGenAsmWriter.inc"
+#undef MachineInstr
+#undef ARMAsmPrinter
+
+void ARMInstPrinter::printInst(const MCInst *MI) { printInstruction(MI); }
+
+void ARMInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                  const char *Modifier) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    unsigned Reg = Op.getReg();
+    if (Modifier && strcmp(Modifier, "dregpair") == 0) {
+      // FIXME: Breaks e.g. ARM/vmul.ll.
+      assert(0);
+      /*
+      unsigned DRegLo = TRI->getSubReg(Reg, 5); // arm_dsubreg_0
+      unsigned DRegHi = TRI->getSubReg(Reg, 6); // arm_dsubreg_1
+      O << '{'
+      << getRegisterName(DRegLo) << ',' << getRegisterName(DRegHi)
+      << '}';*/
+    } else if (Modifier && strcmp(Modifier, "lane") == 0) {
+      assert(0);
+      /*
+      unsigned RegNum = ARMRegisterInfo::getRegisterNumbering(Reg);
+      unsigned DReg = TRI->getMatchingSuperReg(Reg, RegNum & 1 ? 2 : 1,
+                                               &ARM::DPR_VFP2RegClass);
+      O << getRegisterName(DReg) << '[' << (RegNum & 1) << ']';
+       */
+    } else {
+      O << getRegisterName(Reg);
+    }
+  } else if (Op.isImm()) {
+    assert((Modifier == 0 || Modifier[0] == 0) && "No modifiers supported");
+    O << '#' << Op.getImm();
+  } else {
+    assert((Modifier == 0 || Modifier[0] == 0) && "No modifiers supported");
+    assert(Op.isExpr() && "unknown operand kind in printOperand");
+    Op.getExpr()->print(O, &MAI);
+  }
+}
+
+static void printSOImm(raw_ostream &O, int64_t V, bool VerboseAsm,
+                       const MCAsmInfo *MAI) {
+  // Break it up into two parts that make up a shifter immediate.
+  V = ARM_AM::getSOImmVal(V);
+  assert(V != -1 && "Not a valid so_imm value!");
+  
+  unsigned Imm = ARM_AM::getSOImmValImm(V);
+  unsigned Rot = ARM_AM::getSOImmValRot(V);
+  
+  // Print low-level immediate formation info, per
+  // A5.1.3: "Data-processing operands - Immediate".
+  if (Rot) {
+    O << "#" << Imm << ", " << Rot;
+    // Pretty printed version.
+    if (VerboseAsm)
+      O << ' ' << MAI->getCommentString()
+      << ' ' << (int)ARM_AM::rotr32(Imm, Rot);
+  } else {
+    O << "#" << Imm;
+  }
+}
+
+
+/// printSOImmOperand - SOImm is 4-bit rotate amount in bits 8-11 with 8-bit
+/// immediate in bits 0-7.
+void ARMInstPrinter::printSOImmOperand(const MCInst *MI, unsigned OpNum) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+  assert(MO.isImm() && "Not a valid so_imm value!");
+  printSOImm(O, MO.getImm(), VerboseAsm, &MAI);
+}
+
+/// printSOImm2PartOperand - SOImm is broken into two pieces using a 'mov'
+/// followed by an 'orr' to materialize.
+void ARMInstPrinter::printSOImm2PartOperand(const MCInst *MI, unsigned OpNum) {
+  // FIXME: REMOVE this method.
+  abort();
+}
+
+// so_reg is a 4-operand unit corresponding to register forms of the A5.1
+// "Addressing Mode 1 - Data-processing operands" forms.  This includes:
+//    REG 0   0           - e.g. R5
+//    REG REG 0,SH_OPC    - e.g. R5, ROR R3
+//    REG 0   IMM,SH_OPC  - e.g. R5, LSL #3
+void ARMInstPrinter::printSORegOperand(const MCInst *MI, unsigned OpNum) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+  const MCOperand &MO3 = MI->getOperand(OpNum+2);
+  
+  O << getRegisterName(MO1.getReg());
+  
+  // Print the shift opc.
+  O << ", "
+    << ARM_AM::getShiftOpcStr(ARM_AM::getSORegShOp(MO3.getImm()))
+    << ' ';
+  
+  if (MO2.getReg()) {
+    O << getRegisterName(MO2.getReg());
+    assert(ARM_AM::getSORegOffset(MO3.getImm()) == 0);
+  } else {
+    O << "#" << ARM_AM::getSORegOffset(MO3.getImm());
+  }
+}
+
+
+void ARMInstPrinter::printAddrMode2Operand(const MCInst *MI, unsigned Op) {
+  const MCOperand &MO1 = MI->getOperand(Op);
+  const MCOperand &MO2 = MI->getOperand(Op+1);
+  const MCOperand &MO3 = MI->getOperand(Op+2);
+  
+  if (!MO1.isReg()) {   // FIXME: This is for CP entries, but isn't right.
+    printOperand(MI, Op);
+    return;
+  }
+  
+  O << "[" << getRegisterName(MO1.getReg());
+  
+  if (!MO2.getReg()) {
+    if (ARM_AM::getAM2Offset(MO3.getImm()))  // Don't print +0.
+      O << ", #"
+      << (char)ARM_AM::getAM2Op(MO3.getImm())
+      << ARM_AM::getAM2Offset(MO3.getImm());
+    O << "]";
+    return;
+  }
+  
+  O << ", "
+  << (char)ARM_AM::getAM2Op(MO3.getImm())
+  << getRegisterName(MO2.getReg());
+  
+  if (unsigned ShImm = ARM_AM::getAM2Offset(MO3.getImm()))
+    O << ", "
+    << ARM_AM::getShiftOpcStr(ARM_AM::getAM2ShiftOpc(MO3.getImm()))
+    << " #" << ShImm;
+  O << "]";
+}  
+
+void ARMInstPrinter::printAddrMode2OffsetOperand(const MCInst *MI,
+                                                 unsigned OpNum) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+  
+  if (!MO1.getReg()) {
+    unsigned ImmOffs = ARM_AM::getAM2Offset(MO2.getImm());
+    assert(ImmOffs && "Malformed indexed load / store!");
+    O << '#' << (char)ARM_AM::getAM2Op(MO2.getImm()) << ImmOffs;
+    return;
+  }
+  
+  O << (char)ARM_AM::getAM2Op(MO2.getImm()) << getRegisterName(MO1.getReg());
+  
+  if (unsigned ShImm = ARM_AM::getAM2Offset(MO2.getImm()))
+    O << ", "
+    << ARM_AM::getShiftOpcStr(ARM_AM::getAM2ShiftOpc(MO2.getImm()))
+    << " #" << ShImm;
+}
+
+void ARMInstPrinter::printAddrMode3Operand(const MCInst *MI, unsigned OpNum) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+  const MCOperand &MO3 = MI->getOperand(OpNum+2);
+  
+  O << '[' << getRegisterName(MO1.getReg());
+  
+  if (MO2.getReg()) {
+    O << ", " << (char)ARM_AM::getAM3Op(MO3.getImm())
+      << getRegisterName(MO2.getReg()) << ']';
+    return;
+  }
+  
+  if (unsigned ImmOffs = ARM_AM::getAM3Offset(MO3.getImm()))
+    O << ", #"
+    << (char)ARM_AM::getAM3Op(MO3.getImm())
+    << ImmOffs;
+  O << ']';
+}
+
+void ARMInstPrinter::printAddrMode3OffsetOperand(const MCInst *MI,
+                                                 unsigned OpNum) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+  
+  if (MO1.getReg()) {
+    O << (char)ARM_AM::getAM3Op(MO2.getImm())
+    << getRegisterName(MO1.getReg());
+    return;
+  }
+  
+  unsigned ImmOffs = ARM_AM::getAM3Offset(MO2.getImm());
+  assert(ImmOffs && "Malformed indexed load / store!");
+  O << "#"
+  << (char)ARM_AM::getAM3Op(MO2.getImm())
+  << ImmOffs;
+}
+
+
+void ARMInstPrinter::printAddrMode4Operand(const MCInst *MI, unsigned OpNum,
+                                           const char *Modifier) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+  ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MO2.getImm());
+  if (Modifier && strcmp(Modifier, "submode") == 0) {
+    if (MO1.getReg() == ARM::SP) {
+      // FIXME
+      bool isLDM = (MI->getOpcode() == ARM::LDM ||
+                    MI->getOpcode() == ARM::LDM_RET ||
+                    MI->getOpcode() == ARM::t2LDM ||
+                    MI->getOpcode() == ARM::t2LDM_RET);
+      O << ARM_AM::getAMSubModeAltStr(Mode, isLDM);
+    } else
+      O << ARM_AM::getAMSubModeStr(Mode);
+  } else if (Modifier && strcmp(Modifier, "wide") == 0) {
+    ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MO2.getImm());
+    if (Mode == ARM_AM::ia)
+      O << ".w";
+  } else {
+    printOperand(MI, OpNum);
+    if (ARM_AM::getAM4WBFlag(MO2.getImm()))
+      O << "!";
+  }
+}
+
+void ARMInstPrinter::printAddrMode5Operand(const MCInst *MI, unsigned OpNum,
+                                           const char *Modifier) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+  
+  if (!MO1.isReg()) {   // FIXME: This is for CP entries, but isn't right.
+    printOperand(MI, OpNum);
+    return;
+  }
+  
+  if (Modifier && strcmp(Modifier, "submode") == 0) {
+    ARM_AM::AMSubMode Mode = ARM_AM::getAM5SubMode(MO2.getImm());
+    if (MO1.getReg() == ARM::SP) {
+      bool isFLDM = (MI->getOpcode() == ARM::FLDMD ||
+                     MI->getOpcode() == ARM::FLDMS);
+      O << ARM_AM::getAMSubModeAltStr(Mode, isFLDM);
+    } else
+      O << ARM_AM::getAMSubModeStr(Mode);
+    return;
+  } else if (Modifier && strcmp(Modifier, "base") == 0) {
+    // Used for FSTM{D|S} and LSTM{D|S} operations.
+    O << getRegisterName(MO1.getReg());
+    if (ARM_AM::getAM5WBFlag(MO2.getImm()))
+      O << "!";
+    return;
+  }
+  
+  O << "[" << getRegisterName(MO1.getReg());
+  
+  if (unsigned ImmOffs = ARM_AM::getAM5Offset(MO2.getImm())) {
+    O << ", #"
+      << (char)ARM_AM::getAM5Op(MO2.getImm())
+      << ImmOffs*4;
+  }
+  O << "]";
+}
+
+void ARMInstPrinter::printAddrMode6Operand(const MCInst *MI, unsigned OpNum) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+  const MCOperand &MO3 = MI->getOperand(OpNum+2);
+  
+  // FIXME: No support yet for specifying alignment.
+  O << '[' << getRegisterName(MO1.getReg()) << ']';
+  
+  if (ARM_AM::getAM6WBFlag(MO3.getImm())) {
+    if (MO2.getReg() == 0)
+      O << '!';
+    else
+      O << ", " << getRegisterName(MO2.getReg());
+  }
+}
+
+void ARMInstPrinter::printAddrModePCOperand(const MCInst *MI, unsigned OpNum,
+                                            const char *Modifier) {
+  assert(0 && "FIXME: Implement printAddrModePCOperand");
+}
+
+void ARMInstPrinter::printBitfieldInvMaskImmOperand (const MCInst *MI,
+                                                     unsigned OpNum) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+  uint32_t v = ~MO.getImm();
+  int32_t lsb = CountTrailingZeros_32(v);
+  int32_t width = (32 - CountLeadingZeros_32 (v)) - lsb;
+  assert(MO.isImm() && "Not a valid bf_inv_mask_imm value!");
+  O << '#' << lsb << ", #" << width;
+}
+
+void ARMInstPrinter::printRegisterList(const MCInst *MI, unsigned OpNum) {
+  O << "{";
+  // Always skip the first operand, it's the optional (and implicit writeback).
+  for (unsigned i = OpNum+1, e = MI->getNumOperands(); i != e; ++i) {
+    if (i != OpNum+1) O << ", ";
+    O << getRegisterName(MI->getOperand(i).getReg());
+  }
+  O << "}";
+}
+
+void ARMInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNum) {
+  ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(OpNum).getImm();
+  if (CC != ARMCC::AL)
+    O << ARMCondCodeToString(CC);
+}
+
+void ARMInstPrinter::printSBitModifierOperand(const MCInst *MI, unsigned OpNum){
+  if (MI->getOperand(OpNum).getReg()) {
+    assert(MI->getOperand(OpNum).getReg() == ARM::CPSR &&
+           "Expect ARM CPSR register!");
+    O << 's';
+  }
+}
+
+
+
+void ARMInstPrinter::printCPInstOperand(const MCInst *MI, unsigned OpNum,
+                                        const char *Modifier) {
+  // FIXME: remove this.
+  abort();
+}
+
+void ARMInstPrinter::printNoHashImmediate(const MCInst *MI, unsigned OpNum) {
+  O << MI->getOperand(OpNum).getImm();
+}
+
+
+void ARMInstPrinter::printPCLabel(const MCInst *MI, unsigned OpNum) {
+  // FIXME: remove this.
+  abort();
+}
diff --git a/lib/Target/ARM/AsmPrinter/ARMInstPrinter.h b/lib/Target/ARM/AsmPrinter/ARMInstPrinter.h
new file mode 100644
index 000000000000..492513768b27
--- /dev/null
+++ b/lib/Target/ARM/AsmPrinter/ARMInstPrinter.h
@@ -0,0 +1,89 @@
+//===-- ARMInstPrinter.h - Convert ARM MCInst to assembly syntax ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an ARM MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMINSTPRINTER_H
+#define ARMINSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+  class MCOperand;
+  
+class ARMInstPrinter : public MCInstPrinter {
+  bool VerboseAsm;
+public:
+  ARMInstPrinter(raw_ostream &O, const MCAsmInfo &MAI, bool verboseAsm)
+    : MCInstPrinter(O, MAI), VerboseAsm(verboseAsm) {}
+
+  virtual void printInst(const MCInst *MI);
+  
+  // Autogenerated by tblgen.
+  void printInstruction(const MCInst *MI);
+  static const char *getRegisterName(unsigned RegNo);
+
+
+  void printOperand(const MCInst *MI, unsigned OpNo,
+                    const char *Modifier = 0);
+    
+  void printSOImmOperand(const MCInst *MI, unsigned OpNum);
+  void printSOImm2PartOperand(const MCInst *MI, unsigned OpNum);
+  
+  void printSORegOperand(const MCInst *MI, unsigned OpNum);
+  void printAddrMode2Operand(const MCInst *MI, unsigned OpNum);
+  void printAddrMode2OffsetOperand(const MCInst *MI, unsigned OpNum);
+  void printAddrMode3Operand(const MCInst *MI, unsigned OpNum);
+  void printAddrMode3OffsetOperand(const MCInst *MI, unsigned OpNum);
+  void printAddrMode4Operand(const MCInst *MI, unsigned OpNum,
+                             const char *Modifier = 0);
+  void printAddrMode5Operand(const MCInst *MI, unsigned OpNum,
+                             const char *Modifier = 0);
+  void printAddrMode6Operand(const MCInst *MI, unsigned OpNum);
+  void printAddrModePCOperand(const MCInst *MI, unsigned OpNum,
+                              const char *Modifier = 0);
+    
+  void printBitfieldInvMaskImmOperand(const MCInst *MI, unsigned OpNum);
+  
+  void printThumbITMask(const MCInst *MI, unsigned OpNum) {}
+  void printThumbAddrModeRROperand(const MCInst *MI, unsigned OpNum) {}
+  void printThumbAddrModeRI5Operand(const MCInst *MI, unsigned OpNum,
+                                    unsigned Scale) {}
+  void printThumbAddrModeS1Operand(const MCInst *MI, unsigned OpNum) {}
+  void printThumbAddrModeS2Operand(const MCInst *MI, unsigned OpNum) {}
+  void printThumbAddrModeS4Operand(const MCInst *MI, unsigned OpNum) {}
+  void printThumbAddrModeSPOperand(const MCInst *MI, unsigned OpNum) {}
+  
+  void printT2SOOperand(const MCInst *MI, unsigned OpNum) {}
+  void printT2AddrModeImm12Operand(const MCInst *MI, unsigned OpNum) {}
+  void printT2AddrModeImm8Operand(const MCInst *MI, unsigned OpNum) {}
+  void printT2AddrModeImm8s4Operand(const MCInst *MI, unsigned OpNum) {}
+  void printT2AddrModeImm8OffsetOperand(const MCInst *MI, unsigned OpNum) {}
+  void printT2AddrModeSoRegOperand(const MCInst *MI, unsigned OpNum) {}
+  
+  void printPredicateOperand(const MCInst *MI, unsigned OpNum);
+  void printSBitModifierOperand(const MCInst *MI, unsigned OpNum);
+  void printRegisterList(const MCInst *MI, unsigned OpNum);
+  void printCPInstOperand(const MCInst *MI, unsigned OpNum,
+                          const char *Modifier);
+  void printJTBlockOperand(const MCInst *MI, unsigned OpNum) {}
+  void printJT2BlockOperand(const MCInst *MI, unsigned OpNum) {}
+  void printTBAddrMode(const MCInst *MI, unsigned OpNum) {}
+  void printNoHashImmediate(const MCInst *MI, unsigned OpNum);
+
+  void printPCLabel(const MCInst *MI, unsigned OpNum);  
+  // FIXME: Implement.
+  void PrintSpecial(const MCInst *MI, const char *Kind) {}
+};
+  
+}
+
+#endif
diff --git a/lib/Target/ARM/AsmPrinter/ARMMCInstLower.cpp b/lib/Target/ARM/AsmPrinter/ARMMCInstLower.cpp
new file mode 100644
index 000000000000..757164e682af
--- /dev/null
+++ b/lib/Target/ARM/AsmPrinter/ARMMCInstLower.cpp
@@ -0,0 +1,166 @@
+//===-- ARMMCInstLower.cpp - Convert ARM MachineInstr to an MCInst --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower ARM MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMMCInstLower.h"
+//#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+//#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Mangler.h"
+#include "llvm/ADT/SmallString.h"
+using namespace llvm;
+
+
+#if 0
+const ARMSubtarget &ARMMCInstLower::getSubtarget() const {
+  return AsmPrinter.getSubtarget();
+}
+
+MachineModuleInfoMachO &ARMMCInstLower::getMachOMMI() const {
+  assert(getSubtarget().isTargetDarwin() &&"Can only get MachO info on darwin");
+  return AsmPrinter.MMI->getObjFileInfo<MachineModuleInfoMachO>(); 
+}
+#endif
+
+MCSymbol *ARMMCInstLower::
+GetGlobalAddressSymbol(const MachineOperand &MO) const {
+  const GlobalValue *GV = MO.getGlobal();
+  
+  SmallString<128> Name;
+  Mang.getNameWithPrefix(Name, GV, false);
+  
+  // FIXME: HANDLE PLT references how??
+  switch (MO.getTargetFlags()) {
+  default: assert(0 && "Unknown target flag on GV operand");
+  case 0: break;
+  }
+  
+  return Ctx.GetOrCreateSymbol(Name.str());
+}
+
+MCSymbol *ARMMCInstLower::
+GetExternalSymbolSymbol(const MachineOperand &MO) const {
+  SmallString<128> Name;
+  Name += Printer.MAI->getGlobalPrefix();
+  Name += MO.getSymbolName();
+  
+  // FIXME: HANDLE PLT references how??
+  switch (MO.getTargetFlags()) {
+  default: assert(0 && "Unknown target flag on GV operand");
+  case 0: break;
+  }
+  
+  return Ctx.GetOrCreateSymbol(Name.str());
+}
+
+
+
+MCSymbol *ARMMCInstLower::
+GetJumpTableSymbol(const MachineOperand &MO) const {
+  SmallString<256> Name;
+  raw_svector_ostream(Name) << Printer.MAI->getPrivateGlobalPrefix() << "JTI"
+    << Printer.getFunctionNumber() << '_' << MO.getIndex();
+  
+#if 0
+  switch (MO.getTargetFlags()) {
+    default: llvm_unreachable("Unknown target flag on GV operand");
+  }
+#endif
+  
+  // Create a symbol for the name.
+  return Ctx.GetOrCreateSymbol(Name.str());
+}
+
+MCSymbol *ARMMCInstLower::
+GetConstantPoolIndexSymbol(const MachineOperand &MO) const {
+  SmallString<256> Name;
+  raw_svector_ostream(Name) << Printer.MAI->getPrivateGlobalPrefix() << "CPI"
+    << Printer.getFunctionNumber() << '_' << MO.getIndex();
+  
+#if 0
+  switch (MO.getTargetFlags()) {
+  default: llvm_unreachable("Unknown target flag on GV operand");
+  }
+#endif
+  
+  // Create a symbol for the name.
+  return Ctx.GetOrCreateSymbol(Name.str());
+}
+  
+MCOperand ARMMCInstLower::
+LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const {
+  // FIXME: We would like an efficient form for this, so we don't have to do a
+  // lot of extra uniquing.
+  const MCExpr *Expr = MCSymbolRefExpr::Create(Sym, Ctx);
+  
+#if 0
+  switch (MO.getTargetFlags()) {
+  default: llvm_unreachable("Unknown target flag on GV operand");
+  }
+#endif
+  
+  if (!MO.isJTI() && MO.getOffset())
+    Expr = MCBinaryExpr::CreateAdd(Expr,
+                                   MCConstantExpr::Create(MO.getOffset(), Ctx),
+                                   Ctx);
+  return MCOperand::CreateExpr(Expr);
+}
+
+
+void ARMMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+  OutMI.setOpcode(MI->getOpcode());
+  
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    
+    MCOperand MCOp;
+    switch (MO.getType()) {
+    default:
+      MI->dump();
+      assert(0 && "unknown operand type");
+    case MachineOperand::MO_Register:
+      // Ignore all implicit register operands.
+      if (MO.isImplicit()) continue;
+      MCOp = MCOperand::CreateReg(MO.getReg());
+      break;
+    case MachineOperand::MO_Immediate:
+      MCOp = MCOperand::CreateImm(MO.getImm());
+      break;
+    case MachineOperand::MO_MachineBasicBlock:
+      MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(
+                       Printer.GetMBBSymbol(MO.getMBB()->getNumber()), Ctx));
+      break;
+    case MachineOperand::MO_GlobalAddress:
+      MCOp = LowerSymbolOperand(MO, GetGlobalAddressSymbol(MO));
+      break;
+    case MachineOperand::MO_ExternalSymbol:
+      MCOp = LowerSymbolOperand(MO, GetExternalSymbolSymbol(MO));
+      break;
+    case MachineOperand::MO_JumpTableIndex:
+      MCOp = LowerSymbolOperand(MO, GetJumpTableSymbol(MO));
+      break;
+    case MachineOperand::MO_ConstantPoolIndex:
+      MCOp = LowerSymbolOperand(MO, GetConstantPoolIndexSymbol(MO));
+      break;
+    }
+    
+    OutMI.addOperand(MCOp);
+  }
+  
+}
diff --git a/lib/Target/ARM/AsmPrinter/ARMMCInstLower.h b/lib/Target/ARM/AsmPrinter/ARMMCInstLower.h
new file mode 100644
index 000000000000..383d30d5de56
--- /dev/null
+++ b/lib/Target/ARM/AsmPrinter/ARMMCInstLower.h
@@ -0,0 +1,56 @@
+//===-- ARMMCInstLower.h - Lower MachineInstr to MCInst -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARM_MCINSTLOWER_H
+#define ARM_MCINSTLOWER_H
+
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+  class AsmPrinter;
+  class MCAsmInfo;
+  class MCContext;
+  class MCInst;
+  class MCOperand;
+  class MCSymbol;
+  class MachineInstr;
+  class MachineModuleInfoMachO;
+  class MachineOperand;
+  class Mangler;
+  //class ARMSubtarget;
+  
+/// ARMMCInstLower - This class is used to lower an MachineInstr into an MCInst.
+class VISIBILITY_HIDDEN ARMMCInstLower {
+  MCContext &Ctx;
+  Mangler &Mang;
+  AsmPrinter &Printer;
+
+  //const ARMSubtarget &getSubtarget() const;
+public:
+  ARMMCInstLower(MCContext &ctx, Mangler &mang, AsmPrinter &printer)
+    : Ctx(ctx), Mang(mang), Printer(printer) {}
+  
+  void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+
+  //MCSymbol *GetPICBaseSymbol() const;
+  MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const;
+  MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const;
+  MCSymbol *GetJumpTableSymbol(const MachineOperand &MO) const;
+  MCSymbol *GetConstantPoolIndexSymbol(const MachineOperand &MO) const;
+  MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
+  
+/*
+private:
+  MachineModuleInfoMachO &getMachOMMI() const;
+ */
+};
+
+}
+
+#endif
diff --git a/lib/Target/ARM/AsmPrinter/CMakeLists.txt b/lib/Target/ARM/AsmPrinter/CMakeLists.txt
index a67fc8471a63..4e299f86ecb6 100644
--- a/lib/Target/ARM/AsmPrinter/CMakeLists.txt
+++ b/lib/Target/ARM/AsmPrinter/CMakeLists.txt
@@ -2,5 +2,7 @@ include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/
 
 add_llvm_library(LLVMARMAsmPrinter
   ARMAsmPrinter.cpp
+  ARMInstPrinter.cpp
+  ARMMCInstLower.cpp
   )
-add_dependencies(LLVMARMAsmPrinter ARMCodeGenTable_gen)
-\ No newline at end of file
+add_dependencies(LLVMARMAsmPrinter ARMCodeGenTable_gen)
diff --git a/lib/Target/ARM/README-Thumb.txt b/lib/Target/ARM/README-Thumb.txt
index a961a576f40d..e7770b2292e1 100644
--- a/lib/Target/ARM/README-Thumb.txt
+++ b/lib/Target/ARM/README-Thumb.txt
@@ -196,14 +196,6 @@ This is especially bad when dynamic alloca is used. The all fixed size stack
 objects are referenced off the frame pointer with negative offsets. See
 oggenc for an example.
 
-//===---------------------------------------------------------------------===//
-
-We are reserving R3 as a scratch register under thumb mode. So if it is live in
-to the function, we save / restore R3 to / from R12. Until register scavenging
-is done, we should save R3 to a high callee saved reg at emitPrologue time
-(when hasFP is true or stack size is large) and restore R3 from that register
-instead. This allows us to at least get rid of the save to r12 everytime it is
-used.
 
 //===---------------------------------------------------------------------===//
 
diff --git a/lib/Target/ARM/Thumb1RegisterInfo.cpp b/lib/Target/ARM/Thumb1RegisterInfo.cpp
index 3c896da4c0ca..6207177b9969 100644
--- a/lib/Target/ARM/Thumb1RegisterInfo.cpp
+++ b/lib/Target/ARM/Thumb1RegisterInfo.cpp
@@ -32,7 +32,6 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
@@ -394,31 +393,48 @@ rewriteFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
   return 0;
 }
 
-/// saveScavengerRegister - Save the register so it can be used by the
+/// saveScavengerRegister - Spill the register so it can be used by the
 /// register scavenger. Return true.
-bool Thumb1RegisterInfo::saveScavengerRegister(MachineBasicBlock &MBB,
-                                               MachineBasicBlock::iterator I,
-                                               const TargetRegisterClass *RC,
-                                               unsigned Reg) const {
+bool
+Thumb1RegisterInfo::saveScavengerRegister(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator I,
+                                          MachineBasicBlock::iterator &UseMI,
+                                          const TargetRegisterClass *RC,
+                                          unsigned Reg) const {
   // Thumb1 can't use the emergency spill slot on the stack because
   // ldr/str immediate offsets must be positive, and if we're referencing
   // off the frame pointer (if, for example, there are alloca() calls in
   // the function, the offset will be negative. Use R12 instead since that's
   // a call clobbered register that we know won't be used in Thumb1 mode.
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  BuildMI(MBB, I, DL, TII.get(ARM::tMOVtgpr2gpr)).
+    addReg(ARM::R12, RegState::Define).addReg(Reg, RegState::Kill);
+
+  // The UseMI is where we would like to restore the register. If there's
+  // interference with R12 before then, however, we'll need to restore it
+  // before that instead and adjust the UseMI.
+  bool done = false;
+  for (MachineBasicBlock::iterator II = I; !done && II != UseMI ; ++II) {
+    // If this instruction affects R12, adjust our restore point.
+    for (unsigned i = 0, e = II->getNumOperands(); i != e; ++i) {
+      const MachineOperand &MO = II->getOperand(i);
+      if (!MO.isReg() || MO.isUndef() || !MO.getReg() ||
+          TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+        continue;
+      if (MO.getReg() == ARM::R12) {
+        UseMI = II;
+        done = true;
+        break;
+      }
+    }
+  }
+  // Restore the register from R12
+  BuildMI(MBB, UseMI, DL, TII.get(ARM::tMOVgpr2tgpr)).
+    addReg(Reg, RegState::Define).addReg(ARM::R12, RegState::Kill);
 
-  TII.copyRegToReg(MBB, I, ARM::R12, Reg, ARM::GPRRegisterClass, RC);
   return true;
 }
 
-/// restoreScavengerRegister - restore a registers saved by
-// saveScavengerRegister().
-void Thumb1RegisterInfo::restoreScavengerRegister(MachineBasicBlock &MBB,
-                                               MachineBasicBlock::iterator I,
-                                               const TargetRegisterClass *RC,
-                                               unsigned Reg) const {
-  TII.copyRegToReg(MBB, I, Reg, ARM::R12, RC, ARM::GPRRegisterClass);
-}
-
 unsigned
 Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
                                         int SPAdj, int *Value,
@@ -828,7 +844,6 @@ void Thumb1RegisterInfo::emitEpilogue(MachineFunction &MF,
 
   if (VARegSaveSize) {
     // Epilogue for vararg functions: pop LR to R3 and branch off it.
-    // FIXME: Verify this is still ok when R3 is no longer being reserved.
     AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP)))
       .addReg(0) // No write back.
       .addReg(ARM::R3, RegState::Define);
diff --git a/lib/Target/ARM/Thumb1RegisterInfo.h b/lib/Target/ARM/Thumb1RegisterInfo.h
index bb7a6199d10d..570a5bc8c2ec 100644
--- a/lib/Target/ARM/Thumb1RegisterInfo.h
+++ b/lib/Target/ARM/Thumb1RegisterInfo.h
@@ -57,12 +57,9 @@ public:
 
   bool saveScavengerRegister(MachineBasicBlock &MBB,
                              MachineBasicBlock::iterator I,
+                             MachineBasicBlock::iterator &UseMI,
                              const TargetRegisterClass *RC,
                              unsigned Reg) const;
-  void restoreScavengerRegister(MachineBasicBlock &MBB,
-                                MachineBasicBlock::iterator I,
-                                const TargetRegisterClass *RC,
-                                unsigned Reg) const;
   unsigned eliminateFrameIndex(MachineBasicBlock::iterator II,
                                int SPAdj, int *Value = NULL,
                                RegScavenger *RS = NULL) const;
diff --git a/lib/Target/ARM/Thumb2ITBlockPass.cpp b/lib/Target/ARM/Thumb2ITBlockPass.cpp
index 98b5cbdfb98f..427c0bb22b26 100644
--- a/lib/Target/ARM/Thumb2ITBlockPass.cpp
+++ b/lib/Target/ARM/Thumb2ITBlockPass.cpp
@@ -107,8 +107,12 @@ bool Thumb2ITBlockPass::InsertITBlocks(MachineBasicBlock &MBB) {
     // Finalize IT mask.
     ARMCC::CondCodes OCC = ARMCC::getOppositeCondition(CC);
     unsigned Mask = 0, Pos = 3;
-    while (MBBI != E && Pos) {
+    // Branches, including tricky ones like LDM_RET, need to end an IT
+    // block so check the instruction we just put in the block.
+    while (MBBI != E && Pos &&
+           (!MI->getDesc().isBranch() && !MI->getDesc().isReturn())) {
       MachineInstr *NMI = &*MBBI;
+      MI = NMI;
       DebugLoc ndl = NMI->getDebugLoc();
       unsigned NPredReg = 0;
       ARMCC::CondCodes NCC = getPredicate(NMI, NPredReg);
diff --git a/lib/Target/ARM/Thumb2RegisterInfo.cpp b/lib/Target/ARM/Thumb2RegisterInfo.cpp
index 6c4c15dfe354..f217e0e28f75 100644
--- a/lib/Target/ARM/Thumb2RegisterInfo.cpp
+++ b/lib/Target/ARM/Thumb2RegisterInfo.cpp
@@ -32,7 +32,6 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 using namespace llvm;