1 files changed, 906 insertions, 1031 deletions
diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index 0e0da2ddc400..604d84994b6c 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -40,6 +40,7 @@
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
@@ -103,427 +104,52 @@ static cl::opt<int> MaxStoresPerMemsetOptSizeCL("max-store-memset-Os",
   cl::Hidden, cl::ZeroOrMore, cl::init(4),
   cl::desc("Max #stores to inline memset"));
 
+static cl::opt<bool> AlignLoads("hexagon-align-loads",
+  cl::Hidden, cl::init(false),
+  cl::desc("Rewrite unaligned loads as a pair of aligned loads"));
+
 
 namespace {
 
   class HexagonCCState : public CCState {
-    unsigned NumNamedVarArgParams;
+    unsigned NumNamedVarArgParams = 0;
 
   public:
-    HexagonCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
+    HexagonCCState(CallingConv::ID CC, bool IsVarArg, MachineFunction &MF,
                    SmallVectorImpl<CCValAssign> &locs, LLVMContext &C,
-                   int NumNamedVarArgParams)
-        : CCState(CC, isVarArg, MF, locs, C),
-          NumNamedVarArgParams(NumNamedVarArgParams) {}
-
+                   unsigned NumNamedArgs)
+        : CCState(CC, IsVarArg, MF, locs, C),
+          NumNamedVarArgParams(NumNamedArgs) {}
     unsigned getNumNamedVarArgParams() const { return NumNamedVarArgParams; }
   };
 
-  enum StridedLoadKind {
-    Even = 0,
-    Odd,
-    NoPattern
-  };
-
 } // end anonymous namespace
 
-// Implement calling convention for Hexagon.
-
-static const MVT LegalV64[] =  { MVT::v64i8,  MVT::v32i16,  MVT::v16i32 };
-static const MVT LegalW64[] =  { MVT::v128i8, MVT::v64i16,  MVT::v32i32 };
-static const MVT LegalV128[] = { MVT::v128i8, MVT::v64i16,  MVT::v32i32 };
-static const MVT LegalW128[] = { MVT::v256i8, MVT::v128i16, MVT::v64i32 };
-
-static bool
-CC_Hexagon(unsigned ValNo, MVT ValVT,
-           MVT LocVT, CCValAssign::LocInfo LocInfo,
-           ISD::ArgFlagsTy ArgFlags, CCState &State);
-
-static bool
-CC_Hexagon32(unsigned ValNo, MVT ValVT,
-             MVT LocVT, CCValAssign::LocInfo LocInfo,
-             ISD::ArgFlagsTy ArgFlags, CCState &State);
-
-static bool
-CC_Hexagon64(unsigned ValNo, MVT ValVT,
-             MVT LocVT, CCValAssign::LocInfo LocInfo,
-             ISD::ArgFlagsTy ArgFlags, CCState &State);
-
-static bool
-CC_HexagonVector(unsigned ValNo, MVT ValVT,
-                 MVT LocVT, CCValAssign::LocInfo LocInfo,
-                 ISD::ArgFlagsTy ArgFlags, CCState &State);
-
-static bool
-RetCC_Hexagon(unsigned ValNo, MVT ValVT,
-              MVT LocVT, CCValAssign::LocInfo LocInfo,
-              ISD::ArgFlagsTy ArgFlags, CCState &State);
-
-static bool
-RetCC_Hexagon32(unsigned ValNo, MVT ValVT,
-                MVT LocVT, CCValAssign::LocInfo LocInfo,
-                ISD::ArgFlagsTy ArgFlags, CCState &State);
-
-static bool
-RetCC_Hexagon64(unsigned ValNo, MVT ValVT,
-                MVT LocVT, CCValAssign::LocInfo LocInfo,
-                ISD::ArgFlagsTy ArgFlags, CCState &State);
-
-static bool
-RetCC_HexagonVector(unsigned ValNo, MVT ValVT,
-                    MVT LocVT, CCValAssign::LocInfo LocInfo,
-                    ISD::ArgFlagsTy ArgFlags, CCState &State);
-
-static bool
-CC_Hexagon_VarArg (unsigned ValNo, MVT ValVT,
-            MVT LocVT, CCValAssign::LocInfo LocInfo,
-            ISD::ArgFlagsTy ArgFlags, CCState &State) {
-  HexagonCCState &HState = static_cast<HexagonCCState &>(State);
-
-  if (ValNo < HState.getNumNamedVarArgParams()) {
-    // Deal with named arguments.
-    return CC_Hexagon(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State);
-  }
-
-  // Deal with un-named arguments.
-  unsigned Offset;
-  if (ArgFlags.isByVal()) {
-    // If pass-by-value, the size allocated on stack is decided
-    // by ArgFlags.getByValSize(), not by the size of LocVT.
-    Offset = State.AllocateStack(ArgFlags.getByValSize(),
-                                 ArgFlags.getByValAlign());
-    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
-    return false;
-  }
-  if (LocVT == MVT::i1 || LocVT == MVT::i8 || LocVT == MVT::i16) {
-    LocVT = MVT::i32;
-    ValVT = MVT::i32;
-    if (ArgFlags.isSExt())
-      LocInfo = CCValAssign::SExt;
-    else if (ArgFlags.isZExt())
-      LocInfo = CCValAssign::ZExt;
-    else
-      LocInfo = CCValAssign::AExt;
-  }
-  if (LocVT == MVT::i32 || LocVT == MVT::f32) {
-    Offset = State.AllocateStack(4, 4);
-    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
-    return false;
-  }
-  if (LocVT == MVT::i64 || LocVT == MVT::f64) {
-    Offset = State.AllocateStack(8, 8);
-    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
-    return false;
-  }
-  if (LocVT == MVT::v2i64 || LocVT == MVT::v4i32 || LocVT == MVT::v8i16 ||
-      LocVT == MVT::v16i8) {
-    Offset = State.AllocateStack(16, 16);
-    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
-    return false;
-  }
-  if (LocVT == MVT::v4i64 || LocVT == MVT::v8i32 || LocVT == MVT::v16i16 ||
-      LocVT == MVT::v32i8) {
-    Offset = State.AllocateStack(32, 32);
-    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
-    return false;
-  }
-  if (LocVT == MVT::v16i32 || LocVT == MVT::v32i16 ||
-      LocVT == MVT::v64i8 || LocVT == MVT::v512i1) {
-    Offset = State.AllocateStack(64, 64);
-    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
-    return false;
-  }
-  if (LocVT == MVT::v32i32 || LocVT == MVT::v64i16 ||
-      LocVT == MVT::v128i8 || LocVT == MVT::v1024i1) {
-    Offset = State.AllocateStack(128, 128);
-    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
-    return false;
-  }
-  if (LocVT == MVT::v64i32 || LocVT == MVT::v128i16 ||
-      LocVT == MVT::v256i8) {
-    Offset = State.AllocateStack(256, 256);
-    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
-    return false;
-  }
-
-  llvm_unreachable(nullptr);
-}
-
-static bool CC_Hexagon (unsigned ValNo, MVT ValVT, MVT LocVT,
-      CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State) {
-  if (ArgFlags.isByVal()) {
-    // Passed on stack.
-    unsigned Offset = State.AllocateStack(ArgFlags.getByValSize(),
-                                          ArgFlags.getByValAlign());
-    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
-    return false;
-  }
-
-  if (LocVT == MVT::i1) {
-    LocVT = MVT::i32;
-  } else if (LocVT == MVT::i8 || LocVT == MVT::i16) {
-    LocVT = MVT::i32;
-    ValVT = MVT::i32;
-    if (ArgFlags.isSExt())
-      LocInfo = CCValAssign::SExt;
-    else if (ArgFlags.isZExt())
-      LocInfo = CCValAssign::ZExt;
-    else
-      LocInfo = CCValAssign::AExt;
-  } else if (LocVT == MVT::v4i8 || LocVT == MVT::v2i16) {
-    LocVT = MVT::i32;
-    LocInfo = CCValAssign::BCvt;
-  } else if (LocVT == MVT::v8i8 || LocVT == MVT::v4i16 || LocVT == MVT::v2i32) {
-    LocVT = MVT::i64;
-    LocInfo = CCValAssign::BCvt;
-  }
-
-  if (LocVT == MVT::i32 || LocVT == MVT::f32) {
-    if (!CC_Hexagon32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
-      return false;
-  }
-
-  if (LocVT == MVT::i64 || LocVT == MVT::f64) {
-    if (!CC_Hexagon64(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
-      return false;
-  }
-
-  if (LocVT == MVT::v8i32 || LocVT == MVT::v16i16 || LocVT == MVT::v32i8) {
-    unsigned Offset = State.AllocateStack(ArgFlags.getByValSize(), 32);
-    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
-    return false;
-  }
-
-  auto &HST = State.getMachineFunction().getSubtarget<HexagonSubtarget>();
-  if (HST.isHVXVectorType(LocVT)) {
-    if (!CC_HexagonVector(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
-      return false;
-  }
-
-  return true;  // CC didn't match.
-}
 
+// Implement calling convention for Hexagon.
 
-static bool CC_Hexagon32(unsigned ValNo, MVT ValVT,
-                         MVT LocVT, CCValAssign::LocInfo LocInfo,
-                         ISD::ArgFlagsTy ArgFlags, CCState &State) {
-  static const MCPhysReg RegList[] = {
-    Hexagon::R0, Hexagon::R1, Hexagon::R2, Hexagon::R3, Hexagon::R4,
-    Hexagon::R5
+static bool CC_SkipOdd(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                       CCValAssign::LocInfo &LocInfo,
+                       ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+  static const MCPhysReg ArgRegs[] = {
+    Hexagon::R0, Hexagon::R1, Hexagon::R2,
+    Hexagon::R3, Hexagon::R4, Hexagon::R5
   };
-  if (unsigned Reg = State.AllocateReg(RegList)) {
-    State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-    return false;
-  }
+  const unsigned NumArgRegs = array_lengthof(ArgRegs);
+  unsigned RegNum = State.getFirstUnallocated(ArgRegs);
 
-  unsigned Offset = State.AllocateStack(4, 4);
-  State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
-  return false;
-}
-
-static bool CC_Hexagon64(unsigned ValNo, MVT ValVT,
-                         MVT LocVT, CCValAssign::LocInfo LocInfo,
-                         ISD::ArgFlagsTy ArgFlags, CCState &State) {
-  if (unsigned Reg = State.AllocateReg(Hexagon::D0)) {
-    State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-    return false;
-  }
-
-  static const MCPhysReg RegList1[] = {
-    Hexagon::D1, Hexagon::D2
-  };
-  static const MCPhysReg RegList2[] = {
-    Hexagon::R1, Hexagon::R3
-  };
-  if (unsigned Reg = State.AllocateReg(RegList1, RegList2)) {
-    State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-    return false;
-  }
+  // RegNum is an index into ArgRegs: skip a register if RegNum is odd.
+  if (RegNum != NumArgRegs && RegNum % 2 == 1)
+    State.AllocateReg(ArgRegs[RegNum]);
 
-  unsigned Offset = State.AllocateStack(8, 8, Hexagon::D2);
-  State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+  // Always return false here, as this function only makes sure that the first
+  // unallocated register has an even register number and does not actually
+  // allocate a register for the current argument.
   return false;
 }
 
-static bool CC_HexagonVector(unsigned ValNo, MVT ValVT,
-                             MVT LocVT, CCValAssign::LocInfo LocInfo,
-                             ISD::ArgFlagsTy ArgFlags, CCState &State) {
-  static const MCPhysReg VecLstS[] = {
-      Hexagon::V0, Hexagon::V1, Hexagon::V2, Hexagon::V3, Hexagon::V4,
-      Hexagon::V5, Hexagon::V6, Hexagon::V7, Hexagon::V8, Hexagon::V9,
-      Hexagon::V10, Hexagon::V11, Hexagon::V12, Hexagon::V13, Hexagon::V14,
-      Hexagon::V15
-  };
-  static const MCPhysReg VecLstD[] = {
-      Hexagon::W0, Hexagon::W1, Hexagon::W2, Hexagon::W3, Hexagon::W4,
-      Hexagon::W5, Hexagon::W6, Hexagon::W7
-  };
-  auto &MF = State.getMachineFunction();
-  auto &HST = MF.getSubtarget<HexagonSubtarget>();
-
-  if (HST.useHVX64BOps() &&
-      (LocVT == MVT::v16i32 || LocVT == MVT::v32i16 ||
-       LocVT == MVT::v64i8 || LocVT == MVT::v512i1)) {
-    if (unsigned Reg = State.AllocateReg(VecLstS)) {
-      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-      return false;
-    }
-    unsigned Offset = State.AllocateStack(64, 64);
-    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
-    return false;
-  }
-  if (HST.useHVX64BOps() && (LocVT == MVT::v32i32 ||
-                             LocVT == MVT::v64i16 || LocVT == MVT::v128i8)) {
-    if (unsigned Reg = State.AllocateReg(VecLstD)) {
-      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-      return false;
-    }
-    unsigned Offset = State.AllocateStack(128, 128);
-    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
-    return false;
-  }
-  // 128B Mode
-  if (HST.useHVX128BOps() && (LocVT == MVT::v64i32 ||
-                              LocVT == MVT::v128i16 || LocVT == MVT::v256i8)) {
-    if (unsigned Reg = State.AllocateReg(VecLstD)) {
-      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-      return false;
-    }
-    unsigned Offset = State.AllocateStack(256, 256);
-    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
-    return false;
-  }
-  if (HST.useHVX128BOps() &&
-      (LocVT == MVT::v32i32 || LocVT == MVT::v64i16 ||
-       LocVT == MVT::v128i8 || LocVT == MVT::v1024i1)) {
-    if (unsigned Reg = State.AllocateReg(VecLstS)) {
-      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-      return false;
-    }
-    unsigned Offset = State.AllocateStack(128, 128);
-    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
-    return false;
-  }
-  return true;
-}
-
-static bool RetCC_Hexagon(unsigned ValNo, MVT ValVT,
-                          MVT LocVT, CCValAssign::LocInfo LocInfo,
-                          ISD::ArgFlagsTy ArgFlags, CCState &State) {
-  auto &MF = State.getMachineFunction();
-  auto &HST = MF.getSubtarget<HexagonSubtarget>();
-
-  if (LocVT == MVT::i1) {
-    // Return values of type MVT::i1 still need to be assigned to R0, but
-    // the value type needs to remain i1. LowerCallResult will deal with it,
-    // but it needs to recognize i1 as the value type.
-    LocVT = MVT::i32;
-  } else if (LocVT == MVT::i8 || LocVT == MVT::i16) {
-    LocVT = MVT::i32;
-    ValVT = MVT::i32;
-    if (ArgFlags.isSExt())
-      LocInfo = CCValAssign::SExt;
-    else if (ArgFlags.isZExt())
-      LocInfo = CCValAssign::ZExt;
-    else
-      LocInfo = CCValAssign::AExt;
-  } else if (LocVT == MVT::v4i8 || LocVT == MVT::v2i16) {
-    LocVT = MVT::i32;
-    LocInfo = CCValAssign::BCvt;
-  } else if (LocVT == MVT::v8i8 || LocVT == MVT::v4i16 || LocVT == MVT::v2i32) {
-    LocVT = MVT::i64;
-    LocInfo = CCValAssign::BCvt;
-  } else if (LocVT == MVT::v64i8 || LocVT == MVT::v32i16 ||
-             LocVT == MVT::v16i32 || LocVT == MVT::v512i1) {
-    LocVT = MVT::v16i32;
-    ValVT = MVT::v16i32;
-    LocInfo = CCValAssign::Full;
-  } else if (LocVT == MVT::v128i8 || LocVT == MVT::v64i16 ||
-             LocVT == MVT::v32i32 ||
-             (LocVT == MVT::v1024i1 && HST.useHVX128BOps())) {
-    LocVT = MVT::v32i32;
-    ValVT = MVT::v32i32;
-    LocInfo = CCValAssign::Full;
-  } else if (LocVT == MVT::v256i8 || LocVT == MVT::v128i16 ||
-             LocVT == MVT::v64i32) {
-    LocVT = MVT::v64i32;
-    ValVT = MVT::v64i32;
-    LocInfo = CCValAssign::Full;
-  }
-  if (LocVT == MVT::i32 || LocVT == MVT::f32) {
-    if (!RetCC_Hexagon32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
-      return false;
-  }
-
-  if (LocVT == MVT::i64 || LocVT == MVT::f64) {
-    if (!RetCC_Hexagon64(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
-      return false;
-  }
-  if (LocVT == MVT::v16i32 || LocVT == MVT::v32i32 || LocVT == MVT::v64i32) {
-    if (!RetCC_HexagonVector(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
-      return false;
-  }
-  return true;  // CC didn't match.
-}
-
-static bool RetCC_Hexagon32(unsigned ValNo, MVT ValVT,
-                            MVT LocVT, CCValAssign::LocInfo LocInfo,
-                            ISD::ArgFlagsTy ArgFlags, CCState &State) {
-  if (LocVT == MVT::i32 || LocVT == MVT::f32) {
-    // Note that use of registers beyond R1 is not ABI compliant. However there
-    // are (experimental) IR passes which generate internal functions that
-    // return structs using these additional registers.
-    static const uint16_t RegList[] = { Hexagon::R0, Hexagon::R1,
-                                        Hexagon::R2, Hexagon::R3,
-                                        Hexagon::R4, Hexagon::R5 };
-    if (unsigned Reg = State.AllocateReg(RegList)) {
-      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-      return false;
-    }
-  }
-
-  return true;
-}
-
-static bool RetCC_Hexagon64(unsigned ValNo, MVT ValVT,
-                            MVT LocVT, CCValAssign::LocInfo LocInfo,
-                            ISD::ArgFlagsTy ArgFlags, CCState &State) {
-  if (LocVT == MVT::i64 || LocVT == MVT::f64) {
-    if (unsigned Reg = State.AllocateReg(Hexagon::D0)) {
-      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-      return false;
-    }
-  }
-
-  return true;
-}
+#include "HexagonGenCallingConv.inc"
 
-static bool RetCC_HexagonVector(unsigned ValNo, MVT ValVT,
-                                MVT LocVT, CCValAssign::LocInfo LocInfo,
-                                ISD::ArgFlagsTy ArgFlags, CCState &State) {
-  auto &MF = State.getMachineFunction();
-  auto &HST = MF.getSubtarget<HexagonSubtarget>();
-
-  if (LocVT == MVT::v16i32) {
-    if (unsigned Reg = State.AllocateReg(Hexagon::V0)) {
-      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-      return false;
-    }
-  } else if (LocVT == MVT::v32i32) {
-    unsigned Req = HST.useHVX128BOps() ? Hexagon::V0 : Hexagon::W0;
-    if (unsigned Reg = State.AllocateReg(Req)) {
-      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-      return false;
-    }
-  } else if (LocVT == MVT::v64i32) {
-    if (unsigned Reg = State.AllocateReg(Hexagon::W0)) {
-      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-      return false;
-    }
-  }
-
-  return true;
-}
 
 void HexagonTargetLowering::promoteLdStType(MVT VT, MVT PromotedLdStVT) {
   if (VT != PromotedLdStVT) {
@@ -558,11 +184,14 @@ static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
 
 bool
 HexagonTargetLowering::CanLowerReturn(
-    CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
+    CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
     const SmallVectorImpl<ISD::OutputArg> &Outs,
     LLVMContext &Context) const {
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
+  CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
+
+  if (MF.getSubtarget<HexagonSubtarget>().useHVXOps())
+    return CCInfo.CheckReturn(Outs, RetCC_Hexagon_HVX);
   return CCInfo.CheckReturn(Outs, RetCC_Hexagon);
 }
 
@@ -571,7 +200,7 @@ HexagonTargetLowering::CanLowerReturn(
 // the value is stored in memory pointed by a pointer passed by caller.
 SDValue
 HexagonTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
-                                   bool isVarArg,
+                                   bool IsVarArg,
                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
                                    const SmallVectorImpl<SDValue> &OutVals,
                                    const SDLoc &dl, SelectionDAG &DAG) const {
@@ -579,11 +208,14 @@ HexagonTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   SmallVector<CCValAssign, 16> RVLocs;
 
   // CCState - Info about the registers and stack slot.
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
                  *DAG.getContext());
 
   // Analyze return values of ISD::RET
-  CCInfo.AnalyzeReturn(Outs, RetCC_Hexagon);
+  if (Subtarget.useHVXOps())
+    CCInfo.AnalyzeReturn(Outs, RetCC_Hexagon_HVX);
+  else
+    CCInfo.AnalyzeReturn(Outs, RetCC_Hexagon);
 
   SDValue Flag;
   SmallVector<SDValue, 4> RetOps(1, Chain);
@@ -624,17 +256,20 @@ bool HexagonTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
 /// being lowered. Returns a SDNode with the same number of values as the
 /// ISD::CALL.
 SDValue HexagonTargetLowering::LowerCallResult(
-    SDValue Chain, SDValue Glue, CallingConv::ID CallConv, bool isVarArg,
+    SDValue Chain, SDValue Glue, CallingConv::ID CallConv, bool IsVarArg,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
     const SmallVectorImpl<SDValue> &OutVals, SDValue Callee) const {
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
 
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
                  *DAG.getContext());
 
-  CCInfo.AnalyzeCallResult(Ins, RetCC_Hexagon);
+  if (Subtarget.useHVXOps())
+    CCInfo.AnalyzeCallResult(Ins, RetCC_Hexagon_HVX);
+  else
+    CCInfo.AnalyzeCallResult(Ins, RetCC_Hexagon);
 
   // Copy all of the result registers out of their specified physreg.
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
@@ -683,67 +318,57 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
   SDValue Chain                         = CLI.Chain;
   SDValue Callee                        = CLI.Callee;
-  bool &IsTailCall                      = CLI.IsTailCall;
   CallingConv::ID CallConv              = CLI.CallConv;
   bool IsVarArg                         = CLI.IsVarArg;
   bool DoesNotReturn                    = CLI.DoesNotReturn;
 
-  bool IsStructRet    = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
+  bool IsStructRet    = Outs.empty() ? false : Outs[0].Flags.isSRet();
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   auto PtrVT = getPointerTy(MF.getDataLayout());
 
-  // Check for varargs.
-  unsigned NumNamedVarArgParams = -1U;
-  if (GlobalAddressSDNode *GAN = dyn_cast<GlobalAddressSDNode>(Callee)) {
-    const GlobalValue *GV = GAN->getGlobal();
-    Callee = DAG.getTargetGlobalAddress(GV, dl, MVT::i32);
-    if (const Function* F = dyn_cast<Function>(GV)) {
-      // If a function has zero args and is a vararg function, that's
-      // disallowed so it must be an undeclared function.  Do not assume
-      // varargs if the callee is undefined.
-      if (F->isVarArg() && F->getFunctionType()->getNumParams() != 0)
-        NumNamedVarArgParams = F->getFunctionType()->getNumParams();
-    }
-  }
+  unsigned NumParams = CLI.CS.getInstruction()
+                        ? CLI.CS.getFunctionType()->getNumParams()
+                        : 0;
+  if (GlobalAddressSDNode *GAN = dyn_cast<GlobalAddressSDNode>(Callee))
+    Callee = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, MVT::i32);
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  HexagonCCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
-                        *DAG.getContext(), NumNamedVarArgParams);
+  HexagonCCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext(),
+                        NumParams);
 
-  if (IsVarArg)
-    CCInfo.AnalyzeCallOperands(Outs, CC_Hexagon_VarArg);
+  if (Subtarget.useHVXOps())
+    CCInfo.AnalyzeCallOperands(Outs, CC_Hexagon_HVX);
   else
     CCInfo.AnalyzeCallOperands(Outs, CC_Hexagon);
 
   auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls");
   if (Attr.getValueAsString() == "true")
-    IsTailCall = false;
+    CLI.IsTailCall = false;
 
-  if (IsTailCall) {
+  if (CLI.IsTailCall) {
     bool StructAttrFlag = MF.getFunction().hasStructRetAttr();
-    IsTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
-                                                   IsVarArg, IsStructRet,
-                                                   StructAttrFlag,
-                                                   Outs, OutVals, Ins, DAG);
+    CLI.IsTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
+                        IsVarArg, IsStructRet, StructAttrFlag, Outs,
+                        OutVals, Ins, DAG);
     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
       CCValAssign &VA = ArgLocs[i];
       if (VA.isMemLoc()) {
-        IsTailCall = false;
+        CLI.IsTailCall = false;
         break;
       }
     }
-    DEBUG(dbgs() << (IsTailCall ? "Eligible for Tail Call\n"
-                                : "Argument must be passed on stack. "
-                                  "Not eligible for Tail Call\n"));
+    LLVM_DEBUG(dbgs() << (CLI.IsTailCall ? "Eligible for Tail Call\n"
+                                         : "Argument must be passed on stack. "
+                                           "Not eligible for Tail Call\n"));
   }
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NumBytes = CCInfo.getNextStackOffset();
   SmallVector<std::pair<unsigned, SDValue>, 16> RegsToPass;
   SmallVector<SDValue, 8> MemOpChains;
 
-  auto &HRI = *Subtarget.getRegisterInfo();
+  const HexagonRegisterInfo &HRI = *Subtarget.getRegisterInfo();
   SDValue StackPtr =
       DAG.getCopyFromReg(Chain, dl, HRI.getStackRegister(), PtrVT);
 
@@ -789,7 +414,7 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                     VA.getLocVT().getStoreSizeInBits() >> 3);
       if (Flags.isByVal()) {
         // The argument is a struct passed by value. According to LLVM, "Arg"
-        // is is pointer.
+        // is a pointer.
         MemOpChains.push_back(CreateCopyOfByValArgument(Arg, MemAddr, Chain,
                                                         Flags, DAG, dl));
       } else {
@@ -807,14 +432,10 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
   }
 
-  if (NeedsArgAlign && Subtarget.hasV60TOps()) {
-    DEBUG(dbgs() << "Function needs byte stack align due to call args\n");
-    // V6 vectors passed by value have 64 or 128 byte alignment depending
-    // on whether we are 64 byte vector mode or 128 byte.
-    bool UseHVX128B = Subtarget.useHVX128BOps();
-    assert(Subtarget.useHVXOps());
-    const unsigned ObjAlign = UseHVX128B ? 128 : 64;
-    LargestAlignSeen = std::max(LargestAlignSeen, ObjAlign);
+  if (NeedsArgAlign && Subtarget.hasV60Ops()) {
+    LLVM_DEBUG(dbgs() << "Function needs byte stack align due to call args\n");
+    unsigned VecAlign = HRI.getSpillAlignment(Hexagon::HvxVRRegClass);
+    LargestAlignSeen = std::max(LargestAlignSeen, VecAlign);
     MFI.ensureMaxAlignment(LargestAlignSeen);
   }
   // Transform all store nodes into one single node because all store
@@ -823,7 +444,7 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
 
   SDValue Glue;
-  if (!IsTailCall) {
+  if (!CLI.IsTailCall) {
     Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
     Glue = Chain.getValue(1);
   }
@@ -832,7 +453,7 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // chain and flag operands which copy the outgoing args into registers.
   // The Glue is necessary since all emitted instructions must be
   // stuck together.
-  if (!IsTailCall) {
+  if (!CLI.IsTailCall) {
     for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
       Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
                                RegsToPass[i].second, Glue);
@@ -891,7 +512,7 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   if (Glue.getNode())
     Ops.push_back(Glue);
 
-  if (IsTailCall) {
+  if (CLI.IsTailCall) {
     MFI.setHasTailCall();
     return DAG.getNode(HexagonISD::TC_RETURN, dl, NodeTys, Ops);
   }
@@ -916,66 +537,36 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                          InVals, OutVals, Callee);
 }
 
-static bool getIndexedAddressParts(SDNode *Ptr, EVT VT,
-                                   SDValue &Base, SDValue &Offset,
-                                   bool &IsInc, SelectionDAG &DAG) {
-  if (Ptr->getOpcode() != ISD::ADD)
-    return false;
-
-  auto &HST = static_cast<const HexagonSubtarget&>(DAG.getSubtarget());
-
-  bool ValidHVX128BType =
-      HST.useHVX128BOps() && (VT == MVT::v32i32 ||
-                              VT == MVT::v64i16 || VT == MVT::v128i8);
-  bool ValidHVXType =
-      HST.useHVX64BOps() && (VT == MVT::v16i32 ||
-                             VT == MVT::v32i16 || VT == MVT::v64i8);
-
-  if (ValidHVX128BType || ValidHVXType || VT == MVT::i64 || VT == MVT::i32 ||
-      VT == MVT::i16 || VT == MVT::i8) {
-    IsInc = (Ptr->getOpcode() == ISD::ADD);
-    Base = Ptr->getOperand(0);
-    Offset = Ptr->getOperand(1);
-    // Ensure that Offset is a constant.
-    return isa<ConstantSDNode>(Offset);
-  }
-
-  return false;
-}
-
-/// getPostIndexedAddressParts - returns true by value, base pointer and
-/// offset pointer and addressing mode by reference if this node can be
-/// combined with a load / store to form a post-indexed load / store.
+/// Returns true by value, base pointer and offset pointer and addressing
+/// mode by reference if this node can be combined with a load / store to
+/// form a post-indexed load / store.
 bool HexagonTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
-                                                       SDValue &Base,
-                                                       SDValue &Offset,
-                                                       ISD::MemIndexedMode &AM,
-                                                       SelectionDAG &DAG) const
-{
-  EVT VT;
-
-  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
-    VT  = LD->getMemoryVT();
-  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
-    VT  = ST->getMemoryVT();
-    if (ST->getValue().getValueType() == MVT::i64 && ST->isTruncatingStore())
-      return false;
-  } else {
+      SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM,
+      SelectionDAG &DAG) const {
+  LSBaseSDNode *LSN = dyn_cast<LSBaseSDNode>(N);
+  if (!LSN)
+    return false;
+  EVT VT = LSN->getMemoryVT();
+  if (!VT.isSimple())
+    return false;
+  bool IsLegalType = VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 ||
+                     VT == MVT::i64 || VT == MVT::f32 || VT == MVT::f64 ||
+                     VT == MVT::v2i16 || VT == MVT::v2i32 || VT == MVT::v4i8 ||
+                     VT == MVT::v4i16 || VT == MVT::v8i8 ||
+                     Subtarget.isHVXVectorType(VT.getSimpleVT());
+  if (!IsLegalType)
     return false;
-  }
 
-  bool IsInc = false;
-  bool isLegal = getIndexedAddressParts(Op, VT, Base, Offset, IsInc, DAG);
-  if (isLegal) {
-    auto &HII = *Subtarget.getInstrInfo();
-    int32_t OffsetVal = cast<ConstantSDNode>(Offset.getNode())->getSExtValue();
-    if (HII.isValidAutoIncImm(VT, OffsetVal)) {
-      AM = IsInc ? ISD::POST_INC : ISD::POST_DEC;
-      return true;
-    }
-  }
+  if (Op->getOpcode() != ISD::ADD)
+    return false;
+  Base = Op->getOperand(0);
+  Offset = Op->getOperand(1);
+  if (!isa<ConstantSDNode>(Offset.getNode()))
+    return false;
+  AM = ISD::POST_INC;
 
-  return false;
+  int32_t V = cast<ConstantSDNode>(Offset.getNode())->getSExtValue();
+  return Subtarget.getInstrInfo()->isValidAutoIncImm(VT, V);
 }
 
 SDValue
@@ -1080,7 +671,7 @@ HexagonTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
   if (A == 0)
     A = HFI.getStackAlignment();
 
-  DEBUG({
+  LLVM_DEBUG({
     dbgs () << __func__ << " Align: " << A << " Size: ";
     Size.getNode()->dump(&DAG);
     dbgs() << "\n";
@@ -1095,20 +686,22 @@ HexagonTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
 }
 
 SDValue HexagonTargetLowering::LowerFormalArguments(
-    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo &MFI = MF.getFrameInfo();
-  MachineRegisterInfo &RegInfo = MF.getRegInfo();
-  auto &FuncInfo = *MF.getInfo<HexagonMachineFunctionInfo>();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
-                 *DAG.getContext());
+  HexagonCCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext(),
+                        MF.getFunction().getFunctionType()->getNumParams());
 
-  CCInfo.AnalyzeFormalArguments(Ins, CC_Hexagon);
+  if (Subtarget.useHVXOps())
+    CCInfo.AnalyzeFormalArguments(Ins, CC_Hexagon_HVX);
+  else
+    CCInfo.AnalyzeFormalArguments(Ins, CC_Hexagon);
 
   // For LLVM, in the case when returning a struct by value (>8byte),
   // the first argument is a pointer that points to the location on caller's
@@ -1117,110 +710,62 @@ SDValue HexagonTargetLowering::LowerFormalArguments(
   // equal to) 8 bytes. If not, no address will be passed into callee and
   // callee return the result direclty through R0/R1.
 
-  SmallVector<SDValue, 8> MemOps;
+  auto &HMFI = *MF.getInfo<HexagonMachineFunctionInfo>();
 
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
     ISD::ArgFlagsTy Flags = Ins[i].Flags;
-    unsigned ObjSize;
-    unsigned StackLocation;
-    int FI;
-
-    if (   (VA.isRegLoc() && !Flags.isByVal())
-        || (VA.isRegLoc() && Flags.isByVal() && Flags.getByValSize() > 8)) {
-      // Arguments passed in registers
-      // 1. int, long long, ptr args that get allocated in register.
-      // 2. Large struct that gets an register to put its address in.
-      EVT RegVT = VA.getLocVT();
-      if (RegVT == MVT::i8 || RegVT == MVT::i16 ||
-          RegVT == MVT::i32 || RegVT == MVT::f32) {
-        unsigned VReg =
-          RegInfo.createVirtualRegister(&Hexagon::IntRegsRegClass);
-        RegInfo.addLiveIn(VA.getLocReg(), VReg);
-        if (VA.getLocInfo() == CCValAssign::BCvt)
-          RegVT = VA.getValVT();
-        SDValue Copy = DAG.getCopyFromReg(Chain, dl, VReg, RegVT);
-        // Treat values of type MVT::i1 specially: they are passed in
-        // registers of type i32, but they need to remain as values of
-        // type i1 for consistency of the argument lowering.
-        if (VA.getValVT() == MVT::i1) {
-          // Generate a copy into a predicate register and use the value
-          // of the register as the "InVal".
-          unsigned PReg =
-            RegInfo.createVirtualRegister(&Hexagon::PredRegsRegClass);
-          SDNode *T = DAG.getMachineNode(Hexagon::C2_tfrrp, dl, MVT::i1,
-                                         Copy.getValue(0));
-          Copy = DAG.getCopyToReg(Copy.getValue(1), dl, PReg, SDValue(T, 0));
-          Copy = DAG.getCopyFromReg(Copy, dl, PReg, MVT::i1);
-        }
-        InVals.push_back(Copy);
-        Chain = Copy.getValue(1);
-      } else if (RegVT == MVT::i64 || RegVT == MVT::f64) {
-        unsigned VReg =
-          RegInfo.createVirtualRegister(&Hexagon::DoubleRegsRegClass);
-        RegInfo.addLiveIn(VA.getLocReg(), VReg);
-        if (VA.getLocInfo() == CCValAssign::BCvt)
-          RegVT = VA.getValVT();
-        InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
-
-      // Single Vector
-      } else if ((RegVT == MVT::v16i32 ||
-                  RegVT == MVT::v32i16 || RegVT == MVT::v64i8)) {
-        unsigned VReg =
-          RegInfo.createVirtualRegister(&Hexagon::HvxVRRegClass);
-        RegInfo.addLiveIn(VA.getLocReg(), VReg);
-        InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
-      } else if (Subtarget.useHVX128BOps() &&
-                 ((RegVT == MVT::v32i32 ||
-                   RegVT == MVT::v64i16 || RegVT == MVT::v128i8))) {
-        unsigned VReg =
-          RegInfo.createVirtualRegister(&Hexagon::HvxVRRegClass);
-        RegInfo.addLiveIn(VA.getLocReg(), VReg);
-        InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
-
-      // Double Vector
-      } else if ((RegVT == MVT::v32i32 ||
-                  RegVT == MVT::v64i16 || RegVT == MVT::v128i8)) {
-        unsigned VReg =
-          RegInfo.createVirtualRegister(&Hexagon::HvxWRRegClass);
-        RegInfo.addLiveIn(VA.getLocReg(), VReg);
-        InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
-      } else if (Subtarget.useHVX128BOps() &&
-                 ((RegVT == MVT::v64i32 ||
-                   RegVT == MVT::v128i16 || RegVT == MVT::v256i8))) {
-        unsigned VReg =
-          RegInfo.createVirtualRegister(&Hexagon::HvxWRRegClass);
-        RegInfo.addLiveIn(VA.getLocReg(), VReg);
-        InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
-      } else if (RegVT == MVT::v512i1 || RegVT == MVT::v1024i1) {
-        assert(0 && "need to support VecPred regs");
-        unsigned VReg =
-          RegInfo.createVirtualRegister(&Hexagon::HvxQRRegClass);
-        RegInfo.addLiveIn(VA.getLocReg(), VReg);
-        InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
+    bool ByVal = Flags.isByVal();
+
+    // Arguments passed in registers:
+    // 1. 32- and 64-bit values and HVX vectors are passed directly,
+    // 2. Large structs are passed via an address, and the address is
+    //    passed in a register.
+    if (VA.isRegLoc() && ByVal && Flags.getByValSize() <= 8)
+      llvm_unreachable("ByValSize must be bigger than 8 bytes");
+
+    bool InReg = VA.isRegLoc() &&
+                 (!ByVal || (ByVal && Flags.getByValSize() > 8));
+
+    if (InReg) {
+      MVT RegVT = VA.getLocVT();
+      if (VA.getLocInfo() == CCValAssign::BCvt)
+        RegVT = VA.getValVT();
+
+      const TargetRegisterClass *RC = getRegClassFor(RegVT);
+      unsigned VReg = MRI.createVirtualRegister(RC);
+      SDValue Copy = DAG.getCopyFromReg(Chain, dl, VReg, RegVT);
+
+      // Treat values of type MVT::i1 specially: they are passed in
+      // registers of type i32, but they need to remain as values of
+      // type i1 for consistency of the argument lowering.
+      if (VA.getValVT() == MVT::i1) {
+        assert(RegVT.getSizeInBits() <= 32);
+        SDValue T = DAG.getNode(ISD::AND, dl, RegVT,
+                                Copy, DAG.getConstant(1, dl, RegVT));
+        Copy = DAG.getSetCC(dl, MVT::i1, T, DAG.getConstant(0, dl, RegVT),
+                            ISD::SETNE);
       } else {
-        assert (0);
+#ifndef NDEBUG
+        unsigned RegSize = RegVT.getSizeInBits();
+        assert(RegSize == 32 || RegSize == 64 ||
+               Subtarget.isHVXVectorType(RegVT));
+#endif
       }
-    } else if (VA.isRegLoc() && Flags.isByVal() && Flags.getByValSize() <= 8) {
-      assert (0 && "ByValSize must be bigger than 8 bytes");
+      InVals.push_back(Copy);
+      MRI.addLiveIn(VA.getLocReg(), VReg);
     } else {
-      // Sanity check.
-      assert(VA.isMemLoc());
-
-      if (Flags.isByVal()) {
-        // If it's a byval parameter, then we need to compute the
-        // "real" size, not the size of the pointer.
-        ObjSize = Flags.getByValSize();
-      } else {
-        ObjSize = VA.getLocVT().getStoreSizeInBits() >> 3;
-      }
+      assert(VA.isMemLoc() && "Argument should be passed in memory");
 
-      StackLocation = HEXAGON_LRFP_SIZE + VA.getLocMemOffset();
-      // Create the frame index object for this incoming parameter...
-      FI = MFI.CreateFixedObject(ObjSize, StackLocation, true);
+      // If it's a byval parameter, then we need to compute the
+      // "real" size, not the size of the pointer.
+      unsigned ObjSize = Flags.isByVal()
+                            ? Flags.getByValSize()
+                            : VA.getLocVT().getStoreSizeInBits() / 8;
 
-      // Create the SelectionDAG nodes cordl, responding to a load
-      // from this parameter.
+      // Create the frame index object for this incoming parameter.
+      int Offset = HEXAGON_LRFP_SIZE + VA.getLocMemOffset();
+      int FI = MFI.CreateFixedObject(ObjSize, Offset, true);
       SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
 
       if (Flags.isByVal()) {
@@ -1229,22 +774,19 @@ SDValue HexagonTargetLowering::LowerFormalArguments(
         // location.
         InVals.push_back(FIN);
       } else {
-        InVals.push_back(
-            DAG.getLoad(VA.getValVT(), dl, Chain, FIN, MachinePointerInfo()));
+        SDValue L = DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
+                                MachinePointerInfo::getFixedStack(MF, FI, 0));
+        InVals.push_back(L);
       }
     }
   }
 
-  if (!MemOps.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
 
-  if (isVarArg) {
+  if (IsVarArg) {
     // This will point to the next argument passed via stack.
-    int FrameIndex = MFI.CreateFixedObject(Hexagon_PointerSize,
-                                           HEXAGON_LRFP_SIZE +
-                                           CCInfo.getNextStackOffset(),
-                                           true);
-    FuncInfo.setVarArgsFrameIndex(FrameIndex);
+    int Offset = HEXAGON_LRFP_SIZE + CCInfo.getNextStackOffset();
+    int FI = MFI.CreateFixedObject(Hexagon_PointerSize, Offset, true);
+    HMFI.setVarArgsFrameIndex(FI);
   }
 
   return Chain;
@@ -1262,66 +804,62 @@ HexagonTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
                       MachinePointerInfo(SV));
 }
 
-static bool isSExtFree(SDValue N) {
-  // A sign-extend of a truncate of a sign-extend is free.
-  if (N.getOpcode() == ISD::TRUNCATE &&
-      N.getOperand(0).getOpcode() == ISD::AssertSext)
-    return true;
-  // We have sign-extended loads.
-  if (N.getOpcode() == ISD::LOAD)
-    return true;
-  return false;
-}
-
 SDValue HexagonTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc dl(Op);
-
+  const SDLoc &dl(Op);
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
-  if (Subtarget.useHVXOps() && Subtarget.isHVXVectorType(ty(LHS)))
-    return LowerHvxSetCC(Op, DAG);
-
-  SDValue Cmp = Op.getOperand(2);
-  ISD::CondCode CC = cast<CondCodeSDNode>(Cmp)->get();
-
-  EVT VT = Op.getValueType();
-  EVT LHSVT = LHS.getValueType();
-  EVT RHSVT = RHS.getValueType();
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+  MVT ResTy = ty(Op);
+  MVT OpTy = ty(LHS);
 
-  if (LHSVT == MVT::v2i16) {
-    assert(ISD::isSignedIntSetCC(CC) || ISD::isUnsignedIntSetCC(CC));
-    unsigned ExtOpc = ISD::isSignedIntSetCC(CC) ? ISD::SIGN_EXTEND
-                                                : ISD::ZERO_EXTEND;
-    SDValue LX = DAG.getNode(ExtOpc, dl, MVT::v2i32, LHS);
-    SDValue RX = DAG.getNode(ExtOpc, dl, MVT::v2i32, RHS);
-    SDValue SC = DAG.getNode(ISD::SETCC, dl, MVT::v2i1, LX, RX, Cmp);
-    return SC;
+  if (OpTy == MVT::v2i16 || OpTy == MVT::v4i8) {
+    MVT ElemTy = OpTy.getVectorElementType();
+    assert(ElemTy.isScalarInteger());
+    MVT WideTy = MVT::getVectorVT(MVT::getIntegerVT(2*ElemTy.getSizeInBits()),
+                                  OpTy.getVectorNumElements());
+    return DAG.getSetCC(dl, ResTy,
+                        DAG.getSExtOrTrunc(LHS, SDLoc(LHS), WideTy),
+                        DAG.getSExtOrTrunc(RHS, SDLoc(RHS), WideTy), CC);
   }
 
   // Treat all other vector types as legal.
-  if (VT.isVector())
+  if (ResTy.isVector())
     return Op;
 
-  // Equals and not equals should use sign-extend, not zero-extend, since
-  // we can represent small negative values in the compare instructions.
+  // Comparisons of short integers should use sign-extend, not zero-extend,
+  // since we can represent small negative values in the compare instructions.
   // The LLVM default is to use zero-extend arbitrarily in these cases.
-  if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
-      (RHSVT == MVT::i8 || RHSVT == MVT::i16) &&
-      (LHSVT == MVT::i8 || LHSVT == MVT::i16)) {
-    ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS);
-    if (C && C->getAPIntValue().isNegative()) {
-      LHS = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, LHS);
-      RHS = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, RHS);
-      return DAG.getNode(ISD::SETCC, dl, Op.getValueType(),
-                         LHS, RHS, Op.getOperand(2));
-    }
-    if (isSExtFree(LHS) || isSExtFree(RHS)) {
-      LHS = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, LHS);
-      RHS = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, RHS);
-      return DAG.getNode(ISD::SETCC, dl, Op.getValueType(),
-                         LHS, RHS, Op.getOperand(2));
+  auto isSExtFree = [this](SDValue N) {
+    switch (N.getOpcode()) {
+      case ISD::TRUNCATE: {
+        // A sign-extend of a truncate of a sign-extend is free.
+        SDValue Op = N.getOperand(0);
+        if (Op.getOpcode() != ISD::AssertSext)
+          return false;
+        EVT OrigTy = cast<VTSDNode>(Op.getOperand(1))->getVT();
+        unsigned ThisBW = ty(N).getSizeInBits();
+        unsigned OrigBW = OrigTy.getSizeInBits();
+        // The type that was sign-extended to get the AssertSext must be
+        // narrower than the type of N (so that N has still the same value
+        // as the original).
+        return ThisBW >= OrigBW;
+      }
+      case ISD::LOAD:
+        // We have sign-extended loads.
+        return true;
     }
+    return false;
+  };
+
+  if (OpTy == MVT::i8 || OpTy == MVT::i16) {
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS);
+    bool IsNegative = C && C->getAPIntValue().isNegative();
+    if (IsNegative || isSExtFree(LHS) || isSExtFree(RHS))
+      return DAG.getSetCC(dl, ResTy,
+                          DAG.getSExtOrTrunc(LHS, SDLoc(LHS), MVT::i32),
+                          DAG.getSExtOrTrunc(RHS, SDLoc(RHS), MVT::i32), CC);
   }
+
   return SDValue();
 }
 
@@ -1393,8 +931,7 @@ HexagonTargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
   else if (isVTi1Type)
     T = DAG.getTargetConstantPool(CVal, ValTy, Align, Offset, TF);
   else
-    T = DAG.getTargetConstantPool(CPN->getConstVal(), ValTy, Align, Offset,
-                                  TF);
+    T = DAG.getTargetConstantPool(CPN->getConstVal(), ValTy, Align, Offset, TF);
 
   assert(cast<ConstantPoolSDNode>(T)->getTargetFlags() == TF &&
          "Inconsistent target flag encountered");
@@ -1480,7 +1017,7 @@ HexagonTargetLowering::LowerGLOBALADDRESS(SDValue Op, SelectionDAG &DAG) const {
   if (RM == Reloc::Static) {
     SDValue GA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
     const GlobalObject *GO = GV->getBaseObject();
-    if (GO && HLOF.isGlobalInSmallSection(GO, HTM))
+    if (GO && Subtarget.useSmallData() && HLOF.isGlobalInSmallSection(GO, HTM))
       return DAG.getNode(HexagonISD::CONST32_GP, dl, PtrVT, GA);
     return DAG.getNode(HexagonISD::CONST32, dl, PtrVT, GA);
   }
@@ -1688,13 +1225,15 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
                                              const HexagonSubtarget &ST)
     : TargetLowering(TM), HTM(static_cast<const HexagonTargetMachine&>(TM)),
       Subtarget(ST) {
-  bool IsV4 = !Subtarget.hasV5TOps();
+  bool IsV4 = !Subtarget.hasV5Ops();
   auto &HRI = *Subtarget.getRegisterInfo();
 
   setPrefLoopAlignment(4);
   setPrefFunctionAlignment(4);
   setMinFunctionAlignment(2);
   setStackPointerRegisterToSaveRestore(HRI.getStackRegister());
+  setBooleanContents(TargetLoweringBase::UndefinedBooleanContent);
+  setBooleanVectorContents(TargetLoweringBase::UndefinedBooleanContent);
 
   setMaxAtomicSizeInBitsSupported(64);
   setMinCmpXchgSizeInBits(32);
@@ -1728,45 +1267,11 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
   addRegisterClass(MVT::v4i16, &Hexagon::DoubleRegsRegClass);
   addRegisterClass(MVT::v2i32, &Hexagon::DoubleRegsRegClass);
 
-  if (Subtarget.hasV5TOps()) {
+  if (Subtarget.hasV5Ops()) {
     addRegisterClass(MVT::f32, &Hexagon::IntRegsRegClass);
     addRegisterClass(MVT::f64, &Hexagon::DoubleRegsRegClass);
   }
 
-  if (Subtarget.hasV60TOps()) {
-    if (Subtarget.useHVX64BOps()) {
-      addRegisterClass(MVT::v64i8,  &Hexagon::HvxVRRegClass);
-      addRegisterClass(MVT::v32i16, &Hexagon::HvxVRRegClass);
-      addRegisterClass(MVT::v16i32, &Hexagon::HvxVRRegClass);
-      addRegisterClass(MVT::v128i8, &Hexagon::HvxWRRegClass);
-      addRegisterClass(MVT::v64i16, &Hexagon::HvxWRRegClass);
-      addRegisterClass(MVT::v32i32, &Hexagon::HvxWRRegClass);
-      // These "short" boolean vector types should be legal because
-      // they will appear as results of vector compares. If they were
-      // not legal, type legalization would try to make them legal
-      // and that would require using operations that do not use or
-      // produce such types. That, in turn, would imply using custom
-      // nodes, which would be unoptimizable by the DAG combiner.
-      // The idea is to rely on target-independent operations as much
-      // as possible.
-      addRegisterClass(MVT::v16i1, &Hexagon::HvxQRRegClass);
-      addRegisterClass(MVT::v32i1, &Hexagon::HvxQRRegClass);
-      addRegisterClass(MVT::v64i1, &Hexagon::HvxQRRegClass);
-      addRegisterClass(MVT::v512i1, &Hexagon::HvxQRRegClass);
-    } else if (Subtarget.useHVX128BOps()) {
-      addRegisterClass(MVT::v128i8,  &Hexagon::HvxVRRegClass);
-      addRegisterClass(MVT::v64i16,  &Hexagon::HvxVRRegClass);
-      addRegisterClass(MVT::v32i32,  &Hexagon::HvxVRRegClass);
-      addRegisterClass(MVT::v256i8,  &Hexagon::HvxWRRegClass);
-      addRegisterClass(MVT::v128i16, &Hexagon::HvxWRRegClass);
-      addRegisterClass(MVT::v64i32,  &Hexagon::HvxWRRegClass);
-      addRegisterClass(MVT::v32i1, &Hexagon::HvxQRRegClass);
-      addRegisterClass(MVT::v64i1, &Hexagon::HvxQRRegClass);
-      addRegisterClass(MVT::v128i1, &Hexagon::HvxQRRegClass);
-      addRegisterClass(MVT::v1024i1, &Hexagon::HvxQRRegClass);
-    }
-  }
-
   //
   // Handling of scalar operations.
   //
@@ -1801,13 +1306,16 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::BlockAddress,  MVT::i32, Custom);
 
   // Hexagon needs to optimize cases with negative constants.
-  setOperationAction(ISD::SETCC, MVT::i8,  Custom);
-  setOperationAction(ISD::SETCC, MVT::i16, Custom);
+  setOperationAction(ISD::SETCC, MVT::i8,    Custom);
+  setOperationAction(ISD::SETCC, MVT::i16,   Custom);
+  setOperationAction(ISD::SETCC, MVT::v4i8,  Custom);
+  setOperationAction(ISD::SETCC, MVT::v2i16, Custom);
 
   // VASTART needs to be custom lowered to use the VarArgsFrameIndex.
   setOperationAction(ISD::VASTART, MVT::Other, Custom);
   setOperationAction(ISD::VAEND,   MVT::Other, Expand);
   setOperationAction(ISD::VAARG,   MVT::Other, Expand);
+  setOperationAction(ISD::VACOPY,  MVT::Other, Expand);
 
   setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
@@ -1819,35 +1327,21 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
     setMinimumJumpTableEntries(std::numeric_limits<int>::max());
   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
 
-  // Hexagon has instructions for add/sub with carry. The problem with
-  // modeling these instructions is that they produce 2 results: Rdd and Px.
-  // To model the update of Px, we will have to use Defs[p0..p3] which will
-  // cause any predicate live range to spill. So, we pretend we dont't have
-  // these instructions.
-  setOperationAction(ISD::ADDE, MVT::i8,  Expand);
-  setOperationAction(ISD::ADDE, MVT::i16, Expand);
-  setOperationAction(ISD::ADDE, MVT::i32, Expand);
-  setOperationAction(ISD::ADDE, MVT::i64, Expand);
-  setOperationAction(ISD::SUBE, MVT::i8,  Expand);
-  setOperationAction(ISD::SUBE, MVT::i16, Expand);
-  setOperationAction(ISD::SUBE, MVT::i32, Expand);
-  setOperationAction(ISD::SUBE, MVT::i64, Expand);
-  setOperationAction(ISD::ADDC, MVT::i8,  Expand);
-  setOperationAction(ISD::ADDC, MVT::i16, Expand);
-  setOperationAction(ISD::ADDC, MVT::i32, Expand);
-  setOperationAction(ISD::ADDC, MVT::i64, Expand);
-  setOperationAction(ISD::SUBC, MVT::i8,  Expand);
-  setOperationAction(ISD::SUBC, MVT::i16, Expand);
-  setOperationAction(ISD::SUBC, MVT::i32, Expand);
-  setOperationAction(ISD::SUBC, MVT::i64, Expand);
-
-  // Only add and sub that detect overflow are the saturating ones.
+  setOperationAction(ISD::ABS, MVT::i32, Legal);
+  setOperationAction(ISD::ABS, MVT::i64, Legal);
+
+  // Hexagon has A4_addp_c and A4_subp_c that take and generate a carry bit,
+  // but they only operate on i64.
   for (MVT VT : MVT::integer_valuetypes()) {
-    setOperationAction(ISD::UADDO, VT, Expand);
-    setOperationAction(ISD::SADDO, VT, Expand);
-    setOperationAction(ISD::USUBO, VT, Expand);
-    setOperationAction(ISD::SSUBO, VT, Expand);
+    setOperationAction(ISD::UADDO,    VT, Expand);
+    setOperationAction(ISD::USUBO,    VT, Expand);
+    setOperationAction(ISD::SADDO,    VT, Expand);
+    setOperationAction(ISD::SSUBO,    VT, Expand);
+    setOperationAction(ISD::ADDCARRY, VT, Expand);
+    setOperationAction(ISD::SUBCARRY, VT, Expand);
   }
+  setOperationAction(ISD::ADDCARRY, MVT::i64, Custom);
+  setOperationAction(ISD::SUBCARRY, MVT::i64, Custom);
 
   setOperationAction(ISD::CTLZ, MVT::i8,  Promote);
   setOperationAction(ISD::CTLZ, MVT::i16, Promote);
@@ -1865,22 +1359,21 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
   setOperationAction(ISD::BSWAP, MVT::i32, Legal);
   setOperationAction(ISD::BSWAP, MVT::i64, Legal);
-  setOperationAction(ISD::MUL,   MVT::i64, Legal);
 
   for (unsigned IntExpOp :
-       { ISD::SDIV,      ISD::UDIV,      ISD::SREM,      ISD::UREM,
-         ISD::SDIVREM,   ISD::UDIVREM,   ISD::ROTL,      ISD::ROTR,
-         ISD::SHL_PARTS, ISD::SRA_PARTS, ISD::SRL_PARTS,
-         ISD::SMUL_LOHI, ISD::UMUL_LOHI }) {
-    setOperationAction(IntExpOp, MVT::i32, Expand);
-    setOperationAction(IntExpOp, MVT::i64, Expand);
+       {ISD::SDIV,      ISD::UDIV,      ISD::SREM,      ISD::UREM,
+        ISD::SDIVREM,   ISD::UDIVREM,   ISD::ROTL,      ISD::ROTR,
+        ISD::SHL_PARTS, ISD::SRA_PARTS, ISD::SRL_PARTS,
+        ISD::SMUL_LOHI, ISD::UMUL_LOHI}) {
+    for (MVT VT : MVT::integer_valuetypes())
+      setOperationAction(IntExpOp, VT, Expand);
   }
 
   for (unsigned FPExpOp :
        {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS, ISD::FSINCOS,
         ISD::FPOW, ISD::FCOPYSIGN}) {
-    setOperationAction(FPExpOp, MVT::f32, Expand);
-    setOperationAction(FPExpOp, MVT::f64, Expand);
+    for (MVT VT : MVT::fp_valuetypes())
+      setOperationAction(FPExpOp, VT, Expand);
   }
 
   // No extending loads from i32.
@@ -1920,10 +1413,9 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
   // either "custom" or "legal" for specific cases.
   static const unsigned VectExpOps[] = {
     // Integer arithmetic:
-    ISD::ADD,     ISD::SUB,     ISD::MUL,     ISD::SDIV,    ISD::UDIV,
-    ISD::SREM,    ISD::UREM,    ISD::SDIVREM, ISD::UDIVREM, ISD::ADDC,
-    ISD::SUBC,    ISD::SADDO,   ISD::UADDO,   ISD::SSUBO,   ISD::USUBO,
-    ISD::SMUL_LOHI,             ISD::UMUL_LOHI,
+    ISD::ADD,     ISD::SUB,     ISD::MUL,     ISD::SDIV,      ISD::UDIV,
+    ISD::SREM,    ISD::UREM,    ISD::SDIVREM, ISD::UDIVREM,   ISD::SADDO,
+    ISD::UADDO,   ISD::SSUBO,   ISD::USUBO,   ISD::SMUL_LOHI, ISD::UMUL_LOHI,
     // Logical/bit:
     ISD::AND,     ISD::OR,      ISD::XOR,     ISD::ROTL,    ISD::ROTR,
     ISD::CTPOP,   ISD::CTLZ,    ISD::CTTZ,
@@ -1970,16 +1462,16 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
 
   // Extending loads from (native) vectors of i8 into (native) vectors of i16
   // are legal.
-  setLoadExtAction(ISD::EXTLOAD, MVT::v2i16, MVT::v2i8, Legal);
+  setLoadExtAction(ISD::EXTLOAD,  MVT::v2i16, MVT::v2i8, Legal);
   setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i16, MVT::v2i8, Legal);
   setLoadExtAction(ISD::SEXTLOAD, MVT::v2i16, MVT::v2i8, Legal);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Legal);
+  setLoadExtAction(ISD::EXTLOAD,  MVT::v4i16, MVT::v4i8, Legal);
   setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Legal);
   setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Legal);
 
   // Types natively supported:
-  for (MVT NativeVT : {MVT::v32i1, MVT::v64i1, MVT::v4i8, MVT::v8i8, MVT::v2i16,
-                       MVT::v4i16, MVT::v1i32, MVT::v2i32, MVT::v1i64}) {
+  for (MVT NativeVT : {MVT::v8i1, MVT::v4i1, MVT::v2i1, MVT::v4i8,
+                       MVT::v8i8, MVT::v2i16, MVT::v4i16, MVT::v2i32}) {
     setOperationAction(ISD::BUILD_VECTOR,       NativeVT, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, NativeVT, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT,  NativeVT, Custom);
@@ -1995,19 +1487,34 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::XOR, NativeVT, Legal);
   }
 
+  // Custom lower unaligned loads.
+  for (MVT VecVT : {MVT::i32, MVT::v4i8, MVT::i64, MVT::v8i8,
+                    MVT::v2i16, MVT::v4i16, MVT::v2i32}) {
+    setOperationAction(ISD::LOAD, VecVT, Custom);
+  }
+
+  for (MVT VT : {MVT::v2i16, MVT::v4i8, MVT::v2i32, MVT::v4i16, MVT::v2i32}) {
+    setCondCodeAction(ISD::SETLT,  VT, Expand);
+    setCondCodeAction(ISD::SETLE,  VT, Expand);
+    setCondCodeAction(ISD::SETULT, VT, Expand);
+    setCondCodeAction(ISD::SETULE, VT, Expand);
+  }
+
+  // Custom-lower bitcasts from i8 to v8i1.
+  setOperationAction(ISD::BITCAST,        MVT::i8,    Custom);
   setOperationAction(ISD::SETCC,          MVT::v2i16, Custom);
   setOperationAction(ISD::VSELECT,        MVT::v2i16, Custom);
+  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i8,  Custom);
   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom);
   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8,  Custom);
 
-  auto setPromoteTo = [this] (unsigned Opc, MVT FromTy, MVT ToTy) {
-    setOperationAction(Opc, FromTy, Promote);
-    AddPromotedToType(Opc, FromTy, ToTy);
-  };
-
   // Subtarget-specific operation actions.
   //
-  if (Subtarget.hasV5TOps()) {
+  if (Subtarget.hasV60Ops()) {
+    setOperationAction(ISD::ROTL, MVT::i32, Custom);
+    setOperationAction(ISD::ROTL, MVT::i64, Custom);
+  }
+  if (Subtarget.hasV5Ops()) {
     setOperationAction(ISD::FMA,  MVT::f64, Expand);
     setOperationAction(ISD::FADD, MVT::f64, Expand);
     setOperationAction(ISD::FSUB, MVT::f64, Expand);
@@ -2061,71 +1568,14 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
 
   // Handling of indexed loads/stores: default is "expand".
   //
-  for (MVT VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
+  for (MVT VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64, MVT::f32, MVT::f64,
+                 MVT::v2i16, MVT::v2i32, MVT::v4i8, MVT::v4i16, MVT::v8i8}) {
     setIndexedLoadAction(ISD::POST_INC, VT, Legal);
     setIndexedStoreAction(ISD::POST_INC, VT, Legal);
   }
 
-  if (Subtarget.useHVXOps()) {
-    bool Use64b = Subtarget.useHVX64BOps();
-    ArrayRef<MVT> LegalV = Use64b ? LegalV64 : LegalV128;
-    ArrayRef<MVT> LegalW = Use64b ? LegalW64 : LegalW128;
-    MVT ByteV = Use64b ?  MVT::v64i8 : MVT::v128i8;
-    MVT ByteW = Use64b ? MVT::v128i8 : MVT::v256i8;
-
-    setOperationAction(ISD::VECTOR_SHUFFLE, ByteV, Legal);
-    setOperationAction(ISD::VECTOR_SHUFFLE, ByteW, Legal);
-    setOperationAction(ISD::CONCAT_VECTORS, ByteW, Legal);
-    setOperationAction(ISD::AND,            ByteV, Legal);
-    setOperationAction(ISD::OR,             ByteV, Legal);
-    setOperationAction(ISD::XOR,            ByteV, Legal);
-
-    for (MVT T : LegalV) {
-      setIndexedLoadAction(ISD::POST_INC,  T, Legal);
-      setIndexedStoreAction(ISD::POST_INC, T, Legal);
-
-      setOperationAction(ISD::ADD,     T, Legal);
-      setOperationAction(ISD::SUB,     T, Legal);
-      if (T != ByteV) {
-        setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, T, Legal);
-        setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, T, Legal);
-      }
-
-      setOperationAction(ISD::MUL,                T, Custom);
-      setOperationAction(ISD::SETCC,              T, Custom);
-      setOperationAction(ISD::BUILD_VECTOR,       T, Custom);
-      setOperationAction(ISD::INSERT_SUBVECTOR,   T, Custom);
-      setOperationAction(ISD::INSERT_VECTOR_ELT,  T, Custom);
-      setOperationAction(ISD::EXTRACT_SUBVECTOR,  T, Custom);
-      setOperationAction(ISD::EXTRACT_VECTOR_ELT, T, Custom);
-      if (T != ByteV)
-        setOperationAction(ISD::ANY_EXTEND_VECTOR_INREG, T, Custom);
-    }
-
-    for (MVT T : LegalV) {
-      if (T == ByteV)
-        continue;
-      // Promote all shuffles and concats to operate on vectors of bytes.
-      setPromoteTo(ISD::VECTOR_SHUFFLE, T, ByteV);
-      setPromoteTo(ISD::CONCAT_VECTORS, T, ByteV);
-      setPromoteTo(ISD::AND,            T, ByteV);
-      setPromoteTo(ISD::OR,             T, ByteV);
-      setPromoteTo(ISD::XOR,            T, ByteV);
-    }
-
-    for (MVT T : LegalW) {
-      // Custom-lower BUILD_VECTOR for vector pairs. The standard (target-
-      // independent) handling of it would convert it to a load, which is
-      // not always the optimal choice.
-      setOperationAction(ISD::BUILD_VECTOR, T, Custom);
-
-      if (T == ByteW)
-        continue;
-      // Promote all shuffles and concats to operate on vectors of bytes.
-      setPromoteTo(ISD::VECTOR_SHUFFLE, T, ByteW);
-      setPromoteTo(ISD::CONCAT_VECTORS, T, ByteW);
-    }
-  }
+  if (Subtarget.useHVXOps())
+    initializeHVXLowering();
 
   computeRegisterProperties(&HRI);
 
@@ -2195,7 +1645,7 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
     setLibcallName(RTLIB::DIV_F32, "__hexagon_divsf3");
   }
 
-  if (Subtarget.hasV5TOps()) {
+  if (Subtarget.hasV5Ops()) {
     if (FastMath)
       setLibcallName(RTLIB::SQRT_F32, "__hexagon_fast2_sqrtf");
     else
@@ -2242,6 +1692,8 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
 
 const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch ((HexagonISD::NodeType)Opcode) {
+  case HexagonISD::ADDC:          return "HexagonISD::ADDC";
+  case HexagonISD::SUBC:          return "HexagonISD::SUBC";
   case HexagonISD::ALLOCA:        return "HexagonISD::ALLOCA";
   case HexagonISD::AT_GOT:        return "HexagonISD::AT_GOT";
   case HexagonISD::AT_PCREL:      return "HexagonISD::AT_PCREL";
@@ -2255,16 +1707,12 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case HexagonISD::CP:            return "HexagonISD::CP";
   case HexagonISD::DCFETCH:       return "HexagonISD::DCFETCH";
   case HexagonISD::EH_RETURN:     return "HexagonISD::EH_RETURN";
+  case HexagonISD::TSTBIT:        return "HexagonISD::TSTBIT";
   case HexagonISD::EXTRACTU:      return "HexagonISD::EXTRACTU";
-  case HexagonISD::EXTRACTURP:    return "HexagonISD::EXTRACTURP";
   case HexagonISD::INSERT:        return "HexagonISD::INSERT";
-  case HexagonISD::INSERTRP:      return "HexagonISD::INSERTRP";
   case HexagonISD::JT:            return "HexagonISD::JT";
   case HexagonISD::RET_FLAG:      return "HexagonISD::RET_FLAG";
   case HexagonISD::TC_RETURN:     return "HexagonISD::TC_RETURN";
-  case HexagonISD::VCOMBINE:      return "HexagonISD::VCOMBINE";
-  case HexagonISD::VPACKE:        return "HexagonISD::VPACKE";
-  case HexagonISD::VPACKO:        return "HexagonISD::VPACKO";
   case HexagonISD::VASL:          return "HexagonISD::VASL";
   case HexagonISD::VASR:          return "HexagonISD::VASR";
   case HexagonISD::VLSR:          return "HexagonISD::VLSR";
@@ -2274,11 +1722,97 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case HexagonISD::VROR:          return "HexagonISD::VROR";
   case HexagonISD::READCYCLE:     return "HexagonISD::READCYCLE";
   case HexagonISD::VZERO:         return "HexagonISD::VZERO";
+  case HexagonISD::VSPLATW:       return "HexagonISD::VSPLATW";
+  case HexagonISD::D2P:           return "HexagonISD::D2P";
+  case HexagonISD::P2D:           return "HexagonISD::P2D";
+  case HexagonISD::V2Q:           return "HexagonISD::V2Q";
+  case HexagonISD::Q2V:           return "HexagonISD::Q2V";
+  case HexagonISD::QCAT:          return "HexagonISD::QCAT";
+  case HexagonISD::QTRUE:         return "HexagonISD::QTRUE";
+  case HexagonISD::QFALSE:        return "HexagonISD::QFALSE";
+  case HexagonISD::TYPECAST:      return "HexagonISD::TYPECAST";
+  case HexagonISD::VALIGN:        return "HexagonISD::VALIGN";
+  case HexagonISD::VALIGNADDR:    return "HexagonISD::VALIGNADDR";
   case HexagonISD::OP_END:        break;
   }
   return nullptr;
 }
 
+// Bit-reverse Load Intrinsic: Check if the instruction is a bit reverse load
+// intrinsic.
+static bool isBrevLdIntrinsic(const Value *Inst) {
+  unsigned ID = cast<IntrinsicInst>(Inst)->getIntrinsicID();
+  return (ID == Intrinsic::hexagon_L2_loadrd_pbr ||
+          ID == Intrinsic::hexagon_L2_loadri_pbr ||
+          ID == Intrinsic::hexagon_L2_loadrh_pbr ||
+          ID == Intrinsic::hexagon_L2_loadruh_pbr ||
+          ID == Intrinsic::hexagon_L2_loadrb_pbr ||
+          ID == Intrinsic::hexagon_L2_loadrub_pbr);
+}
+
+// Bit-reverse Load Intrinsic :Crawl up and figure out the object from previous
+// instruction. So far we only handle bitcast, extract value and bit reverse
+// load intrinsic instructions. Should we handle CGEP ?
+static Value *getBrevLdObject(Value *V) {
+  if (Operator::getOpcode(V) == Instruction::ExtractValue ||
+      Operator::getOpcode(V) == Instruction::BitCast)
+    V = cast<Operator>(V)->getOperand(0);
+  else if (isa<IntrinsicInst>(V) && isBrevLdIntrinsic(V))
+    V = cast<Instruction>(V)->getOperand(0);
+  return V;
+}
+
+// Bit-reverse Load Intrinsic: For a PHI Node return either an incoming edge or
+// a back edge. If the back edge comes from the intrinsic itself, the incoming
+// edge is returned.
+static Value *returnEdge(const PHINode *PN, Value *IntrBaseVal) {
+  const BasicBlock *Parent = PN->getParent();
+  int Idx = -1;
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i < e; ++i) {
+    BasicBlock *Blk = PN->getIncomingBlock(i);
+    // Determine if the back edge is originated from intrinsic.
+    if (Blk == Parent) {
+      Value *BackEdgeVal = PN->getIncomingValue(i);
+      Value *BaseVal;
+      // Loop over till we return the same Value or we hit the IntrBaseVal.
+      do {
+        BaseVal = BackEdgeVal;
+        BackEdgeVal = getBrevLdObject(BackEdgeVal);
+      } while ((BaseVal != BackEdgeVal) && (IntrBaseVal != BackEdgeVal));
+      // If the getBrevLdObject returns IntrBaseVal, we should return the
+      // incoming edge.
+      if (IntrBaseVal == BackEdgeVal)
+        continue;
+      Idx = i;
+      break;
+    } else // Set the node to incoming edge.
+      Idx = i;
+  }
+  assert(Idx >= 0 && "Unexpected index to incoming argument in PHI");
+  return PN->getIncomingValue(Idx);
+}
+
+// Bit-reverse Load Intrinsic: Figure out the underlying object the base
+// pointer points to, for the bit-reverse load intrinsic. Setting this to
+// memoperand might help alias analysis to figure out the dependencies.
+static Value *getUnderLyingObjectForBrevLdIntr(Value *V) {
+  Value *IntrBaseVal = V;
+  Value *BaseVal;
+  // Loop over till we return the same Value, implies we either figure out
+  // the object or we hit a PHI
+  do {
+    BaseVal = V;
+    V = getBrevLdObject(V);
+  } while (BaseVal != V);
+
+  // Identify the object from PHINode.
+  if (const PHINode *PN = dyn_cast<PHINode>(V))
+    return returnEdge(PN, IntrBaseVal);
+  // For non PHI nodes, the object is the last value returned by getBrevLdObject
+  else
+    return V;
+}
+
 /// Given an intrinsic, checks if on the target the intrinsic will need to map
 /// to a MemIntrinsicNode (touches memory). If this is the case, it returns
 /// true and store the intrinsic information into the IntrinsicInfo that was
@@ -2288,6 +1822,32 @@ bool HexagonTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
                                                MachineFunction &MF,
                                                unsigned Intrinsic) const {
   switch (Intrinsic) {
+  case Intrinsic::hexagon_L2_loadrd_pbr:
+  case Intrinsic::hexagon_L2_loadri_pbr:
+  case Intrinsic::hexagon_L2_loadrh_pbr:
+  case Intrinsic::hexagon_L2_loadruh_pbr:
+  case Intrinsic::hexagon_L2_loadrb_pbr:
+  case Intrinsic::hexagon_L2_loadrub_pbr: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
+    auto &Cont = I.getCalledFunction()->getParent()->getContext();
+    // The intrinsic function call is of the form { ElTy, i8* }
+    // @llvm.hexagon.L2.loadXX.pbr(i8*, i32). The pointer and memory access type
+    // should be derived from ElTy.
+    PointerType *PtrTy = I.getCalledFunction()
+                             ->getReturnType()
+                             ->getContainedType(0)
+                             ->getPointerTo();
+    Info.memVT = MVT::getVT(PtrTy->getElementType());
+    llvm::Value *BasePtrVal = I.getOperand(0);
+    Info.ptrVal = getUnderLyingObjectForBrevLdIntr(BasePtrVal);
+    // The offset value comes through Modifier register. For now, assume the
+    // offset is 0.
+    Info.offset = 0;
+    Info.align = DL.getABITypeAlignment(Info.memVT.getTypeForEVT(Cont));
+    Info.flags = MachineMemOperand::MOLoad;
+    return true;
+  }
   case Intrinsic::hexagon_V6_vgathermw:
   case Intrinsic::hexagon_V6_vgathermw_128B:
   case Intrinsic::hexagon_V6_vgathermh:
@@ -2319,17 +1879,13 @@ bool HexagonTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
 }
 
 bool HexagonTargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
-  EVT MTy1 = EVT::getEVT(Ty1);
-  EVT MTy2 = EVT::getEVT(Ty2);
-  if (!MTy1.isSimple() || !MTy2.isSimple())
-    return false;
-  return (MTy1.getSimpleVT() == MVT::i64) && (MTy2.getSimpleVT() == MVT::i32);
+  return isTruncateFree(EVT::getEVT(Ty1), EVT::getEVT(Ty2));
 }
 
 bool HexagonTargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
   if (!VT1.isSimple() || !VT2.isSimple())
     return false;
-  return (VT1.getSimpleVT() == MVT::i64) && (VT2.getSimpleVT() == MVT::i32);
+  return VT1.getSimpleVT() == MVT::i64 && VT2.getSimpleVT() == MVT::i32;
 }
 
 bool HexagonTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
@@ -2372,126 +1928,199 @@ HexagonTargetLowering::getPreferredVectorAction(EVT VT) const {
   return TargetLoweringBase::TypeSplitVector;
 }
 
+std::pair<SDValue, int>
+HexagonTargetLowering::getBaseAndOffset(SDValue Addr) const {
+  if (Addr.getOpcode() == ISD::ADD) {
+    SDValue Op1 = Addr.getOperand(1);
+    if (auto *CN = dyn_cast<const ConstantSDNode>(Op1.getNode()))
+      return { Addr.getOperand(0), CN->getSExtValue() };
+  }
+  return { Addr, 0 };
+}
+
 // Lower a vector shuffle (V1, V2, V3).  V1 and V2 are the two vectors
 // to select data from, V3 is the permutation.
 SDValue
 HexagonTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG)
       const {
-  const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
-  SDValue V1 = Op.getOperand(0);
-  SDValue V2 = Op.getOperand(1);
-  SDLoc dl(Op);
-  EVT VT = Op.getValueType();
+  const auto *SVN = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> AM = SVN->getMask();
+  assert(AM.size() <= 8 && "Unexpected shuffle mask");
+  unsigned VecLen = AM.size();
 
-  if (V2.isUndef())
-    V2 = V1;
-
-  if (SVN->isSplat()) {
-    int Lane = SVN->getSplatIndex();
-    if (Lane == -1) Lane = 0;
-
-    // Test if V1 is a SCALAR_TO_VECTOR.
-    if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
-      return DAG.getNode(HexagonISD::VSPLAT, dl, VT, V1.getOperand(0));
-
-    // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR
-    // (and probably will turn into a SCALAR_TO_VECTOR once legalization
-    // reaches it).
-    if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR &&
-        !isa<ConstantSDNode>(V1.getOperand(0))) {
-      bool IsScalarToVector = true;
-      for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i) {
-        if (!V1.getOperand(i).isUndef()) {
-          IsScalarToVector = false;
-          break;
-        }
-      }
-      if (IsScalarToVector)
-        return DAG.getNode(HexagonISD::VSPLAT, dl, VT, V1.getOperand(0));
+  MVT VecTy = ty(Op);
+  assert(!Subtarget.isHVXVectorType(VecTy, true) &&
+         "HVX shuffles should be legal");
+  assert(VecTy.getSizeInBits() <= 64 && "Unexpected vector length");
+
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  const SDLoc &dl(Op);
+
+  // If the inputs are not the same as the output, bail. This is not an
+  // error situation, but complicates the handling and the default expansion
+  // (into BUILD_VECTOR) should be adequate.
+  if (ty(Op0) != VecTy || ty(Op1) != VecTy)
+    return SDValue();
+
+  // Normalize the mask so that the first non-negative index comes from
+  // the first operand.
+  SmallVector<int,8> Mask(AM.begin(), AM.end());
+  unsigned F = llvm::find_if(AM, [](int M) { return M >= 0; }) - AM.data();
+  if (F == AM.size())
+    return DAG.getUNDEF(VecTy);
+  if (AM[F] >= int(VecLen)) {
+    ShuffleVectorSDNode::commuteMask(Mask);
+    std::swap(Op0, Op1);
+  }
+
+  // Express the shuffle mask in terms of bytes.
+  SmallVector<int,8> ByteMask;
+  unsigned ElemBytes = VecTy.getVectorElementType().getSizeInBits() / 8;
+  for (unsigned i = 0, e = Mask.size(); i != e; ++i) {
+    int M = Mask[i];
+    if (M < 0) {
+      for (unsigned j = 0; j != ElemBytes; ++j)
+        ByteMask.push_back(-1);
+    } else {
+      for (unsigned j = 0; j != ElemBytes; ++j)
+        ByteMask.push_back(M*ElemBytes + j);
     }
-    return DAG.getNode(HexagonISD::VSPLAT, dl, VT,
-                       DAG.getConstant(Lane, dl, MVT::i32));
   }
+  assert(ByteMask.size() <= 8);
+
+  // All non-undef (non-negative) indexes are well within [0..127], so they
+  // fit in a single byte. Build two 64-bit words:
+  // - MaskIdx where each byte is the corresponding index (for non-negative
+  //   indexes), and 0xFF for negative indexes, and
+  // - MaskUnd that has 0xFF for each negative index.
+  uint64_t MaskIdx = 0;
+  uint64_t MaskUnd = 0;
+  for (unsigned i = 0, e = ByteMask.size(); i != e; ++i) {
+    unsigned S = 8*i;
+    uint64_t M = ByteMask[i] & 0xFF;
+    if (M == 0xFF)
+      MaskUnd |= M << S;
+    MaskIdx |= M << S;
+  }
+
+  if (ByteMask.size() == 4) {
+    // Identity.
+    if (MaskIdx == (0x03020100 | MaskUnd))
+      return Op0;
+    // Byte swap.
+    if (MaskIdx == (0x00010203 | MaskUnd)) {
+      SDValue T0 = DAG.getBitcast(MVT::i32, Op0);
+      SDValue T1 = DAG.getNode(ISD::BSWAP, dl, MVT::i32, T0);
+      return DAG.getBitcast(VecTy, T1);
+    }
 
-  // FIXME: We need to support more general vector shuffles.  See
-  // below the comment from the ARM backend that deals in the general
-  // case with the vector shuffles.  For now, let expand handle these.
-  return SDValue();
+    // Byte packs.
+    SDValue Concat10 = DAG.getNode(HexagonISD::COMBINE, dl,
+                                   typeJoin({ty(Op1), ty(Op0)}), {Op1, Op0});
+    if (MaskIdx == (0x06040200 | MaskUnd))
+      return getInstr(Hexagon::S2_vtrunehb, dl, VecTy, {Concat10}, DAG);
+    if (MaskIdx == (0x07050301 | MaskUnd))
+      return getInstr(Hexagon::S2_vtrunohb, dl, VecTy, {Concat10}, DAG);
+
+    SDValue Concat01 = DAG.getNode(HexagonISD::COMBINE, dl,
+                                   typeJoin({ty(Op0), ty(Op1)}), {Op0, Op1});
+    if (MaskIdx == (0x02000604 | MaskUnd))
+      return getInstr(Hexagon::S2_vtrunehb, dl, VecTy, {Concat01}, DAG);
+    if (MaskIdx == (0x03010705 | MaskUnd))
+      return getInstr(Hexagon::S2_vtrunohb, dl, VecTy, {Concat01}, DAG);
+  }
+
+  if (ByteMask.size() == 8) {
+    // Identity.
+    if (MaskIdx == (0x0706050403020100ull | MaskUnd))
+      return Op0;
+    // Byte swap.
+    if (MaskIdx == (0x0001020304050607ull | MaskUnd)) {
+      SDValue T0 = DAG.getBitcast(MVT::i64, Op0);
+      SDValue T1 = DAG.getNode(ISD::BSWAP, dl, MVT::i64, T0);
+      return DAG.getBitcast(VecTy, T1);
+    }
 
-  // If the shuffle is not directly supported and it has 4 elements, use
-  // the PerfectShuffle-generated table to synthesize it from other shuffles.
-}
+    // Halfword picks.
+    if (MaskIdx == (0x0d0c050409080100ull | MaskUnd))
+      return getInstr(Hexagon::S2_shuffeh, dl, VecTy, {Op1, Op0}, DAG);
+    if (MaskIdx == (0x0f0e07060b0a0302ull | MaskUnd))
+      return getInstr(Hexagon::S2_shuffoh, dl, VecTy, {Op1, Op0}, DAG);
+    if (MaskIdx == (0x0d0c090805040100ull | MaskUnd))
+      return getInstr(Hexagon::S2_vtrunewh, dl, VecTy, {Op1, Op0}, DAG);
+    if (MaskIdx == (0x0f0e0b0a07060302ull | MaskUnd))
+      return getInstr(Hexagon::S2_vtrunowh, dl, VecTy, {Op1, Op0}, DAG);
+    if (MaskIdx == (0x0706030205040100ull | MaskUnd)) {
+      VectorPair P = opSplit(Op0, dl, DAG);
+      return getInstr(Hexagon::S2_packhl, dl, VecTy, {P.second, P.first}, DAG);
+    }
 
-// If BUILD_VECTOR has same base element repeated several times,
-// report true.
-static bool isCommonSplatElement(BuildVectorSDNode *BVN) {
-  unsigned NElts = BVN->getNumOperands();
-  SDValue V0 = BVN->getOperand(0);
+    // Byte packs.
+    if (MaskIdx == (0x0e060c040a020800ull | MaskUnd))
+      return getInstr(Hexagon::S2_shuffeb, dl, VecTy, {Op1, Op0}, DAG);
+    if (MaskIdx == (0x0f070d050b030901ull | MaskUnd))
+      return getInstr(Hexagon::S2_shuffob, dl, VecTy, {Op1, Op0}, DAG);
+  }
 
-  for (unsigned i = 1, e = NElts; i != e; ++i) {
-    if (BVN->getOperand(i) != V0)
-      return false;
+  return SDValue();
+}
+
+// Create a Hexagon-specific node for shifting a vector by an integer.
+SDValue
+HexagonTargetLowering::getVectorShiftByInt(SDValue Op, SelectionDAG &DAG)
+      const {
+  if (auto *BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode())) {
+    if (SDValue S = BVN->getSplatValue()) {
+      unsigned NewOpc;
+      switch (Op.getOpcode()) {
+        case ISD::SHL:
+          NewOpc = HexagonISD::VASL;
+          break;
+        case ISD::SRA:
+          NewOpc = HexagonISD::VASR;
+          break;
+        case ISD::SRL:
+          NewOpc = HexagonISD::VLSR;
+          break;
+        default:
+          llvm_unreachable("Unexpected shift opcode");
+      }
+      return DAG.getNode(NewOpc, SDLoc(Op), ty(Op), Op.getOperand(0), S);
+    }
   }
-  return true;
+
+  return SDValue();
 }
 
-// Lower a vector shift. Try to convert
-// <VT> = SHL/SRA/SRL <VT> by <VT> to Hexagon specific
-// <VT> = SHL/SRA/SRL <VT> by <IT/i32>.
 SDValue
 HexagonTargetLowering::LowerVECTOR_SHIFT(SDValue Op, SelectionDAG &DAG) const {
-  BuildVectorSDNode *BVN = nullptr;
-  SDValue V1 = Op.getOperand(0);
-  SDValue V2 = Op.getOperand(1);
-  SDValue V3;
-  SDLoc dl(Op);
-  EVT VT = Op.getValueType();
+  return getVectorShiftByInt(Op, DAG);
+}
 
-  if ((BVN = dyn_cast<BuildVectorSDNode>(V1.getNode())) &&
-      isCommonSplatElement(BVN))
-    V3 = V2;
-  else if ((BVN = dyn_cast<BuildVectorSDNode>(V2.getNode())) &&
-           isCommonSplatElement(BVN))
-    V3 = V1;
-  else
-    return SDValue();
+SDValue
+HexagonTargetLowering::LowerROTL(SDValue Op, SelectionDAG &DAG) const {
+  if (isa<ConstantSDNode>(Op.getOperand(1).getNode()))
+    return Op;
+  return SDValue();
+}
 
-  SDValue CommonSplat = BVN->getOperand(0);
-  SDValue Result;
+SDValue
+HexagonTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
+  MVT ResTy = ty(Op);
+  SDValue InpV = Op.getOperand(0);
+  MVT InpTy = ty(InpV);
+  assert(ResTy.getSizeInBits() == InpTy.getSizeInBits());
+  const SDLoc &dl(Op);
 
-  if (VT.getSimpleVT() == MVT::v4i16) {
-    switch (Op.getOpcode()) {
-    case ISD::SRA:
-      Result = DAG.getNode(HexagonISD::VASR, dl, VT, V3, CommonSplat);
-      break;
-    case ISD::SHL:
-      Result = DAG.getNode(HexagonISD::VASL, dl, VT, V3, CommonSplat);
-      break;
-    case ISD::SRL:
-      Result = DAG.getNode(HexagonISD::VLSR, dl, VT, V3, CommonSplat);
-      break;
-    default:
-      return SDValue();
-    }
-  } else if (VT.getSimpleVT() == MVT::v2i32) {
-    switch (Op.getOpcode()) {
-    case ISD::SRA:
-      Result = DAG.getNode(HexagonISD::VASR, dl, VT, V3, CommonSplat);
-      break;
-    case ISD::SHL:
-      Result = DAG.getNode(HexagonISD::VASL, dl, VT, V3, CommonSplat);
-      break;
-    case ISD::SRL:
-      Result = DAG.getNode(HexagonISD::VLSR, dl, VT, V3, CommonSplat);
-      break;
-    default:
-      return SDValue();
-    }
-  } else {
-    return SDValue();
+  // Handle conversion from i8 to v8i1.
+  if (ResTy == MVT::v8i1) {
+    SDValue Sc = DAG.getBitcast(tyScalar(InpTy), InpV);
+    SDValue Ext = DAG.getZExtOrTrunc(Sc, dl, MVT::i32);
+    return getInstr(Hexagon::C2_tfrrp, dl, ResTy, Ext, DAG);
   }
 
-  return DAG.getNode(ISD::BITCAST, dl, VT, Result);
+  return SDValue();
 }
 
 bool
@@ -2509,9 +2138,10 @@ HexagonTargetLowering::getBuildVectorConstInts(ArrayRef<SDValue> Values,
       Consts[i] = ConstantInt::get(IntTy, 0);
       continue;
     }
+    // Make sure to always cast to IntTy.
     if (auto *CN = dyn_cast<ConstantSDNode>(V.getNode())) {
       const ConstantInt *CI = CN->getConstantIntValue();
-      Consts[i] = const_cast<ConstantInt*>(CI);
+      Consts[i] = ConstantInt::get(IntTy, CI->getValue().getSExtValue());
     } else if (auto *CN = dyn_cast<ConstantFPSDNode>(V.getNode())) {
       const ConstantFP *CF = CN->getConstantFPValue();
       APInt A = CF->getValueAPF().bitcastToAPInt();
@@ -2550,8 +2180,8 @@ HexagonTargetLowering::buildVector32(ArrayRef<SDValue> Elem, const SDLoc &dl,
                    Consts[1]->getZExtValue() << 16;
       return DAG.getBitcast(MVT::v2i16, DAG.getConstant(V, dl, MVT::i32));
     }
-    SDValue N = getNode(Hexagon::A2_combine_ll, dl, MVT::i32,
-                        {Elem[1], Elem[0]}, DAG);
+    SDValue N = getInstr(Hexagon::A2_combine_ll, dl, MVT::i32,
+                         {Elem[1], Elem[0]}, DAG);
     return DAG.getBitcast(MVT::v2i16, N);
   }
 
@@ -2596,7 +2226,7 @@ HexagonTargetLowering::buildVector32(ArrayRef<SDValue> Elem, const SDLoc &dl,
     SDValue B0 = DAG.getNode(ISD::OR, dl, MVT::i32, {Vs[0], T0});
     SDValue B1 = DAG.getNode(ISD::OR, dl, MVT::i32, {Vs[2], T1});
 
-    SDValue R = getNode(Hexagon::A2_combine_ll, dl, MVT::i32, {B1, B0}, DAG);
+    SDValue R = getInstr(Hexagon::A2_combine_ll, dl, MVT::i32, {B1, B0}, DAG);
     return DAG.getBitcast(MVT::v4i8, R);
   }
 
@@ -2651,7 +2281,7 @@ HexagonTargetLowering::buildVector64(ArrayRef<SDValue> Elem, const SDLoc &dl,
     uint64_t Mask = (ElemTy == MVT::i8)  ? 0xFFull
                   : (ElemTy == MVT::i16) ? 0xFFFFull : 0xFFFFFFFFull;
     for (unsigned i = 0; i != Num; ++i)
-      Val = (Val << W) | (Consts[i]->getZExtValue() & Mask);
+      Val = (Val << W) | (Consts[Num-1-i]->getZExtValue() & Mask);
     SDValue V0 = DAG.getConstant(Val, dl, MVT::i64);
     return DAG.getBitcast(VecTy, V0);
   }
@@ -2677,8 +2307,56 @@ HexagonTargetLowering::extractVector(SDValue VecV, SDValue IdxV,
   unsigned VecWidth = VecTy.getSizeInBits();
   unsigned ValWidth = ValTy.getSizeInBits();
   unsigned ElemWidth = VecTy.getVectorElementType().getSizeInBits();
-  assert(VecWidth == 32 || VecWidth == 64);
   assert((VecWidth % ElemWidth) == 0);
+  auto *IdxN = dyn_cast<ConstantSDNode>(IdxV);
+
+  // Special case for v{8,4,2}i1 (the only boolean vectors legal in Hexagon
+  // without any coprocessors).
+  if (ElemWidth == 1) {
+    assert(VecWidth == VecTy.getVectorNumElements() && "Sanity failure");
+    assert(VecWidth == 8 || VecWidth == 4 || VecWidth == 2);
+    // Check if this is an extract of the lowest bit.
+    if (IdxN) {
+      // Extracting the lowest bit is a no-op, but it changes the type,
+      // so it must be kept as an operation to avoid errors related to
+      // type mismatches.
+      if (IdxN->isNullValue() && ValTy.getSizeInBits() == 1)
+        return DAG.getNode(HexagonISD::TYPECAST, dl, MVT::i1, VecV);
+    }
+
+    // If the value extracted is a single bit, use tstbit.
+    if (ValWidth == 1) {
+      SDValue A0 = getInstr(Hexagon::C2_tfrpr, dl, MVT::i32, {VecV}, DAG);
+      SDValue M0 = DAG.getConstant(8 / VecWidth, dl, MVT::i32);
+      SDValue I0 = DAG.getNode(ISD::MUL, dl, MVT::i32, IdxV, M0);
+      return DAG.getNode(HexagonISD::TSTBIT, dl, MVT::i1, A0, I0);
+    }
+
+    // Each bool vector (v2i1, v4i1, v8i1) always occupies 8 bits in
+    // a predicate register. The elements of the vector are repeated
+    // in the register (if necessary) so that the total number is 8.
+    // The extracted subvector will need to be expanded in such a way.
+    unsigned Scale = VecWidth / ValWidth;
+
+    // Generate (p2d VecV) >> 8*Idx to move the interesting bytes to
+    // position 0.
+    assert(ty(IdxV) == MVT::i32);
+    SDValue S0 = DAG.getNode(ISD::MUL, dl, MVT::i32, IdxV,
+                             DAG.getConstant(8*Scale, dl, MVT::i32));
+    SDValue T0 = DAG.getNode(HexagonISD::P2D, dl, MVT::i64, VecV);
+    SDValue T1 = DAG.getNode(ISD::SRL, dl, MVT::i64, T0, S0);
+    while (Scale > 1) {
+      // The longest possible subvector is at most 32 bits, so it is always
+      // contained in the low subregister.
+      T1 = DAG.getTargetExtractSubreg(Hexagon::isub_lo, dl, MVT::i32, T1);
+      T1 = expandPredicate(T1, dl, DAG);
+      Scale /= 2;
+    }
+
+    return DAG.getNode(HexagonISD::D2P, dl, ResTy, T1);
+  }
+
+  assert(VecWidth == 32 || VecWidth == 64);
 
   // Cast everything to scalar integer types.
   MVT ScalarTy = tyScalar(VecTy);
@@ -2687,8 +2365,8 @@ HexagonTargetLowering::extractVector(SDValue VecV, SDValue IdxV,
   SDValue WidthV = DAG.getConstant(ValWidth, dl, MVT::i32);
   SDValue ExtV;
 
-  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(IdxV)) {
-    unsigned Off = C->getZExtValue() * ElemWidth;
+  if (IdxN) {
+    unsigned Off = IdxN->getZExtValue() * ElemWidth;
     if (VecWidth == 64 && ValWidth == 32) {
       assert(Off == 0 || Off == 32);
       unsigned SubIdx = Off == 0 ? Hexagon::isub_lo : Hexagon::isub_hi;
@@ -2707,11 +2385,8 @@ HexagonTargetLowering::extractVector(SDValue VecV, SDValue IdxV,
       IdxV = DAG.getZExtOrTrunc(IdxV, dl, MVT::i32);
     SDValue OffV = DAG.getNode(ISD::MUL, dl, MVT::i32, IdxV,
                                DAG.getConstant(ElemWidth, dl, MVT::i32));
-    // EXTRACTURP takes width/offset in a 64-bit pair.
-    SDValue CombV = DAG.getNode(HexagonISD::COMBINE, dl, MVT::i64,
-                                  {WidthV, OffV});
-    ExtV = DAG.getNode(HexagonISD::EXTRACTURP, dl, ScalarTy,
-                       {VecV, CombV});
+    ExtV = DAG.getNode(HexagonISD::EXTRACTU, dl, ScalarTy,
+                       {VecV, WidthV, OffV});
   }
 
   // Cast ExtV to the requested result type.
@@ -2725,6 +2400,33 @@ HexagonTargetLowering::insertVector(SDValue VecV, SDValue ValV, SDValue IdxV,
                                     const SDLoc &dl, MVT ValTy,
                                     SelectionDAG &DAG) const {
   MVT VecTy = ty(VecV);
+  if (VecTy.getVectorElementType() == MVT::i1) {
+    MVT ValTy = ty(ValV);
+    assert(ValTy.getVectorElementType() == MVT::i1);
+    SDValue ValR = DAG.getNode(HexagonISD::P2D, dl, MVT::i64, ValV);
+    unsigned VecLen = VecTy.getVectorNumElements();
+    unsigned Scale = VecLen / ValTy.getVectorNumElements();
+    assert(Scale > 1);
+
+    for (unsigned R = Scale; R > 1; R /= 2) {
+      ValR = contractPredicate(ValR, dl, DAG);
+      ValR = DAG.getNode(HexagonISD::COMBINE, dl, MVT::i64,
+                         DAG.getUNDEF(MVT::i32), ValR);
+    }
+    // The longest possible subvector is at most 32 bits, so it is always
+    // contained in the low subregister.
+    ValR = DAG.getTargetExtractSubreg(Hexagon::isub_lo, dl, MVT::i32, ValR);
+
+    unsigned ValBytes = 64 / Scale;
+    SDValue Width = DAG.getConstant(ValBytes*8, dl, MVT::i32);
+    SDValue Idx = DAG.getNode(ISD::MUL, dl, MVT::i32, IdxV,
+                              DAG.getConstant(8, dl, MVT::i32));
+    SDValue VecR = DAG.getNode(HexagonISD::P2D, dl, MVT::i64, VecV);
+    SDValue Ins = DAG.getNode(HexagonISD::INSERT, dl, MVT::i32,
+                              {VecR, ValR, Width, Idx});
+    return DAG.getNode(HexagonISD::D2P, dl, VecTy, Ins);
+  }
+
   unsigned VecWidth = VecTy.getSizeInBits();
   unsigned ValWidth = ValTy.getSizeInBits();
   assert(VecWidth == 32 || VecWidth == 64);
@@ -2752,17 +2454,32 @@ HexagonTargetLowering::insertVector(SDValue VecV, SDValue ValV, SDValue IdxV,
     if (ty(IdxV) != MVT::i32)
       IdxV = DAG.getZExtOrTrunc(IdxV, dl, MVT::i32);
     SDValue OffV = DAG.getNode(ISD::MUL, dl, MVT::i32, IdxV, WidthV);
-    // INSERTRP takes width/offset in a 64-bit pair.
-    SDValue CombV = DAG.getNode(HexagonISD::COMBINE, dl, MVT::i64,
-                                  {WidthV, OffV});
-    InsV = DAG.getNode(HexagonISD::INSERTRP, dl, ScalarTy,
-                       {VecV, ValV, CombV});
+    InsV = DAG.getNode(HexagonISD::INSERT, dl, ScalarTy,
+                       {VecV, ValV, WidthV, OffV});
   }
 
   return DAG.getNode(ISD::BITCAST, dl, VecTy, InsV);
 }
 
 SDValue
+HexagonTargetLowering::expandPredicate(SDValue Vec32, const SDLoc &dl,
+                                       SelectionDAG &DAG) const {
+  assert(ty(Vec32).getSizeInBits() == 32);
+  if (isUndef(Vec32))
+    return DAG.getUNDEF(MVT::i64);
+  return getInstr(Hexagon::S2_vsxtbh, dl, MVT::i64, {Vec32}, DAG);
+}
+
+SDValue
+HexagonTargetLowering::contractPredicate(SDValue Vec64, const SDLoc &dl,
+                                         SelectionDAG &DAG) const {
+  assert(ty(Vec64).getSizeInBits() == 64);
+  if (isUndef(Vec64))
+    return DAG.getUNDEF(MVT::i32);
+  return getInstr(Hexagon::S2_vtrunehb, dl, MVT::i32, {Vec64}, DAG);
+}
+
+SDValue
 HexagonTargetLowering::getZero(const SDLoc &dl, MVT Ty, SelectionDAG &DAG)
       const {
   if (Ty.isVector()) {
@@ -2784,18 +2501,34 @@ SDValue
 HexagonTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   MVT VecTy = ty(Op);
   unsigned BW = VecTy.getSizeInBits();
-
-  if (Subtarget.useHVXOps() && Subtarget.isHVXVectorType(VecTy, true))
-    return LowerHvxBuildVector(Op, DAG);
-
-  if (BW == 32 || BW == 64) {
-    const SDLoc &dl(Op);
-    SmallVector<SDValue,8> Ops;
-    for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i)
-      Ops.push_back(Op.getOperand(i));
-    if (BW == 32)
-      return buildVector32(Ops, dl, VecTy, DAG);
+  const SDLoc &dl(Op);
+  SmallVector<SDValue,8> Ops;
+  for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i)
+    Ops.push_back(Op.getOperand(i));
+
+  if (BW == 32)
+    return buildVector32(Ops, dl, VecTy, DAG);
+  if (BW == 64)
     return buildVector64(Ops, dl, VecTy, DAG);
+
+  if (VecTy == MVT::v8i1 || VecTy == MVT::v4i1 || VecTy == MVT::v2i1) {
+    // For each i1 element in the resulting predicate register, put 1
+    // shifted by the index of the element into a general-purpose register,
+    // then or them together and transfer it back into a predicate register.
+    SDValue Rs[8];
+    SDValue Z = getZero(dl, MVT::i32, DAG);
+    // Always produce 8 bits, repeat inputs if necessary.
+    unsigned Rep = 8 / VecTy.getVectorNumElements();
+    for (unsigned i = 0; i != 8; ++i) {
+      SDValue S = DAG.getConstant(1ull << i, dl, MVT::i32);
+      Rs[i] = DAG.getSelect(dl, MVT::i32, Ops[i/Rep], S, Z);
+    }
+    for (ArrayRef<SDValue> A(Rs); A.size() != 1; A = A.drop_back(A.size()/2)) {
+      for (unsigned i = 0, e = A.size()/2; i != e; ++i)
+        Rs[i] = DAG.getNode(ISD::OR, dl, MVT::i32, Rs[2*i], Rs[2*i+1]);
+    }
+    // Move the value directly to a predicate register.
+    return getInstr(Hexagon::C2_tfrrp, dl, VecTy, {Rs[0]}, DAG);
   }
 
   return SDValue();
@@ -2805,14 +2538,64 @@ SDValue
 HexagonTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
                                            SelectionDAG &DAG) const {
   MVT VecTy = ty(Op);
-  assert(!Subtarget.useHVXOps() || !Subtarget.isHVXVectorType(VecTy));
-
+  const SDLoc &dl(Op);
   if (VecTy.getSizeInBits() == 64) {
     assert(Op.getNumOperands() == 2);
-    return DAG.getNode(HexagonISD::COMBINE, SDLoc(Op), VecTy, Op.getOperand(1),
+    return DAG.getNode(HexagonISD::COMBINE, dl, VecTy, Op.getOperand(1),
                        Op.getOperand(0));
   }
 
+  MVT ElemTy = VecTy.getVectorElementType();
+  if (ElemTy == MVT::i1) {
+    assert(VecTy == MVT::v2i1 || VecTy == MVT::v4i1 || VecTy == MVT::v8i1);
+    MVT OpTy = ty(Op.getOperand(0));
+    // Scale is how many times the operands need to be contracted to match
+    // the representation in the target register.
+    unsigned Scale = VecTy.getVectorNumElements() / OpTy.getVectorNumElements();
+    assert(Scale == Op.getNumOperands() && Scale > 1);
+
+    // First, convert all bool vectors to integers, then generate pairwise
+    // inserts to form values of doubled length. Up until there are only
+    // two values left to concatenate, all of these values will fit in a
+    // 32-bit integer, so keep them as i32 to use 32-bit inserts.
+    SmallVector<SDValue,4> Words[2];
+    unsigned IdxW = 0;
+
+    for (SDValue P : Op.getNode()->op_values()) {
+      SDValue W = DAG.getNode(HexagonISD::P2D, dl, MVT::i64, P);
+      for (unsigned R = Scale; R > 1; R /= 2) {
+        W = contractPredicate(W, dl, DAG);
+        W = DAG.getNode(HexagonISD::COMBINE, dl, MVT::i64,
+                        DAG.getUNDEF(MVT::i32), W);
+      }
+      W = DAG.getTargetExtractSubreg(Hexagon::isub_lo, dl, MVT::i32, W);
+      Words[IdxW].push_back(W);
+    }
+
+    while (Scale > 2) {
+      SDValue WidthV = DAG.getConstant(64 / Scale, dl, MVT::i32);
+      Words[IdxW ^ 1].clear();
+
+      for (unsigned i = 0, e = Words[IdxW].size(); i != e; i += 2) {
+        SDValue W0 = Words[IdxW][i], W1 = Words[IdxW][i+1];
+        // Insert W1 into W0 right next to the significant bits of W0.
+        SDValue T = DAG.getNode(HexagonISD::INSERT, dl, MVT::i32,
+                                {W0, W1, WidthV, WidthV});
+        Words[IdxW ^ 1].push_back(T);
+      }
+      IdxW ^= 1;
+      Scale /= 2;
+    }
+
+    // Another sanity check. At this point there should only be two words
+    // left, and Scale should be 2.
+    assert(Scale == 2 && Words[IdxW].size() == 2);
+
+    SDValue WW = DAG.getNode(HexagonISD::COMBINE, dl, MVT::i64,
+                             Words[IdxW][1], Words[IdxW][0]);
+    return DAG.getNode(HexagonISD::D2P, dl, VecTy, WW);
+  }
+
   return SDValue();
 }
 
@@ -2820,10 +2603,6 @@ SDValue
 HexagonTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
                                                SelectionDAG &DAG) const {
   SDValue Vec = Op.getOperand(0);
-  MVT VecTy = ty(Vec);
-  if (Subtarget.useHVXOps() && Subtarget.isHVXVectorType(VecTy))
-    return LowerHvxExtractElement(Op, DAG);
-
   MVT ElemTy = ty(Vec).getVectorElementType();
   return extractVector(Vec, Op.getOperand(1), SDLoc(Op), ElemTy, ty(Op), DAG);
 }
@@ -2831,31 +2610,20 @@ HexagonTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
 SDValue
 HexagonTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
                                               SelectionDAG &DAG) const {
-  SDValue Vec = Op.getOperand(0);
-  MVT VecTy = ty(Vec);
-  if (Subtarget.useHVXOps() && Subtarget.isHVXVectorType(VecTy))
-    return LowerHvxExtractSubvector(Op, DAG);
-
-  return extractVector(Vec, Op.getOperand(1), SDLoc(Op), ty(Op), ty(Op), DAG);
+  return extractVector(Op.getOperand(0), Op.getOperand(1), SDLoc(Op),
+                       ty(Op), ty(Op), DAG);
 }
 
 SDValue
 HexagonTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
                                               SelectionDAG &DAG) const {
-  MVT VecTy = ty(Op);
-  if (Subtarget.useHVXOps() && Subtarget.isHVXVectorType(VecTy))
-    return LowerHvxInsertElement(Op, DAG);
-
   return insertVector(Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
-                      SDLoc(Op), VecTy.getVectorElementType(), DAG);
+                      SDLoc(Op), ty(Op).getVectorElementType(), DAG);
 }
 
 SDValue
 HexagonTargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
                                              SelectionDAG &DAG) const {
-  if (Subtarget.useHVXOps() && Subtarget.isHVXVectorType(ty(Op)))
-    return LowerHvxInsertSubvector(Op, DAG);
-
   SDValue ValV = Op.getOperand(1);
   return insertVector(Op.getOperand(0), ValV, Op.getOperand(2),
                       SDLoc(Op), ty(ValV), DAG);
@@ -2875,6 +2643,109 @@ HexagonTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
 }
 
 SDValue
+HexagonTargetLowering::LowerUnalignedLoad(SDValue Op, SelectionDAG &DAG)
+      const {
+  LoadSDNode *LN = cast<LoadSDNode>(Op.getNode());
+  unsigned HaveAlign = LN->getAlignment();
+  MVT LoadTy = ty(Op);
+  unsigned NeedAlign = Subtarget.getTypeAlignment(LoadTy);
+  if (HaveAlign >= NeedAlign)
+    return Op;
+
+  const SDLoc &dl(Op);
+  const DataLayout &DL = DAG.getDataLayout();
+  LLVMContext &Ctx = *DAG.getContext();
+  unsigned AS = LN->getAddressSpace();
+
+  // If the load aligning is disabled or the load can be broken up into two
+  // smaller legal loads, do the default (target-independent) expansion.
+  bool DoDefault = false;
+  // Handle it in the default way if this is an indexed load.
+  if (!LN->isUnindexed())
+    DoDefault = true;
+
+  if (!AlignLoads) {
+    if (allowsMemoryAccess(Ctx, DL, LN->getMemoryVT(), AS, HaveAlign))
+      return Op;
+    DoDefault = true;
+  }
+  if (!DoDefault && 2*HaveAlign == NeedAlign) {
+    // The PartTy is the equivalent of "getLoadableTypeOfSize(HaveAlign)".
+    MVT PartTy = HaveAlign <= 8 ? MVT::getIntegerVT(8*HaveAlign)
+                                : MVT::getVectorVT(MVT::i8, HaveAlign);
+    DoDefault = allowsMemoryAccess(Ctx, DL, PartTy, AS, HaveAlign);
+  }
+  if (DoDefault) {
+    std::pair<SDValue, SDValue> P = expandUnalignedLoad(LN, DAG);
+    return DAG.getMergeValues({P.first, P.second}, dl);
+  }
+
+  // The code below generates two loads, both aligned as NeedAlign, and
+  // with the distance of NeedAlign between them. For that to cover the
+  // bits that need to be loaded (and without overlapping), the size of
+  // the loads should be equal to NeedAlign. This is true for all loadable
+  // types, but add an assertion in case something changes in the future.
+  assert(LoadTy.getSizeInBits() == 8*NeedAlign);
+
+  unsigned LoadLen = NeedAlign;
+  SDValue Base = LN->getBasePtr();
+  SDValue Chain = LN->getChain();
+  auto BO = getBaseAndOffset(Base);
+  unsigned BaseOpc = BO.first.getOpcode();
+  if (BaseOpc == HexagonISD::VALIGNADDR && BO.second % LoadLen == 0)
+    return Op;
+
+  if (BO.second % LoadLen != 0) {
+    BO.first = DAG.getNode(ISD::ADD, dl, MVT::i32, BO.first,
+                           DAG.getConstant(BO.second % LoadLen, dl, MVT::i32));
+    BO.second -= BO.second % LoadLen;
+  }
+  SDValue BaseNoOff = (BaseOpc != HexagonISD::VALIGNADDR)
+      ? DAG.getNode(HexagonISD::VALIGNADDR, dl, MVT::i32, BO.first,
+                    DAG.getConstant(NeedAlign, dl, MVT::i32))
+      : BO.first;
+  SDValue Base0 = DAG.getMemBasePlusOffset(BaseNoOff, BO.second, dl);
+  SDValue Base1 = DAG.getMemBasePlusOffset(BaseNoOff, BO.second+LoadLen, dl);
+
+  MachineMemOperand *WideMMO = nullptr;
+  if (MachineMemOperand *MMO = LN->getMemOperand()) {
+    MachineFunction &MF = DAG.getMachineFunction();
+    WideMMO = MF.getMachineMemOperand(MMO->getPointerInfo(), MMO->getFlags(),
+                    2*LoadLen, LoadLen, MMO->getAAInfo(), MMO->getRanges(),
+                    MMO->getSyncScopeID(), MMO->getOrdering(),
+                    MMO->getFailureOrdering());
+  }
+
+  SDValue Load0 = DAG.getLoad(LoadTy, dl, Chain, Base0, WideMMO);
+  SDValue Load1 = DAG.getLoad(LoadTy, dl, Chain, Base1, WideMMO);
+
+  SDValue Aligned = DAG.getNode(HexagonISD::VALIGN, dl, LoadTy,
+                                {Load1, Load0, BaseNoOff.getOperand(0)});
+  SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                                 Load0.getValue(1), Load1.getValue(1));
+  SDValue M = DAG.getMergeValues({Aligned, NewChain}, dl);
+  return M;
+}
+
+SDValue
+HexagonTargetLowering::LowerAddSubCarry(SDValue Op, SelectionDAG &DAG) const {
+  const SDLoc &dl(Op);
+  unsigned Opc = Op.getOpcode();
+  SDValue X = Op.getOperand(0), Y = Op.getOperand(1), C = Op.getOperand(2);
+
+  if (Opc == ISD::ADDCARRY)
+    return DAG.getNode(HexagonISD::ADDC, dl, Op.getNode()->getVTList(),
+                       { X, Y, C });
+
+  EVT CarryTy = C.getValueType();
+  SDValue SubC = DAG.getNode(HexagonISD::SUBC, dl, Op.getNode()->getVTList(),
+                             { X, Y, DAG.getLogicalNOT(dl, C, CarryTy) });
+  SDValue Out[] = { SubC.getValue(0),
+                    DAG.getLogicalNOT(dl, SubC.getValue(1), CarryTy) };
+  return DAG.getMergeValues(Out, dl);
+}
+
+SDValue
 HexagonTargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
   SDValue Chain     = Op.getOperand(0);
   SDValue Offset    = Op.getOperand(1);
@@ -2904,6 +2775,17 @@ HexagonTargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
 SDValue
 HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   unsigned Opc = Op.getOpcode();
+
+  // Handle INLINEASM first.
+  if (Opc == ISD::INLINEASM)
+    return LowerINLINEASM(Op, DAG);
+
+  if (isHvxOperation(Op)) {
+    // If HVX lowering returns nothing, try the default lowering.
+    if (SDValue V = LowerHvxOperation(Op, DAG))
+      return V;
+  }
+
   switch (Opc) {
     default:
 #ifndef NDEBUG
@@ -2919,13 +2801,17 @@ HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     case ISD::EXTRACT_VECTOR_ELT:   return LowerEXTRACT_VECTOR_ELT(Op, DAG);
     case ISD::BUILD_VECTOR:         return LowerBUILD_VECTOR(Op, DAG);
     case ISD::VECTOR_SHUFFLE:       return LowerVECTOR_SHUFFLE(Op, DAG);
+    case ISD::BITCAST:              return LowerBITCAST(Op, DAG);
+    case ISD::LOAD:                 return LowerUnalignedLoad(Op, DAG);
+    case ISD::ADDCARRY:
+    case ISD::SUBCARRY:             return LowerAddSubCarry(Op, DAG);
     case ISD::SRA:
     case ISD::SHL:
     case ISD::SRL:                  return LowerVECTOR_SHIFT(Op, DAG);
+    case ISD::ROTL:                 return LowerROTL(Op, DAG);
     case ISD::ConstantPool:         return LowerConstantPool(Op, DAG);
     case ISD::JumpTable:            return LowerJumpTable(Op, DAG);
     case ISD::EH_RETURN:            return LowerEH_RETURN(Op, DAG);
-      // Frame & Return address. Currently unimplemented.
     case ISD::RETURNADDR:           return LowerRETURNADDR(Op, DAG);
     case ISD::FRAMEADDR:            return LowerFRAMEADDR(Op, DAG);
     case ISD::GlobalTLSAddress:     return LowerGlobalTLSAddress(Op, DAG);
@@ -2939,17 +2825,35 @@ HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     case ISD::VSELECT:              return LowerVSELECT(Op, DAG);
     case ISD::INTRINSIC_WO_CHAIN:   return LowerINTRINSIC_WO_CHAIN(Op, DAG);
     case ISD::INTRINSIC_VOID:       return LowerINTRINSIC_VOID(Op, DAG);
-    case ISD::INLINEASM:            return LowerINLINEASM(Op, DAG);
     case ISD::PREFETCH:             return LowerPREFETCH(Op, DAG);
     case ISD::READCYCLECOUNTER:     return LowerREADCYCLECOUNTER(Op, DAG);
-    case ISD::MUL:
-      if (Subtarget.useHVXOps())
-        return LowerHvxMul(Op, DAG);
       break;
   }
+
   return SDValue();
 }
 
+void
+HexagonTargetLowering::ReplaceNodeResults(SDNode *N,
+                                          SmallVectorImpl<SDValue> &Results,
+                                          SelectionDAG &DAG) const {
+  const SDLoc &dl(N);
+  switch (N->getOpcode()) {
+    case ISD::SRL:
+    case ISD::SRA:
+    case ISD::SHL:
+      return;
+    case ISD::BITCAST:
+      // Handle a bitcast from v8i1 to i8.
+      if (N->getValueType(0) == MVT::i8) {
+        SDValue P = getInstr(Hexagon::C2_tfrpr, dl, MVT::i32,
+                             N->getOperand(0), DAG);
+        Results.push_back(P);
+      }
+      break;
+  }
+}
+
 /// Returns relocation base for the given PIC jumptable.
 SDValue
 HexagonTargetLowering::getPICJumpTableRelocBase(SDValue Table,
@@ -3023,7 +2927,7 @@ HexagonTargetLowering::getRegForInlineAsmConstraint(
       case 512:
         return {0u, &Hexagon::HvxVRRegClass};
       case 1024:
-        if (Subtarget.hasV60TOps() && Subtarget.useHVX128BOps())
+        if (Subtarget.hasV60Ops() && Subtarget.useHVX128BOps())
           return {0u, &Hexagon::HvxVRRegClass};
         return {0u, &Hexagon::HvxWRRegClass};
       case 2048:
@@ -3042,7 +2946,7 @@ HexagonTargetLowering::getRegForInlineAsmConstraint(
 /// specified FP immediate natively. If false, the legalizer will
 /// materialize the FP immediate as a load from a constant pool.
 bool HexagonTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
-  return Subtarget.hasV5TOps();
+  return Subtarget.hasV5Ops();
 }
 
 /// isLegalAddressingMode - Return true if the addressing mode represented by
@@ -3104,9 +3008,9 @@ bool HexagonTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
 bool HexagonTargetLowering::IsEligibleForTailCallOptimization(
                                  SDValue Callee,
                                  CallingConv::ID CalleeCC,
-                                 bool isVarArg,
-                                 bool isCalleeStructRet,
-                                 bool isCallerStructRet,
+                                 bool IsVarArg,
+                                 bool IsCalleeStructRet,
+                                 bool IsCallerStructRet,
                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
                                  const SmallVectorImpl<SDValue> &OutVals,
                                  const SmallVectorImpl<ISD::InputArg> &Ins,
@@ -3137,12 +3041,12 @@ bool HexagonTargetLowering::IsEligibleForTailCallOptimization(
   }
 
   // Do not tail call optimize vararg calls.
-  if (isVarArg)
+  if (IsVarArg)
     return false;
 
   // Also avoid tail call optimization if either caller or callee uses struct
   // return semantics.
-  if (isCalleeStructRet || isCallerStructRet)
+  if (IsCalleeStructRet || IsCallerStructRet)
     return false;
 
   // In addition to the cases above, we also disable Tail Call Optimization if
@@ -3185,54 +3089,25 @@ bool HexagonTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
       unsigned AS, unsigned Align, bool *Fast) const {
   if (Fast)
     *Fast = false;
-
-  switch (VT.getSimpleVT().SimpleTy) {
-  default:
-    return false;
-  case MVT::v64i8:
-  case MVT::v128i8:
-  case MVT::v256i8:
-  case MVT::v32i16:
-  case MVT::v64i16:
-  case MVT::v128i16:
-  case MVT::v16i32:
-  case MVT::v32i32:
-  case MVT::v64i32:
-    return true;
-  }
-  return false;
+  return Subtarget.isHVXVectorType(VT.getSimpleVT());
 }
 
 std::pair<const TargetRegisterClass*, uint8_t>
 HexagonTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
       MVT VT) const {
-  const TargetRegisterClass *RRC = nullptr;
+  if (Subtarget.isHVXVectorType(VT, true)) {
+    unsigned BitWidth = VT.getSizeInBits();
+    unsigned VecWidth = Subtarget.getVectorLength() * 8;
 
-  uint8_t Cost = 1;
-  switch (VT.SimpleTy) {
-  default:
-    return TargetLowering::findRepresentativeClass(TRI, VT);
-  case MVT::v64i8:
-  case MVT::v32i16:
-  case MVT::v16i32:
-    RRC = &Hexagon::HvxVRRegClass;
-    break;
-  case MVT::v128i8:
-  case MVT::v64i16:
-  case MVT::v32i32:
-    if (Subtarget.hasV60TOps() && Subtarget.useHVXOps() &&
-        Subtarget.useHVX128BOps())
-      RRC = &Hexagon::HvxVRRegClass;
-    else
-      RRC = &Hexagon::HvxWRRegClass;
-    break;
-  case MVT::v256i8:
-  case MVT::v128i16:
-  case MVT::v64i32:
-    RRC = &Hexagon::HvxWRRegClass;
-    break;
+    if (VT.getVectorElementType() == MVT::i1)
+      return std::make_pair(&Hexagon::HvxQRRegClass, 1);
+    if (BitWidth == VecWidth)
+      return std::make_pair(&Hexagon::HvxVRRegClass, 1);
+    assert(BitWidth == 2 * VecWidth);
+    return std::make_pair(&Hexagon::HvxWRRegClass, 1);
   }
-  return std::make_pair(RRC, Cost);
+
+  return TargetLowering::findRepresentativeClass(TRI, VT);
 }
 
 Value *HexagonTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,