40 files changed, 1540 insertions, 225 deletions
diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index 63c11e237d6c..1bd5d7a6c1d7 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -3279,10 +3279,10 @@ public:
   /// Return a new OMPTraitInfo object owned by this context.
   OMPTraitInfo &getNewOMPTraitInfo();
 
-  /// Whether a C++ static variable may be externalized.
+  /// Whether a C++ static variable or CUDA/HIP kernel may be externalized.
   bool mayExternalizeStaticVar(const Decl *D) const;
 
-  /// Whether a C++ static variable should be externalized.
+  /// Whether a C++ static variable or CUDA/HIP kernel should be externalized.
   bool shouldExternalizeStaticVar(const Decl *D) const;
 
   StringRef getCUIDHash() const;
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 5fa2d46de89b..e4b3827b8714 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -8547,21 +8547,18 @@ static TypedefDecl *CreateVoidPtrBuiltinVaListDecl(const ASTContext *Context) {
 
 static TypedefDecl *
 CreateAArch64ABIBuiltinVaListDecl(const ASTContext *Context) {
+  // struct __va_list
   RecordDecl *VaListTagDecl = Context->buildImplicitRecord("__va_list");
-  // namespace std { struct __va_list {
-  // Note that we create the namespace even in C. This is intentional so that
-  // the type is consistent between C and C++, which is important in cases where
-  // the types need to match between translation units (e.g. with
-  // -fsanitize=cfi-icall). Ideally we wouldn't have created this namespace at
-  // all, but it's now part of the ABI (e.g. in mangled names), so we can't
-  // change it.
-  auto *NS = NamespaceDecl::Create(
-      const_cast<ASTContext &>(*Context), Context->getTranslationUnitDecl(),
-      /*Inline*/ false, SourceLocation(), SourceLocation(),
-      &Context->Idents.get("std"),
-      /*PrevDecl*/ nullptr);
-  NS->setImplicit();
-  VaListTagDecl->setDeclContext(NS);
+  if (Context->getLangOpts().CPlusPlus) {
+    // namespace std { struct __va_list {
+    auto *NS = NamespaceDecl::Create(
+        const_cast<ASTContext &>(*Context), Context->getTranslationUnitDecl(),
+        /*Inline*/ false, SourceLocation(), SourceLocation(),
+        &Context->Idents.get("std"),
+        /*PrevDecl*/ nullptr);
+    NS->setImplicit();
+    VaListTagDecl->setDeclContext(NS);
+  }
 
   VaListTagDecl->startDefinition();
 
@@ -12266,14 +12263,18 @@ bool ASTContext::mayExternalizeStaticVar(const Decl *D) const {
                              (D->hasAttr<CUDAConstantAttr>() &&
                               !D->getAttr<CUDAConstantAttr>()->isImplicit());
   // CUDA/HIP: static managed variables need to be externalized since it is
-  // a declaration in IR, therefore cannot have internal linkage.
-  return IsStaticVar &&
-         (D->hasAttr<HIPManagedAttr>() || IsExplicitDeviceVar);
+  // a declaration in IR, therefore cannot have internal linkage. Kernels in
+  // anonymous name space needs to be externalized to avoid duplicate symbols.
+  return (IsStaticVar &&
+          (D->hasAttr<HIPManagedAttr>() || IsExplicitDeviceVar)) ||
+         (D->hasAttr<CUDAGlobalAttr>() &&
+          basicGVALinkageForFunction(*this, cast<FunctionDecl>(D)) ==
+              GVA_Internal);
 }
 
 bool ASTContext::shouldExternalizeStaticVar(const Decl *D) const {
   return mayExternalizeStaticVar(D) &&
-         (D->hasAttr<HIPManagedAttr>() ||
+         (D->hasAttr<HIPManagedAttr>() || D->hasAttr<CUDAGlobalAttr>() ||
           CUDADeviceVarODRUsedByHost.count(cast<VarDecl>(D)));
 }
 
diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index 2e734e2b28cd..68d4d1271cdb 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -40,65 +40,10 @@ using namespace clang;
 
 namespace {
 
-/// Retrieve the declaration context that should be used when mangling the given
-/// declaration.
-static const DeclContext *getEffectiveDeclContext(const Decl *D) {
-  // The ABI assumes that lambda closure types that occur within
-  // default arguments live in the context of the function. However, due to
-  // the way in which Clang parses and creates function declarations, this is
-  // not the case: the lambda closure type ends up living in the context
-  // where the function itself resides, because the function declaration itself
-  // had not yet been created. Fix the context here.
-  if (const CXXRecordDecl *RD = dyn_cast<CXXRecordDecl>(D)) {
-    if (RD->isLambda())
-      if (ParmVarDecl *ContextParam
-            = dyn_cast_or_null<ParmVarDecl>(RD->getLambdaContextDecl()))
-        return ContextParam->getDeclContext();
-  }
-
-  // Perform the same check for block literals.
-  if (const BlockDecl *BD = dyn_cast<BlockDecl>(D)) {
-    if (ParmVarDecl *ContextParam
-          = dyn_cast_or_null<ParmVarDecl>(BD->getBlockManglingContextDecl()))
-      return ContextParam->getDeclContext();
-  }
-
-  const DeclContext *DC = D->getDeclContext();
-  if (isa<CapturedDecl>(DC) || isa<OMPDeclareReductionDecl>(DC) ||
-      isa<OMPDeclareMapperDecl>(DC)) {
-    return getEffectiveDeclContext(cast<Decl>(DC));
-  }
-
-  if (const auto *VD = dyn_cast<VarDecl>(D))
-    if (VD->isExternC())
-      return VD->getASTContext().getTranslationUnitDecl();
-
-  if (const auto *FD = dyn_cast<FunctionDecl>(D))
-    if (FD->isExternC())
-      return FD->getASTContext().getTranslationUnitDecl();
-
-  return DC->getRedeclContext();
-}
-
-static const DeclContext *getEffectiveParentContext(const DeclContext *DC) {
-  return getEffectiveDeclContext(cast<Decl>(DC));
-}
-
 static bool isLocalContainerContext(const DeclContext *DC) {
   return isa<FunctionDecl>(DC) || isa<ObjCMethodDecl>(DC) || isa<BlockDecl>(DC);
 }
 
-static const RecordDecl *GetLocalClassDecl(const Decl *D) {
-  const DeclContext *DC = getEffectiveDeclContext(D);
-  while (!DC->isNamespace() && !DC->isTranslationUnit()) {
-    if (isLocalContainerContext(DC))
-      return dyn_cast<RecordDecl>(D);
-    D = cast<Decl>(DC);
-    DC = getEffectiveDeclContext(D);
-  }
-  return nullptr;
-}
-
 static const FunctionDecl *getStructor(const FunctionDecl *fn) {
   if (const FunctionTemplateDecl *ftd = fn->getPrimaryTemplate())
     return ftd->getTemplatedDecl();
@@ -126,6 +71,7 @@ class ItaniumMangleContextImpl : public ItaniumMangleContext {
   llvm::DenseMap<DiscriminatorKeyTy, unsigned> Discriminator;
   llvm::DenseMap<const NamedDecl*, unsigned> Uniquifier;
   const DiscriminatorOverrideTy DiscriminatorOverride = nullptr;
+  NamespaceDecl *StdNamespace = nullptr;
 
   bool NeedsUniqueInternalLinkageNames = false;
 
@@ -249,6 +195,16 @@ public:
     return DiscriminatorOverride;
   }
 
+  NamespaceDecl *getStdNamespace();
+
+  const DeclContext *getEffectiveDeclContext(const Decl *D);
+  const DeclContext *getEffectiveParentContext(const DeclContext *DC) {
+    return getEffectiveDeclContext(cast<Decl>(DC));
+  }
+
+  bool isInternalLinkageDecl(const NamedDecl *ND);
+  const DeclContext *IgnoreLinkageSpecDecls(const DeclContext *DC);
+
   /// @}
 };
 
@@ -427,6 +383,15 @@ class CXXNameMangler {
 
   ASTContext &getASTContext() const { return Context.getASTContext(); }
 
+  bool isStd(const NamespaceDecl *NS);
+  bool isStdNamespace(const DeclContext *DC);
+
+  const RecordDecl *GetLocalClassDecl(const Decl *D);
+  const DeclContext *IgnoreLinkageSpecDecls(const DeclContext *DC);
+  bool isSpecializedAs(QualType S, llvm::StringRef Name, QualType A);
+  bool isStdCharSpecialization(const ClassTemplateSpecializationDecl *SD,
+                               llvm::StringRef Name, bool HasAllocator);
+
 public:
   CXXNameMangler(ItaniumMangleContextImpl &C, raw_ostream &Out_,
                  const NamedDecl *D = nullptr, bool NullOut_ = false)
@@ -628,7 +593,71 @@ private:
 
 }
 
-static bool isInternalLinkageDecl(const NamedDecl *ND) {
+NamespaceDecl *ItaniumMangleContextImpl::getStdNamespace() {
+  if (!StdNamespace) {
+    StdNamespace = NamespaceDecl::Create(
+        getASTContext(), getASTContext().getTranslationUnitDecl(),
+        /*Inline*/ false, SourceLocation(), SourceLocation(),
+        &getASTContext().Idents.get("std"),
+        /*PrevDecl*/ nullptr);
+    StdNamespace->setImplicit();
+  }
+  return StdNamespace;
+}
+
+/// Retrieve the declaration context that should be used when mangling the given
+/// declaration.
+const DeclContext *
+ItaniumMangleContextImpl::getEffectiveDeclContext(const Decl *D) {
+  // The ABI assumes that lambda closure types that occur within
+  // default arguments live in the context of the function. However, due to
+  // the way in which Clang parses and creates function declarations, this is
+  // not the case: the lambda closure type ends up living in the context
+  // where the function itself resides, because the function declaration itself
+  // had not yet been created. Fix the context here.
+  if (const CXXRecordDecl *RD = dyn_cast<CXXRecordDecl>(D)) {
+    if (RD->isLambda())
+      if (ParmVarDecl *ContextParam =
+              dyn_cast_or_null<ParmVarDecl>(RD->getLambdaContextDecl()))
+        return ContextParam->getDeclContext();
+  }
+
+  // Perform the same check for block literals.
+  if (const BlockDecl *BD = dyn_cast<BlockDecl>(D)) {
+    if (ParmVarDecl *ContextParam =
+            dyn_cast_or_null<ParmVarDecl>(BD->getBlockManglingContextDecl()))
+      return ContextParam->getDeclContext();
+  }
+
+  // On ARM and AArch64, the va_list tag is always mangled as if in the std
+  // namespace. We do not represent va_list as actually being in the std
+  // namespace in C because this would result in incorrect debug info in C,
+  // among other things. It is important for both languages to have the same
+  // mangling in order for -fsanitize=cfi-icall to work.
+  if (D == getASTContext().getVaListTagDecl()) {
+    const llvm::Triple &T = getASTContext().getTargetInfo().getTriple();
+    if (T.isARM() || T.isThumb() || T.isAArch64())
+      return getStdNamespace();
+  }
+
+  const DeclContext *DC = D->getDeclContext();
+  if (isa<CapturedDecl>(DC) || isa<OMPDeclareReductionDecl>(DC) ||
+      isa<OMPDeclareMapperDecl>(DC)) {
+    return getEffectiveDeclContext(cast<Decl>(DC));
+  }
+
+  if (const auto *VD = dyn_cast<VarDecl>(D))
+    if (VD->isExternC())
+      return getASTContext().getTranslationUnitDecl();
+
+  if (const auto *FD = dyn_cast<FunctionDecl>(D))
+    if (FD->isExternC())
+      return getASTContext().getTranslationUnitDecl();
+
+  return DC->getRedeclContext();
+}
+
+bool ItaniumMangleContextImpl::isInternalLinkageDecl(const NamedDecl *ND) {
   if (ND && ND->getFormalLinkage() == InternalLinkage &&
       !ND->isExternallyVisible() &&
       getEffectiveDeclContext(ND)->isFileContext() &&
@@ -862,18 +891,9 @@ void CXXNameMangler::mangleFunctionEncodingBareType(const FunctionDecl *FD) {
                          MangleReturnType, FD);
 }
 
-static const DeclContext *IgnoreLinkageSpecDecls(const DeclContext *DC) {
-  while (isa<LinkageSpecDecl>(DC)) {
-    DC = getEffectiveParentContext(DC);
-  }
-
-  return DC;
-}
-
 /// Return whether a given namespace is the 'std' namespace.
-static bool isStd(const NamespaceDecl *NS) {
-  if (!IgnoreLinkageSpecDecls(getEffectiveParentContext(NS))
-                                ->isTranslationUnit())
+bool CXXNameMangler::isStd(const NamespaceDecl *NS) {
+  if (!Context.getEffectiveParentContext(NS)->isTranslationUnit())
     return false;
 
   const IdentifierInfo *II = NS->getOriginalNamespace()->getIdentifier();
@@ -882,7 +902,7 @@ static bool isStd(const NamespaceDecl *NS) {
 
 // isStdNamespace - Return whether a given decl context is a toplevel 'std'
 // namespace.
-static bool isStdNamespace(const DeclContext *DC) {
+bool CXXNameMangler::isStdNamespace(const DeclContext *DC) {
   if (!DC->isNamespace())
     return false;
 
@@ -956,6 +976,17 @@ void CXXNameMangler::mangleName(GlobalDecl GD) {
   }
 }
 
+const RecordDecl *CXXNameMangler::GetLocalClassDecl(const Decl *D) {
+  const DeclContext *DC = Context.getEffectiveDeclContext(D);
+  while (!DC->isNamespace() && !DC->isTranslationUnit()) {
+    if (isLocalContainerContext(DC))
+      return dyn_cast<RecordDecl>(D);
+    D = cast<Decl>(DC);
+    DC = Context.getEffectiveDeclContext(D);
+  }
+  return nullptr;
+}
+
 void CXXNameMangler::mangleNameWithAbiTags(GlobalDecl GD,
                                            const AbiTagList *AdditionalAbiTags) {
   const NamedDecl *ND = cast<NamedDecl>(GD.getDecl());
@@ -964,7 +995,7 @@ void CXXNameMangler::mangleNameWithAbiTags(GlobalDecl GD,
   //         ::= [<module-name>] <unscoped-template-name> <template-args>
   //         ::= <local-name>
   //
-  const DeclContext *DC = getEffectiveDeclContext(ND);
+  const DeclContext *DC = Context.getEffectiveDeclContext(ND);
 
   // If this is an extern variable declared locally, the relevant DeclContext
   // is that of the containing namespace, or the translation unit.
@@ -972,13 +1003,13 @@ void CXXNameMangler::mangleNameWithAbiTags(GlobalDecl GD,
   // a proper semantic declaration context!
   if (isLocalContainerContext(DC) && ND->hasLinkage() && !isLambda(ND))
     while (!DC->isNamespace() && !DC->isTranslationUnit())
-      DC = getEffectiveParentContext(DC);
+      DC = Context.getEffectiveParentContext(DC);
   else if (GetLocalClassDecl(ND)) {
     mangleLocalName(GD, AdditionalAbiTags);
     return;
   }
 
-  DC = IgnoreLinkageSpecDecls(DC);
+  assert(!isa<LinkageSpecDecl>(DC) && "context cannot be LinkageSpecDecl");
 
   if (isLocalContainerContext(DC)) {
     mangleLocalName(GD, AdditionalAbiTags);
@@ -1054,7 +1085,7 @@ void CXXNameMangler::mangleModuleNamePrefix(StringRef Name) {
 void CXXNameMangler::mangleTemplateName(const TemplateDecl *TD,
                                         const TemplateArgument *TemplateArgs,
                                         unsigned NumTemplateArgs) {
-  const DeclContext *DC = IgnoreLinkageSpecDecls(getEffectiveDeclContext(TD));
+  const DeclContext *DC = Context.getEffectiveDeclContext(TD);
 
   if (DC->isTranslationUnit() || isStdNamespace(DC)) {
     mangleUnscopedTemplateName(TD, nullptr);
@@ -1070,7 +1101,7 @@ void CXXNameMangler::mangleUnscopedName(GlobalDecl GD,
   //  <unscoped-name> ::= <unqualified-name>
   //                  ::= St <unqualified-name>   # ::std::
 
-  if (isStdNamespace(IgnoreLinkageSpecDecls(getEffectiveDeclContext(ND))))
+  if (isStdNamespace(Context.getEffectiveDeclContext(ND)))
     Out << "St";
 
   mangleUnqualifiedName(GD, AdditionalAbiTags);
@@ -1430,7 +1461,7 @@ void CXXNameMangler::mangleUnqualifiedName(GlobalDecl GD,
       // 12_GLOBAL__N_1 mangling is quite sufficient there, and this better
       // matches GCC anyway, because GCC does not treat anonymous namespaces as
       // implying internal linkage.
-      if (isInternalLinkageDecl(ND))
+      if (Context.isInternalLinkageDecl(ND))
         Out << 'L';
 
       auto *FD = dyn_cast<FunctionDecl>(ND);
@@ -1745,7 +1776,7 @@ void CXXNameMangler::mangleLocalName(GlobalDecl GD,
   // <discriminator> := _ <non-negative number>
   assert(isa<NamedDecl>(D) || isa<BlockDecl>(D));
   const RecordDecl *RD = GetLocalClassDecl(D);
-  const DeclContext *DC = getEffectiveDeclContext(RD ? RD : D);
+  const DeclContext *DC = Context.getEffectiveDeclContext(RD ? RD : D);
 
   Out << 'Z';
 
@@ -1798,13 +1829,13 @@ void CXXNameMangler::mangleLocalName(GlobalDecl GD,
       if (const NamedDecl *PrefixND = getClosurePrefix(BD))
         mangleClosurePrefix(PrefixND, true /*NoFunction*/);
       else
-        manglePrefix(getEffectiveDeclContext(BD), true /*NoFunction*/);
+        manglePrefix(Context.getEffectiveDeclContext(BD), true /*NoFunction*/);
       assert(!AdditionalAbiTags && "Block cannot have additional abi tags");
       mangleUnqualifiedBlock(BD);
     } else {
       const NamedDecl *ND = cast<NamedDecl>(D);
-      mangleNestedName(GD, getEffectiveDeclContext(ND), AdditionalAbiTags,
-                       true /*NoFunction*/);
+      mangleNestedName(GD, Context.getEffectiveDeclContext(ND),
+                       AdditionalAbiTags, true /*NoFunction*/);
     }
   } else if (const BlockDecl *BD = dyn_cast<BlockDecl>(D)) {
     // Mangle a block in a default parameter; see above explanation for
@@ -1843,7 +1874,7 @@ void CXXNameMangler::mangleBlockForPrefix(const BlockDecl *Block) {
     mangleLocalName(Block, /* AdditionalAbiTags */ nullptr);
     return;
   }
-  const DeclContext *DC = getEffectiveDeclContext(Block);
+  const DeclContext *DC = Context.getEffectiveDeclContext(Block);
   if (isLocalContainerContext(DC)) {
     mangleLocalName(Block, /* AdditionalAbiTags */ nullptr);
     return;
@@ -2030,7 +2061,7 @@ void CXXNameMangler::manglePrefix(const DeclContext *DC, bool NoFunction) {
   //           ::= # empty
   //           ::= <substitution>
 
-  DC = IgnoreLinkageSpecDecls(DC);
+  assert(!isa<LinkageSpecDecl>(DC) && "prefix cannot be LinkageSpecDecl");
 
   if (DC->isTranslationUnit())
     return;
@@ -2053,7 +2084,7 @@ void CXXNameMangler::manglePrefix(const DeclContext *DC, bool NoFunction) {
     mangleClosurePrefix(PrefixND, NoFunction);
     mangleUnqualifiedName(ND, nullptr);
   } else {
-    manglePrefix(getEffectiveDeclContext(ND), NoFunction);
+    manglePrefix(Context.getEffectiveDeclContext(ND), NoFunction);
     mangleUnqualifiedName(ND, nullptr);
   }
 
@@ -2107,7 +2138,7 @@ void CXXNameMangler::mangleTemplatePrefix(GlobalDecl GD,
   if (const auto *TTP = dyn_cast<TemplateTemplateParmDecl>(ND)) {
     mangleTemplateParameter(TTP->getDepth(), TTP->getIndex());
   } else {
-    manglePrefix(getEffectiveDeclContext(ND), NoFunction);
+    manglePrefix(Context.getEffectiveDeclContext(ND), NoFunction);
     if (isa<BuiltinTemplateDecl>(ND) || isa<ConceptDecl>(ND))
       mangleUnqualifiedName(GD, nullptr);
     else
@@ -2152,7 +2183,7 @@ void CXXNameMangler::mangleClosurePrefix(const NamedDecl *ND, bool NoFunction) {
     mangleTemplatePrefix(TD, NoFunction);
     mangleTemplateArgs(asTemplateName(TD), *TemplateArgs);
   } else {
-    manglePrefix(getEffectiveDeclContext(ND), NoFunction);
+    manglePrefix(Context.getEffectiveDeclContext(ND), NoFunction);
     mangleUnqualifiedName(ND, nullptr);
   }
 
@@ -5969,56 +6000,61 @@ bool CXXNameMangler::mangleSubstitution(uintptr_t Ptr) {
   return true;
 }
 
-static bool isCharType(QualType T) {
-  if (T.isNull())
+/// Returns whether S is a template specialization of std::Name with a single
+/// argument of type A.
+bool CXXNameMangler::isSpecializedAs(QualType S, llvm::StringRef Name,
+                                     QualType A) {
+  if (S.isNull())
     return false;
 
-  return T->isSpecificBuiltinType(BuiltinType::Char_S) ||
-    T->isSpecificBuiltinType(BuiltinType::Char_U);
-}
-
-/// Returns whether a given type is a template specialization of a given name
-/// with a single argument of type char.
-static bool isCharSpecialization(QualType T, const char *Name) {
-  if (T.isNull())
-    return false;
-
-  const RecordType *RT = T->getAs<RecordType>();
+  const RecordType *RT = S->getAs<RecordType>();
   if (!RT)
     return false;
 
   const ClassTemplateSpecializationDecl *SD =
     dyn_cast<ClassTemplateSpecializationDecl>(RT->getDecl());
-  if (!SD)
+  if (!SD || !SD->getIdentifier()->isStr(Name))
     return false;
 
-  if (!isStdNamespace(getEffectiveDeclContext(SD)))
+  if (!isStdNamespace(Context.getEffectiveDeclContext(SD)))
     return false;
 
   const TemplateArgumentList &TemplateArgs = SD->getTemplateArgs();
   if (TemplateArgs.size() != 1)
     return false;
 
-  if (!isCharType(TemplateArgs[0].getAsType()))
+  if (TemplateArgs[0].getAsType() != A)
     return false;
 
-  return SD->getIdentifier()->getName() == Name;
+  return true;
 }
 
-template <std::size_t StrLen>
-static bool isStreamCharSpecialization(const ClassTemplateSpecializationDecl*SD,
-                                       const char (&Str)[StrLen]) {
-  if (!SD->getIdentifier()->isStr(Str))
+/// Returns whether SD is a template specialization std::Name<char,
+/// std::char_traits<char> [, std::allocator<char>]>
+/// HasAllocator controls whether the 3rd template argument is needed.
+bool CXXNameMangler::isStdCharSpecialization(
+    const ClassTemplateSpecializationDecl *SD, llvm::StringRef Name,
+    bool HasAllocator) {
+  if (!SD->getIdentifier()->isStr(Name))
     return false;
 
   const TemplateArgumentList &TemplateArgs = SD->getTemplateArgs();
-  if (TemplateArgs.size() != 2)
+  if (TemplateArgs.size() != (HasAllocator ? 3 : 2))
+    return false;
+
+  QualType A = TemplateArgs[0].getAsType();
+  if (A.isNull())
+    return false;
+  // Plain 'char' is named Char_S or Char_U depending on the target ABI.
+  if (!A->isSpecificBuiltinType(BuiltinType::Char_S) &&
+      !A->isSpecificBuiltinType(BuiltinType::Char_U))
     return false;
 
-  if (!isCharType(TemplateArgs[0].getAsType()))
+  if (!isSpecializedAs(TemplateArgs[1].getAsType(), "char_traits", A))
     return false;
 
-  if (!isCharSpecialization(TemplateArgs[1].getAsType(), "char_traits"))
+  if (HasAllocator &&
+      !isSpecializedAs(TemplateArgs[2].getAsType(), "allocator", A))
     return false;
 
   return true;
@@ -6031,10 +6067,11 @@ bool CXXNameMangler::mangleStandardSubstitution(const NamedDecl *ND) {
       Out << "St";
       return true;
     }
+    return false;
   }
 
   if (const ClassTemplateDecl *TD = dyn_cast<ClassTemplateDecl>(ND)) {
-    if (!isStdNamespace(getEffectiveDeclContext(TD)))
+    if (!isStdNamespace(Context.getEffectiveDeclContext(TD)))
       return false;
 
     // <substitution> ::= Sa # ::std::allocator
@@ -6048,56 +6085,45 @@ bool CXXNameMangler::mangleStandardSubstitution(const NamedDecl *ND) {
       Out << "Sb";
       return true;
     }
+    return false;
   }
 
   if (const ClassTemplateSpecializationDecl *SD =
         dyn_cast<ClassTemplateSpecializationDecl>(ND)) {
-    if (!isStdNamespace(getEffectiveDeclContext(SD)))
+    if (!isStdNamespace(Context.getEffectiveDeclContext(SD)))
       return false;
 
     //    <substitution> ::= Ss # ::std::basic_string<char,
     //                            ::std::char_traits<char>,
     //                            ::std::allocator<char> >
-    if (SD->getIdentifier()->isStr("basic_string")) {
-      const TemplateArgumentList &TemplateArgs = SD->getTemplateArgs();
-
-      if (TemplateArgs.size() != 3)
-        return false;
-
-      if (!isCharType(TemplateArgs[0].getAsType()))
-        return false;
-
-      if (!isCharSpecialization(TemplateArgs[1].getAsType(), "char_traits"))
-        return false;
-
-      if (!isCharSpecialization(TemplateArgs[2].getAsType(), "allocator"))
-        return false;
-
+    if (isStdCharSpecialization(SD, "basic_string", /*HasAllocator=*/true)) {
       Out << "Ss";
       return true;
     }
 
     //    <substitution> ::= Si # ::std::basic_istream<char,
     //                            ::std::char_traits<char> >
-    if (isStreamCharSpecialization(SD, "basic_istream")) {
+    if (isStdCharSpecialization(SD, "basic_istream", /*HasAllocator=*/false)) {
       Out << "Si";
       return true;
     }
 
     //    <substitution> ::= So # ::std::basic_ostream<char,
     //                            ::std::char_traits<char> >
-    if (isStreamCharSpecialization(SD, "basic_ostream")) {
+    if (isStdCharSpecialization(SD, "basic_ostream", /*HasAllocator=*/false)) {
       Out << "So";
       return true;
     }
 
     //    <substitution> ::= Sd # ::std::basic_iostream<char,
     //                            ::std::char_traits<char> >
-    if (isStreamCharSpecialization(SD, "basic_iostream")) {
+    if (isStdCharSpecialization(SD, "basic_iostream", /*HasAllocator=*/false)) {
       Out << "Sd";
       return true;
     }
+    return false;
   }
+
   return false;
 }
 
diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp
index bba323f651aa..6e827530f41b 100644
--- a/clang/lib/AST/TypePrinter.cpp
+++ b/clang/lib/AST/TypePrinter.cpp
@@ -1466,8 +1466,7 @@ void TypePrinter::printTemplateId(const TemplateSpecializationType *T,
     if (!Policy.SuppressScope)
       AppendScope(TD->getDeclContext(), OS, TD->getDeclName());
 
-    IdentifierInfo *II = TD->getIdentifier();
-    OS << II->getName();
+    OS << TD->getName();
   } else {
     T->getTemplateName().print(OS, Policy);
   }
diff --git a/clang/lib/Basic/TargetInfo.cpp b/clang/lib/Basic/TargetInfo.cpp
index e3a2f30febe7..188ffb5f2f78 100644
--- a/clang/lib/Basic/TargetInfo.cpp
+++ b/clang/lib/Basic/TargetInfo.cpp
@@ -69,11 +69,11 @@ TargetInfo::TargetInfo(const llvm::Triple &T) : Triple(T) {
   // From the glibc documentation, on GNU systems, malloc guarantees 16-byte
   // alignment on 64-bit systems and 8-byte alignment on 32-bit systems. See
   // https://www.gnu.org/software/libc/manual/html_node/Malloc-Examples.html.
-  // This alignment guarantee also applies to Windows and Android. On Darwin,
-  // the alignment is 16 bytes on both 64-bit and 32-bit systems.
+  // This alignment guarantee also applies to Windows and Android. On Darwin
+  // and OpenBSD, the alignment is 16 bytes on both 64-bit and 32-bit systems.
   if (T.isGNUEnvironment() || T.isWindowsMSVCEnvironment() || T.isAndroid())
     NewAlign = Triple.isArch64Bit() ? 128 : Triple.isArch32Bit() ? 64 : 0;
-  else if (T.isOSDarwin())
+  else if (T.isOSDarwin() || T.isOSOpenBSD())
     NewAlign = 128;
   else
     NewAlign = 0; // Infer from basic type alignment.
diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index c4e3f7f54f4f..414e61f25fb3 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -287,7 +287,7 @@ std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) {
     SmallString<256> Buffer;
     llvm::raw_svector_ostream Out(Buffer);
     Out << DeviceSideName;
-    CGM.printPostfixForExternalizedStaticVar(Out);
+    CGM.printPostfixForExternalizedDecl(Out, ND);
     DeviceSideName = std::string(Out.str());
   }
   return DeviceSideName;
diff --git a/clang/lib/CodeGen/CGExprCXX.cpp b/clang/lib/CodeGen/CGExprCXX.cpp
index f06d21861740..3a3acf98f309 100644
--- a/clang/lib/CodeGen/CGExprCXX.cpp
+++ b/clang/lib/CodeGen/CGExprCXX.cpp
@@ -1572,7 +1572,7 @@ llvm::Value *CodeGenFunction::EmitCXXNewExpr(const CXXNewExpr *E) {
   llvm::Value *allocSize =
     EmitCXXNewAllocSize(*this, E, minElements, numElements,
                         allocSizeWithoutCookie);
-  CharUnits allocAlign = getContext().getPreferredTypeAlignInChars(allocType);
+  CharUnits allocAlign = getContext().getTypeAlignInChars(allocType);
 
   // Emit the allocation call.  If the allocator is a global placement
   // operator, just "inline" it directly.
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 29806b65e984..2777fc22600d 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -1367,7 +1367,7 @@ static std::string getMangledNameImpl(CodeGenModule &CGM, GlobalDecl GD,
   if (CGM.getContext().shouldExternalizeStaticVar(ND) &&
       CGM.getLangOpts().GPURelocatableDeviceCode &&
       CGM.getLangOpts().CUDAIsDevice && !CGM.getLangOpts().CUID.empty())
-    CGM.printPostfixForExternalizedStaticVar(Out);
+    CGM.printPostfixForExternalizedDecl(Out, ND);
   return std::string(Out.str());
 }
 
@@ -1455,7 +1455,7 @@ StringRef CodeGenModule::getMangledName(GlobalDecl GD) {
   // directly between host- and device-compilations, the host- and
   // device-mangling in host compilation could help catching certain ones.
   assert(!isa<FunctionDecl>(ND) || !ND->hasAttr<CUDAGlobalAttr>() ||
-         getLangOpts().CUDAIsDevice ||
+         getContext().shouldExternalizeStaticVar(ND) || getLangOpts().CUDAIsDevice ||
          (getContext().getAuxTargetInfo() &&
           (getContext().getAuxTargetInfo()->getCXXABI() !=
            getContext().getTargetInfo().getCXXABI())) ||
@@ -6645,7 +6645,14 @@ bool CodeGenModule::stopAutoInit() {
   return false;
 }
 
-void CodeGenModule::printPostfixForExternalizedStaticVar(
-    llvm::raw_ostream &OS) const {
-  OS << "__static__" << getContext().getCUIDHash();
+void CodeGenModule::printPostfixForExternalizedDecl(llvm::raw_ostream &OS,
+                                                    const Decl *D) const {
+  StringRef Tag;
+  // ptxas does not allow '.' in symbol names. On the other hand, HIP prefers
+  // postfix beginning with '.' since the symbol name can be demangled.
+  if (LangOpts.HIP)
+    Tag = (isa<VarDecl>(D) ? ".static." : ".intern.");
+  else
+    Tag = (isa<VarDecl>(D) ? "__static__" : "__intern__");
+  OS << Tag << getContext().getCUIDHash();
 }
diff --git a/clang/lib/CodeGen/CodeGenModule.h b/clang/lib/CodeGen/CodeGenModule.h
index 1fcd5d4d808a..a8a63c8da57f 100644
--- a/clang/lib/CodeGen/CodeGenModule.h
+++ b/clang/lib/CodeGen/CodeGenModule.h
@@ -1447,9 +1447,10 @@ public:
                                            TBAAAccessInfo *TBAAInfo = nullptr);
   bool stopAutoInit();
 
-  /// Print the postfix for externalized static variable for single source
-  /// offloading languages CUDA and HIP.
-  void printPostfixForExternalizedStaticVar(llvm::raw_ostream &OS) const;
+  /// Print the postfix for externalized static variable or kernels for single
+  /// source offloading languages CUDA and HIP.
+  void printPostfixForExternalizedDecl(llvm::raw_ostream &OS,
+                                       const Decl *D) const;
 
 private:
   llvm::Constant *GetOrCreateLLVMFunction(
diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp
index 43ce33750eba..9638fa2ecfca 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp
@@ -510,7 +510,7 @@ void RocmInstallationDetector::AddHIPIncludeArgs(const ArgList &DriverArgs,
     return;
   }
 
-  CC1Args.push_back("-internal-isystem");
+  CC1Args.push_back("-idirafter");
   CC1Args.push_back(DriverArgs.MakeArgString(getIncludePath()));
   if (UsesRuntimeWrapper)
     CC1Args.append({"-include", "__clang_hip_runtime_wrapper.h"});
diff --git a/clang/lib/Driver/ToolChains/Ananas.cpp b/clang/lib/Driver/ToolChains/Ananas.cpp
index be1476a7636c..40f9e56b38e9 100644
--- a/clang/lib/Driver/ToolChains/Ananas.cpp
+++ b/clang/lib/Driver/ToolChains/Ananas.cpp
@@ -85,7 +85,8 @@ void ananas::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     assert(Output.isNothing() && "Invalid output.");
   }
 
-  if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nostartfiles)) {
+  if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nostartfiles,
+                   options::OPT_r)) {
     if (!Args.hasArg(options::OPT_shared)) {
       CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath("crt0.o")));
     }
@@ -111,12 +112,15 @@ void ananas::Linker::ConstructJob(Compilation &C, const JobAction &JA,
 
   AddLinkerInputs(ToolChain, Inputs, Args, CmdArgs, JA);
 
-  if (ToolChain.ShouldLinkCXXStdlib(Args))
-    ToolChain.AddCXXStdlibLibArgs(Args, CmdArgs);
-  if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs))
+  if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs,
+                   options::OPT_r)) {
+    if (ToolChain.ShouldLinkCXXStdlib(Args))
+      ToolChain.AddCXXStdlibLibArgs(Args, CmdArgs);
     CmdArgs.push_back("-lc");
+  }
 
-  if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nostartfiles)) {
+  if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nostartfiles,
+                   options::OPT_r)) {
     if (Args.hasArg(options::OPT_shared) || Args.hasArg(options::OPT_pie))
       CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath("crtendS.o")));
     else
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index dfcef2304040..8f9244cae8db 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -661,6 +661,17 @@ void tools::addOpenMPRuntimeSpecificRPath(const ToolChain &TC,
   }
 }
 
+void tools::addOpenMPRuntimeLibraryPath(const ToolChain &TC,
+                                        const ArgList &Args,
+                                        ArgStringList &CmdArgs) {
+  // Default to clang lib / lib64 folder, i.e. the same location as device
+  // runtime.
+  SmallString<256> DefaultLibPath =
+      llvm::sys::path::parent_path(TC.getDriver().Dir);
+  llvm::sys::path::append(DefaultLibPath, Twine("lib") + CLANG_LIBDIR_SUFFIX);
+  CmdArgs.push_back(Args.MakeArgString("-L" + DefaultLibPath));
+}
+
 void tools::addArchSpecificRPath(const ToolChain &TC, const ArgList &Args,
                                  ArgStringList &CmdArgs) {
   // Enable -frtlib-add-rpath by default for the case of VE.
@@ -720,6 +731,7 @@ bool tools::addOpenMPRuntime(ArgStringList &CmdArgs, const ToolChain &TC,
 
   if (RTKind == Driver::OMPRT_OMP)
     addOpenMPRuntimeSpecificRPath(TC, Args, CmdArgs);
+  addOpenMPRuntimeLibraryPath(TC, Args, CmdArgs);
 
   return true;
 }
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.h b/clang/lib/Driver/ToolChains/CommonArgs.h
index 23012dc247e4..2bba1ee285e6 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.h
+++ b/clang/lib/Driver/ToolChains/CommonArgs.h
@@ -111,6 +111,9 @@ void addOpenMPRuntimeSpecificRPath(const ToolChain &TC,
                                    llvm::opt::ArgStringList &CmdArgs);
 void addArchSpecificRPath(const ToolChain &TC, const llvm::opt::ArgList &Args,
                           llvm::opt::ArgStringList &CmdArgs);
+void addOpenMPRuntimeLibraryPath(const ToolChain &TC,
+                                 const llvm::opt::ArgList &Args,
+                                 llvm::opt::ArgStringList &CmdArgs);
 /// Returns true, if an OpenMP runtime has been added.
 bool addOpenMPRuntime(llvm::opt::ArgStringList &CmdArgs, const ToolChain &TC,
                       const llvm::opt::ArgList &Args,
diff --git a/clang/lib/Driver/ToolChains/Linux.cpp b/clang/lib/Driver/ToolChains/Linux.cpp
index f85c04df4f6c..83cb41159de7 100644
--- a/clang/lib/Driver/ToolChains/Linux.cpp
+++ b/clang/lib/Driver/ToolChains/Linux.cpp
@@ -301,18 +301,12 @@ Linux::Linux(const Driver &D, const llvm::Triple &Triple, const ArgList &Args)
 
   Generic_GCC::AddMultiarchPaths(D, SysRoot, OSLibDir, Paths);
 
-  // Similar to the logic for GCC above, if we are currently running Clang
-  // inside of the requested system root, add its parent library path to those
-  // searched.
-  // FIXME: It's not clear whether we should use the driver's installed
-  // directory ('Dir' below) or the ResourceDir.
-  if (StringRef(D.Dir).startswith(SysRoot)) {
-    // Even if OSLibDir != "lib", this is needed for Clang in the build
-    // directory (not installed) to find libc++.
+  // The deprecated -DLLVM_ENABLE_PROJECTS=libcxx configuration installs
+  // libc++.so in D.Dir+"/../lib/". Detect this path.
+  // TODO Remove once LLVM_ENABLE_PROJECTS=libcxx is unsupported.
+  if (StringRef(D.Dir).startswith(SysRoot) &&
+      D.getVFS().exists(D.Dir + "/../lib/libc++.so"))
     addPathIfExists(D, D.Dir + "/../lib", Paths);
-    if (OSLibDir != "lib")
-      addPathIfExists(D, D.Dir + "/../" + OSLibDir, Paths);
-  }
 
   addPathIfExists(D, SysRoot + "/lib", Paths);
   addPathIfExists(D, SysRoot + "/usr/lib", Paths);
diff --git a/clang/lib/Driver/ToolChains/Solaris.cpp b/clang/lib/Driver/ToolChains/Solaris.cpp
index 24f18b92dd66..cb3898575d3a 100644
--- a/clang/lib/Driver/ToolChains/Solaris.cpp
+++ b/clang/lib/Driver/ToolChains/Solaris.cpp
@@ -82,7 +82,8 @@ void solaris::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     assert(Output.isNothing() && "Invalid output.");
   }
 
-  if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nostartfiles)) {
+  if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nostartfiles,
+                   options::OPT_r)) {
     if (!Args.hasArg(options::OPT_shared))
       CmdArgs.push_back(
           Args.MakeArgString(getToolChain().GetFilePath("crt1.o")));
@@ -122,7 +123,8 @@ void solaris::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   bool NeedsSanitizerDeps = addSanitizerRuntimes(getToolChain(), Args, CmdArgs);
   AddLinkerInputs(getToolChain(), Inputs, Args, CmdArgs, JA);
 
-  if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs)) {
+  if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs,
+                   options::OPT_r)) {
     if (getToolChain().ShouldLinkCXXStdlib(Args))
       getToolChain().AddCXXStdlibLibArgs(Args, CmdArgs);
     if (Args.hasArg(options::OPT_fstack_protector) ||
@@ -149,11 +151,13 @@ void solaris::Linker::ConstructJob(Compilation &C, const JobAction &JA,
       linkSanitizerRuntimeDeps(getToolChain(), CmdArgs);
   }
 
-  if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nostartfiles)) {
+  if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nostartfiles,
+                   options::OPT_r)) {
     CmdArgs.push_back(
         Args.MakeArgString(getToolChain().GetFilePath("crtend.o")));
+    CmdArgs.push_back(
+        Args.MakeArgString(getToolChain().GetFilePath("crtn.o")));
   }
-  CmdArgs.push_back(Args.MakeArgString(getToolChain().GetFilePath("crtn.o")));
 
   getToolChain().addProfileRTLibs(Args, CmdArgs);
 
diff --git a/compiler-rt/lib/tsan/rtl/tsan_rtl_s390x.S b/compiler-rt/lib/tsan/rtl/tsan_rtl_s390x.S
index fcff35fbc7e0..2f445e8f1b20 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_rtl_s390x.S
+++ b/compiler-rt/lib/tsan/rtl/tsan_rtl_s390x.S
@@ -45,3 +45,5 @@ intercept setjmp, _ZN14__interception11real_setjmpE
 intercept _setjmp, _ZN14__interception12real__setjmpE
 intercept sigsetjmp, _ZN14__interception14real_sigsetjmpE
 intercept __sigsetjmp, _ZN14__interception16real___sigsetjmpE
+
+NO_EXEC_STACK_DIRECTIVE
diff --git a/libcxx/include/__ranges/concepts.h b/libcxx/include/__ranges/concepts.h
index 5f1fa834d409..e16343591cda 100644
--- a/libcxx/include/__ranges/concepts.h
+++ b/libcxx/include/__ranges/concepts.h
@@ -68,8 +68,6 @@ namespace ranges {
   template <range _Rp>
   using range_rvalue_reference_t = iter_rvalue_reference_t<iterator_t<_Rp>>;
 
-#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_RANGES)
-
   // [range.sized]
   template <class _Tp>
   concept sized_range = range<_Tp> && requires(_Tp& __t) { ranges::size(__t); };
@@ -135,8 +133,6 @@ namespace ranges {
       (is_lvalue_reference_v<_Tp> ||
        (movable<remove_reference_t<_Tp>> && !__is_std_initializer_list<remove_cvref_t<_Tp>>))));
 
-#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_RANGES)
-
 } // namespace ranges
 
 #endif // !defined(_LIBCPP_HAS_NO_CONCEPTS)
diff --git a/libcxx/include/__ranges/data.h b/libcxx/include/__ranges/data.h
index f8d92cbc7520..f97ec8033297 100644
--- a/libcxx/include/__ranges/data.h
+++ b/libcxx/include/__ranges/data.h
@@ -24,7 +24,7 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-#if !defined(_LIBCPP_HAS_NO_CONCEPTS) && !defined(_LIBCPP_HAS_NO_INCOMPLETE_RANGES)
+#if !defined(_LIBCPP_HAS_NO_CONCEPTS)
 
 // [range.prim.data]
 
@@ -99,7 +99,7 @@ inline namespace __cpo {
 } // namespace __cpo
 } // namespace ranges
 
-#endif // !defined(_LIBCPP_HAS_NO_CONCEPTS) && !defined(_LIBCPP_HAS_NO_INCOMPLETE_RANGES)
+#endif // !defined(_LIBCPP_HAS_NO_CONCEPTS)
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__ranges/size.h b/libcxx/include/__ranges/size.h
index 2b71c03fb399..e1aaf7eba898 100644
--- a/libcxx/include/__ranges/size.h
+++ b/libcxx/include/__ranges/size.h
@@ -24,7 +24,7 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-#if !defined(_LIBCPP_HAS_NO_CONCEPTS) && !defined(_LIBCPP_HAS_NO_INCOMPLETE_RANGES)
+#if !defined(_LIBCPP_HAS_NO_CONCEPTS)
 
 namespace ranges {
   template<class>
@@ -128,7 +128,7 @@ inline namespace __cpo {
 } // namespace __cpo
 } // namespace ranges
 
-#endif // !defined(_LIBCPP_HAS_NO_CONCEPTS) && !defined(_LIBCPP_HAS_NO_INCOMPLETE_RANGES)
+#endif // !defined(_LIBCPP_HAS_NO_CONCEPTS)
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/lld/ELF/Arch/AArch64.cpp b/lld/ELF/Arch/AArch64.cpp
index add819ff5a80..193956fa9991 100644
--- a/lld/ELF/Arch/AArch64.cpp
+++ b/lld/ELF/Arch/AArch64.cpp
@@ -256,9 +256,11 @@ void AArch64::writePlt(uint8_t *buf, const Symbol &sym,
 bool AArch64::needsThunk(RelExpr expr, RelType type, const InputFile *file,
                          uint64_t branchAddr, const Symbol &s,
                          int64_t a) const {
-  // If s is an undefined weak symbol and does not have a PLT entry then it
-  // will be resolved as a branch to the next instruction.
-  if (s.isUndefWeak() && !s.isInPlt())
+  // If s is an undefined weak symbol and does not have a PLT entry then it will
+  // be resolved as a branch to the next instruction. If it is hidden, its
+  // binding has been converted to local, so we just check isUndefined() here. A
+  // undefined non-weak symbol will have been errored.
+  if (s.isUndefined() && !s.isInPlt())
     return false;
   // ELF for the ARM 64-bit architecture, section Call and Jump relocations
   // only permits range extension thunks for R_AARCH64_CALL26 and
diff --git a/lld/ELF/Arch/ARM.cpp b/lld/ELF/Arch/ARM.cpp
index b7c2eb74757c..b6b32f0500a4 100644
--- a/lld/ELF/Arch/ARM.cpp
+++ b/lld/ELF/Arch/ARM.cpp
@@ -296,9 +296,11 @@ void ARM::addPltSymbols(InputSection &isec, uint64_t off) const {
 bool ARM::needsThunk(RelExpr expr, RelType type, const InputFile *file,
                      uint64_t branchAddr, const Symbol &s,
                      int64_t a) const {
-  // If S is an undefined weak symbol and does not have a PLT entry then it
-  // will be resolved as a branch to the next instruction.
-  if (s.isUndefWeak() && !s.isInPlt())
+  // If s is an undefined weak symbol and does not have a PLT entry then it will
+  // be resolved as a branch to the next instruction. If it is hidden, its
+  // binding has been converted to local, so we just check isUndefined() here. A
+  // undefined non-weak symbol will have been errored.
+  if (s.isUndefined() && !s.isInPlt())
     return false;
   // A state change from ARM to Thumb and vice versa must go through an
   // interworking thunk if the relocation type is not R_ARM_CALL or
diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp
index 4b047f75ad69..490254c995c8 100644
--- a/lld/ELF/InputSection.cpp
+++ b/lld/ELF/InputSection.cpp
@@ -733,11 +733,13 @@ uint64_t InputSectionBase::getRelocTargetVA(const InputFile *file, RelType type,
     if (expr == R_ARM_PCA)
       // Some PC relative ARM (Thumb) relocations align down the place.
       p = p & 0xfffffffc;
-    if (sym.isUndefWeak()) {
+    if (sym.isUndefined()) {
       // On ARM and AArch64 a branch to an undefined weak resolves to the next
       // instruction, otherwise the place. On RISCV, resolve an undefined weak
       // to the same instruction to cause an infinite loop (making the user
       // aware of the issue) while ensuring no overflow.
+      // Note: if the symbol is hidden, its binding has been converted to local,
+      // so we just check isUndefined() here.
       if (config->emachine == EM_ARM)
         dest = getARMUndefinedRelativeWeakVA(type, a, p);
       else if (config->emachine == EM_AARCH64)
diff --git a/lld/ELF/Options.td b/lld/ELF/Options.td
index ca9fdcde791f..cf2013a5f820 100644
--- a/lld/ELF/Options.td
+++ b/lld/ELF/Options.td
@@ -695,6 +695,7 @@ def plugin_opt_eq : J<"plugin-opt=">;
 def: F<"detect-odr-violations">;
 def: Flag<["-"], "g">;
 def: F<"long-plt">;
+def: FF<"no-add-needed">;
 def: F<"no-copy-dt-needed-entries">;
 def: F<"no-ctors-in-init-array">;
 def: F<"no-keep-memory">;
diff --git a/llvm/include/llvm/Support/AArch64TargetParser.def b/llvm/include/llvm/Support/AArch64TargetParser.def
index a953e9439db4..c45c149c6084 100644
--- a/llvm/include/llvm/Support/AArch64TargetParser.def
+++ b/llvm/include/llvm/Support/AArch64TargetParser.def
@@ -290,6 +290,8 @@ AARCH64_CPU_NAME("a64fx", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
                  (AArch64::AEK_FP16 | AArch64::AEK_SVE))
 AARCH64_CPU_NAME("carmel", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
                  AArch64::AEK_FP16)
+AARCH64_CPU_NAME("ampere1", ARMV8_6A, FK_CRYPTO_NEON_FP_ARMV8, false,
+                 (AArch64::AEK_FP16 | AArch64::AEK_SB | AArch64::AEK_SSBS))
 // Invalid CPU
 AARCH64_CPU_NAME("invalid", INVALID, FK_INVALID, true, AArch64::AEK_INVALID)
 #undef AARCH64_CPU_NAME
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 6d415c9c7f90..847df84afba6 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -3469,8 +3469,13 @@ bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) {
         return false;
       }
 
-      if (!finalizeBasicBlock(*BB, MBB))
+      if (!finalizeBasicBlock(*BB, MBB)) {
+        OptimizationRemarkMissed R("gisel-irtranslator", "GISelFailure",
+                                   BB->getTerminator()->getDebugLoc(), BB);
+        R << "unable to translate basic block";
+        reportTranslationError(*MF, *TPC, *ORE, R);
         return false;
+      }
     }
 #ifndef NDEBUG
     WrapperObserver.removeObserver(&Verifier);
diff --git a/llvm/lib/MC/ELFObjectWriter.cpp b/llvm/lib/MC/ELFObjectWriter.cpp
index 6fd2f7e7a718..e6d28141dd06 100644
--- a/llvm/lib/MC/ELFObjectWriter.cpp
+++ b/llvm/lib/MC/ELFObjectWriter.cpp
@@ -549,9 +549,27 @@ void ELFWriter::writeSymbol(SymbolTableWriter &Writer, uint32_t StringIndex,
   uint64_t Size = 0;
 
   const MCExpr *ESize = MSD.Symbol->getSize();
-  if (!ESize && Base)
+  if (!ESize && Base) {
+    // For expressions like .set y, x+1, if y's size is unset, inherit from x.
     ESize = Base->getSize();
 
+    // For `.size x, 2; y = x; .size y, 1; z = y; z1 = z; .symver y, y@v1`, z,
+    // z1, and y@v1's st_size equals y's. However, `Base` is `x` which will give
+    // us 2. Follow the MCSymbolRefExpr assignment chain, which covers most
+    // needs. MCBinaryExpr is not handled.
+    const MCSymbolELF *Sym = &Symbol;
+    while (Sym->isVariable()) {
+      if (auto *Expr =
+              dyn_cast<MCSymbolRefExpr>(Sym->getVariableValue(false))) {
+        Sym = cast<MCSymbolELF>(&Expr->getSymbol());
+        if (!Sym->getSize())
+          continue;
+        ESize = Sym->getSize();
+      }
+      break;
+    }
+  }
+
   if (ESize) {
     int64_t Res;
     if (!ESize->evaluateKnownAbsolute(Res, Layout))
diff --git a/llvm/lib/Support/Host.cpp b/llvm/lib/Support/Host.cpp
index f6003b783245..a82a4d451c8a 100644
--- a/llvm/lib/Support/Host.cpp
+++ b/llvm/lib/Support/Host.cpp
@@ -296,6 +296,12 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) {
     }
   }
 
+  if (Implementer == "0xc0") { // Ampere Computing
+    return StringSwitch<const char *>(Part)
+        .Case("0xac3", "ampere1")
+        .Default("generic");
+  }
+
   return "generic";
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index 70c7b7b3f5dc..80e574b7b886 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -557,6 +557,7 @@ include "AArch64Schedule.td"
 include "AArch64InstrInfo.td"
 include "AArch64SchedPredicates.td"
 include "AArch64SchedPredExynos.td"
+include "AArch64SchedPredAmpere.td"
 include "AArch64Combine.td"
 
 def AArch64InstrInfo : InstrInfo;
@@ -626,6 +627,7 @@ include "AArch64SchedThunderX2T99.td"
 include "AArch64SchedA64FX.td"
 include "AArch64SchedThunderX3T110.td"
 include "AArch64SchedTSV110.td"
+include "AArch64SchedAmpere1.td"
 
 def TuneA35     : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35",
                                 "Cortex-A35 ARM processors">;
@@ -939,6 +941,16 @@ def TuneTSV110 : SubtargetFeature<"tsv110", "ARMProcFamily", "TSV110",
                                   FeatureFuseAES,
                                   FeaturePostRAScheduler]>;
 
+def TuneAmpere1 : SubtargetFeature<"ampere1", "ARMProcFamily", "Ampere1",
+                                   "Ampere Computing Ampere-1 processors", [
+                                   FeaturePostRAScheduler,
+                                   FeatureFuseAES,
+                                   FeatureLSLFast,
+                                   FeatureAggressiveFMA,
+                                   FeatureArithmeticBccFusion,
+                                   FeatureCmpBccFusion,
+                                   FeatureFuseAddress,
+                                   FeatureFuseLiterals]>;
 
 def ProcessorFeatures {
   list<SubtargetFeature> A53  = [HasV8_0aOps, FeatureCRC, FeatureCrypto,
@@ -1047,6 +1059,8 @@ def ProcessorFeatures {
   list<SubtargetFeature> TSV110 = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8,
                                    FeatureNEON, FeaturePerfMon, FeatureSPE,
                                    FeatureFullFP16, FeatureFP16FML, FeatureDotProd];
+  list<SubtargetFeature> Ampere1 = [HasV8_6aOps, FeatureNEON, FeaturePerfMon,
+                                    FeatureMTE, FeatureSSBS];
 
   // ETE and TRBE are future architecture extensions. We temporarily enable them
   // by default for users targeting generic AArch64. The extensions do not
@@ -1184,6 +1198,10 @@ def : ProcessorModel<"a64fx", A64FXModel, ProcessorFeatures.A64FX,
 def : ProcessorModel<"carmel", NoSchedModel, ProcessorFeatures.Carmel,
                      [TuneCarmel]>;
 
+// Ampere Computing
+def : ProcessorModel<"ampere1", Ampere1Model, ProcessorFeatures.Ampere1,
+                     [TuneAmpere1]>;
+
 //===----------------------------------------------------------------------===//
 // Assembly parser
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/AArch64SchedAmpere1.td b/llvm/lib/Target/AArch64/AArch64SchedAmpere1.td
new file mode 100644
index 000000000000..32f7299fbf87
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64SchedAmpere1.td
@@ -0,0 +1,1136 @@
+//=- AArch64SchedAmpere1.td - Ampere-1 scheduling def -----*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for the Ampere Computing Ampere-1 to
+// support instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+// The Ampere-1 core is an out-of-order micro-architecture.  The front
+// end has branch prediction, with a 10-cycle recovery time from a
+// mispredicted branch.  Instructions coming out of the front end are
+// decoded into internal micro-ops (uops).
+
+def Ampere1Model : SchedMachineModel {
+  let IssueWidth            =   4;  // 4-way decode and dispatch
+  let MicroOpBufferSize     = 174;  // micro-op re-order buffer size
+  let LoadLatency           =   4;  // Optimistic load latency
+  let MispredictPenalty     =  10;  // Branch mispredict penalty
+  let LoopMicroOpBufferSize =  32;  // Instruction queue size
+  let CompleteModel = 1;
+
+  list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
+                                                    SMEUnsupported.F);
+}
+
+let SchedModel = Ampere1Model in {
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available on Ampere-1.
+// Ampere-1 has 12 pipelines that 8 independent scheduler (4 integer, 2 FP,
+// and 2 memory) issue into.  The integer and FP schedulers can each issue
+// one uop per cycle, while the memory schedulers can each issue one load
+// and one store address calculation per cycle.
+
+def Ampere1UnitA  : ProcResource<2>;  // integer single-cycle, branch, and flags r/w
+def Ampere1UnitB  : ProcResource<2>;  // integer single-cycle, and complex shifts
+def Ampere1UnitBS : ProcResource<1>;  // integer multi-cycle
+def Ampere1UnitL  : ProcResource<2>;  // load
+def Ampere1UnitS  : ProcResource<2>;  // store address calculation
+def Ampere1UnitX  : ProcResource<1>;  // FP and vector operations, and flag write
+def Ampere1UnitY  : ProcResource<1>;  // FP and vector operations, and crypto
+def Ampere1UnitZ  : ProcResource<1>;  // FP store data and FP-to-integer moves
+
+def Ampere1UnitAB : ProcResGroup<[Ampere1UnitA, Ampere1UnitB]>;
+def Ampere1UnitXY : ProcResGroup<[Ampere1UnitX, Ampere1UnitY]>;
+
+//===----------------------------------------------------------------------===//
+// Define customized scheduler read/write types specific to the Ampere-1.
+
+def Ampere1Write_1cyc_1A : SchedWriteRes<[Ampere1UnitA]> {
+  let Latency = 1;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_1cyc_2A : SchedWriteRes<[Ampere1UnitA, Ampere1UnitA]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_1cyc_1B : SchedWriteRes<[Ampere1UnitB]> {
+  let Latency = 1;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_1cyc_1AB : SchedWriteRes<[Ampere1UnitAB]> {
+  let Latency = 1;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_1cyc_1L : SchedWriteRes<[Ampere1UnitL]> {
+  let Latency = 1;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_1cyc_1S : SchedWriteRes<[Ampere1UnitS]> {
+  let Latency = 1;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_1cyc_2S : SchedWriteRes<[Ampere1UnitS, Ampere1UnitS]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_2cyc_1Y : SchedWriteRes<[Ampere1UnitY]> {
+  let Latency = 2;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_2cyc_2AB : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitAB]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_2cyc_1B_1AB : SchedWriteRes<[Ampere1UnitB, Ampere1UnitAB]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_2cyc_1B_1A : SchedWriteRes<[Ampere1UnitB, Ampere1UnitA]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_2cyc_1AB_1A : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitA]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_2cyc_1AB_1L : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitL]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_2cyc_1AB_2S : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitS,
+                                                             Ampere1UnitS]> {
+  let Latency = 2;
+  let NumMicroOps = 3;
+}
+
+def Ampere1Write_2cyc_1AB_1S_1Z : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitS,
+                                                                Ampere1UnitZ]> {
+  let Latency = 2;
+  let NumMicroOps = 3;
+}
+
+def Ampere1Write_2cyc_1B_1S : SchedWriteRes<[Ampere1UnitB, Ampere1UnitS]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_2cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+  let Latency = 2;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_2cyc_1S_1Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitZ]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_3cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> {
+  let Latency = 3;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_3cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+  let Latency = 3;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_3cyc_1B_1S_1AB : SchedWriteRes<[Ampere1UnitB, Ampere1UnitS,
+                                                               Ampere1UnitAB]> {
+  let Latency = 2;
+  let NumMicroOps = 3;
+}
+
+def Ampere1Write_3cyc_1S_2Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitZ, Ampere1UnitZ]> {
+  let Latency = 2;
+  let NumMicroOps = 3;
+}
+
+def Ampere1Write_3cyc_2S_2Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitS,
+                                             Ampere1UnitZ, Ampere1UnitZ]> {
+  let Latency = 2;
+  let NumMicroOps = 4;
+}
+
+def Ampere1Write_4cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> {
+  let Latency = 4;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_4cyc_1L : SchedWriteRes<[Ampere1UnitL]> {
+  let Latency = 4;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_4cyc_1X : SchedWriteRes<[Ampere1UnitX]> {
+  let Latency = 4;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_4cyc_1Y : SchedWriteRes<[Ampere1UnitY]> {
+  let Latency = 4;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_4cyc_1Z : SchedWriteRes<[Ampere1UnitZ]> {
+  let Latency = 4;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_4cyc_2L : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_4cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+  let Latency = 4;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_4cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_4cyc_1XY_1S_1Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitS, Ampere1UnitZ]> {
+  let Latency = 4;
+  let NumMicroOps = 3;
+}
+
+def Ampere1Write_4cyc_3S_3Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitS, Ampere1UnitS,
+                                             Ampere1UnitZ, Ampere1UnitZ, Ampere1UnitZ]> {
+  let Latency = 4;
+  let NumMicroOps = 6;
+}
+
+def Ampere1Write_5cyc_1AB_1L : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitL]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_5cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> {
+  let Latency = 5;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_5cyc_1X : SchedWriteRes<[Ampere1UnitX]> {
+  let Latency = 5;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_5cyc_1L : SchedWriteRes<[Ampere1UnitL]> {
+  let Latency = 5;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_5cyc_2L : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_5cyc_1L_1BS : SchedWriteRes<[Ampere1UnitL, Ampere1UnitBS]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_5cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+  let Latency = 5;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_5cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_5cyc_4S_4Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitS,
+                                             Ampere1UnitS, Ampere1UnitS,
+                                             Ampere1UnitZ, Ampere1UnitZ,
+                                             Ampere1UnitZ, Ampere1UnitZ]> {
+  let Latency = 5;
+  let NumMicroOps = 8;
+}
+
+def Ampere1Write_5cyc_2XY_2S_2Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY,
+                                                 Ampere1UnitS, Ampere1UnitS,
+                                                 Ampere1UnitZ, Ampere1UnitZ]> {
+  let Latency = 5;
+  let NumMicroOps = 6;
+}
+
+def Ampere1Write_6cyc_2XY_2S_2Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY,
+                                                 Ampere1UnitS, Ampere1UnitS,
+                                                 Ampere1UnitZ, Ampere1UnitZ]> {
+  let Latency = 6;
+  let NumMicroOps = 6;
+}
+
+def Ampere1Write_6cyc_3XY_3S_3Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY,
+                                                 Ampere1UnitS, Ampere1UnitS, Ampere1UnitS,
+                                                 Ampere1UnitZ, Ampere1UnitZ, Ampere1UnitZ]> {
+  let Latency = 6;
+  let NumMicroOps = 9;
+}
+
+def Ampere1Write_6cyc_1AB_1L : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitL]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_6cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+  let Latency = 6;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_6cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_6cyc_3XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 6;
+  let NumMicroOps = 3;
+}
+
+def Ampere1Write_6cyc_3L : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL]> {
+  let Latency = 6;
+  let NumMicroOps = 3;
+}
+
+def Ampere1Write_6cyc_4L : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL,
+                                          Ampere1UnitL, Ampere1UnitL]> {
+  let Latency = 6;
+  let NumMicroOps = 4;
+}
+
+def Ampere1Write_6cyc_1XY_1Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitZ]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_7cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> {
+  let Latency = 7;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_7cyc_1BS_1XY : SchedWriteRes<[Ampere1UnitBS, Ampere1UnitXY]> {
+  let Latency = 7;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_7cyc_1L_1XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitXY]> {
+  let Latency = 7;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_7cyc_2L_2XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL,
+                                              Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 7;
+  let NumMicroOps = 4;
+}
+
+def Ampere1Write_7cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 7;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_7cyc_4XY_4S_4Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY,
+                                                 Ampere1UnitXY, Ampere1UnitXY,
+                                                 Ampere1UnitS, Ampere1UnitS,
+                                                 Ampere1UnitS, Ampere1UnitS,
+                                                 Ampere1UnitZ, Ampere1UnitZ,
+                                                 Ampere1UnitZ, Ampere1UnitZ]> {
+  let Latency = 7;
+  let NumMicroOps = 12;
+}
+
+def Ampere1Write_8cyc_1BS_1A : SchedWriteRes<[Ampere1UnitBS, Ampere1UnitA]> {
+  let Latency = 8;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_8cyc_1BS_2A : SchedWriteRes<[Ampere1UnitBS, Ampere1UnitA,
+                                                             Ampere1UnitA]> {
+  let Latency = 8;
+  let NumMicroOps = 3;
+}
+
+def Ampere1Write_8cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 8;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_8cyc_4XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY,
+                                           Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 8;
+  let NumMicroOps = 4;
+}
+
+def Ampere1Write_8cyc_3L_3XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL,
+                                              Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 8;
+  let NumMicroOps = 6;
+}
+
+def Ampere1Write_8cyc_4L_4XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL,
+                                              Ampere1UnitL, Ampere1UnitL,
+                                              Ampere1UnitXY, Ampere1UnitXY,
+                                              Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 8;
+  let NumMicroOps = 8;
+}
+
+def Ampere1Write_9cyc_3L_3XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL,
+                                              Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 9;
+  let NumMicroOps = 6;
+}
+
+def Ampere1Write_9cyc_4L_4XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL,
+                                              Ampere1UnitL, Ampere1UnitL,
+                                              Ampere1UnitXY, Ampere1UnitXY,
+                                              Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 9;
+  let NumMicroOps = 8;
+}
+
+def Ampere1Write_9cyc_3XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 9;
+  let NumMicroOps = 3;
+}
+
+def Ampere1Write_9cyc_2L_3XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL,
+                                              Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 9;
+  let NumMicroOps = 5;
+}
+
+def Ampere1Write_9cyc_6XY_4S_4Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY,
+                                                 Ampere1UnitXY, Ampere1UnitXY,
+                                                 Ampere1UnitXY, Ampere1UnitXY,
+                                                 Ampere1UnitS, Ampere1UnitS,
+                                                 Ampere1UnitS, Ampere1UnitS,
+                                                 Ampere1UnitZ, Ampere1UnitZ,
+                                                 Ampere1UnitZ, Ampere1UnitZ]> {
+  let Latency = 9;
+  let NumMicroOps = 14;
+}
+
+def Ampere1Write_9cyc_8XY_4S_4Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY,
+                                                 Ampere1UnitXY, Ampere1UnitXY,
+                                                 Ampere1UnitXY, Ampere1UnitXY,
+                                                 Ampere1UnitXY, Ampere1UnitXY,
+                                                 Ampere1UnitS, Ampere1UnitS,
+                                                 Ampere1UnitS, Ampere1UnitS,
+                                                 Ampere1UnitZ, Ampere1UnitZ,
+                                                 Ampere1UnitZ, Ampere1UnitZ]> {
+  let Latency = 9;
+  let NumMicroOps = 16;
+}
+
+def Ampere1Write_10cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 10;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_10cyc_1XY_1Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitZ]> {
+  let Latency = 10;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_10cyc_1X_1Z : SchedWriteRes<[Ampere1UnitX, Ampere1UnitZ]> {
+  let Latency = 10;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_10cyc_3L_3XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL,
+                                               Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 10;
+  let NumMicroOps = 6;
+}
+
+def Ampere1Write_10cyc_1A_1BS_1X : SchedWriteRes<[Ampere1UnitA, Ampere1UnitBS, Ampere1UnitX]> {
+  let Latency = 10;
+  let NumMicroOps = 3;
+}
+
+def Ampere1Write_10cyc_1A_1BS_1XY : SchedWriteRes<[Ampere1UnitA, Ampere1UnitBS, Ampere1UnitXY]> {
+  let Latency = 10;
+  let NumMicroOps = 3;
+}
+
+def Ampere1Write_11cyc_1BS_1L : SchedWriteRes<[Ampere1UnitBS, Ampere1UnitL]> {
+  let Latency = 11;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_11cyc_1A_1BS_1X : SchedWriteRes<[Ampere1UnitA, Ampere1UnitBS, Ampere1UnitX]> {
+  let Latency = 11;
+  let NumMicroOps = 3;
+}
+
+def Ampere1Write_11cyc_1A_1BS_1XY : SchedWriteRes<[Ampere1UnitA, Ampere1UnitBS, Ampere1UnitXY]> {
+  let Latency = 11;
+  let NumMicroOps = 3;
+}
+
+def Ampere1Write_11cyc_4L_8XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL,
+                                               Ampere1UnitL, Ampere1UnitL,
+                                               Ampere1UnitXY, Ampere1UnitXY,
+                                               Ampere1UnitXY, Ampere1UnitXY,
+                                               Ampere1UnitXY, Ampere1UnitXY,
+                                               Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 11;
+  let NumMicroOps = 12;
+}
+
+def Ampere1Write_12cyc_4L_8XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL,
+                                               Ampere1UnitL, Ampere1UnitL,
+                                               Ampere1UnitXY, Ampere1UnitXY,
+                                               Ampere1UnitXY, Ampere1UnitXY,
+                                               Ampere1UnitXY, Ampere1UnitXY,
+                                               Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 12;
+  let NumMicroOps = 12;
+}
+
+def Ampere1Write_12cyc_3XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 12;
+  let NumMicroOps = 3;
+}
+
+def Ampere1Write_12cyc_4XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY,
+                                            Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 12;
+  let NumMicroOps = 4;
+}
+
+def Ampere1Write_18cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> {
+  let Latency = 18;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_19cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+  let Latency = 19;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_25cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+  let Latency = 25;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_32cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+  let Latency = 32;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_34cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> {
+  let Latency = 34;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_34cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+  let Latency = 34;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_39cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+  let Latency = 39;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_62cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+  let Latency = 62;
+  let NumMicroOps = 1;
+}
+
+// For basic arithmetic, we have more flexibility for short shifts (LSL shift <= 4),
+// which are a single uop, and for extended registers, which have full flexibility
+// across Unit A or B for both uops.
+def Ampere1Write_Arith : SchedWriteVariant<[
+                                SchedVar<RegExtendedPred, [Ampere1Write_2cyc_2AB]>,
+                                SchedVar<AmpereCheapLSL,  [Ampere1Write_1cyc_1AB]>,
+                                SchedVar<NoSchedPred,     [Ampere1Write_2cyc_1B_1AB]>]>;
+
+def Ampere1Write_ArithFlagsetting : SchedWriteVariant<[
+                                SchedVar<RegExtendedPred, [Ampere1Write_2cyc_1AB_1A]>,
+                                SchedVar<AmpereCheapLSL,  [Ampere1Write_1cyc_1A]>,
+                                SchedVar<NoSchedPred,     [Ampere1Write_2cyc_1B_1A]>]>;
+
+//===----------------------------------------------------------------------===//
+// Map the target-defined scheduler read/write resources and latencies for Ampere-1.
+// This provides a coarse model, which is then specialised below.
+
+def : WriteRes<WriteImm,   [Ampere1UnitAB]>;  // MOVN, MOVZ
+def : WriteRes<WriteI,     [Ampere1UnitAB]>;  // ALU
+def : WriteRes<WriteISReg, [Ampere1UnitB, Ampere1UnitA]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}  // ALU of Shifted-Reg
+def : WriteRes<WriteIEReg, [Ampere1UnitAB, Ampere1UnitA]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}  // ALU of Extended-Reg
+def : WriteRes<WriteExtr,  [Ampere1UnitB]>;  // EXTR shifts a reg pair
+def : WriteRes<WriteIS,    [Ampere1UnitB]>;  // Shift/Scale
+def : WriteRes<WriteID32,  [Ampere1UnitBS]> {
+  let Latency = 18;
+}  // 32-bit Divide
+def : WriteRes<WriteID64,  [Ampere1UnitBS]> {
+  let Latency = 34;
+}  // 64-bit Divide
+def : WriteRes<WriteIM32,  [Ampere1UnitBS]> {
+  let Latency = 3;
+}  // 32-bit Multiply
+def : WriteRes<WriteIM64,  [Ampere1UnitBS]> {
+  let Latency = 3;
+}  // 32-bit Multiply
+def : WriteRes<WriteBr,    [Ampere1UnitA]>;
+def : WriteRes<WriteBrReg, [Ampere1UnitA, Ampere1UnitA]>;
+def : WriteRes<WriteLD,    [Ampere1UnitL]> {
+  let Latency = 4;
+}  // Load from base addr plus immediate offset
+def : WriteRes<WriteST,    [Ampere1UnitS]> {
+  let Latency = 1;
+}  // Store to base addr plus immediate offset
+def : WriteRes<WriteSTP,   [Ampere1UnitS, Ampere1UnitS]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}  // Store a register pair.
+def : WriteRes<WriteAdr,   [Ampere1UnitAB]>;
+def : WriteRes<WriteLDIdx, [Ampere1UnitAB, Ampere1UnitS]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}  // Load from a register index (maybe scaled).
+def : WriteRes<WriteSTIdx, [Ampere1UnitS, Ampere1UnitS]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}  // Store to a register index (maybe scaled).
+def : WriteRes<WriteF,  [Ampere1UnitXY]> {
+  let Latency = 2;
+}  // General floating-point ops.
+def : WriteRes<WriteFCmp,  [Ampere1UnitX]> {
+  let Latency = 5;
+}  // Floating-point compare.
+def : WriteRes<WriteFCvt,  [Ampere1UnitXY]> {
+  let Latency = 6;
+}  // Float conversion.
+def : WriteRes<WriteFCopy, [Ampere1UnitXY]> {
+}  // Float-int register copy.
+def : WriteRes<WriteFImm,  [Ampere1UnitXY]> {
+  let Latency = 2;
+}  // Float-int register copy.
+def : WriteRes<WriteFMul,  [Ampere1UnitXY]> {
+  let Latency = 5;
+}  // Floating-point multiply.
+def : WriteRes<WriteFDiv,  [Ampere1UnitXY]> {
+  let Latency = 34;
+}  // Floating-point division.
+def : WriteRes<WriteVd,    [Ampere1UnitXY]> {
+  let Latency = 3;
+}  // 64bit Vector D ops.
+def : WriteRes<WriteVq,    [Ampere1UnitXY]> {
+  let Latency = 3;
+}  // 128bit Vector Q ops.
+def : WriteRes<WriteVLD,   [Ampere1UnitL, Ampere1UnitL]> {
+  let Latency = 5;
+}  // Vector loads.
+def : WriteRes<WriteVST,   [Ampere1UnitS, Ampere1UnitZ]> {
+  let Latency = 2;
+}  // Vector stores.
+
+def : WriteRes<WriteAtomic,  []> { let Unsupported = 1; }
+
+def : WriteRes<WriteSys,     []> { let Latency = 1; }
+def : WriteRes<WriteBarrier, []> { let Latency = 1; }
+def : WriteRes<WriteHint,    []> { let Latency = 1; }
+
+def : WriteRes<WriteLDHi,    []> {
+  let Latency = 4;
+}  // The second register of a load-pair: LDP,LDPSW,LDNP,LDXP,LDAXP
+
+// Forwarding logic.
+def : ReadAdvance<ReadI,       0>;
+def : ReadAdvance<ReadISReg,   0>;
+def : ReadAdvance<ReadIEReg,   0>;
+def : ReadAdvance<ReadIM,      0>;
+def : ReadAdvance<ReadIMA,     1, [WriteIM32, WriteIM64]>;
+def : ReadAdvance<ReadID,      0>;
+def : ReadAdvance<ReadExtrHi,  0>;
+def : ReadAdvance<ReadST,      0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadVLD,     0>;
+
+//===----------------------------------------------------------------------===//
+// Specialising the scheduling model further for Ampere-1.
+
+def : InstRW<[Ampere1Write_1cyc_1AB], (instrs COPY)>;
+
+// Branch instructions
+def : InstRW<[Ampere1Write_1cyc_1A], (instrs Bcc, BL, RET)>;
+def : InstRW<[Ampere1Write_1cyc_1A],
+        (instrs CBZW, CBZX, CBNZW, CBNZX, TBZW, TBZX, TBNZW, TBNZX)>;
+def : InstRW<[Ampere1Write_1cyc_2A], (instrs BLR)>;
+
+// Cryptography instructions
+// -- AES encryption/decryption
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^AES[DE]")>;
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^AESI?MC")>;
+// -- Polynomial multiplication
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^PMUL", "^PMULL")>;
+// -- SHA-256 hash
+def : InstRW<[Ampere1Write_4cyc_1X], (instregex "^SHA256(H|H2)")>;
+// -- SHA-256 schedule update
+def : InstRW<[Ampere1Write_4cyc_1Y], (instregex "^SHA256SU[01]")>;
+// -- SHA-3 instructions
+def : InstRW<[Ampere1Write_2cyc_1XY],
+        (instregex "^BCAX", "^EOR3", "^RAX1", "^XAR")>;
+// -- SHA-512 hash
+def : InstRW<[Ampere1Write_4cyc_1X], (instregex "^SHA512(H|H2)")>;
+// -- SHA-512 schedule update
+def : InstRW<[Ampere1Write_4cyc_1Y], (instregex "^SHA512SU[01]")>;
+// -- SHA1 choose/majority/parity
+def : InstRW<[Ampere1Write_4cyc_1X], (instregex "^SHA1[CMP]")>;
+// -- SHA1 hash/schedule update
+def : InstRW<[Ampere1Write_2cyc_1Y], (instregex "^SHA1SU[01]")>;
+def : InstRW<[Ampere1Write_2cyc_1Y], (instregex "^SHA1H")>;
+
+// FP and vector load instructions
+// -- Load 1-element structure to one/all lanes
+// ---- all lanes
+def : InstRW<[Ampere1Write_7cyc_1L_1XY],
+        (instregex "^LD1Rv(8b|4h|2s|16b|8h|4s|2d)")>;
+// ---- one lane
+def : InstRW<[Ampere1Write_7cyc_1L_1XY],
+        (instregex "^LD1i(8|16|32|64)")>;
+// -- Load 1-element structure to one/all lanes, 1D size
+def : InstRW<[Ampere1Write_5cyc_1L],
+        (instregex "^LD1Rv1d")>;
+// -- Load 1-element structures to 1 register
+def : InstRW<[Ampere1Write_5cyc_1L],
+        (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Load 1-element structures to 2 registers
+def : InstRW<[Ampere1Write_5cyc_2L],
+        (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Load 1-element structures to 3 registers
+def : InstRW<[Ampere1Write_6cyc_3L],
+        (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Load 1-element structures to 4 registers
+def : InstRW<[Ampere1Write_6cyc_4L],
+        (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Load 2-element structure to all lanes of 2 registers, 1D size
+def : InstRW<[Ampere1Write_5cyc_2L],
+        (instregex "^LD2Rv1d")>;
+// -- Load 2-element structure to all lanes of 2 registers, other sizes
+def : InstRW<[Ampere1Write_7cyc_2L_2XY],
+        (instregex "^LD2Rv(8b|4h|2s|16b|8h|4s|2d)")>;
+// -- Load 2-element structure to one lane of 2 registers
+def : InstRW<[Ampere1Write_7cyc_2L_2XY],
+        (instregex "^LD2i(8|16|32|64)")>;
+// -- Load 2-element structures to 2 registers, 16B/8H/4S/2D size
+def : InstRW<[Ampere1Write_7cyc_2L_2XY],
+        (instregex "^LD2Twov(16b|8h|4s|2d)")>;
+// -- Load 2-element structures to 2 registers, 8B/4H/2S size
+def : InstRW<[Ampere1Write_9cyc_2L_3XY],
+        (instregex "^LD2Twov(8b|4h|2s)")>;
+// -- Load 3-element structure to all lanes of 3 registers, 1D size
+def : InstRW<[Ampere1Write_6cyc_3L],
+        (instregex "^LD3Rv1d")>;
+// -- Load 3-element structure to all lanes of 3 registers, other sizes
+def : InstRW<[Ampere1Write_8cyc_3L_3XY],
+        (instregex "^LD3Rv(8b|4h|2s|16b|8h|4s|2d)")>;
+// -- Load 3-element structure to one lane of 3 registers
+def : InstRW<[Ampere1Write_8cyc_3L_3XY],
+        (instregex "^LD3i(8|16|32|64)")>;
+// -- Load 3-element structures to 3 registers, 16B/8H/4S sizes
+def : InstRW<[Ampere1Write_9cyc_3L_3XY],
+        (instregex "^LD3Threev(16b|8h|4s)")>;
+// -- Load 3-element structures to 3 registers, 2D size
+def : InstRW<[Ampere1Write_8cyc_3L_3XY],
+        (instregex "^LD3Threev2d")>;
+// -- Load 3-element structures to 3 registers, 8B/4H/2S sizes
+def : InstRW<[Ampere1Write_10cyc_3L_3XY],
+        (instregex "^LD3Threev(8b|4h|2s)")>;
+// -- Load 4-element structure to all lanes of 4 registers, 1D size
+def : InstRW<[Ampere1Write_6cyc_4L],
+        (instregex "^LD4Rv1d")>;
+// -- Load 4-element structure to all lanes of 4 registers, other sizes
+def : InstRW<[Ampere1Write_8cyc_4L_4XY],
+        (instregex "^LD4Rv(8b|4h|2s|16b|8h|4s|2d)")>;
+// -- Load 4-element structure to one lane of 4 registers
+def : InstRW<[Ampere1Write_6cyc_4L],
+        (instregex "^LD4i(8|16|32|64)")>;
+// -- Load 4-element structures to 4 registers, 2D size
+def : InstRW<[Ampere1Write_9cyc_4L_4XY],
+        (instregex "^LD4Fourv2d")>;
+// -- Load 4-element structures to 4 registers, 2S size
+def : InstRW<[Ampere1Write_12cyc_4L_8XY],
+        (instregex "^LD4Fourv2s")>;
+// -- Load 4-element structures to 4 registers, other sizes
+def : InstRW<[Ampere1Write_11cyc_4L_8XY],
+        (instregex "^LD4Fourv(8b|4h|16b|8h|4s)")>;
+// -- Load pair, Q-form
+def : InstRW<[Ampere1Write_5cyc_2L], (instregex "LDN?PQ")>;
+// -- Load pair, S/D-form
+def : InstRW<[Ampere1Write_5cyc_1L_1BS], (instregex "LDN?P(S|D)")>;
+// -- Load register
+def : InstRW<[Ampere1Write_5cyc_1L], (instregex "LDU?R[BHSDQ]i")>;
+// -- Load register, sign-extended register
+def : InstRW<[Ampere1Write_6cyc_1AB_1L], (instregex "LDR[BHSDQ]ro(W|X)")>;
+
+// FP and vector store instructions
+// -- Store 1-element structure from one lane of 1 register
+def : InstRW<[Ampere1Write_4cyc_1XY_1S_1Z],
+        (instregex "^ST1i(8|16|32|64)")>;
+// -- Store 1-element structures from 1 register
+def : InstRW<[Ampere1Write_2cyc_1S_1Z],
+        (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Store 1-element structures from 2 registers
+def : InstRW<[Ampere1Write_3cyc_2S_2Z],
+        (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Store 1-element structures from 3 registers
+def : InstRW<[Ampere1Write_4cyc_3S_3Z],
+        (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Store 1-element structures from 4 registers
+def : InstRW<[Ampere1Write_5cyc_4S_4Z],
+        (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Store 2-element structure from one lane of 2 registers
+def : InstRW<[Ampere1Write_5cyc_2XY_2S_2Z],
+        (instregex "^ST2i(8|16|32|64)")>;
+// -- Store 2-element structures from 2 registers, 16B/8H/4S/2D sizes
+def : InstRW<[Ampere1Write_5cyc_2XY_2S_2Z],
+        (instregex "^ST2Twov(16b|8h|4s|2d)")>;
+// -- Store 2-element structures from 2 registers, 8B/4H/2S sizes
+def : InstRW<[Ampere1Write_6cyc_2XY_2S_2Z],
+        (instregex "^ST2Twov(8b|4h|2s)")>;
+// -- Store 3-element structure from one lane of 3 registers
+def : InstRW<[Ampere1Write_6cyc_3XY_3S_3Z],
+        (instregex "^ST3i(8|16|32|64)")>;
+// -- Store 3-element structures from 3 registers
+def : InstRW<[Ampere1Write_6cyc_3XY_3S_3Z],
+        (instregex "^ST3Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Store 4-element structure from one lane of 4 registers
+def : InstRW<[Ampere1Write_7cyc_4XY_4S_4Z],
+        (instregex "^ST4i(8|16|32|64)")>;
+// -- Store 4-element structures from 4 registers, 16B/8H/4S sizes
+def : InstRW<[Ampere1Write_9cyc_8XY_4S_4Z],
+        (instregex "^ST4Fourv(16b|8h|4s)")>;
+// -- Store 4-element structures from 4 registers, 2D sizes
+def : InstRW<[Ampere1Write_7cyc_4XY_4S_4Z],
+        (instregex "^ST4Fourv2d")>;
+// -- Store 4-element structures from 4 registers, 8B/4H/2S sizes
+def : InstRW<[Ampere1Write_9cyc_6XY_4S_4Z],
+        (instregex "^ST4Fourv(8b|4h|2s)")>;
+// -- Store pair, Q-form
+def : InstRW<[Ampere1Write_3cyc_2S_2Z], (instregex "^STN?PQ")>;
+// -- Store pair, S/D-form
+def : InstRW<[Ampere1Write_3cyc_1S_2Z],	(instregex "^STN?P[SD]")>;
+// -- Store register
+def : InstRW<[Ampere1Write_2cyc_1S_1Z],	(instregex "^STU?R[BHSDQ](ui|i)")>;
+// -- Store register, sign-extended register offset
+def : InstRW<[Ampere1Write_2cyc_1AB_1S_1Z], (instregex "^STR[BHSDQ]ro[XW]")>;
+
+// FP data processing, bfloat16 format
+def : InstRW<[Ampere1Write_5cyc_1XY], (instrs BFCVT)>;
+def : InstRW<[Ampere1Write_7cyc_2XY], (instrs BFCVTN, BFCVTN2)>;
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^BFDOTv", "^BF16DOT")>;
+def : InstRW<[Ampere1Write_4cyc_2XY], (instrs BFMMLA)>;
+def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^BFMLAL")>;
+
+// FP data processing, scalar/vector, half precision
+def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^F(ABD|ABS)v.[fi]16")>;
+def : InstRW<[Ampere1Write_4cyc_1XY],
+        (instregex "^F(ADD|ADDP|CADD|NEG|NMUL|SUB)v.[fi]16")>;
+def : InstRW<[Ampere1Write_4cyc_1XY],
+        (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v.[fi]16")>;
+def : InstRW<[Ampere1Write_4cyc_1XY],
+        (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)16")>;
+def : InstRW<[Ampere1Write_4cyc_1X],
+        (instregex "^FCMPE?H")>;
+def : InstRW<[Ampere1Write_10cyc_1A_1BS_1X],
+        (instregex "^FCCMPE?H")>;
+def : InstRW<[Ampere1Write_10cyc_1A_1BS_1XY],
+        (instregex "^FCSELH")>;
+def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FCVT[AMNPZ][SU]v.[if]16")>;
+def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^[SU]CVTFv.[fi]16")>;
+def : InstRW<[Ampere1Write_25cyc_1XY], (instregex "^FDIVv.[if]16", "FDIVH")>;
+def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^F(MAX|MIN)(NM)?P?v.[if]16")>;
+def : InstRW<[Ampere1Write_8cyc_2XY], (instregex "^F(MAX|MIN)(NM)?Vv4[if]16")>;
+def : InstRW<[Ampere1Write_12cyc_3XY], (instregex "^F(MAX|MIN)(NM)?Vv8[if]16")>;
+def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FMULX?v.[if]16")>;
+def : InstRW<[Ampere1Write_4cyc_1XY], (instrs FMULX16)>;
+def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FN?M(ADD|SUB)[H]rrr")>;
+def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FML[AS]v.[if]16")>;
+def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FRECPXv.[if]16")>;
+def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^F(RECP|RSQRT)S16")>;
+def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FRINT[AIMNPXZ]v.[if]16")>;
+def : InstRW<[Ampere1Write_39cyc_1XY], (instregex "^FSQRTv.f16", "^FSQRTHr")>;
+
+// FP data processing, scalar/vector, single/double precision
+def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^F(ABD|ABS)v.[fi](32|64)")>;
+def : InstRW<[Ampere1Write_5cyc_1XY],
+        (instregex "^F(ADD|ADDP|CADD|NEG|NMUL|SUB)v.[fi](32|64)")>;
+def : InstRW<[Ampere1Write_5cyc_1XY],
+        (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v.[fi](32|64)")>;
+def : InstRW<[Ampere1Write_5cyc_1XY],
+        (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)(32|64)")>;
+def : InstRW<[Ampere1Write_5cyc_1X],
+        (instregex "^FCMPE?(S|D)")>;
+def : InstRW<[Ampere1Write_11cyc_1A_1BS_1X],
+        (instregex "^FCCMPE?(S|D)")>;
+def : InstRW<[Ampere1Write_11cyc_1A_1BS_1XY],
+        (instregex "^FCSEL(S|D)")>;
+def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FCVT[AMNPZ][SU]v.[if](32|64)")>;
+def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^[SU]CVTFv.[fi](32|64)")>;
+def : InstRW<[Ampere1Write_34cyc_1XY], (instregex "^FDIVv.[if](64)", "FDIVD")>;
+def : InstRW<[Ampere1Write_19cyc_1XY], (instregex "^FDIVv.[if](32)", "FDIVS")>;
+def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^F(MAX|MIN)(NM)?P?v.[if](32|64)")>;
+def : InstRW<[Ampere1Write_10cyc_2XY], (instregex "^F(MAX|MIN)(NM)?Vv.[if](32|64)")>;
+def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FMULX?v.[if](32|64)")>;
+def : InstRW<[Ampere1Write_6cyc_1XY], (instrs FMULX32, FMULX64)>;
+def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FN?M(ADD|SUB)[SD]rrr")>;
+def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FML[AS]v.[if](32|64)")>;
+def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FRECPXv.[if](32|64)")>;
+def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^F(RECP|RSQRT)S(32|64)")>;
+def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FRINT[AIMNPXZ]v.[if](32|64)")>;
+def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FRINT(32|64)")>;
+def : InstRW<[Ampere1Write_62cyc_1XY], (instregex "^FSQRTv.f64", "^FSQRTDr")>;
+def : InstRW<[Ampere1Write_32cyc_1XY], (instregex "^FSQRTv.f32", "^FSQRTSr")>;
+
+// FP miscellaneous instructions
+def : InstRW<[Ampere1Write_10cyc_1XY_1Z], (instregex "^FCVT[AMNPZ][SU][SU][XW][HSD]r")>;
+def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FCVT[HSD]Hr")>;
+def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FCVT[HSD][SD]r")>;
+def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FCVTLv")>;
+def : InstRW<[Ampere1Write_8cyc_2XY], (instregex "^FCVT(N|XN)v")>;
+def : InstRW<[Ampere1Write_10cyc_1X_1Z], (instrs FJCVTZS)>;
+def : InstRW<[Ampere1Write_5cyc_1BS], (instregex "^FMOV[HSD][WX]r")>;
+def : InstRW<[Ampere1Write_7cyc_1BS_1XY], (instregex "^FMOVDXHighr")>;
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^FMOV[HSD][ri]")>;
+def : InstRW<[Ampere1Write_6cyc_1XY_1Z], (instregex "^FMOVXDHighr")>;
+def : InstRW<[Ampere1Write_4cyc_1Z], (instregex "^FMOV[WX][HSD]r")>;
+
+// Integer arithmetic and logical instructions
+def : InstRW<[Ampere1Write_1cyc_1A],
+        (instregex "ADC(W|X)r", "SBC(W|X)r")>;
+def : InstRW<[Ampere1Write_Arith],
+        (instregex "(ADD|AND|BIC|EON|EOR|ORN|ORR|SUB)(W|X)r")>;
+def : InstRW<[Ampere1Write_ArithFlagsetting],
+        (instregex "(ADD|AND|BIC|SUB)S(W|X)r")>;
+def : InstRW<[Ampere1Write_1cyc_1A],
+        (instregex "(ADC|SBC)S(W|X)r")>;
+def : InstRW<[Ampere1Write_1cyc_1A], (instrs RMIF)>;
+def : InstRW<[Ampere1Write_1cyc_1A],
+        (instregex "(CCMN|CCMP)(X|W)")>;
+def : InstRW<[Ampere1Write_1cyc_1A],
+        (instregex "(CSEL|CSINC|CSINV|CSNEG)(X|W)")>;
+def : InstRW<[Ampere1Write_18cyc_1BS], (instrs SDIVWr, UDIVWr)>;
+def : InstRW<[Ampere1Write_34cyc_1BS], (instrs SDIVXr, UDIVXr)>;
+def : InstRW<[Ampere1Write_3cyc_1BS],
+        (instregex "(S|U)MULHr")>;
+def : InstRW<[Ampere1Write_4cyc_1BS],
+        (instregex "(S|U)?M(ADD|SUB)L?r")>;
+
+// Integer load instructions
+def : InstRW<[Ampere1Write_4cyc_2L],
+        (instregex "(LDNP|LDP|LDPSW)(X|W)")>;
+def : InstRW<[Ampere1Write_4cyc_1L],
+        (instregex "LDR(B|D|H|Q|S)ui")>;
+def : InstRW<[Ampere1Write_4cyc_1L],
+        (instregex "LDR(D|Q|W|X)l")>;
+def : InstRW<[Ampere1Write_4cyc_1L],
+        (instregex "LDTR(B|H|W|X)i")>;
+def : InstRW<[Ampere1Write_4cyc_1L],
+        (instregex "LDTRS(BW|BX|HW|HX|W)i")>;
+def : InstRW<[Ampere1Write_4cyc_1L],
+        (instregex "LDUR(BB|HH|X|W)i")>;
+def : InstRW<[Ampere1Write_4cyc_1L],
+        (instregex "LDURS(BW|BX|HW|HX|W)i")>;
+def : InstRW<[Ampere1Write_5cyc_1AB_1L],
+        (instregex "LDR(HH|SHW|SHX|W|X)ro(W|X)")>;
+def : InstRW<[Ampere1Write_1cyc_1L],
+        (instrs PRFMl, PRFUMi, PRFUMi)>;
+def : InstRW<[Ampere1Write_2cyc_1AB_1L],
+        (instrs PRFMroW, PRFMroX)>;
+
+// Integer miscellaneous instructions
+def : InstRW<[Ampere1Write_1cyc_1A],  (instrs ADR, ADRP)>;
+def : InstRW<[Ampere1Write_1cyc_1B],  (instregex "EXTR(W|X)")>;
+def : InstRW<[Ampere1Write_1cyc_1B],  (instregex "(S|U)?BFM(W|X)")>;
+def : InstRW<[Ampere1Write_3cyc_1BS], (instregex "^CRC32C?[BHWX]")>;
+def : InstRW<[Ampere1Write_1cyc_1B],  (instregex "CLS(W|X)")>;
+def : InstRW<[Ampere1Write_1cyc_1A],  (instrs SETF8, SETF16)>;
+def : InstRW<[Ampere1Write_1cyc_1AB],
+        (instrs MOVKWi, MOVKXi, MOVNWi, MOVNXi, MOVZWi, MOVZXi)>;
+def : InstRW<[Ampere1Write_1cyc_1B],
+        (instregex "(RBIT|REV|REV16)(W|X)r", "REV32Xr")>;
+def : InstRW<[Ampere1Write_1cyc_1B],
+        (instregex "(ASR|LSL|LSR|ROR)V(W|X)r")>;
+
+// Integer store instructions
+def : InstRW<[Ampere1Write_1cyc_2S],  (instregex "STNP(X|W)i")>;
+def : InstRW<[Ampere1Write_2cyc_1B_1S],
+        (instrs STPWi, STPXi)>;
+def : InstRW<[Ampere1Write_3cyc_1B_1S_1AB],
+        (instregex "STP(W|X)(pre|post)")>;
+def : InstRW<[Ampere1Write_1cyc_1S],
+        (instrs STTRBi, STTRHi, STTRWi, STTRXi)>;
+def : InstRW<[Ampere1Write_1cyc_1S],
+        (instregex "STUR(BB|HH|X|W)i",
+                   "STR(X|W)ui",
+                   "STUR(BB|HH|X|W)i")>;
+def : InstRW<[Ampere1Write_1cyc_2S], (instrs STRWroX, STRXroX)>;
+def : InstRW<[Ampere1Write_2cyc_1AB_2S], (instrs STRWroW, STRXroW)>;
+
+// Pointer authentication
+//def : InstRW<[Ampere1Write_7cyc_1BS],
+//	(instrs AUTIAZ, AUTIBZ, AUTIASP, AUTIBSP, AUTIA1716, AUTIB1716)>;
+def : InstRW<[Ampere1Write_8cyc_1BS_1A],
+        (instregex "BRA(A|AZ|B|BZ)", "RETA(A|B)", "ERETA(A|B)")>;
+def : InstRW<[Ampere1Write_8cyc_1BS_2A],
+        (instrs BLRAA, BLRAAZ, BLRAB, BLRABZ)>;
+//def : InstRW<[Ampere1Write_7cyc_1BS],
+//	(instrs PACIAZ, PACIBZ, PACIASP, PACIBSP, PACIA1716, PACIB1716)>;
+def : InstRW<[Ampere1Write_11cyc_1BS_1L], (instregex "^LDRA(A|B)")>;
+def : InstRW<[Ampere1Write_7cyc_1BS], (instrs XPACD, XPACI)>;
+
+// Vector integer instructions
+// -- absolute difference
+def : InstRW<[Ampere1Write_3cyc_1XY],
+             (instregex "^SABAv", "^SABALv", "^SABDv", "^SABDLv",
+                        "^UABAv", "^UABALv", "^UABDv", "^UABDLv")>;
+// -- arithmetic
+def : InstRW<[Ampere1Write_3cyc_1XY],
+        (instregex "^ABSv", "^(ADD|SUB)v", "^SADDLv", "^SADDW", "SHADD",
+                   "SHSUB", "^SRHADD", "^URHADD", "SSUBL", "SSUBW",
+                   "^UADDLv", "^UADDW", "UHADD", "UHSUB", "USUBL", "USUBW")>;
+// -- arithmetic, horizontal, 16B
+def : InstRW<[Ampere1Write_12cyc_4XY],
+            (instregex "^ADDVv16i8v", "^SADDLVv16i8v", "^UADDLVv16i8v")>;
+def : InstRW<[Ampere1Write_12cyc_4XY],
+            (instregex "^[SU](MIN|MAX)Vv16i8v")>;
+// -- arithmetic, horizontal, 4H/4S
+def : InstRW<[Ampere1Write_6cyc_2XY],
+            (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v")>;
+def : InstRW<[Ampere1Write_6cyc_2XY],
+            (instregex "^[SU](MIN|MAX)V(v4i16|v4i32)v")>;
+// -- arithmetic, horizontal, 8B/8H
+def : InstRW<[Ampere1Write_9cyc_3XY],
+            (instregex "^[SU]?ADDL?V(v8i16|v4i32)v")>;
+def : InstRW<[Ampere1Write_9cyc_3XY],
+            (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v")>;
+// -- arithmetic, narrowing
+def : InstRW<[Ampere1Write_5cyc_2XY], (instregex "(ADD|SUB)HNv.*")>;
+def : InstRW<[Ampere1Write_5cyc_2XY], (instregex "(RADD|RSUB)HNv.*")>;
+// -- arithmetic, pairwise
+def : InstRW<[Ampere1Write_3cyc_1XY],
+        (instregex "^ADDPv", "^SADALP", "^UADALP", "^SADDLPv", "^UADDLPv")>;
+// -- arithmetic, saturating
+def : InstRW<[Ampere1Write_3cyc_1XY],
+        (instregex "^SQADD", "^SQSUB", "^SUQADD", "^UQADD", "^UQSUB", "^USQADD")>;
+// -- bit count
+def : InstRW<[Ampere1Write_2cyc_1XY],
+        (instregex "^(CLS|CLZ|CNT)v")>;
+// -- compare
+def : InstRW<[Ampere1Write_3cyc_1XY],
+        (instregex "^CMEQv", "^CMGEv", "^CMGTv", "^CMLEv", "^CMLTv",
+                   "^CMHIv", "^CMHSv")>;
+// -- compare non-zero
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^CMTSTv")>;
+// -- dot product
+def : InstRW<[Ampere1Write_3cyc_1XY], (instregex "^(S|SU|U|US)DOTv")>;
+// -- fp reciprocal estimate
+def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FRECPEv", "^FRSQRTEv")>;
+// -- integer reciprocal estimate
+def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^URECPEv", "^URSQRTEv")>;
+// -- logical
+def : InstRW<[Ampere1Write_2cyc_1XY],
+        (instregex "^ANDv", "^BICv", "^EORv", "^ORRv", "^ORNv", "^NOTv")>;
+// -- logical, narrowing
+def : InstRW<[Ampere1Write_5cyc_2XY],
+        (instregex "RSHRNv",
+                   "SHRNv", "SQSHRNv", "SQSHRUNv",
+                   "UQXTNv")>;
+// -- matrix multiply
+def : InstRW<[Ampere1Write_6cyc_2XY],
+        (instrs SMMLA, UMMLA, USMMLA)>;
+// -- max/min
+def : InstRW<[Ampere1Write_3cyc_1XY],
+        (instregex "^SMAXv", "^SMINv", "^UMAXv", "^UMINv")>;
+def : InstRW<[Ampere1Write_3cyc_1XY],
+        (instregex "^SMAXPv", "^SMINPv", "^UMAXPv", "^UMINPv")>;
+// -- move immediate
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^MOVIv", "^MVNIv")>;
+// -- multiply
+def : InstRW<[Ampere1Write_3cyc_1XY],
+        (instregex "MULv", "SMULLv", "UMULLv", "SQDMUL(H|L)v", "SQRDMULHv")>;
+// -- multiply accumulate
+def : InstRW<[Ampere1Write_3cyc_1XY],
+        (instregex "MLAv", "MLSv", "(S|U|SQD)(MLAL|MLSL)v", "SQRDML(A|S)Hv")>;
+// -- negation, saturating
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^SQABS", "^SQNEG")>;
+// -- reverse bits/bytes
+def : InstRW<[Ampere1Write_2cyc_1XY],
+        (instregex "^RBITv", "^REV16v", "^REV32v", "^REV64v")>;
+// -- shift
+def : InstRW<[Ampere1Write_3cyc_1XY], (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>;
+// -- shift and accumulate
+def : InstRW<[Ampere1Write_3cyc_1XY],
+        (instregex "SRSRAv", "SSRAv", "URSRAv", "USRAv")>;
+// -- shift, saturating
+def : InstRW<[Ampere1Write_3cyc_1XY],
+        (instregex "^SQRSHLv", "^SQRSHRNv", "^SQRSHRUNv", "^SQSHL", "^SQSHLU",
+                   "^SQXTNv", "^SQXTUNv", "^UQSHRNv", "UQRSHRNv", "^UQRSHL",
+                   "^UQSHL")>;
+
+// Vector miscellaneous instructions
+// -- duplicate element
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^DUPv.+lane")>;
+// -- duplicate from GPR
+def : InstRW<[Ampere1Write_5cyc_1BS], (instregex "^DUPv.+gpr")>;
+// -- extract narrow
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^XTNv")>;
+// -- insert/extract element
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^EXTv", "^INSv.+lane")>;
+// -- move FP immediate
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^FMOVv")>;
+// -- move element to GPR
+def : InstRW<[Ampere1Write_6cyc_1XY_1Z], (instregex "(S|U)MOVv")>;
+// -- move from GPR to any element
+def : InstRW<[Ampere1Write_7cyc_1BS_1XY], (instregex "^INSv.+gpr")>;
+// -- table lookup
+def : InstRW<[Ampere1Write_2cyc_1XY],
+            (instrs TBLv8i8One, TBLv16i8One, TBXv8i8One, TBXv16i8One)>;
+def : InstRW<[Ampere1Write_4cyc_2XY],
+            (instrs TBLv8i8Two, TBLv16i8Two, TBXv8i8Two, TBXv16i8Two)>;
+def : InstRW<[Ampere1Write_6cyc_3XY],
+            (instrs TBLv8i8Three, TBLv16i8Three, TBXv8i8Three, TBXv16i8Three)>;
+def : InstRW<[Ampere1Write_8cyc_4XY],
+            (instrs TBLv8i8Four, TBLv16i8Four, TBXv8i8Four, TBXv16i8Four)>;
+// -- transpose
+def : InstRW<[Ampere1Write_2cyc_1XY],
+              (instregex "^TRN1v", "^TRN2v", "^UZP1v", "^UZP2v")>;
+// -- zip/unzip
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^ZIP1v", "^ZIP2v")>;
+
+} // SchedModel = Ampere1Model
diff --git a/llvm/lib/Target/AArch64/AArch64SchedPredAmpere.td b/llvm/lib/Target/AArch64/AArch64SchedPredAmpere.td
new file mode 100644
index 000000000000..8552c07bda56
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64SchedPredAmpere.td
@@ -0,0 +1,25 @@
+//===- AArch64SchedPredAmpere.td - AArch64 Sched Preds -----*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines scheduling predicate definitions that are used by the
+// AArch64 Ampere Computing processors.
+//
+//===----------------------------------------------------------------------===//
+
+// Auxiliary predicates.
+
+// Check for a LSL shift <= 4
+def AmpereCheapLSL : MCSchedPredicate<
+                                CheckAny<[CheckShiftBy0,
+                                 CheckAll<
+                                   [CheckShiftLSL,
+                                    CheckAny<
+                                      [CheckShiftBy1,
+                                       CheckShiftBy2,
+                                       CheckShiftBy3,
+                                       CheckShiftBy4]>]>]>>;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedPredicates.td b/llvm/lib/Target/AArch64/AArch64SchedPredicates.td
index fc13b23b4cf8..19a3780c7381 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedPredicates.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedPredicates.td
@@ -53,7 +53,7 @@ let FunctionMapper = "AArch64_AM::getShiftType" in {
 }
 
 // Check for shifting in arithmetic and logic instructions.
-foreach I = {0-3, 8} in {
+foreach I = {0-4, 8} in {
   let FunctionMapper = "AArch64_AM::getShiftValue" in
   def CheckShiftBy#I        : CheckImmOperand<3, I>;
 }
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 8a7e20237271..9f2753584db8 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -221,6 +221,12 @@ void AArch64Subtarget::initializeProperties() {
     // FIXME: remove this to enable 64-bit SLP if performance looks good.
     MinVectorRegisterBitWidth = 128;
     break;
+  case Ampere1:
+    CacheLineSize = 64;
+    PrefFunctionLogAlignment = 6;
+    PrefLoopLogAlignment = 6;
+    MaxInterleaveFactor = 4;
+    break;
   }
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 061db926ee2b..1c14e0a55049 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -40,6 +40,7 @@ public:
   enum ARMProcFamilyEnum : uint8_t {
     Others,
     A64FX,
+    Ampere1,
     AppleA7,
     AppleA10,
     AppleA11,
diff --git a/llvm/lib/Target/AVR/AVRCallingConv.td b/llvm/lib/Target/AVR/AVRCallingConv.td
index b4bc35e191c0..0fae61fb55c5 100644
--- a/llvm/lib/Target/AVR/AVRCallingConv.td
+++ b/llvm/lib/Target/AVR/AVRCallingConv.td
@@ -27,6 +27,8 @@ def RetCC_AVR_BUILTIN : CallingConv<[
 
 // Calling convention for variadic functions.
 def ArgCC_AVR_Vararg : CallingConv<[
+  // i8 are always passed through the stack with a byte slot and byte alignment.
+  CCIfType<[i8], CCAssignToStack<1, 1>>,
   // i16 are always passed through the stack with an alignment of 1.
   CCAssignToStack<2, 1>
 ]>;
diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.td b/llvm/lib/Target/AVR/AVRInstrInfo.td
index 2b96dc0b833a..7e027369f096 100644
--- a/llvm/lib/Target/AVR/AVRInstrInfo.td
+++ b/llvm/lib/Target/AVR/AVRInstrInfo.td
@@ -194,6 +194,11 @@ def brtarget_13 : Operand<OtherVT> {
   let EncoderMethod = "encodeRelCondBrTarget<AVR::fixup_13_pcrel>";
 }
 
+def rcalltarget_13 : Operand<i16> {
+  let PrintMethod = "printPCRelImm";
+  let EncoderMethod = "encodeRelCondBrTarget<AVR::fixup_13_pcrel>";
+}
+
 // The target of a 22 or 16-bit call/jmp instruction.
 def call_target : Operand<iPTR> {
   let EncoderMethod = "encodeCallTarget";
@@ -965,10 +970,8 @@ let isBarrier = 1, isBranch = 1, isTerminator = 1 in {
 let isCall = 1 in {
   // SP is marked as a use to prevent stack-pointer assignments that appear
   // immediately before calls from potentially appearing dead.
-  let Uses = [SP] in def RCALLk : FBRk<1, (outs),
-                                       (ins brtarget_13
-                                        : $target),
-                                       "rcall\t$target", []>;
+  let Uses = [SP] in def RCALLk : FBRk<1, (outs), (ins rcalltarget_13:$k),
+                                       "rcall\t$k", [(AVRcall imm:$k)]>;
 
   // SP is marked as a use to prevent stack-pointer assignments that appear
   // immediately before calls from potentially appearing dead.
@@ -985,13 +988,10 @@ let isCall = 1 in {
   // SP is marked as a use to prevent stack-pointer assignments that appear
   // immediately before calls from potentially appearing dead.
   //
-  //: TODO: the imm field can be either 16 or 22 bits in devices with more
+  // TODO: the imm field can be either 16 or 22 bits in devices with more
   // than 64k of ROM, fix it once we support the largest devices.
-  let Uses = [SP] in def CALLk : F32BRk<0b111, (outs),
-                                        (ins call_target
-                                         : $k),
-                                        "call\t$k", [(AVRcall imm
-                                                      : $k)]>,
+  let Uses = [SP] in def CALLk : F32BRk<0b111, (outs), (ins call_target:$k),
+                                        "call\t$k", [(AVRcall imm:$k)]>,
       Requires<[HasJMPCALL]>;
 }
 
@@ -2457,8 +2457,12 @@ def : Pat<(adde i8
                      : $src2))>;
 
 // Calls.
-def : Pat<(AVRcall(i16 tglobaladdr : $dst)), (CALLk tglobaladdr : $dst)>;
-def : Pat<(AVRcall(i16 texternalsym : $dst)), (CALLk texternalsym : $dst)>;
+let Predicates = [HasJMPCALL] in {
+  def : Pat<(AVRcall(i16 tglobaladdr:$dst)), (CALLk tglobaladdr:$dst)>;
+  def : Pat<(AVRcall(i16 texternalsym:$dst)), (CALLk texternalsym:$dst)>;
+}
+def : Pat<(AVRcall(i16 tglobaladdr:$dst)), (RCALLk tglobaladdr:$dst)>;
+def : Pat<(AVRcall(i16 texternalsym:$dst)), (RCALLk texternalsym:$dst)>;
 
 // `anyext`
 def : Pat<(i16(anyext i8
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
index c83796b8579b..9eb546d1b5dc 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
@@ -37,6 +37,8 @@ class SystemZMCCodeEmitter : public MCCodeEmitter {
   const MCInstrInfo &MCII;
   MCContext &Ctx;
 
+  mutable unsigned MemOpsEmitted;
+
 public:
   SystemZMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
     : MCII(mcii), Ctx(ctx) {
@@ -165,6 +167,7 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
   verifyInstructionPredicates(MI,
                               computeAvailableFeatures(STI.getFeatureBits()));
 
+  MemOpsEmitted = 0;
   uint64_t Bits = getBinaryCodeForInstr(MI, Fixups, STI);
   unsigned Size = MCII.get(MI.getOpcode()).getSize();
   // Big-endian insertion of Size bytes.
@@ -191,12 +194,14 @@ getDispOpValue(const MCInst &MI, unsigned OpNum,
                SmallVectorImpl<MCFixup> &Fixups,
                SystemZ::FixupKind Kind) const {
   const MCOperand &MO = MI.getOperand(OpNum);
-  if (MO.isImm())
+  if (MO.isImm()) {
+    ++MemOpsEmitted;
     return static_cast<uint64_t>(MO.getImm());
+  }
   if (MO.isExpr()) {
     // All instructions follow the pattern where the first displacement has a
     // 2 bytes offset, and the second one 4 bytes.
-    unsigned ByteOffs = Fixups.size() == 0 ? 2 : 4;
+    unsigned ByteOffs = MemOpsEmitted++ == 0 ? 2 : 4;
     Fixups.push_back(MCFixup::create(ByteOffs, MO.getExpr(), (MCFixupKind)Kind,
                                      MI.getLoc()));
     assert(Fixups.size() <= 2 && "More than two memory operands in MI?");
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index f10651d5c5d7..d795697b3ed6 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -6442,22 +6442,26 @@ SDValue SystemZTargetLowering::combineINT_TO_FP(
     SDNode *N, DAGCombinerInfo &DCI) const {
   if (DCI.Level != BeforeLegalizeTypes)
     return SDValue();
+  SelectionDAG &DAG = DCI.DAG;
+  LLVMContext &Ctx = *DAG.getContext();
   unsigned Opcode = N->getOpcode();
   EVT OutVT = N->getValueType(0);
-  SelectionDAG &DAG = DCI.DAG;
+  Type *OutLLVMTy = OutVT.getTypeForEVT(Ctx);
   SDValue Op = N->getOperand(0);
-  unsigned OutScalarBits = OutVT.getScalarSizeInBits();
+  unsigned OutScalarBits = OutLLVMTy->getScalarSizeInBits();
   unsigned InScalarBits = Op->getValueType(0).getScalarSizeInBits();
 
   // Insert an extension before type-legalization to avoid scalarization, e.g.:
   // v2f64 = uint_to_fp v2i16
   // =>
   // v2f64 = uint_to_fp (v2i64 zero_extend v2i16)
-  if (OutVT.isVector() && OutScalarBits > InScalarBits) {
-    MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(OutVT.getScalarSizeInBits()),
-                                 OutVT.getVectorNumElements());
+  if (OutLLVMTy->isVectorTy() && OutScalarBits > InScalarBits &&
+      OutScalarBits <= 64) {
+    unsigned NumElts = cast<FixedVectorType>(OutLLVMTy)->getNumElements();
+    EVT ExtVT = EVT::getVectorVT(
+        Ctx, EVT::getIntegerVT(Ctx, OutLLVMTy->getScalarSizeInBits()), NumElts);
     unsigned ExtOpcode =
-      (Opcode == ISD::UINT_TO_FP ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND);
+        (Opcode == ISD::UINT_TO_FP ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND);
     SDValue ExtOp = DAG.getNode(ExtOpcode, SDLoc(N), ExtVT, Op);
     return DAG.getNode(Opcode, SDLoc(N), OutVT, ExtOp);
   }
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 682932b8f3e6..8bb7e81e19bb 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -37558,7 +37558,11 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
          (RootVT.is128BitVector() && Subtarget.hasVLX())) &&
         (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
         isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
-      if (Depth == 0 && Root.getOpcode() == ISD::TRUNCATE)
+      // Bail if this was already a truncation or PACK node.
+      // We sometimes fail to match PACK if we demand known undef elements.
+      if (Depth == 0 && (Root.getOpcode() == ISD::TRUNCATE ||
+                         Root.getOpcode() == X86ISD::PACKSS ||
+                         Root.getOpcode() == X86ISD::PACKUS))
         return SDValue(); // Nothing to do!
       ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
       ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 2aab79e89078..7eaa28bd1320 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -2488,8 +2488,12 @@ Value *InstCombinerImpl::matchSelectFromAndOr(Value *A, Value *C, Value *B,
     // not create unnecessary casts if the types already match.
     Type *SelTy = A->getType();
     if (auto *VecTy = dyn_cast<VectorType>(Cond->getType())) {
+      // For a fixed or scalable vector get N from <{vscale x} N x iM>
       unsigned Elts = VecTy->getElementCount().getKnownMinValue();
-      Type *EltTy = Builder.getIntNTy(SelTy->getPrimitiveSizeInBits() / Elts);
+      // For a fixed or scalable vector, get the size in bits of N x iM; for a
+      // scalar this is just M.
+      unsigned SelEltSize = SelTy->getPrimitiveSizeInBits().getKnownMinSize();
+      Type *EltTy = Builder.getIntNTy(SelEltSize / Elts);
       SelTy = VectorType::get(EltTy, VecTy->getElementCount());
     }
     Value *BitcastC = Builder.CreateBitCast(C, SelTy);
diff --git a/llvm/lib/Transforms/Scalar/SCCP.cpp b/llvm/lib/Transforms/Scalar/SCCP.cpp
index c34da51e6dc1..fa1cfc84e4fd 100644
--- a/llvm/lib/Transforms/Scalar/SCCP.cpp
+++ b/llvm/lib/Transforms/Scalar/SCCP.cpp
@@ -342,7 +342,8 @@ static void findReturnsToZap(Function &F,
 }
 
 static bool removeNonFeasibleEdges(const SCCPSolver &Solver, BasicBlock *BB,
-                                   DomTreeUpdater &DTU) {
+                                   DomTreeUpdater &DTU,
+                                   BasicBlock *&NewUnreachableBB) {
   SmallPtrSet<BasicBlock *, 8> FeasibleSuccessors;
   bool HasNonFeasibleEdges = false;
   for (BasicBlock *Succ : successors(BB)) {
@@ -385,6 +386,23 @@ static bool removeNonFeasibleEdges(const SCCPSolver &Solver, BasicBlock *BB,
   } else if (FeasibleSuccessors.size() > 1) {
     SwitchInstProfUpdateWrapper SI(*cast<SwitchInst>(TI));
     SmallVector<DominatorTree::UpdateType, 8> Updates;
+
+    // If the default destination is unfeasible it will never be taken. Replace
+    // it with a new block with a single Unreachable instruction.
+    BasicBlock *DefaultDest = SI->getDefaultDest();
+    if (!FeasibleSuccessors.contains(DefaultDest)) {
+      if (!NewUnreachableBB) {
+        NewUnreachableBB =
+            BasicBlock::Create(DefaultDest->getContext(), "default.unreachable",
+                               DefaultDest->getParent(), DefaultDest);
+        new UnreachableInst(DefaultDest->getContext(), NewUnreachableBB);
+      }
+
+      SI->setDefaultDest(NewUnreachableBB);
+      Updates.push_back({DominatorTree::Delete, BB, DefaultDest});
+      Updates.push_back({DominatorTree::Insert, BB, NewUnreachableBB});
+    }
+
     for (auto CI = SI->case_begin(); CI != SI->case_end();) {
       if (FeasibleSuccessors.contains(CI->getCaseSuccessor())) {
         ++CI;
@@ -532,8 +550,9 @@ bool llvm::runIPSCCP(
       NumInstRemoved += changeToUnreachable(F.front().getFirstNonPHI(),
                                             /*PreserveLCSSA=*/false, &DTU);
 
+    BasicBlock *NewUnreachableBB = nullptr;
     for (BasicBlock &BB : F)
-      MadeChanges |= removeNonFeasibleEdges(Solver, &BB, DTU);
+      MadeChanges |= removeNonFeasibleEdges(Solver, &BB, DTU, NewUnreachableBB);
 
     for (BasicBlock *DeadBB : BlocksToErase)
       DTU.deleteBB(DeadBB);