44 files changed, 1307 insertions, 1683 deletions
diff --git a/contrib/arm-optimized-routines/string/aarch64/__mtag_tag_region.S b/contrib/arm-optimized-routines/string/aarch64/__mtag_tag_region.S
index 84339f73cf23..34b5789240da 100644
--- a/contrib/arm-optimized-routines/string/aarch64/__mtag_tag_region.S
+++ b/contrib/arm-optimized-routines/string/aarch64/__mtag_tag_region.S
@@ -1,8 +1,8 @@
 /*
  * __mtag_tag_region - tag memory
  *
- * Copyright (c) 2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2021-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -15,7 +15,7 @@
  * The memory region may remain untagged if tagging is not enabled.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #if __ARM_FEATURE_MEMORY_TAGGING
 
@@ -27,9 +27,6 @@
 #define zva_val	x4
 
 ENTRY (__mtag_tag_region)
-	PTR_ARG (0)
-	SIZE_ARG (1)
-
 	add	dstend, dstin, count
 
 	cmp	count, 96
diff --git a/contrib/arm-optimized-routines/string/aarch64/__mtag_tag_zero_region.S b/contrib/arm-optimized-routines/string/aarch64/__mtag_tag_zero_region.S
index f58364ca6fcb..2fa248e25621 100644
--- a/contrib/arm-optimized-routines/string/aarch64/__mtag_tag_zero_region.S
+++ b/contrib/arm-optimized-routines/string/aarch64/__mtag_tag_zero_region.S
@@ -1,8 +1,8 @@
 /*
  * __mtag_tag_zero_region - tag memory and fill it with zero bytes
  *
- * Copyright (c) 2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2021-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -15,7 +15,7 @@
  * The memory region may remain untagged if tagging is not enabled.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #if __ARM_FEATURE_MEMORY_TAGGING
 
@@ -27,9 +27,6 @@
 #define zva_val	x4
 
 ENTRY (__mtag_tag_zero_region)
-	PTR_ARG (0)
-	SIZE_ARG (1)
-
 	add	dstend, dstin, count
 
 	cmp	count, 96
diff --git a/contrib/arm-optimized-routines/string/aarch64/asmdefs.h b/contrib/arm-optimized-routines/string/aarch64/asmdefs.h
new file mode 100644
index 000000000000..90166676977a
--- /dev/null
+++ b/contrib/arm-optimized-routines/string/aarch64/asmdefs.h
@@ -0,0 +1,69 @@
+/*
+ * Macros for asm code.  AArch64 version.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef _ASMDEFS_H
+#define _ASMDEFS_H
+
+/* Branch Target Identitication support.  */
+#define BTI_C		hint	34
+#define BTI_J		hint	36
+/* Return address signing support (pac-ret).  */
+#define PACIASP		hint	25; .cfi_window_save
+#define AUTIASP		hint	29; .cfi_window_save
+
+/* GNU_PROPERTY_AARCH64_* macros from elf.h.  */
+#define FEATURE_1_AND 0xc0000000
+#define FEATURE_1_BTI 1
+#define FEATURE_1_PAC 2
+
+/* Add a NT_GNU_PROPERTY_TYPE_0 note.  */
+#define GNU_PROPERTY(type, value)	\
+  .section .note.gnu.property, "a";	\
+  .p2align 3;				\
+  .word 4;				\
+  .word 16;				\
+  .word 5;				\
+  .asciz "GNU";				\
+  .word type;				\
+  .word 4;				\
+  .word value;				\
+  .word 0;				\
+  .text
+
+/* If set then the GNU Property Note section will be added to
+   mark objects to support BTI and PAC-RET.  */
+#ifndef WANT_GNU_PROPERTY
+#define WANT_GNU_PROPERTY 1
+#endif
+
+#if WANT_GNU_PROPERTY
+/* Add property note with supported features to all asm files.  */
+GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC)
+#endif
+
+#define ENTRY_ALIGN(name, alignment)	\
+  .global name;		\
+  .type name,%function;	\
+  .align alignment;		\
+  name:			\
+  .cfi_startproc;	\
+  BTI_C;
+
+#define ENTRY(name)	ENTRY_ALIGN(name, 6)
+
+#define ENTRY_ALIAS(name)	\
+  .global name;		\
+  .type name,%function;	\
+  name:
+
+#define END(name)	\
+  .cfi_endproc;		\
+  .size name, .-name;
+
+#define L(l) .L ## l
+
+#endif
diff --git a/contrib/arm-optimized-routines/string/aarch64/check-arch.S b/contrib/arm-optimized-routines/string/aarch64/check-arch.S
index 5a54242d7de6..131b7fa36ec2 100644
--- a/contrib/arm-optimized-routines/string/aarch64/check-arch.S
+++ b/contrib/arm-optimized-routines/string/aarch64/check-arch.S
@@ -1,8 +1,8 @@
 /*
  * check ARCH setting.
  *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #if !__aarch64__
@@ -10,4 +10,4 @@
 #endif
 
 /* Include for GNU property notes.  */
-#include "../asmdefs.h"
+#include "asmdefs.h"
diff --git a/contrib/arm-optimized-routines/string/aarch64/memchr-sve.S b/contrib/arm-optimized-routines/string/aarch64/experimental/memchr-sve.S
index c22e6596f19b..b314551f3e0f 100644
--- a/contrib/arm-optimized-routines/string/aarch64/memchr-sve.S
+++ b/contrib/arm-optimized-routines/string/aarch64/experimental/memchr-sve.S
@@ -1,13 +1,14 @@
 /*
  * memchr - find a character in a memory zone
  *
- * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
+
+.arch armv8-a+sve
 
-#if __ARM_FEATURE_SVE
 /* Assumptions:
  *
  * ARMv8-a, AArch64
@@ -15,8 +16,6 @@
  */
 
 ENTRY (__memchr_aarch64_sve)
-	PTR_ARG (0)
-	SIZE_ARG (2)
 	dup	z1.b, w1			/* duplicate c to a vector */
 	setffr					/* initialize FFR */
 	mov	x3, 0				/* initialize off */
@@ -59,6 +58,3 @@ ENTRY (__memchr_aarch64_sve)
 	ret
 
 END (__memchr_aarch64_sve)
-
-#endif
-
diff --git a/contrib/arm-optimized-routines/string/aarch64/memcmp-sve.S b/contrib/arm-optimized-routines/string/aarch64/experimental/memcmp-sve.S
index 78c5ecaa4cdc..ad3534836d04 100644
--- a/contrib/arm-optimized-routines/string/aarch64/memcmp-sve.S
+++ b/contrib/arm-optimized-routines/string/aarch64/experimental/memcmp-sve.S
@@ -1,13 +1,14 @@
 /*
  * memcmp - compare memory
  *
- * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
+
+.arch armv8-a+sve
 
-#if __ARM_FEATURE_SVE
 /* Assumptions:
  *
  * ARMv8-a, AArch64
@@ -15,9 +16,6 @@
  */
 
 ENTRY (__memcmp_aarch64_sve)
-	PTR_ARG (0)
-	PTR_ARG (1)
-	SIZE_ARG (2)
 	mov	x3, 0			/* initialize off */
 
 0:	whilelo	p0.b, x3, x2		/* while off < max */
@@ -46,6 +44,3 @@ ENTRY (__memcmp_aarch64_sve)
 	ret
 
 END (__memcmp_aarch64_sve)
-
-#endif
-
diff --git a/contrib/arm-optimized-routines/string/aarch64/stpcpy-sve.S b/contrib/arm-optimized-routines/string/aarch64/experimental/stpcpy-sve.S
index 82dd9717b0a0..5d3f14b86026 100644
--- a/contrib/arm-optimized-routines/string/aarch64/stpcpy-sve.S
+++ b/contrib/arm-optimized-routines/string/aarch64/experimental/stpcpy-sve.S
@@ -2,7 +2,7 @@
  * stpcpy - copy a string returning pointer to end.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #define BUILD_STPCPY 1
diff --git a/contrib/arm-optimized-routines/string/aarch64/strchr-sve.S b/contrib/arm-optimized-routines/string/aarch64/experimental/strchr-sve.S
index 13ba9f44f9c5..7d74ae9ff232 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strchr-sve.S
+++ b/contrib/arm-optimized-routines/string/aarch64/experimental/strchr-sve.S
@@ -1,13 +1,14 @@
 /*
  * strchr/strchrnul - find a character in a string
  *
- * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
+
+.arch armv8-a+sve
 
-#if __ARM_FEATURE_SVE
 /* Assumptions:
  *
  * ARMv8-a, AArch64
@@ -22,7 +23,6 @@
 #endif
 
 ENTRY (FUNC)
-	PTR_ARG (0)
 	dup	z1.b, w1		/* replicate byte across vector */
 	setffr				/* initialize FFR */
 	ptrue	p1.b			/* all ones; loop invariant */
@@ -65,6 +65,3 @@ ENTRY (FUNC)
 	b	0b
 
 END (FUNC)
-
-#endif
-
diff --git a/contrib/arm-optimized-routines/string/aarch64/strchrnul-sve.S b/contrib/arm-optimized-routines/string/aarch64/experimental/strchrnul-sve.S
index 428ff1a3d008..0005f9177514 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strchrnul-sve.S
+++ b/contrib/arm-optimized-routines/string/aarch64/experimental/strchrnul-sve.S
@@ -2,7 +2,7 @@
  * strchrnul - find a character or nul in a string
  *
  * Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #define BUILD_STRCHRNUL
diff --git a/contrib/arm-optimized-routines/string/aarch64/strcmp-sve.S b/contrib/arm-optimized-routines/string/aarch64/experimental/strcmp-sve.S
index e6d2da5411ca..b6c249588534 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strcmp-sve.S
+++ b/contrib/arm-optimized-routines/string/aarch64/experimental/strcmp-sve.S
@@ -1,13 +1,14 @@
 /*
  * __strcmp_aarch64_sve - compare two strings
  *
- * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
+
+.arch armv8-a+sve
 
-#if __ARM_FEATURE_SVE
 /* Assumptions:
  *
  * ARMv8-a, AArch64
@@ -15,8 +16,6 @@
  */
 
 ENTRY (__strcmp_aarch64_sve)
-	PTR_ARG (0)
-	PTR_ARG (1)
 	setffr				/* initialize FFR */
 	ptrue	p1.b, all		/* all ones; loop invariant */
 	mov	x2, 0			/* initialize offset */
@@ -54,6 +53,3 @@ ENTRY (__strcmp_aarch64_sve)
 	b	1b
 
 END (__strcmp_aarch64_sve)
-
-#endif
-
diff --git a/contrib/arm-optimized-routines/string/aarch64/strcpy-sve.S b/contrib/arm-optimized-routines/string/aarch64/experimental/strcpy-sve.S
index f515462e09ae..57b77c8a00e7 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strcpy-sve.S
+++ b/contrib/arm-optimized-routines/string/aarch64/experimental/strcpy-sve.S
@@ -1,13 +1,14 @@
 /*
  * strcpy/stpcpy - copy a string returning pointer to start/end.
  *
- * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
+
+.arch armv8-a+sve
 
-#if __ARM_FEATURE_SVE
 /* Assumptions:
  *
  * ARMv8-a, AArch64
@@ -22,8 +23,6 @@
 #endif
 
 ENTRY (FUNC)
-	PTR_ARG (0)
-	PTR_ARG (1)
 	setffr				/* initialize FFR */
 	ptrue	p2.b, all		/* all ones; loop invariant */
 	mov	x2, 0			/* initialize offset */
@@ -66,6 +65,3 @@ ENTRY (FUNC)
 	ret
 
 END (FUNC)
-
-#endif
-
diff --git a/contrib/arm-optimized-routines/string/aarch64/strlen-sve.S b/contrib/arm-optimized-routines/string/aarch64/experimental/strlen-sve.S
index 2392493f1a3c..c83155052c07 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strlen-sve.S
+++ b/contrib/arm-optimized-routines/string/aarch64/experimental/strlen-sve.S
@@ -1,13 +1,14 @@
 /*
  * __strlen_aarch64_sve - compute the length of a string
  *
- * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
+
+.arch armv8-a+sve
 
-#if __ARM_FEATURE_SVE
 /* Assumptions:
  *
  * ARMv8-a, AArch64
@@ -15,7 +16,6 @@
  */
 
 ENTRY (__strlen_aarch64_sve)
-	PTR_ARG (0)
 	setffr			/* initialize FFR */
 	ptrue	p2.b		/* all ones; loop invariant */
 	mov	x1, 0		/* initialize length */
@@ -50,6 +50,3 @@ ENTRY (__strlen_aarch64_sve)
 	b	0b
 
 END (__strlen_aarch64_sve)
-
-#endif
-
diff --git a/contrib/arm-optimized-routines/string/aarch64/strncmp-sve.S b/contrib/arm-optimized-routines/string/aarch64/experimental/strncmp-sve.S
index 234190e245b0..a281e642d8aa 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strncmp-sve.S
+++ b/contrib/arm-optimized-routines/string/aarch64/experimental/strncmp-sve.S
@@ -1,13 +1,14 @@
 /*
  * strncmp - compare two strings with limit
  *
- * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
+
+.arch armv8-a+sve
 
-#if __ARM_FEATURE_SVE
 /* Assumptions:
  *
  * ARMv8-a, AArch64
@@ -15,9 +16,6 @@
  */
 
 ENTRY (__strncmp_aarch64_sve)
-	PTR_ARG (0)
-	PTR_ARG (1)
-	SIZE_ARG (2)
 	setffr				/* initialize FFR */
 	mov	x3, 0			/* initialize off */
 
@@ -64,6 +62,3 @@ ENTRY (__strncmp_aarch64_sve)
 	ret
 
 END (__strncmp_aarch64_sve)
-
-#endif
-
diff --git a/contrib/arm-optimized-routines/string/aarch64/strnlen-sve.S b/contrib/arm-optimized-routines/string/aarch64/experimental/strnlen-sve.S
index 5b9ebf7763bc..11d835a1b13c 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strnlen-sve.S
+++ b/contrib/arm-optimized-routines/string/aarch64/experimental/strnlen-sve.S
@@ -1,13 +1,14 @@
 /*
  * strnlen - calculate the length of a string with limit.
  *
- * Copyright (c) 2019-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
+
+.arch armv8-a+sve
 
-#if __ARM_FEATURE_SVE
 /* Assumptions:
  *
  * ARMv8-a, AArch64
@@ -15,8 +16,6 @@
  */
 
 ENTRY (__strnlen_aarch64_sve)
-	PTR_ARG (0)
-	SIZE_ARG (1)
 	setffr				/* initialize FFR */
 	mov	x2, 0			/* initialize len */
 	b	1f
@@ -69,6 +68,3 @@ ENTRY (__strnlen_aarch64_sve)
 	ret
 
 END (__strnlen_aarch64_sve)
-
-#endif
-
diff --git a/contrib/arm-optimized-routines/string/aarch64/strrchr-sve.S b/contrib/arm-optimized-routines/string/aarch64/experimental/strrchr-sve.S
index d36d69af37fd..731edaddf156 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strrchr-sve.S
+++ b/contrib/arm-optimized-routines/string/aarch64/experimental/strrchr-sve.S
@@ -1,13 +1,14 @@
 /*
  * strrchr - find the last of a character in a string
  *
- * Copyright (c) 2019-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
+
+.arch armv8-a+sve
 
-#if __ARM_FEATURE_SVE
 /* Assumptions:
  *
  * ARMv8-a, AArch64
@@ -15,7 +16,6 @@
  */
 
 ENTRY (__strrchr_aarch64_sve)
-	PTR_ARG (0)
 	dup	z1.b, w1		/* replicate byte across vector */
 	setffr				/* initialize FFR */
 	ptrue	p1.b			/* all ones; loop invariant */
@@ -79,6 +79,3 @@ ENTRY (__strrchr_aarch64_sve)
 	ret
 
 END (__strrchr_aarch64_sve)
-
-#endif
-
diff --git a/contrib/arm-optimized-routines/string/aarch64/memchr-mte.S b/contrib/arm-optimized-routines/string/aarch64/memchr-mte.S
index c2e967d1004e..68bd0af9a8c5 100644
--- a/contrib/arm-optimized-routines/string/aarch64/memchr-mte.S
+++ b/contrib/arm-optimized-routines/string/aarch64/memchr-mte.S
@@ -1,8 +1,8 @@
 /*
  * memchr - find a character in a memory zone
  *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  * MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define srcin		x0
 #define chrin		w1
@@ -23,82 +23,74 @@
 #define synd		x5
 #define shift		x6
 #define	tmp		x7
-#define wtmp		w7
 
 #define vrepchr		v0
 #define qdata		q1
 #define vdata		v1
 #define vhas_chr	v2
-#define vrepmask	v3
-#define vend		v4
-#define dend		d4
+#define vend		v3
+#define dend		d3
 
 /*
    Core algorithm:
-
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting leading zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (__memchr_aarch64_mte)
-	PTR_ARG (0)
-	SIZE_ARG (2)
 	bic	src, srcin, 15
 	cbz	cntin, L(nomatch)
 	ld1	{vdata.16b}, [src]
 	dup	vrepchr.16b, chrin
-	mov	wtmp, 0xf00f
-	dup	vrepmask.8h, wtmp
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	lsl	shift, srcin, 2
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	cbz	synd, L(start_loop)
 
 	rbit	synd, synd
 	clz	synd, synd
-	add	result, srcin, synd, lsr 2
 	cmp	cntin, synd, lsr 2
+	add	result, srcin, synd, lsr 2
 	csel	result, result, xzr, hi
 	ret
 
+	.p2align 3
 L(start_loop):
 	sub	tmp, src, srcin
-	add	tmp, tmp, 16
+	add	tmp, tmp, 17
 	subs	cntrem, cntin, tmp
-	b.ls	L(nomatch)
+	b.lo	L(nomatch)
 
 	/* Make sure that it won't overread by a 16-byte chunk */
-	add	tmp, cntrem, 15
-	tbnz	tmp, 4, L(loop32_2)
-
+	tbz	cntrem, 4, L(loop32_2)
+	sub	src, src, 16
 	.p2align 4
 L(loop32):
-	ldr	qdata, [src, 16]!
+	ldr	qdata, [src, 32]!
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
 	fmov	synd, dend
 	cbnz	synd, L(end)
 
 L(loop32_2):
-	ldr	qdata, [src, 16]!
-	subs	cntrem, cntrem, 32
+	ldr	qdata, [src, 16]
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
-	b.ls	L(end)
+	subs	cntrem, cntrem, 32
+	b.lo	L(end_2)
 	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
 	fmov	synd, dend
 	cbz	synd, L(loop32)
+L(end_2):
+	add	src, src, 16
 L(end):
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
+	sub	cntrem, src, srcin
 	fmov	synd, dend
-	add	tmp, srcin, cntin
-	sub	cntrem, tmp, src
+	sub	cntrem, cntin, cntrem
 #ifndef __AARCH64EB__
 	rbit	synd, synd
 #endif
diff --git a/contrib/arm-optimized-routines/string/aarch64/memchr.S b/contrib/arm-optimized-routines/string/aarch64/memchr.S
index 353f0d1eac53..d12a38abbc30 100644
--- a/contrib/arm-optimized-routines/string/aarch64/memchr.S
+++ b/contrib/arm-optimized-routines/string/aarch64/memchr.S
@@ -1,8 +1,8 @@
 /*
  * memchr - find a character in a memory zone
  *
- * Copyright (c) 2014-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2014-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  * Neon Available.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 /* Arguments and results.  */
 #define srcin		x0
@@ -47,8 +47,6 @@
  */
 
 ENTRY (__memchr_aarch64)
-	PTR_ARG (0)
-	SIZE_ARG (2)
 	/* Do not dereference srcin if no bytes to compare.  */
 	cbz	cntin, L(zero_length)
 	/*
diff --git a/contrib/arm-optimized-routines/string/aarch64/memcmp.S b/contrib/arm-optimized-routines/string/aarch64/memcmp.S
index 3b1026642eee..43439de4db69 100644
--- a/contrib/arm-optimized-routines/string/aarch64/memcmp.S
+++ b/contrib/arm-optimized-routines/string/aarch64/memcmp.S
@@ -1,103 +1,80 @@
 /* memcmp - compare memory
  *
- * Copyright (c) 2013-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2013-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64, unaligned accesses.
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
-/* Parameters and result.  */
-#define src1		x0
-#define src2		x1
-#define limit		x2
-#define result		w0
+#define src1	x0
+#define src2	x1
+#define limit	x2
+#define result	w0
 
-/* Internal variables.  */
-#define data1		x3
-#define data1w		w3
-#define data1h		x4
-#define data2		x5
-#define data2w		w5
-#define data2h		x6
-#define tmp1		x7
-#define tmp2		x8
+#define data1	x3
+#define data1w	w3
+#define data2	x4
+#define data2w	w4
+#define data3	x5
+#define data3w	w5
+#define data4	x6
+#define data4w	w6
+#define tmp	x6
+#define src1end	x7
+#define src2end	x8
 
-ENTRY (__memcmp_aarch64)
-	PTR_ARG (0)
-	PTR_ARG (1)
-	SIZE_ARG (2)
-	subs	limit, limit, 8
-	b.lo	L(less8)
-
-	ldr	data1, [src1], 8
-	ldr	data2, [src2], 8
-	cmp	data1, data2
-	b.ne	L(return)
-
-	subs	limit, limit, 8
-	b.gt	L(more16)
-
-	ldr	data1, [src1, limit]
-	ldr	data2, [src2, limit]
-	b	L(return)
 
-L(more16):
-	ldr	data1, [src1], 8
-	ldr	data2, [src2], 8
-	cmp	data1, data2
-	bne	L(return)
-
-	/* Jump directly to comparing the last 16 bytes for 32 byte (or less)
-	   strings.  */
-	subs	limit, limit, 16
+ENTRY (__memcmp_aarch64)
+	cmp	limit, 16
+	b.lo	L(less16)
+	ldp	data1, data3, [src1]
+	ldp	data2, data4, [src2]
+	ccmp	data1, data2, 0, ne
+	ccmp	data3, data4, 0, eq
+	b.ne	L(return2)
+
+	add	src1end, src1, limit
+	add	src2end, src2, limit
+	cmp	limit, 32
 	b.ls	L(last_bytes)
+	cmp	limit, 160
+	b.hs	L(loop_align)
+	sub	limit, limit, 32
 
-	/* We overlap loads between 0-32 bytes at either side of SRC1 when we
-	   try to align, so limit it only to strings larger than 128 bytes.  */
-	cmp	limit, 96
-	b.ls	L(loop16)
-
-	/* Align src1 and adjust src2 with bytes not yet done.  */
-	and	tmp1, src1, 15
-	add	limit, limit, tmp1
-	sub	src1, src1, tmp1
-	sub	src2, src2, tmp1
-
-	/* Loop performing 16 bytes per iteration using aligned src1.
-	   Limit is pre-decremented by 16 and must be larger than zero.
-	   Exit if <= 16 bytes left to do or if the data is not equal.  */
 	.p2align 4
-L(loop16):
-	ldp	data1, data1h, [src1], 16
-	ldp	data2, data2h, [src2], 16
-	subs	limit, limit, 16
-	ccmp	data1, data2, 0, hi
-	ccmp	data1h, data2h, 0, eq
-	b.eq	L(loop16)
-
+L(loop32):
+	ldp	data1, data3, [src1, 16]
+	ldp	data2, data4, [src2, 16]
 	cmp	data1, data2
-	bne	L(return)
-	mov	data1, data1h
-	mov	data2, data2h
+	ccmp	data3, data4, 0, eq
+	b.ne	L(return2)
+	cmp	limit, 16
+	b.ls	L(last_bytes)
+
+	ldp	data1, data3, [src1, 32]
+	ldp	data2, data4, [src2, 32]
 	cmp	data1, data2
-	bne	L(return)
+	ccmp	data3, data4, 0, eq
+	b.ne	L(return2)
+	add	src1, src1, 32
+	add	src2, src2, 32
+L(last64):
+	subs	limit, limit, 32
+	b.hi	L(loop32)
 
 	/* Compare last 1-16 bytes using unaligned access.  */
 L(last_bytes):
-	add	src1, src1, limit
-	add	src2, src2, limit
-	ldp	data1, data1h, [src1]
-	ldp	data2, data2h, [src2]
-	cmp     data1, data2
-	bne	L(return)
-	mov	data1, data1h
-	mov	data2, data2h
+	ldp	data1, data3, [src1end, -16]
+	ldp	data2, data4, [src2end, -16]
+L(return2):
 	cmp	data1, data2
+	csel	data1, data1, data3, ne
+	csel	data2, data2, data4, ne
 
 	/* Compare data bytes and set return value to 0, -1 or 1.  */
 L(return):
@@ -105,33 +82,105 @@ L(return):
 	rev	data1, data1
 	rev	data2, data2
 #endif
-	cmp     data1, data2
-L(ret_eq):
+	cmp	data1, data2
 	cset	result, ne
 	cneg	result, result, lo
 	ret
 
 	.p2align 4
-	/* Compare up to 8 bytes.  Limit is [-8..-1].  */
+L(less16):
+	add	src1end, src1, limit
+	add	src2end, src2, limit
+	tbz	limit, 3, L(less8)
+	ldr	data1, [src1]
+	ldr	data2, [src2]
+	ldr	data3, [src1end, -8]
+	ldr	data4, [src2end, -8]
+	b	L(return2)
+
+	.p2align 4
 L(less8):
-	adds	limit, limit, 4
-	b.lo	L(less4)
-	ldr	data1w, [src1], 4
-	ldr	data2w, [src2], 4
+	tbz	limit, 2, L(less4)
+	ldr	data1w, [src1]
+	ldr	data2w, [src2]
+	ldr	data3w, [src1end, -4]
+	ldr	data4w, [src2end, -4]
+	b	L(return2)
+
+L(less4):
+	tbz	limit, 1, L(less2)
+	ldrh	data1w, [src1]
+	ldrh	data2w, [src2]
 	cmp	data1w, data2w
 	b.ne	L(return)
-	sub	limit, limit, 4
-L(less4):
-	adds	limit, limit, 4
-	beq	L(ret_eq)
-L(byte_loop):
-	ldrb	data1w, [src1], 1
-	ldrb	data2w, [src2], 1
-	subs	limit, limit, 1
-	ccmp	data1w, data2w, 0, ne	/* NZCV = 0b0000.  */
-	b.eq	L(byte_loop)
+L(less2):
+	mov	result, 0
+	tbz	limit, 0, L(return_zero)
+	ldrb	data1w, [src1end, -1]
+	ldrb	data2w, [src2end, -1]
 	sub	result, data1w, data2w
+L(return_zero):
 	ret
 
-END (__memcmp_aarch64)
+L(loop_align):
+	ldp	data1, data3, [src1, 16]
+	ldp	data2, data4, [src2, 16]
+	cmp	data1, data2
+	ccmp	data3, data4, 0, eq
+	b.ne	L(return2)
+
+	/* Align src2 and adjust src1, src2 and limit.  */
+	and	tmp, src2, 15
+	sub	tmp, tmp, 16
+	sub	src2, src2, tmp
+	add	limit, limit, tmp
+	sub	src1, src1, tmp
+	sub	limit, limit, 64 + 16
+
+	.p2align 4
+L(loop64):
+	ldr	q0, [src1, 16]
+	ldr	q1, [src2, 16]
+	subs	limit, limit, 64
+	ldr	q2, [src1, 32]
+	ldr	q3, [src2, 32]
+	eor	v0.16b, v0.16b, v1.16b
+	eor	v1.16b, v2.16b, v3.16b
+	ldr	q2, [src1, 48]
+	ldr	q3, [src2, 48]
+	umaxp	v0.16b, v0.16b, v1.16b
+	ldr	q4, [src1, 64]!
+	ldr	q5, [src2, 64]!
+	eor	v1.16b, v2.16b, v3.16b
+	eor	v2.16b, v4.16b, v5.16b
+	umaxp	v1.16b, v1.16b, v2.16b
+	umaxp	v0.16b, v0.16b, v1.16b
+	umaxp	v0.16b, v0.16b, v0.16b
+	fmov	tmp, d0
+	ccmp	tmp, 0, 0, hi
+	b.eq	L(loop64)
+
+	/* If equal, process last 1-64 bytes using scalar loop.  */
+	add	limit, limit, 64 + 16
+	cbz	tmp, L(last64)
+
+	/* Determine the 8-byte aligned offset of the first difference.  */
+#ifdef __AARCH64EB__
+	rev16	tmp, tmp
+#endif
+	rev	tmp, tmp
+	clz	tmp, tmp
+	bic	tmp, tmp, 7
+	sub	tmp, tmp, 48
+	ldr	data1, [src1, tmp]
+	ldr	data2, [src2, tmp]
+#ifndef __AARCH64EB__
+	rev	data1, data1
+	rev	data2, data2
+#endif
+	mov	result, 1
+	cmp	data1, data2
+	cneg	result, result, lo
+	ret
 
+END (__memcmp_aarch64)
diff --git a/contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S b/contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S
index f97f2c3047b9..cbf4c581500e 100644
--- a/contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S
+++ b/contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S
@@ -1,8 +1,8 @@
 /*
  * memcpy - copy memory area
  *
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  *
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define dstin	x0
 #define src	x1
@@ -52,15 +52,13 @@
 
 ENTRY_ALIAS (__memmove_aarch64_simd)
 ENTRY (__memcpy_aarch64_simd)
-	PTR_ARG (0)
-	PTR_ARG (1)
-	SIZE_ARG (2)
 	add	srcend, src, count
-	add	dstend, dstin, count
 	cmp	count, 128
 	b.hi	L(copy_long)
+	add	dstend, dstin, count
 	cmp	count, 32
 	b.hi	L(copy32_128)
+	nop
 
 	/* Small copies: 0..32 bytes.  */
 	cmp	count, 16
@@ -71,6 +69,18 @@ ENTRY (__memcpy_aarch64_simd)
 	str	B_q, [dstend, -16]
 	ret
 
+	.p2align 4
+	/* Medium copies: 33..128 bytes.  */
+L(copy32_128):
+	ldp	A_q, B_q, [src]
+	ldp	C_q, D_q, [srcend, -32]
+	cmp	count, 64
+	b.hi	L(copy128)
+	stp	A_q, B_q, [dstin]
+	stp	C_q, D_q, [dstend, -32]
+	ret
+
+	.p2align 4
 	/* Copy 8-15 bytes.  */
 L(copy16):
 	tbz	count, 3, L(copy8)
@@ -80,7 +90,6 @@ L(copy16):
 	str	A_h, [dstend, -8]
 	ret
 
-	.p2align 3
 	/* Copy 4-7 bytes.  */
 L(copy8):
 	tbz	count, 2, L(copy4)
@@ -90,31 +99,6 @@ L(copy8):
 	str	B_lw, [dstend, -4]
 	ret
 
-	/* Copy 0..3 bytes using a branchless sequence.  */
-L(copy4):
-	cbz	count, L(copy0)
-	lsr	tmp1, count, 1
-	ldrb	A_lw, [src]
-	ldrb	C_lw, [srcend, -1]
-	ldrb	B_lw, [src, tmp1]
-	strb	A_lw, [dstin]
-	strb	B_lw, [dstin, tmp1]
-	strb	C_lw, [dstend, -1]
-L(copy0):
-	ret
-
-	.p2align 4
-	/* Medium copies: 33..128 bytes.  */
-L(copy32_128):
-	ldp	A_q, B_q, [src]
-	ldp	C_q, D_q, [srcend, -32]
-	cmp	count, 64
-	b.hi	L(copy128)
-	stp	A_q, B_q, [dstin]
-	stp	C_q, D_q, [dstend, -32]
-	ret
-
-	.p2align 4
 	/* Copy 65..128 bytes.  */
 L(copy128):
 	ldp	E_q, F_q, [src, 32]
@@ -128,8 +112,24 @@ L(copy96):
 	stp	C_q, D_q, [dstend, -32]
 	ret
 
+	/* Copy 0..3 bytes using a branchless sequence.  */
+L(copy4):
+	cbz	count, L(copy0)
+	lsr	tmp1, count, 1
+	ldrb	A_lw, [src]
+	ldrb	C_lw, [srcend, -1]
+	ldrb	B_lw, [src, tmp1]
+	strb	A_lw, [dstin]
+	strb	B_lw, [dstin, tmp1]
+	strb	C_lw, [dstend, -1]
+L(copy0):
+	ret
+
+	.p2align 3
 	/* Copy more than 128 bytes.  */
 L(copy_long):
+	add	dstend, dstin, count
+
 	/* Use backwards copy if there is an overlap.  */
 	sub	tmp1, dstin, src
 	cmp	tmp1, count
@@ -166,6 +166,9 @@ L(copy64_from_end):
 	stp	A_q, B_q, [dstend, -32]
 	ret
 
+	.p2align 4
+	nop
+
 	/* Large backwards copy for overlapping copies.
 	   Copy 16 bytes and then align srcend to 16-byte alignment.  */
 L(copy_long_backwards):
diff --git a/contrib/arm-optimized-routines/string/aarch64/memcpy-mops.S b/contrib/arm-optimized-routines/string/aarch64/memcpy-mops.S
new file mode 100644
index 000000000000..03ae95570c04
--- /dev/null
+++ b/contrib/arm-optimized-routines/string/aarch64/memcpy-mops.S
@@ -0,0 +1,17 @@
+/*
+ * memcpy using MOPS extension.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "asmdefs.h"
+
+ENTRY (__memcpy_aarch64_mops)
+	mov	x3, x0
+	.inst	0x19010443	/* cpyfp   [x3]!, [x1]!, x2!  */
+	.inst	0x19410443	/* cpyfm   [x3]!, [x1]!, x2!  */
+	.inst	0x19810443	/* cpyfe   [x3]!, [x1]!, x2!  */
+	ret
+
+END (__memcpy_aarch64_mops)
diff --git a/contrib/arm-optimized-routines/string/aarch64/memcpy-sve.S b/contrib/arm-optimized-routines/string/aarch64/memcpy-sve.S
new file mode 100644
index 000000000000..9b05cb2a58ee
--- /dev/null
+++ b/contrib/arm-optimized-routines/string/aarch64/memcpy-sve.S
@@ -0,0 +1,169 @@
+/*
+ * memcpy - copy memory area
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses.
+ *
+ */
+
+#include "asmdefs.h"
+
+.arch armv8-a+sve
+
+#define dstin	x0
+#define src	x1
+#define count	x2
+#define dst	x3
+#define srcend	x4
+#define dstend	x5
+#define tmp1	x6
+#define vlen	x6
+
+#define A_q	q0
+#define B_q	q1
+#define C_q	q2
+#define D_q	q3
+#define E_q	q4
+#define F_q	q5
+#define G_q	q6
+#define H_q	q7
+
+/* This implementation handles overlaps and supports both memcpy and memmove
+   from a single entry point.  It uses unaligned accesses and branchless
+   sequences to keep the code small, simple and improve performance.
+   SVE vectors are used to speedup small copies.
+
+   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+   copies of up to 128 bytes, and large copies.  The overhead of the overlap
+   check is negligible since it is only required for large copies.
+
+   Large copies use a software pipelined loop processing 64 bytes per iteration.
+   The source pointer is 16-byte aligned to minimize unaligned accesses.
+   The loop tail is handled by always copying 64 bytes from the end.
+*/
+
+ENTRY_ALIAS (__memmove_aarch64_sve)
+ENTRY (__memcpy_aarch64_sve)
+	cmp	count, 128
+	b.hi	L(copy_long)
+	cntb	vlen
+	cmp	count, vlen, lsl 1
+	b.hi	L(copy32_128)
+
+	whilelo p0.b, xzr, count
+	whilelo p1.b, vlen, count
+	ld1b	z0.b, p0/z, [src, 0, mul vl]
+	ld1b	z1.b, p1/z, [src, 1, mul vl]
+	st1b	z0.b, p0, [dstin, 0, mul vl]
+	st1b	z1.b, p1, [dstin, 1, mul vl]
+	ret
+
+	/* Medium copies: 33..128 bytes.  */
+L(copy32_128):
+	add	srcend, src, count
+	add	dstend, dstin, count
+	ldp	A_q, B_q, [src]
+	ldp	C_q, D_q, [srcend, -32]
+	cmp	count, 64
+	b.hi	L(copy128)
+	stp	A_q, B_q, [dstin]
+	stp	C_q, D_q, [dstend, -32]
+	ret
+
+	/* Copy 65..128 bytes.  */
+L(copy128):
+	ldp	E_q, F_q, [src, 32]
+	cmp	count, 96
+	b.ls	L(copy96)
+	ldp	G_q, H_q, [srcend, -64]
+	stp	G_q, H_q, [dstend, -64]
+L(copy96):
+	stp	A_q, B_q, [dstin]
+	stp	E_q, F_q, [dstin, 32]
+	stp	C_q, D_q, [dstend, -32]
+	ret
+
+	/* Copy more than 128 bytes.  */
+L(copy_long):
+	add	srcend, src, count
+	add	dstend, dstin, count
+
+	/* Use backwards copy if there is an overlap.  */
+	sub	tmp1, dstin, src
+	cmp	tmp1, count
+	b.lo	L(copy_long_backwards)
+
+	/* Copy 16 bytes and then align src to 16-byte alignment.  */
+	ldr	D_q, [src]
+	and	tmp1, src, 15
+	bic	src, src, 15
+	sub	dst, dstin, tmp1
+	add	count, count, tmp1	/* Count is now 16 too large.  */
+	ldp	A_q, B_q, [src, 16]
+	str	D_q, [dstin]
+	ldp	C_q, D_q, [src, 48]
+	subs	count, count, 128 + 16	/* Test and readjust count.  */
+	b.ls	L(copy64_from_end)
+L(loop64):
+	stp	A_q, B_q, [dst, 16]
+	ldp	A_q, B_q, [src, 80]
+	stp	C_q, D_q, [dst, 48]
+	ldp	C_q, D_q, [src, 112]
+	add	src, src, 64
+	add	dst, dst, 64
+	subs	count, count, 64
+	b.hi	L(loop64)
+
+	/* Write the last iteration and copy 64 bytes from the end.  */
+L(copy64_from_end):
+	ldp	E_q, F_q, [srcend, -64]
+	stp	A_q, B_q, [dst, 16]
+	ldp	A_q, B_q, [srcend, -32]
+	stp	C_q, D_q, [dst, 48]
+	stp	E_q, F_q, [dstend, -64]
+	stp	A_q, B_q, [dstend, -32]
+	ret
+
+	/* Large backwards copy for overlapping copies.
+	   Copy 16 bytes and then align srcend to 16-byte alignment.  */
+L(copy_long_backwards):
+	cbz	tmp1, L(return)
+	ldr	D_q, [srcend, -16]
+	and	tmp1, srcend, 15
+	bic	srcend, srcend, 15
+	sub	count, count, tmp1
+	ldp	A_q, B_q, [srcend, -32]
+	str	D_q, [dstend, -16]
+	ldp	C_q, D_q, [srcend, -64]
+	sub	dstend, dstend, tmp1
+	subs	count, count, 128
+	b.ls	L(copy64_from_start)
+
+L(loop64_backwards):
+	str	B_q, [dstend, -16]
+	str	A_q, [dstend, -32]
+	ldp	A_q, B_q, [srcend, -96]
+	str	D_q, [dstend, -48]
+	str	C_q, [dstend, -64]!
+	ldp	C_q, D_q, [srcend, -128]
+	sub	srcend, srcend, 64
+	subs	count, count, 64
+	b.hi	L(loop64_backwards)
+
+	/* Write the last iteration and copy 64 bytes from the start.  */
+L(copy64_from_start):
+	ldp	E_q, F_q, [src, 32]
+	stp	A_q, B_q, [dstend, -32]
+	ldp	A_q, B_q, [src]
+	stp	C_q, D_q, [dstend, -64]
+	stp	E_q, F_q, [dstin, 32]
+	stp	A_q, B_q, [dstin]
+L(return):
+	ret
+
+END (__memcpy_aarch64_sve)
diff --git a/contrib/arm-optimized-routines/string/aarch64/memcpy.S b/contrib/arm-optimized-routines/string/aarch64/memcpy.S
index dd254f6f9929..351f1a11f097 100644
--- a/contrib/arm-optimized-routines/string/aarch64/memcpy.S
+++ b/contrib/arm-optimized-routines/string/aarch64/memcpy.S
@@ -1,8 +1,8 @@
 /*
  * memcpy - copy memory area
  *
- * Copyright (c) 2012-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2012-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  *
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define dstin	x0
 #define src	x1
@@ -55,9 +55,6 @@
 
 ENTRY_ALIAS (__memmove_aarch64)
 ENTRY (__memcpy_aarch64)
-	PTR_ARG (0)
-	PTR_ARG (1)
-	SIZE_ARG (2)
 	add	srcend, src, count
 	add	dstend, dstin, count
 	cmp	count, 128
diff --git a/contrib/arm-optimized-routines/string/aarch64/memmove-mops.S b/contrib/arm-optimized-routines/string/aarch64/memmove-mops.S
new file mode 100644
index 000000000000..d9839f86e9b4
--- /dev/null
+++ b/contrib/arm-optimized-routines/string/aarch64/memmove-mops.S
@@ -0,0 +1,17 @@
+/*
+ * memmove using MOPS extension.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "asmdefs.h"
+
+ENTRY (__memmove_aarch64_mops)
+	mov	x3, x0
+	.inst	0x1d010443	/* cpyp    [x3]!, [x1]!, x2!  */
+	.inst	0x1d410443	/* cpym    [x3]!, [x1]!, x2!  */
+	.inst	0x1d810443	/* cpye    [x3]!, [x1]!, x2!  */
+	ret
+
+END (__memmove_aarch64_mops)
diff --git a/contrib/arm-optimized-routines/string/aarch64/memrchr.S b/contrib/arm-optimized-routines/string/aarch64/memrchr.S
index 7b4be847cecb..ed38478a6faa 100644
--- a/contrib/arm-optimized-routines/string/aarch64/memrchr.S
+++ b/contrib/arm-optimized-routines/string/aarch64/memrchr.S
@@ -1,8 +1,8 @@
 /*
  * memrchr - find last character in a memory zone.
  *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  * MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define srcin		x0
 #define chrin		w1
@@ -23,7 +23,6 @@
 #define synd		x5
 #define shift		x6
 #define	tmp		x7
-#define wtmp		w7
 #define end		x8
 #define endm1		x9
 
@@ -31,34 +30,27 @@
 #define qdata		q1
 #define vdata		v1
 #define vhas_chr	v2
-#define vrepmask	v3
-#define vend		v4
-#define dend		d4
+#define vend		v3
+#define dend		d3
 
 /*
    Core algorithm:
-
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting leading zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (__memrchr_aarch64)
-	PTR_ARG (0)
 	add	end, srcin, cntin
 	sub	endm1, end, 1
 	bic	src, endm1, 15
 	cbz	cntin, L(nomatch)
 	ld1	{vdata.16b}, [src]
 	dup	vrepchr.16b, chrin
-	mov	wtmp, 0xf00f
-	dup	vrepmask.8h, wtmp
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	neg	shift, end, lsl 2
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	lsl	synd, synd, shift
 	cbz	synd, L(start_loop)
@@ -69,34 +61,36 @@ ENTRY (__memrchr_aarch64)
 	csel	result, result, xzr, hi
 	ret
 
+	nop
 L(start_loop):
-	sub	tmp, end, src
-	subs	cntrem, cntin, tmp
+	subs	cntrem, src, srcin
 	b.ls	L(nomatch)
 
 	/* Make sure that it won't overread by a 16-byte chunk */
-	add	tmp, cntrem, 15
-	tbnz	tmp, 4, L(loop32_2)
+	sub	cntrem, cntrem, 1
+	tbz	cntrem, 4, L(loop32_2)
+	add	src, src, 16
 
-	.p2align 4
+	.p2align 5
 L(loop32):
-	ldr	qdata, [src, -16]!
+	ldr	qdata, [src, -32]!
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
 	fmov	synd, dend
 	cbnz	synd, L(end)
 
 L(loop32_2):
-	ldr	qdata, [src, -16]!
+	ldr	qdata, [src, -16]
 	subs	cntrem, cntrem, 32
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
-	b.ls	L(end)
+	b.lo	L(end_2)
 	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
 	fmov	synd, dend
 	cbz	synd, L(loop32)
+L(end_2):
+	sub	src, src, 16
 L(end):
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	synd, dend
 
 	add	tmp, src, 15
diff --git a/contrib/arm-optimized-routines/string/aarch64/memset-mops.S b/contrib/arm-optimized-routines/string/aarch64/memset-mops.S
new file mode 100644
index 000000000000..00d8e7d2c05f
--- /dev/null
+++ b/contrib/arm-optimized-routines/string/aarch64/memset-mops.S
@@ -0,0 +1,17 @@
+/*
+ * memset using MOPS extension.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "asmdefs.h"
+
+ENTRY (__memset_aarch64_mops)
+	mov     x3, x0
+	.inst   0x19c10443	/* setp    [x3]!, x2!, x1  */
+	.inst   0x19c14443	/* setm    [x3]!, x2!, x1  */
+	.inst   0x19c18443	/* sete    [x3]!, x2!, x1  */
+	ret
+
+END (__memset_aarch64_mops)
diff --git a/contrib/arm-optimized-routines/string/aarch64/memset-sve.S b/contrib/arm-optimized-routines/string/aarch64/memset-sve.S
new file mode 100644
index 000000000000..efaeaece284e
--- /dev/null
+++ b/contrib/arm-optimized-routines/string/aarch64/memset-sve.S
@@ -0,0 +1,114 @@
+/*
+ * memset - fill memory with a constant byte
+ *
+ * Copyright (c) 2024-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses.
+ *
+ */
+
+#include "asmdefs.h"
+
+.arch armv8-a+sve
+
+#define dstin	x0
+#define val	x1
+#define valw	w1
+#define count	x2
+#define dst	x3
+#define dstend	x4
+#define zva_val	x5
+#define vlen	x5
+#define off	x3
+#define dstend2 x5
+
+ENTRY (__memset_aarch64_sve)
+	dup	v0.16B, valw
+	cmp	count, 16
+	b.lo	L(set_16)
+
+	add	dstend, dstin, count
+	cmp	count, 64
+	b.hs	L(set_128)
+
+	/* Set 16..63 bytes.  */
+	mov	off, 16
+	and	off, off, count, lsr 1
+	sub	dstend2, dstend, off
+	str	q0, [dstin]
+	str	q0, [dstin, off]
+	str	q0, [dstend2, -16]
+	str	q0, [dstend, -16]
+	ret
+
+	.p2align 4
+L(set_16):
+	whilelo p0.b, xzr, count
+	st1b	z0.b, p0, [dstin]
+	ret
+
+	.p2align 4
+L(set_128):
+	bic	dst, dstin, 15
+	cmp	count, 128
+	b.hi	L(set_long)
+	stp	q0, q0, [dstin]
+	stp	q0, q0, [dstin, 32]
+	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
+	ret
+
+	.p2align 4
+L(set_long):
+	cmp	count, 256
+	b.lo	L(no_zva)
+	tst	valw, 255
+	b.ne	L(no_zva)
+
+#ifndef SKIP_ZVA_CHECK
+	mrs	zva_val, dczid_el0
+	and	zva_val, zva_val, 31
+	cmp	zva_val, 4		/* ZVA size is 64 bytes.  */
+	b.ne	L(no_zva)
+#endif
+	str	q0, [dstin]
+	str	q0, [dst, 16]
+	bic	dst, dstin, 31
+	stp	q0, q0, [dst, 32]
+	bic	dst, dstin, 63
+	sub	count, dstend, dst	/* Count is now 64 too large.  */
+	sub	count, count, 128	/* Adjust count and bias for loop.  */
+
+	sub	x8, dstend, 1		/* Write last bytes before ZVA loop.  */
+	bic	x8, x8, 15
+	stp	q0, q0, [x8, -48]
+	str	q0, [x8, -16]
+	str	q0, [dstend, -16]
+
+	.p2align 4
+L(zva64_loop):
+	add	dst, dst, 64
+	dc	zva, dst
+	subs	count, count, 64
+	b.hi	L(zva64_loop)
+	ret
+
+L(no_zva):
+	str	q0, [dstin]
+	sub	count, dstend, dst	/* Count is 16 too large.  */
+	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
+L(no_zva_loop):
+	stp	q0, q0, [dst, 16]
+	stp	q0, q0, [dst, 48]
+	add	dst, dst, 64
+	subs	count, count, 64
+	b.hi	L(no_zva_loop)
+	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
+	ret
+
+END (__memset_aarch64_sve)
diff --git a/contrib/arm-optimized-routines/string/aarch64/memset.S b/contrib/arm-optimized-routines/string/aarch64/memset.S
index 9fcd97579913..906a4dcf46c6 100644
--- a/contrib/arm-optimized-routines/string/aarch64/memset.S
+++ b/contrib/arm-optimized-routines/string/aarch64/memset.S
@@ -1,8 +1,8 @@
 /*
  * memset - fill memory with a constant byte
  *
- * Copyright (c) 2012-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2012-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  *
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define dstin	x0
 #define val	x1
@@ -20,93 +20,98 @@
 #define dst	x3
 #define dstend	x4
 #define zva_val	x5
+#define off	x3
+#define dstend2	x5
 
 ENTRY (__memset_aarch64)
-	PTR_ARG (0)
-	SIZE_ARG (2)
-
 	dup	v0.16B, valw
-	add	dstend, dstin, count
-
-	cmp	count, 96
-	b.hi	L(set_long)
 	cmp	count, 16
-	b.hs	L(set_medium)
-	mov	val, v0.D[0]
+	b.lo	L(set_small)
 
-	/* Set 0..15 bytes.  */
-	tbz	count, 3, 1f
-	str	val, [dstin]
-	str	val, [dstend, -8]
+	add	dstend, dstin, count
+	cmp	count, 64
+	b.hs	L(set_128)
+
+	/* Set 16..63 bytes.  */
+	mov	off, 16
+	and	off, off, count, lsr 1
+	sub	dstend2, dstend, off
+	str	q0, [dstin]
+	str	q0, [dstin, off]
+	str	q0, [dstend2, -16]
+	str	q0, [dstend, -16]
 	ret
+
 	.p2align 4
-1:	tbz	count, 2, 2f
-	str	valw, [dstin]
-	str	valw, [dstend, -4]
+	/* Set 0..15 bytes.  */
+L(set_small):
+	add	dstend, dstin, count
+	cmp	count, 4
+	b.lo	2f
+	lsr	off, count, 3
+	sub	dstend2, dstend, off, lsl 2
+	str	s0, [dstin]
+	str	s0, [dstin, off, lsl 2]
+	str	s0, [dstend2, -4]
+	str	s0, [dstend, -4]
 	ret
+
+	/* Set 0..3 bytes.  */
 2:	cbz	count, 3f
+	lsr	off, count, 1
 	strb	valw, [dstin]
-	tbz	count, 1, 3f
-	strh	valw, [dstend, -2]
+	strb	valw, [dstin, off]
+	strb	valw, [dstend, -1]
 3:	ret
 
-	/* Set 17..96 bytes.  */
-L(set_medium):
-	str	q0, [dstin]
-	tbnz	count, 6, L(set96)
-	str	q0, [dstend, -16]
-	tbz	count, 5, 1f
-	str	q0, [dstin, 16]
-	str	q0, [dstend, -32]
-1:	ret
-
 	.p2align 4
-	/* Set 64..96 bytes.  Write 64 bytes from the start and
-	   32 bytes from the end.  */
-L(set96):
-	str	q0, [dstin, 16]
+L(set_128):
+	bic	dst, dstin, 15
+	cmp	count, 128
+	b.hi	L(set_long)
+	stp	q0, q0, [dstin]
 	stp	q0, q0, [dstin, 32]
+	stp	q0, q0, [dstend, -64]
 	stp	q0, q0, [dstend, -32]
 	ret
 
 	.p2align 4
 L(set_long):
-	and	valw, valw, 255
-	bic	dst, dstin, 15
 	str	q0, [dstin]
-	cmp	count, 160
-	ccmp	valw, 0, 0, hs
+	str	q0, [dst, 16]
+	tst	valw, 255
 	b.ne	L(no_zva)
-
 #ifndef SKIP_ZVA_CHECK
 	mrs	zva_val, dczid_el0
 	and	zva_val, zva_val, 31
 	cmp	zva_val, 4		/* ZVA size is 64 bytes.  */
 	b.ne	L(no_zva)
 #endif
-	str	q0, [dst, 16]
 	stp	q0, q0, [dst, 32]
-	bic	dst, dst, 63
+	bic	dst, dstin, 63
 	sub	count, dstend, dst	/* Count is now 64 too large.  */
-	sub	count, count, 128	/* Adjust count and bias for loop.  */
+	sub	count, count, 64 + 64	/* Adjust count and bias for loop.  */
+
+	/* Write last bytes before ZVA loop.  */
+	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
 
 	.p2align 4
-L(zva_loop):
+L(zva64_loop):
 	add	dst, dst, 64
 	dc	zva, dst
 	subs	count, count, 64
-	b.hi	L(zva_loop)
-	stp	q0, q0, [dstend, -64]
-	stp	q0, q0, [dstend, -32]
+	b.hi	L(zva64_loop)
 	ret
 
+	.p2align 3
 L(no_zva):
-	sub	count, dstend, dst	/* Count is 16 too large.  */
-	sub	dst, dst, 16		/* Dst is biased by -32.  */
-	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
+	sub	count, dstend, dst	/* Count is 32 too large.  */
+	sub	count, count, 64 + 32	/* Adjust count and bias for loop.  */
 L(no_zva_loop):
 	stp	q0, q0, [dst, 32]
-	stp	q0, q0, [dst, 64]!
+	stp	q0, q0, [dst, 64]
+	add	dst, dst, 64
 	subs	count, count, 64
 	b.hi	L(no_zva_loop)
 	stp	q0, q0, [dstend, -64]
@@ -114,4 +119,3 @@ L(no_zva_loop):
 	ret
 
 END (__memset_aarch64)
-
diff --git a/contrib/arm-optimized-routines/string/aarch64/stpcpy-mte.S b/contrib/arm-optimized-routines/string/aarch64/stpcpy-mte.S
deleted file mode 100644
index f1c711906515..000000000000
--- a/contrib/arm-optimized-routines/string/aarch64/stpcpy-mte.S
+++ /dev/null
@@ -1,10 +0,0 @@
-/*
- * stpcpy - copy a string returning pointer to end.
- *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#define BUILD_STPCPY 1
-
-#include "strcpy-mte.S"
diff --git a/contrib/arm-optimized-routines/string/aarch64/stpcpy.S b/contrib/arm-optimized-routines/string/aarch64/stpcpy.S
index 4f62aa462389..155c68d75a7b 100644
--- a/contrib/arm-optimized-routines/string/aarch64/stpcpy.S
+++ b/contrib/arm-optimized-routines/string/aarch64/stpcpy.S
@@ -2,7 +2,7 @@
  * stpcpy - copy a string returning pointer to end.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #define BUILD_STPCPY 1
diff --git a/contrib/arm-optimized-routines/string/aarch64/strchr-mte.S b/contrib/arm-optimized-routines/string/aarch64/strchr-mte.S
index dcb0e4625870..42b747311bc6 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strchr-mte.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strchr-mte.S
@@ -1,8 +1,8 @@
 /*
  * strchr - find a character in a string
  *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  * MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define srcin		x0
 #define chrin		w1
@@ -19,8 +19,7 @@
 
 #define src		x2
 #define tmp1		x1
-#define wtmp2		w3
-#define tmp3		x3
+#define tmp2		x3
 
 #define vrepchr		v0
 #define vdata		v1
@@ -28,39 +27,29 @@
 #define vhas_nul	v2
 #define vhas_chr	v3
 #define vrepmask	v4
-#define vrepmask2	v5
-#define vend		v6
-#define dend		d6
+#define vend		v5
+#define dend		d5
 
 /* Core algorithm.
 
    For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-1 are set if the relevant byte matched the
-   requested character, bits 2-3 are set if the byte is NUL (or matched), and
-   bits 4-7 are not used and must be zero if none of bits 0-3 are set). Odd
-   bytes set bits 4-7 so that adjacent bytes can be merged. Since the bits
-   in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+   per byte. Bits 0-1 are set if the relevant byte matched the requested
+   character, bits 2-3 are set if the byte is NUL or matched. Count trailing
+   zeroes gives the position of the matching byte if it is a multiple of 4.
+   If it is not a multiple of 4, there was no match.  */
 
 ENTRY (__strchr_aarch64_mte)
-	PTR_ARG (0)
 	bic	src, srcin, 15
 	dup	vrepchr.16b, chrin
 	ld1	{vdata.16b}, [src]
-	mov	wtmp2, 0x3003
-	dup	vrepmask.8h, wtmp2
+	movi	vrepmask.16b, 0x33
 	cmeq	vhas_nul.16b, vdata.16b, 0
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
-	mov	wtmp2, 0xf00f
-	dup	vrepmask2.8h, wtmp2
-
 	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
-	lsl	tmp3, srcin, 2
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
-
+	lsl	tmp2, srcin, 2
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	tmp1, dend
-	lsr	tmp1, tmp1, tmp3
+	lsr	tmp1, tmp1, tmp2
 	cbz	tmp1, L(loop)
 
 	rbit	tmp1, tmp1
@@ -74,28 +63,34 @@ ENTRY (__strchr_aarch64_mte)
 
 	.p2align 4
 L(loop):
-	ldr	qdata, [src, 16]!
+	ldr	qdata, [src, 16]
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	cmhs	vhas_nul.16b, vhas_chr.16b, vdata.16b
+	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	tmp1, dend
+	cbnz	tmp1, L(end)
+	ldr	qdata, [src, 32]!
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	cmhs	vhas_nul.16b, vhas_chr.16b, vdata.16b
 	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
 	fmov	tmp1, dend
 	cbz	tmp1, L(loop)
+	sub	src, src, 16
+L(end):
 
 #ifdef __AARCH64EB__
 	bif	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	tmp1, dend
 #else
 	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	tmp1, dend
 	rbit	tmp1, tmp1
 #endif
+	add	src, src, 16
 	clz	tmp1, tmp1
-	/* Tmp1 is an even multiple of 2 if the target character was
-	   found first. Otherwise we've found the end of string.  */
+	/* Tmp1 is a multiple of 4 if the target character was found.  */
 	tst	tmp1, 2
 	add	result, src, tmp1, lsr 2
 	csel	result, result, xzr, eq
diff --git a/contrib/arm-optimized-routines/string/aarch64/strchr.S b/contrib/arm-optimized-routines/string/aarch64/strchr.S
index 1063cbfd77aa..c1d01e9635b6 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strchr.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strchr.S
@@ -1,8 +1,8 @@
 /*
  * strchr - find a character in a string
  *
- * Copyright (c) 2014-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2014-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  * Neon Available.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 /* Arguments and results.  */
 #define srcin		x0
@@ -51,7 +51,6 @@
 /* Locals and temporaries.  */
 
 ENTRY (__strchr_aarch64)
-	PTR_ARG (0)
 	/* Magic constant 0xc0300c03 to allow us to identify which lane
 	   matches the requested byte.  Even bits are set if the character
 	   matches, odd bits if either the char is NUL or matches.  */
diff --git a/contrib/arm-optimized-routines/string/aarch64/strchrnul-mte.S b/contrib/arm-optimized-routines/string/aarch64/strchrnul-mte.S
index 1b0d0a63094c..b3180cdf9e2c 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strchrnul-mte.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strchrnul-mte.S
@@ -1,8 +1,8 @@
 /*
  * strchrnul - find a character or nul in a string
  *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  * MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define srcin		x0
 #define chrin		w1
@@ -20,38 +20,31 @@
 #define src		x2
 #define tmp1		x1
 #define tmp2		x3
-#define tmp2w		w3
 
 #define vrepchr		v0
 #define vdata		v1
 #define qdata		q1
 #define vhas_nul	v2
 #define vhas_chr	v3
-#define vrepmask	v4
-#define vend		v5
-#define dend		d5
+#define vend		v4
+#define dend		d4
 
-/* Core algorithm:
-
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+/*
+   Core algorithm:
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting leading zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (__strchrnul_aarch64_mte)
-	PTR_ARG (0)
 	bic	src, srcin, 15
 	dup	vrepchr.16b, chrin
 	ld1	{vdata.16b}, [src]
-	mov	tmp2w, 0xf00f
-	dup	vrepmask.8h, tmp2w
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	cmhs	vhas_chr.16b, vhas_chr.16b, vdata.16b
 	lsl	tmp2, srcin, 2
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	tmp1, dend
 	lsr	tmp1, tmp1, tmp2	/* Mask padding bits.  */
 	cbz	tmp1, L(loop)
@@ -63,15 +56,22 @@ ENTRY (__strchrnul_aarch64_mte)
 
 	.p2align 4
 L(loop):
-	ldr	qdata, [src, 16]!
+	ldr	qdata, [src, 16]
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	cmhs	vhas_chr.16b, vhas_chr.16b, vdata.16b
+	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b
+	fmov	tmp1, dend
+	cbnz	tmp1, L(end)
+	ldr	qdata, [src, 32]!
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	cmhs	vhas_chr.16b, vhas_chr.16b, vdata.16b
 	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b
 	fmov	tmp1, dend
 	cbz	tmp1, L(loop)
-
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	sub	src, src, 16
+L(end):
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
+	add	src, src, 16
 	fmov	tmp1, dend
 #ifndef __AARCH64EB__
 	rbit	tmp1, tmp1
diff --git a/contrib/arm-optimized-routines/string/aarch64/strchrnul.S b/contrib/arm-optimized-routines/string/aarch64/strchrnul.S
index a4230d919b47..0a32c46c30c5 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strchrnul.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strchrnul.S
@@ -1,8 +1,8 @@
 /*
  * strchrnul - find a character or nul in a string
  *
- * Copyright (c) 2014-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2014-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  * Neon Available.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 /* Arguments and results.  */
 #define srcin		x0
@@ -47,7 +47,6 @@
 /* Locals and temporaries.  */
 
 ENTRY (__strchrnul_aarch64)
-	PTR_ARG (0)
 	/* Magic constant 0x40100401 to allow us to identify which lane
 	   matches the termination condition.  */
 	mov	wtmp2, #0x0401
diff --git a/contrib/arm-optimized-routines/string/aarch64/strcmp-mte.S b/contrib/arm-optimized-routines/string/aarch64/strcmp-mte.S
deleted file mode 100644
index 12d1a6b51dd3..000000000000
--- a/contrib/arm-optimized-routines/string/aarch64/strcmp-mte.S
+++ /dev/null
@@ -1,189 +0,0 @@
-/*
- * strcmp - compare two strings
- *
- * Copyright (c) 2012-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64.
- * MTE compatible.
- */
-
-#include "../asmdefs.h"
-
-#define REP8_01 0x0101010101010101
-#define REP8_7f 0x7f7f7f7f7f7f7f7f
-
-#define src1		x0
-#define src2		x1
-#define result		x0
-
-#define data1		x2
-#define data1w		w2
-#define data2		x3
-#define data2w		w3
-#define has_nul		x4
-#define diff		x5
-#define off1		x5
-#define syndrome	x6
-#define tmp		x6
-#define data3		x7
-#define zeroones	x8
-#define shift		x9
-#define off2		x10
-
-/* On big-endian early bytes are at MSB and on little-endian LSB.
-   LS_FW means shifting towards early bytes.  */
-#ifdef __AARCH64EB__
-# define LS_FW lsl
-#else
-# define LS_FW lsr
-#endif
-
-/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
-   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
-   can be done in parallel across the entire word.
-   Since carry propagation makes 0x1 bytes before a NUL byte appear
-   NUL too in big-endian, byte-reverse the data before the NUL check.  */
-
-
-ENTRY (__strcmp_aarch64_mte)
-	PTR_ARG (0)
-	PTR_ARG (1)
-	sub	off2, src2, src1
-	mov	zeroones, REP8_01
-	and	tmp, src1, 7
-	tst	off2, 7
-	b.ne	L(misaligned8)
-	cbnz	tmp, L(mutual_align)
-
-	.p2align 4
-
-L(loop_aligned):
-	ldr	data2, [src1, off2]
-	ldr	data1, [src1], 8
-L(start_realigned):
-#ifdef __AARCH64EB__
-	rev	tmp, data1
-	sub	has_nul, tmp, zeroones
-	orr	tmp, tmp, REP8_7f
-#else
-	sub	has_nul, data1, zeroones
-	orr	tmp, data1, REP8_7f
-#endif
-	bics	has_nul, has_nul, tmp	/* Non-zero if NUL terminator.  */
-	ccmp	data1, data2, 0, eq
-	b.eq	L(loop_aligned)
-#ifdef __AARCH64EB__
-	rev	has_nul, has_nul
-#endif
-	eor	diff, data1, data2
-	orr	syndrome, diff, has_nul
-L(end):
-#ifndef __AARCH64EB__
-	rev	syndrome, syndrome
-	rev	data1, data1
-	rev	data2, data2
-#endif
-	clz	shift, syndrome
-	/* The most-significant-non-zero bit of the syndrome marks either the
-	   first bit that is different, or the top bit of the first zero byte.
-	   Shifting left now will bring the critical information into the
-	   top bits.  */
-	lsl	data1, data1, shift
-	lsl	data2, data2, shift
-	/* But we need to zero-extend (char is unsigned) the value and then
-	   perform a signed 32-bit subtraction.  */
-	lsr	data1, data1, 56
-	sub	result, data1, data2, lsr 56
-	ret
-
-	.p2align 4
-
-L(mutual_align):
-	/* Sources are mutually aligned, but are not currently at an
-	   alignment boundary.  Round down the addresses and then mask off
-	   the bytes that precede the start point.  */
-	bic	src1, src1, 7
-	ldr	data2, [src1, off2]
-	ldr	data1, [src1], 8
-	neg	shift, src2, lsl 3	/* Bits to alignment -64.  */
-	mov	tmp, -1
-	LS_FW	tmp, tmp, shift
-	orr	data1, data1, tmp
-	orr	data2, data2, tmp
-	b	L(start_realigned)
-
-L(misaligned8):
-	/* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
-	   checking to make sure that we don't access beyond the end of SRC2.  */
-	cbz	tmp, L(src1_aligned)
-L(do_misaligned):
-	ldrb	data1w, [src1], 1
-	ldrb	data2w, [src2], 1
-	cmp	data1w, 0
-	ccmp	data1w, data2w, 0, ne	/* NZCV = 0b0000.  */
-	b.ne	L(done)
-	tst	src1, 7
-	b.ne	L(do_misaligned)
-
-L(src1_aligned):
-	neg	shift, src2, lsl 3
-	bic	src2, src2, 7
-	ldr	data3, [src2], 8
-#ifdef __AARCH64EB__
-	rev	data3, data3
-#endif
-	lsr	tmp, zeroones, shift
-	orr	data3, data3, tmp
-	sub	has_nul, data3, zeroones
-	orr	tmp, data3, REP8_7f
-	bics	has_nul, has_nul, tmp
-	b.ne	L(tail)
-
-	sub	off1, src2, src1
-
-	.p2align 4
-
-L(loop_unaligned):
-	ldr	data3, [src1, off1]
-	ldr	data2, [src1, off2]
-#ifdef __AARCH64EB__
-	rev	data3, data3
-#endif
-	sub	has_nul, data3, zeroones
-	orr	tmp, data3, REP8_7f
-	ldr	data1, [src1], 8
-	bics	has_nul, has_nul, tmp
-	ccmp	data1, data2, 0, eq
-	b.eq	L(loop_unaligned)
-
-	lsl	tmp, has_nul, shift
-#ifdef __AARCH64EB__
-	rev	tmp, tmp
-#endif
-	eor	diff, data1, data2
-	orr	syndrome, diff, tmp
-	cbnz	syndrome, L(end)
-L(tail):
-	ldr	data1, [src1]
-	neg	shift, shift
-	lsr	data2, data3, shift
-	lsr	has_nul, has_nul, shift
-#ifdef __AARCH64EB__
-	rev     data2, data2
-	rev	has_nul, has_nul
-#endif
-	eor	diff, data1, data2
-	orr	syndrome, diff, has_nul
-	b	L(end)
-
-L(done):
-	sub	result, data1, data2
-	ret
-
-END (__strcmp_aarch64_mte)
-
diff --git a/contrib/arm-optimized-routines/string/aarch64/strcmp.S b/contrib/arm-optimized-routines/string/aarch64/strcmp.S
index 7714ebf5577d..7c0d0485a89b 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strcmp.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strcmp.S
@@ -1,168 +1,182 @@
 /*
  * strcmp - compare two strings
  *
- * Copyright (c) 2012-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2012-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
+
 /* Assumptions:
  *
- * ARMv8-a, AArch64
+ * ARMv8-a, AArch64.
+ * MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define REP8_01 0x0101010101010101
 #define REP8_7f 0x7f7f7f7f7f7f7f7f
-#define REP8_80 0x8080808080808080
 
-/* Parameters and result.  */
 #define src1		x0
 #define src2		x1
 #define result		x0
 
-/* Internal variables.  */
 #define data1		x2
 #define data1w		w2
 #define data2		x3
 #define data2w		w3
 #define has_nul		x4
 #define diff		x5
+#define off1		x5
 #define syndrome	x6
-#define tmp1		x7
-#define tmp2		x8
-#define tmp3		x9
-#define zeroones	x10
-#define pos		x11
+#define tmp		x6
+#define data3		x7
+#define zeroones	x8
+#define shift		x9
+#define off2		x10
+
+/* On big-endian early bytes are at MSB and on little-endian LSB.
+   LS_FW means shifting towards early bytes.  */
+#ifdef __AARCH64EB__
+# define LS_FW lsl
+#else
+# define LS_FW lsr
+#endif
+
+/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+   can be done in parallel across the entire word.
+   Since carry propagation makes 0x1 bytes before a NUL byte appear
+   NUL too in big-endian, byte-reverse the data before the NUL check.  */
+
 
-	/* Start of performance-critical section  -- one 64B cache line.  */
 ENTRY (__strcmp_aarch64)
-	PTR_ARG (0)
-	PTR_ARG (1)
-	eor	tmp1, src1, src2
-	mov	zeroones, #REP8_01
-	tst	tmp1, #7
+	sub	off2, src2, src1
+	mov	zeroones, REP8_01
+	and	tmp, src1, 7
+	tst	off2, 7
 	b.ne	L(misaligned8)
-	ands	tmp1, src1, #7
-	b.ne	L(mutual_align)
-	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
-	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
-	   can be done in parallel across the entire word.  */
+	cbnz	tmp, L(mutual_align)
+
+	.p2align 4
+
 L(loop_aligned):
-	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
+	ldr	data2, [src1, off2]
+	ldr	data1, [src1], 8
 L(start_realigned):
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	eor	diff, data1, data2	/* Non-zero if differences found.  */
-	bic	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+#ifdef __AARCH64EB__
+	rev	tmp, data1
+	sub	has_nul, tmp, zeroones
+	orr	tmp, tmp, REP8_7f
+#else
+	sub	has_nul, data1, zeroones
+	orr	tmp, data1, REP8_7f
+#endif
+	bics	has_nul, has_nul, tmp	/* Non-zero if NUL terminator.  */
+	ccmp	data1, data2, 0, eq
+	b.eq	L(loop_aligned)
+#ifdef __AARCH64EB__
+	rev	has_nul, has_nul
+#endif
+	eor	diff, data1, data2
 	orr	syndrome, diff, has_nul
-	cbz	syndrome, L(loop_aligned)
-	/* End of performance-critical section  -- one 64B cache line.  */
-
 L(end):
-#ifndef	__AARCH64EB__
+#ifndef __AARCH64EB__
 	rev	syndrome, syndrome
 	rev	data1, data1
-	/* The MS-non-zero bit of the syndrome marks either the first bit
-	   that is different, or the top bit of the first zero byte.
-	   Shifting left now will bring the critical information into the
-	   top bits.  */
-	clz	pos, syndrome
 	rev	data2, data2
-	lsl	data1, data1, pos
-	lsl	data2, data2, pos
-	/* But we need to zero-extend (char is unsigned) the value and then
-	   perform a signed 32-bit subtraction.  */
-	lsr	data1, data1, #56
-	sub	result, data1, data2, lsr #56
-	ret
-#else
-	/* For big-endian we cannot use the trick with the syndrome value
-	   as carry-propagation can corrupt the upper bits if the trailing
-	   bytes in the string contain 0x01.  */
-	/* However, if there is no NUL byte in the dword, we can generate
-	   the result directly.  We can't just subtract the bytes as the
-	   MSB might be significant.  */
-	cbnz	has_nul, 1f
-	cmp	data1, data2
-	cset	result, ne
-	cneg	result, result, lo
-	ret
-1:
-	/* Re-compute the NUL-byte detection, using a byte-reversed value.  */
-	rev	tmp3, data1
-	sub	tmp1, tmp3, zeroones
-	orr	tmp2, tmp3, #REP8_7f
-	bic	has_nul, tmp1, tmp2
-	rev	has_nul, has_nul
-	orr	syndrome, diff, has_nul
-	clz	pos, syndrome
-	/* The MS-non-zero bit of the syndrome marks either the first bit
-	   that is different, or the top bit of the first zero byte.
+#endif
+	clz	shift, syndrome
+	/* The most-significant-non-zero bit of the syndrome marks either the
+	   first bit that is different, or the top bit of the first zero byte.
 	   Shifting left now will bring the critical information into the
 	   top bits.  */
-	lsl	data1, data1, pos
-	lsl	data2, data2, pos
+	lsl	data1, data1, shift
+	lsl	data2, data2, shift
 	/* But we need to zero-extend (char is unsigned) the value and then
 	   perform a signed 32-bit subtraction.  */
-	lsr	data1, data1, #56
-	sub	result, data1, data2, lsr #56
+	lsr	data1, data1, 56
+	sub	result, data1, data2, lsr 56
 	ret
-#endif
+
+	.p2align 4
 
 L(mutual_align):
 	/* Sources are mutually aligned, but are not currently at an
 	   alignment boundary.  Round down the addresses and then mask off
-	   the bytes that preceed the start point.  */
-	bic	src1, src1, #7
-	bic	src2, src2, #7
-	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */
-	ldr	data1, [src1], #8
-	neg	tmp1, tmp1		/* Bits to alignment -64.  */
-	ldr	data2, [src2], #8
-	mov	tmp2, #~0
-#ifdef __AARCH64EB__
-	/* Big-endian.  Early bytes are at MSB.  */
-	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
-#else
-	/* Little-endian.  Early bytes are at LSB.  */
-	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
-#endif
-	orr	data1, data1, tmp2
-	orr	data2, data2, tmp2
+	   the bytes that precede the start point.  */
+	bic	src1, src1, 7
+	ldr	data2, [src1, off2]
+	ldr	data1, [src1], 8
+	neg	shift, src2, lsl 3	/* Bits to alignment -64.  */
+	mov	tmp, -1
+	LS_FW	tmp, tmp, shift
+	orr	data1, data1, tmp
+	orr	data2, data2, tmp
 	b	L(start_realigned)
 
 L(misaligned8):
 	/* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
-	   checking to make sure that we don't access beyond page boundary in
-	   SRC2.  */
-	tst	src1, #7
-	b.eq	L(loop_misaligned)
+	   checking to make sure that we don't access beyond the end of SRC2.  */
+	cbz	tmp, L(src1_aligned)
 L(do_misaligned):
-	ldrb	data1w, [src1], #1
-	ldrb	data2w, [src2], #1
-	cmp	data1w, #1
-	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
+	ldrb	data1w, [src1], 1
+	ldrb	data2w, [src2], 1
+	cmp	data1w, 0
+	ccmp	data1w, data2w, 0, ne	/* NZCV = 0b0000.  */
 	b.ne	L(done)
-	tst	src1, #7
+	tst	src1, 7
 	b.ne	L(do_misaligned)
 
-L(loop_misaligned):
-	/* Test if we are within the last dword of the end of a 4K page.  If
-	   yes then jump back to the misaligned loop to copy a byte at a time.  */
-	and	tmp1, src2, #0xff8
-	eor	tmp1, tmp1, #0xff8
-	cbz	tmp1, L(do_misaligned)
-	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
-
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	eor	diff, data1, data2	/* Non-zero if differences found.  */
-	bic	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+L(src1_aligned):
+	neg	shift, src2, lsl 3
+	bic	src2, src2, 7
+	ldr	data3, [src2], 8
+#ifdef __AARCH64EB__
+	rev	data3, data3
+#endif
+	lsr	tmp, zeroones, shift
+	orr	data3, data3, tmp
+	sub	has_nul, data3, zeroones
+	orr	tmp, data3, REP8_7f
+	bics	has_nul, has_nul, tmp
+	b.ne	L(tail)
+
+	sub	off1, src2, src1
+
+	.p2align 4
+
+L(loop_unaligned):
+	ldr	data3, [src1, off1]
+	ldr	data2, [src1, off2]
+#ifdef __AARCH64EB__
+	rev	data3, data3
+#endif
+	sub	has_nul, data3, zeroones
+	orr	tmp, data3, REP8_7f
+	ldr	data1, [src1], 8
+	bics	has_nul, has_nul, tmp
+	ccmp	data1, data2, 0, eq
+	b.eq	L(loop_unaligned)
+
+	lsl	tmp, has_nul, shift
+#ifdef __AARCH64EB__
+	rev	tmp, tmp
+#endif
+	eor	diff, data1, data2
+	orr	syndrome, diff, tmp
+	cbnz	syndrome, L(end)
+L(tail):
+	ldr	data1, [src1]
+	neg	shift, shift
+	lsr	data2, data3, shift
+	lsr	has_nul, has_nul, shift
+#ifdef __AARCH64EB__
+	rev     data2, data2
+	rev	has_nul, has_nul
+#endif
+	eor	diff, data1, data2
 	orr	syndrome, diff, has_nul
-	cbz	syndrome, L(loop_misaligned)
 	b	L(end)
 
 L(done):
diff --git a/contrib/arm-optimized-routines/string/aarch64/strcpy-mte.S b/contrib/arm-optimized-routines/string/aarch64/strcpy-mte.S
deleted file mode 100644
index 88c222d61e53..000000000000
--- a/contrib/arm-optimized-routines/string/aarch64/strcpy-mte.S
+++ /dev/null
@@ -1,161 +0,0 @@
-/*
- * strcpy/stpcpy - copy a string returning pointer to start/end.
- *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64, Advanced SIMD.
- * MTE compatible.
- */
-
-#include "../asmdefs.h"
-
-#define dstin		x0
-#define srcin		x1
-#define result		x0
-
-#define src		x2
-#define dst		x3
-#define len		x4
-#define synd		x4
-#define	tmp		x5
-#define wtmp		w5
-#define shift		x5
-#define data1		x6
-#define dataw1		w6
-#define data2		x7
-#define dataw2		w7
-
-#define dataq		q0
-#define vdata		v0
-#define vhas_nul	v1
-#define vrepmask	v2
-#define vend		v3
-#define dend		d3
-#define dataq2		q1
-
-#ifdef BUILD_STPCPY
-# define STRCPY __stpcpy_aarch64_mte
-# define IFSTPCPY(X,...) X,__VA_ARGS__
-#else
-# define STRCPY __strcpy_aarch64_mte
-# define IFSTPCPY(X,...)
-#endif
-
-/* Core algorithm:
-
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
-
-ENTRY (STRCPY)
-	PTR_ARG (0)
-	PTR_ARG (1)
-	bic	src, srcin, 15
-	mov	wtmp, 0xf00f
-	ld1	{vdata.16b}, [src]
-	dup	vrepmask.8h, wtmp
-	cmeq	vhas_nul.16b, vdata.16b, 0
-	lsl	shift, srcin, 2
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
-	fmov	synd, dend
-	lsr	synd, synd, shift
-	cbnz	synd, L(tail)
-
-	ldr	dataq, [src, 16]!
-	cmeq	vhas_nul.16b, vdata.16b, 0
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
-	fmov	synd, dend
-	cbz	synd, L(start_loop)
-
-#ifndef __AARCH64EB__
-	rbit	synd, synd
-#endif
-	sub	tmp, src, srcin
-	clz	len, synd
-	add	len, tmp, len, lsr 2
-	tbz	len, 4, L(less16)
-	sub	tmp, len, 15
-	ldr	dataq, [srcin]
-	ldr	dataq2, [srcin, tmp]
-	str	dataq, [dstin]
-	str	dataq2, [dstin, tmp]
-	IFSTPCPY (add result, dstin, len)
-	ret
-
-	.p2align 4,,8
-L(tail):
-	rbit	synd, synd
-	clz	len, synd
-	lsr	len, len, 2
-
-	.p2align 4
-L(less16):
-	tbz	len, 3, L(less8)
-	sub	tmp, len, 7
-	ldr	data1, [srcin]
-	ldr	data2, [srcin, tmp]
-	str	data1, [dstin]
-	str	data2, [dstin, tmp]
-	IFSTPCPY (add result, dstin, len)
-	ret
-
-	.p2align 4
-L(less8):
-	subs	tmp, len, 3
-	b.lo	L(less4)
-	ldr	dataw1, [srcin]
-	ldr	dataw2, [srcin, tmp]
-	str	dataw1, [dstin]
-	str	dataw2, [dstin, tmp]
-	IFSTPCPY (add result, dstin, len)
-	ret
-
-L(less4):
-	cbz	len, L(zerobyte)
-	ldrh	dataw1, [srcin]
-	strh	dataw1, [dstin]
-L(zerobyte):
-	strb	wzr, [dstin, len]
-	IFSTPCPY (add result, dstin, len)
-	ret
-
-	.p2align 4
-L(start_loop):
-	sub	len, src, srcin
-	ldr	dataq2, [srcin]
-	add	dst, dstin, len
-	str	dataq2, [dstin]
-
-	.p2align 5
-L(loop):
-	str	dataq, [dst], 16
-	ldr	dataq, [src, 16]!
-	cmeq	vhas_nul.16b, vdata.16b, 0
-	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
-	fmov	synd, dend
-	cbz	synd, L(loop)
-
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
-	fmov	synd, dend
-#ifndef __AARCH64EB__
-	rbit	synd, synd
-#endif
-	clz	len, synd
-	lsr	len, len, 2
-	sub	tmp, len, 15
-	ldr	dataq, [src, tmp]
-	str	dataq, [dst, tmp]
-	IFSTPCPY (add result, dst, len)
-	ret
-
-END (STRCPY)
diff --git a/contrib/arm-optimized-routines/string/aarch64/strcpy.S b/contrib/arm-optimized-routines/string/aarch64/strcpy.S
index 6e9ed424b693..5852616e6024 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strcpy.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strcpy.S
@@ -1,311 +1,154 @@
 /*
  * strcpy/stpcpy - copy a string returning pointer to start/end.
  *
- * Copyright (c) 2013-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
-/* To build as stpcpy, define BUILD_STPCPY before compiling this file.
-
-   To test the page crossing code path more thoroughly, compile with
-   -DSTRCPY_TEST_PAGE_CROSS - this will force all copies through the slower
-   entry path.  This option is not intended for production use.  */
-
-/* Arguments and results.  */
 #define dstin		x0
 #define srcin		x1
+#define result		x0
 
-/* Locals and temporaries.  */
 #define src		x2
 #define dst		x3
-#define data1		x4
-#define data1w		w4
-#define data2		x5
-#define data2w		w5
-#define has_nul1	x6
-#define has_nul2	x7
-#define tmp1		x8
-#define tmp2		x9
-#define tmp3		x10
-#define tmp4		x11
-#define zeroones	x12
-#define data1a		x13
-#define data2a		x14
-#define pos		x15
-#define len		x16
-#define to_align	x17
+#define len		x4
+#define synd		x4
+#define	tmp		x5
+#define shift		x5
+#define data1		x6
+#define dataw1		w6
+#define data2		x7
+#define dataw2		w7
+
+#define dataq		q0
+#define vdata		v0
+#define vhas_nul	v1
+#define vend		v2
+#define dend		d2
+#define dataq2		q1
 
 #ifdef BUILD_STPCPY
-#define STRCPY __stpcpy_aarch64
-#else
-#define STRCPY __strcpy_aarch64
-#endif
-
-	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
-	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
-	   can be done in parallel across the entire word.  */
-
-#define REP8_01 0x0101010101010101
-#define REP8_7f 0x7f7f7f7f7f7f7f7f
-#define REP8_80 0x8080808080808080
-
-	/* AArch64 systems have a minimum page size of 4k.  We can do a quick
-	   page size check for crossing this boundary on entry and if we
-	   do not, then we can short-circuit much of the entry code.  We
-	   expect early page-crossing strings to be rare (probability of
-	   16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite
-	   predictable, even with random strings.
-
-	   We don't bother checking for larger page sizes, the cost of setting
-	   up the correct page size is just not worth the extra gain from
-	   a small reduction in the cases taking the slow path.  Note that
-	   we only care about whether the first fetch, which may be
-	   misaligned, crosses a page boundary - after that we move to aligned
-	   fetches for the remainder of the string.  */
-
-#ifdef STRCPY_TEST_PAGE_CROSS
-	/* Make everything that isn't Qword aligned look like a page cross.  */
-#define MIN_PAGE_P2 4
+# define STRCPY __stpcpy_aarch64
+# define IFSTPCPY(X,...) X,__VA_ARGS__
 #else
-#define MIN_PAGE_P2 12
+# define STRCPY __strcpy_aarch64
+# define IFSTPCPY(X,...)
 #endif
 
-#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)
+/*
+   Core algorithm:
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting leading zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (STRCPY)
-	PTR_ARG (0)
-	PTR_ARG (1)
-	/* For moderately short strings, the fastest way to do the copy is to
-	   calculate the length of the string in the same way as strlen, then
-	   essentially do a memcpy of the result.  This avoids the need for
-	   multiple byte copies and further means that by the time we
-	   reach the bulk copy loop we know we can always use DWord
-	   accesses.  We expect __strcpy_aarch64 to rarely be called repeatedly
-	   with the same source string, so branch prediction is likely to
-	   always be difficult - we mitigate against this by preferring
-	   conditional select operations over branches whenever this is
-	   feasible.  */
-	and	tmp2, srcin, #(MIN_PAGE_SIZE - 1)
-	mov	zeroones, #REP8_01
-	and	to_align, srcin, #15
-	cmp	tmp2, #(MIN_PAGE_SIZE - 16)
-	neg	tmp1, to_align
-	/* The first fetch will straddle a (possible) page boundary iff
-	   srcin + 15 causes bit[MIN_PAGE_P2] to change value.  A 16-byte
-	   aligned string will never fail the page align check, so will
-	   always take the fast path.  */
-	b.gt	L(page_cross)
-
-L(page_cross_ok):
-	ldp	data1, data2, [srcin]
-#ifdef __AARCH64EB__
-	/* Because we expect the end to be found within 16 characters
-	   (profiling shows this is the most common case), it's worth
-	   swapping the bytes now to save having to recalculate the
-	   termination syndrome later.  We preserve data1 and data2
-	   so that we can re-use the values later on.  */
-	rev	tmp2, data1
-	sub	tmp1, tmp2, zeroones
-	orr	tmp2, tmp2, #REP8_7f
-	bics	has_nul1, tmp1, tmp2
-	b.ne	L(fp_le8)
-	rev	tmp4, data2
-	sub	tmp3, tmp4, zeroones
-	orr	tmp4, tmp4, #REP8_7f
-#else
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	bics	has_nul1, tmp1, tmp2
-	b.ne	L(fp_le8)
-	sub	tmp3, data2, zeroones
-	orr	tmp4, data2, #REP8_7f
+	bic	src, srcin, 15
+	ld1	{vdata.16b}, [src]
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	lsl	shift, srcin, 2
+	shrn	vend.8b, vhas_nul.8h, 4
+	fmov	synd, dend
+	lsr	synd, synd, shift
+	cbnz	synd, L(tail)
+
+	ldr	dataq, [src, 16]!
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	shrn	vend.8b, vhas_nul.8h, 4
+	fmov	synd, dend
+	cbz	synd, L(start_loop)
+
+#ifndef __AARCH64EB__
+	rbit	synd, synd
 #endif
-	bics	has_nul2, tmp3, tmp4
-	b.eq	L(bulk_entry)
+	sub	tmp, src, srcin
+	clz	len, synd
+	add	len, tmp, len, lsr 2
+	tbz	len, 4, L(less16)
+	sub	tmp, len, 15
+	ldr	dataq, [srcin]
+	ldr	dataq2, [srcin, tmp]
+	str	dataq, [dstin]
+	str	dataq2, [dstin, tmp]
+	IFSTPCPY (add result, dstin, len)
+	ret
 
-	/* The string is short (<=16 bytes).  We don't know exactly how
-	   short though, yet.  Work out the exact length so that we can
-	   quickly select the optimal copy strategy.  */
-L(fp_gt8):
-	rev	has_nul2, has_nul2
-	clz	pos, has_nul2
-	mov	tmp2, #56
-	add	dst, dstin, pos, lsr #3		/* Bits to bytes.  */
-	sub	pos, tmp2, pos
-#ifdef __AARCH64EB__
-	lsr	data2, data2, pos
-#else
-	lsl	data2, data2, pos
-#endif
-	str	data2, [dst, #1]
+L(tail):
+	rbit	synd, synd
+	clz	len, synd
+	lsr	len, len, 2
+L(less16):
+	tbz	len, 3, L(less8)
+	sub	tmp, len, 7
+	ldr	data1, [srcin]
+	ldr	data2, [srcin, tmp]
 	str	data1, [dstin]
-#ifdef BUILD_STPCPY
-	add	dstin, dst, #8
-#endif
+	str	data2, [dstin, tmp]
+	IFSTPCPY (add result, dstin, len)
 	ret
 
-L(fp_le8):
-	rev	has_nul1, has_nul1
-	clz	pos, has_nul1
-	add	dst, dstin, pos, lsr #3		/* Bits to bytes.  */
-	subs	tmp2, pos, #24			/* Pos in bits. */
-	b.lt	L(fp_lt4)
-#ifdef __AARCH64EB__
-	mov	tmp2, #56
-	sub	pos, tmp2, pos
-	lsr	data2, data1, pos
-	lsr	data1, data1, #32
-#else
-	lsr	data2, data1, tmp2
-#endif
-	/* 4->7 bytes to copy.  */
-	str	data2w, [dst, #-3]
-	str	data1w, [dstin]
-#ifdef BUILD_STPCPY
-	mov	dstin, dst
-#endif
+	.p2align 4
+L(less8):
+	subs	tmp, len, 3
+	b.lo	L(less4)
+	ldr	dataw1, [srcin]
+	ldr	dataw2, [srcin, tmp]
+	str	dataw1, [dstin]
+	str	dataw2, [dstin, tmp]
+	IFSTPCPY (add result, dstin, len)
 	ret
-L(fp_lt4):
-	cbz	pos, L(fp_lt2)
-	/* 2->3 bytes to copy.  */
-#ifdef __AARCH64EB__
-	lsr	data1, data1, #48
-#endif
-	strh	data1w, [dstin]
-	/* Fall-through, one byte (max) to go.  */
-L(fp_lt2):
-	/* Null-terminated string.  Last character must be zero!  */
-	strb	wzr, [dst]
-#ifdef BUILD_STPCPY
-	mov	dstin, dst
-#endif
-	ret
-
-	.p2align 6
-	/* Aligning here ensures that the entry code and main loop all lies
-	   within one 64-byte cache line.  */
-L(bulk_entry):
-	sub	to_align, to_align, #16
-	stp	data1, data2, [dstin]
-	sub	src, srcin, to_align
-	sub	dst, dstin, to_align
-	b	L(entry_no_page_cross)
-
-	/* The inner loop deals with two Dwords at a time.  This has a
-	   slightly higher start-up cost, but we should win quite quickly,
-	   especially on cores with a high number of issue slots per
-	   cycle, as we get much better parallelism out of the operations.  */
-L(main_loop):
-	stp	data1, data2, [dst], #16
-L(entry_no_page_cross):
-	ldp	data1, data2, [src], #16
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	sub	tmp3, data2, zeroones
-	orr	tmp4, data2, #REP8_7f
-	bic	has_nul1, tmp1, tmp2
-	bics	has_nul2, tmp3, tmp4
-	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */
-	b.eq	L(main_loop)
 
-	/* Since we know we are copying at least 16 bytes, the fastest way
-	   to deal with the tail is to determine the location of the
-	   trailing NUL, then (re)copy the 16 bytes leading up to that.  */
-	cmp	has_nul1, #0
-#ifdef __AARCH64EB__
-	/* For big-endian, carry propagation (if the final byte in the
-	   string is 0x01) means we cannot use has_nul directly.  The
-	   easiest way to get the correct byte is to byte-swap the data
-	   and calculate the syndrome a second time.  */
-	csel	data1, data1, data2, ne
-	rev	data1, data1
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	bic	has_nul1, tmp1, tmp2
-#else
-	csel	has_nul1, has_nul1, has_nul2, ne
-#endif
-	rev	has_nul1, has_nul1
-	clz	pos, has_nul1
-	add	tmp1, pos, #72
-	add	pos, pos, #8
-	csel	pos, pos, tmp1, ne
-	add	src, src, pos, lsr #3
-	add	dst, dst, pos, lsr #3
-	ldp	data1, data2, [src, #-32]
-	stp	data1, data2, [dst, #-16]
-#ifdef BUILD_STPCPY
-	sub	dstin, dst, #1
-#endif
+L(less4):
+	cbz	len, L(zerobyte)
+	ldrh	dataw1, [srcin]
+	strh	dataw1, [dstin]
+L(zerobyte):
+	strb	wzr, [dstin, len]
+	IFSTPCPY (add result, dstin, len)
 	ret
 
-L(page_cross):
-	bic	src, srcin, #15
-	/* Start by loading two words at [srcin & ~15], then forcing the
-	   bytes that precede srcin to 0xff.  This means they never look
-	   like termination bytes.  */
-	ldp	data1, data2, [src]
-	lsl	tmp1, tmp1, #3	/* Bytes beyond alignment -> bits.  */
-	tst	to_align, #7
-	csetm	tmp2, ne
-#ifdef __AARCH64EB__
-	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
-#else
-	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+	.p2align 4
+L(start_loop):
+	sub	tmp, srcin, dstin
+	ldr	dataq2, [srcin]
+	sub	dst, src, tmp
+	str	dataq2, [dstin]
+L(loop):
+	str	dataq, [dst], 32
+	ldr	dataq, [src, 16]
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	cbnz	synd, L(loopend)
+	str	dataq, [dst, -16]
+	ldr	dataq, [src, 32]!
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	cbz	synd, L(loop)
+	add	dst, dst, 16
+L(loopend):
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
+	fmov	synd, dend
+	sub	dst, dst, 31
+#ifndef __AARCH64EB__
+	rbit	synd, synd
 #endif
-	orr	data1, data1, tmp2
-	orr	data2a, data2, tmp2
-	cmp	to_align, #8
-	csinv	data1, data1, xzr, lt
-	csel	data2, data2, data2a, lt
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	sub	tmp3, data2, zeroones
-	orr	tmp4, data2, #REP8_7f
-	bic	has_nul1, tmp1, tmp2
-	bics	has_nul2, tmp3, tmp4
-	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */
-	b.eq	L(page_cross_ok)
-	/* We now need to make data1 and data2 look like they've been
-	   loaded directly from srcin.  Do a rotate on the 128-bit value.  */
-	lsl	tmp1, to_align, #3	/* Bytes->bits.  */
-	neg	tmp2, to_align, lsl #3
-#ifdef __AARCH64EB__
-	lsl	data1a, data1, tmp1
-	lsr	tmp4, data2, tmp2
-	lsl	data2, data2, tmp1
-	orr	tmp4, tmp4, data1a
-	cmp	to_align, #8
-	csel	data1, tmp4, data2, lt
-	rev	tmp2, data1
-	rev	tmp4, data2
-	sub	tmp1, tmp2, zeroones
-	orr	tmp2, tmp2, #REP8_7f
-	sub	tmp3, tmp4, zeroones
-	orr	tmp4, tmp4, #REP8_7f
-#else
-	lsr	data1a, data1, tmp1
-	lsl	tmp4, data2, tmp2
-	lsr	data2, data2, tmp1
-	orr	tmp4, tmp4, data1a
-	cmp	to_align, #8
-	csel	data1, tmp4, data2, lt
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	sub	tmp3, data2, zeroones
-	orr	tmp4, data2, #REP8_7f
-#endif
-	bic	has_nul1, tmp1, tmp2
-	cbnz	has_nul1, L(fp_le8)
-	bic	has_nul2, tmp3, tmp4
-	b	L(fp_gt8)
+	clz	len, synd
+	lsr	len, len, 2
+	add	dst, dst, len
+	ldr	dataq, [dst, tmp]
+	str	dataq, [dst]
+	IFSTPCPY (add result, dst, 15)
+	ret
 
 END (STRCPY)
-
diff --git a/contrib/arm-optimized-routines/string/aarch64/strlen-mte.S b/contrib/arm-optimized-routines/string/aarch64/strlen-mte.S
index 7cf41d5c1eac..afa72eed9a43 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strlen-mte.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strlen-mte.S
@@ -1,8 +1,8 @@
 /*
  * strlen - calculate the length of a string.
  *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  * MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define srcin		x0
 #define result		x0
@@ -19,62 +19,71 @@
 #define src		x1
 #define	synd		x2
 #define tmp		x3
-#define wtmp		w3
 #define shift		x4
 
 #define data		q0
 #define vdata		v0
 #define vhas_nul	v1
-#define vrepmask	v2
-#define vend		v3
-#define dend		d3
+#define vend		v2
+#define dend		d2
 
 /* Core algorithm:
-
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+   Process the string in 16-byte aligned chunks. Compute a 64-bit mask with
+   four bits per byte using the shrn instruction. A count trailing zeros then
+   identifies the first zero byte.  */
 
 ENTRY (__strlen_aarch64_mte)
-	PTR_ARG (0)
 	bic	src, srcin, 15
-	mov	wtmp, 0xf00f
 	ld1	{vdata.16b}, [src]
-	dup	vrepmask.8h, wtmp
 	cmeq	vhas_nul.16b, vdata.16b, 0
 	lsl	shift, srcin, 2
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	lsr	synd, synd, shift
-	cbz	synd, L(loop)
+	cbz	synd, L(next16)
 
 	rbit	synd, synd
 	clz	result, synd
 	lsr	result, result, 2
 	ret
 
-	.p2align 5
-L(loop):
-	ldr	data, [src, 16]!
+L(next16):
+	ldr	data, [src, 16]
 	cmeq	vhas_nul.16b, vdata.16b, 0
-	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	cbz	synd, L(loop)
-
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	add	src, src, 16
+#ifndef __AARCH64EB__
+	rbit	synd, synd
+#endif
 	sub	result, src, srcin
+	clz	tmp, synd
+	add	result, result, tmp, lsr 2
+	ret
+
+	.p2align 5
+L(loop):
+	ldr	data, [src, 32]!
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	addhn	vend.8b, vhas_nul.8h, vhas_nul.8h
+	fmov	synd, dend
+	cbnz	synd, L(loop_end)
+	ldr	data, [src, 16]
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	addhn	vend.8b, vhas_nul.8h, vhas_nul.8h
 	fmov	synd, dend
+	cbz	synd, L(loop)
+	add	src, src, 16
+L(loop_end):
+	sub	result, shift, src, lsl 2	/* (srcin - src) << 2.  */
 #ifndef __AARCH64EB__
 	rbit	synd, synd
+	sub	result, result, 3
 #endif
 	clz	tmp, synd
-	add	result, result, tmp, lsr 2
+	sub	result, tmp, result
+	lsr	result, result, 2
 	ret
 
 END (__strlen_aarch64_mte)
-
diff --git a/contrib/arm-optimized-routines/string/aarch64/strlen.S b/contrib/arm-optimized-routines/string/aarch64/strlen.S
index a1b164a49238..0ebb26be844c 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strlen.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strlen.S
@@ -1,8 +1,8 @@
 /*
  * strlen - calculate the length of a string.
  *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  * Not MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define srcin	x0
 #define len	x0
@@ -36,6 +36,7 @@
 #define tmp	x2
 #define tmpw	w2
 #define synd	x3
+#define syndw	w3
 #define shift	x4
 
 /* For the first 32 bytes, NUL detection works on the principle that
@@ -74,7 +75,6 @@
    character, return the length, if not, continue in the main loop.  */
 
 ENTRY (__strlen_aarch64)
-	PTR_ARG (0)
 	and	tmp1, srcin, MIN_PAGE_SIZE - 1
 	cmp	tmp1, MIN_PAGE_SIZE - 32
 	b.hi	L(page_cross)
@@ -110,7 +110,6 @@ ENTRY (__strlen_aarch64)
 	add	len, len, tmp1, lsr 3
 	ret
 
-	.p2align 3
 	/* Look for a NUL byte at offset 16..31 in the string.  */
 L(bytes16_31):
 	ldp	data1, data2, [srcin, 16]
@@ -138,6 +137,7 @@ L(bytes16_31):
 	add	len, len, tmp1, lsr 3
 	ret
 
+	nop
 L(loop_entry):
 	bic	src, srcin, 31
 
@@ -153,18 +153,12 @@ L(loop):
 	/* Low 32 bits of synd are non-zero if a NUL was found in datav1.  */
 	cmeq	maskv.16b, datav1.16b, 0
 	sub	len, src, srcin
-	tst	synd, 0xffffffff
-	b.ne	1f
+	cbnz	syndw, 1f
 	cmeq	maskv.16b, datav2.16b, 0
 	add	len, len, 16
 1:
 	/* Generate a bitmask and compute correct byte offset.  */
-#ifdef __AARCH64EB__
-	bic	maskv.8h, 0xf0
-#else
-	bic	maskv.8h, 0x0f, lsl 8
-#endif
-	umaxp	maskv.16b, maskv.16b, maskv.16b
+	shrn	maskv.8b, maskv.8h, 4
 	fmov	synd, maskd
 #ifndef __AARCH64EB__
 	rbit	synd, synd
@@ -173,8 +167,6 @@ L(loop):
 	add	len, len, tmp, lsr 2
 	ret
 
-        .p2align 4
-
 L(page_cross):
 	bic	src, srcin, 31
 	mov	tmpw, 0x0c03
diff --git a/contrib/arm-optimized-routines/string/aarch64/strncmp-mte.S b/contrib/arm-optimized-routines/string/aarch64/strncmp-mte.S
deleted file mode 100644
index c9d6fc8a158b..000000000000
--- a/contrib/arm-optimized-routines/string/aarch64/strncmp-mte.S
+++ /dev/null
@@ -1,307 +0,0 @@
-/*
- * strncmp - compare two strings
- *
- * Copyright (c) 2013-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64
- */
-
-#include "../asmdefs.h"
-
-#define REP8_01 0x0101010101010101
-#define REP8_7f 0x7f7f7f7f7f7f7f7f
-
-/* Parameters and result.  */
-#define src1		x0
-#define src2		x1
-#define limit		x2
-#define result		x0
-
-/* Internal variables.  */
-#define data1		x3
-#define data1w		w3
-#define data2		x4
-#define data2w		w4
-#define has_nul		x5
-#define diff		x6
-#define syndrome	x7
-#define tmp1		x8
-#define tmp2		x9
-#define tmp3		x10
-#define zeroones	x11
-#define pos		x12
-#define mask		x13
-#define endloop		x14
-#define count		mask
-#define offset		pos
-#define neg_offset	x15
-
-/* Define endian dependent shift operations.
-   On big-endian early bytes are at MSB and on little-endian LSB.
-   LS_FW means shifting towards early bytes.
-   LS_BK means shifting towards later bytes.
-   */
-#ifdef __AARCH64EB__
-#define LS_FW lsl
-#define LS_BK lsr
-#else
-#define LS_FW lsr
-#define LS_BK lsl
-#endif
-
-ENTRY (__strncmp_aarch64_mte)
-	PTR_ARG (0)
-	PTR_ARG (1)
-	SIZE_ARG (2)
-	cbz	limit, L(ret0)
-	eor	tmp1, src1, src2
-	mov	zeroones, #REP8_01
-	tst	tmp1, #7
-	and	count, src1, #7
-	b.ne	L(misaligned8)
-	cbnz	count, L(mutual_align)
-
-	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
-	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
-	   can be done in parallel across the entire word.  */
-	.p2align 4
-L(loop_aligned):
-	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
-L(start_realigned):
-	subs	limit, limit, #8
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	eor	diff, data1, data2	/* Non-zero if differences found.  */
-	csinv	endloop, diff, xzr, hi	/* Last Dword or differences.  */
-	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
-	ccmp	endloop, #0, #0, eq
-	b.eq	L(loop_aligned)
-	/* End of main loop */
-
-L(full_check):
-#ifndef __AARCH64EB__
-	orr	syndrome, diff, has_nul
-	add	limit, limit, 8	/* Rewind limit to before last subs. */
-L(syndrome_check):
-	/* Limit was reached. Check if the NUL byte or the difference
-	   is before the limit. */
-	rev	syndrome, syndrome
-	rev	data1, data1
-	clz	pos, syndrome
-	rev	data2, data2
-	lsl	data1, data1, pos
-	cmp	limit, pos, lsr #3
-	lsl	data2, data2, pos
-	/* But we need to zero-extend (char is unsigned) the value and then
-	   perform a signed 32-bit subtraction.  */
-	lsr	data1, data1, #56
-	sub	result, data1, data2, lsr #56
-	csel result, result, xzr, hi
-	ret
-#else
-	/* Not reached the limit, must have found the end or a diff.  */
-	tbz	limit, #63, L(not_limit)
-	add	tmp1, limit, 8
-	cbz	limit, L(not_limit)
-
-	lsl	limit, tmp1, #3	/* Bits -> bytes.  */
-	mov	mask, #~0
-	lsr	mask, mask, limit
-	bic	data1, data1, mask
-	bic	data2, data2, mask
-
-	/* Make sure that the NUL byte is marked in the syndrome.  */
-	orr	has_nul, has_nul, mask
-
-L(not_limit):
-	/* For big-endian we cannot use the trick with the syndrome value
-	   as carry-propagation can corrupt the upper bits if the trailing
-	   bytes in the string contain 0x01.  */
-	/* However, if there is no NUL byte in the dword, we can generate
-	   the result directly.  We can't just subtract the bytes as the
-	   MSB might be significant.  */
-	cbnz	has_nul, 1f
-	cmp	data1, data2
-	cset	result, ne
-	cneg	result, result, lo
-	ret
-1:
-	/* Re-compute the NUL-byte detection, using a byte-reversed value.  */
-	rev	tmp3, data1
-	sub	tmp1, tmp3, zeroones
-	orr	tmp2, tmp3, #REP8_7f
-	bic	has_nul, tmp1, tmp2
-	rev	has_nul, has_nul
-	orr	syndrome, diff, has_nul
-	clz	pos, syndrome
-	/* The most-significant-non-zero bit of the syndrome marks either the
-	   first bit that is different, or the top bit of the first zero byte.
-	   Shifting left now will bring the critical information into the
-	   top bits.  */
-L(end_quick):
-	lsl	data1, data1, pos
-	lsl	data2, data2, pos
-	/* But we need to zero-extend (char is unsigned) the value and then
-	   perform a signed 32-bit subtraction.  */
-	lsr	data1, data1, #56
-	sub	result, data1, data2, lsr #56
-	ret
-#endif
-
-L(mutual_align):
-	/* Sources are mutually aligned, but are not currently at an
-	   alignment boundary.  Round down the addresses and then mask off
-	   the bytes that precede the start point.
-	   We also need to adjust the limit calculations, but without
-	   overflowing if the limit is near ULONG_MAX.  */
-	bic	src1, src1, #7
-	bic	src2, src2, #7
-	ldr	data1, [src1], #8
-	neg	tmp3, count, lsl #3	/* 64 - bits(bytes beyond align). */
-	ldr	data2, [src2], #8
-	mov	tmp2, #~0
-	LS_FW	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
-	/* Adjust the limit and ensure it doesn't overflow.  */
-	adds	limit, limit, count
-	csinv	limit, limit, xzr, lo
-	orr	data1, data1, tmp2
-	orr	data2, data2, tmp2
-	b	L(start_realigned)
-
-	.p2align 4
-	/* Don't bother with dwords for up to 16 bytes.  */
-L(misaligned8):
-	cmp	limit, #16
-	b.hs	L(try_misaligned_words)
-
-L(byte_loop):
-	/* Perhaps we can do better than this.  */
-	ldrb	data1w, [src1], #1
-	ldrb	data2w, [src2], #1
-	subs	limit, limit, #1
-	ccmp	data1w, #1, #0, hi	/* NZCV = 0b0000.  */
-	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
-	b.eq	L(byte_loop)
-L(done):
-	sub	result, data1, data2
-	ret
-	/* Align the SRC1 to a dword by doing a bytewise compare and then do
-	   the dword loop.  */
-L(try_misaligned_words):
-	cbz	count, L(src1_aligned)
-
-	neg	count, count
-	and	count, count, #7
-	sub	limit, limit, count
-
-L(page_end_loop):
-	ldrb	data1w, [src1], #1
-	ldrb	data2w, [src2], #1
-	cmp	data1w, #1
-	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
-	b.ne	L(done)
-	subs	count, count, #1
-	b.hi	L(page_end_loop)
-
-	/* The following diagram explains the comparison of misaligned strings.
-	   The bytes are shown in natural order. For little-endian, it is
-	   reversed in the registers. The "x" bytes are before the string.
-	   The "|" separates data that is loaded at one time.
-	   src1     | a a a a a a a a | b b b c c c c c | . . .
-	   src2     | x x x x x a a a   a a a a a b b b | c c c c c . . .
-
-	   After shifting in each step, the data looks like this:
-	                STEP_A              STEP_B              STEP_C
-	   data1    a a a a a a a a     b b b c c c c c     b b b c c c c c
-	   data2    a a a a a a a a     b b b 0 0 0 0 0     0 0 0 c c c c c
-
-	   The bytes with "0" are eliminated from the syndrome via mask.
-
-	   Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
-	   time from SRC2. The comparison happens in 3 steps. After each step
-	   the loop can exit, or read from SRC1 or SRC2. */
-L(src1_aligned):
-	/* Calculate offset from 8 byte alignment to string start in bits. No
-	   need to mask offset since shifts are ignoring upper bits. */
-	lsl	offset, src2, #3
-	bic	src2, src2, #0xf
-	mov	mask, -1
-	neg	neg_offset, offset
-	ldr	data1, [src1], #8
-	ldp	tmp1, tmp2, [src2], #16
-	LS_BK	mask, mask, neg_offset
-	and	neg_offset, neg_offset, #63	/* Need actual value for cmp later. */
-	/* Skip the first compare if data in tmp1 is irrelevant. */
-	tbnz	offset, 6, L(misaligned_mid_loop)
-
-L(loop_misaligned):
-	/* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
-	LS_FW	data2, tmp1, offset
-	LS_BK	tmp1, tmp2, neg_offset
-	subs	limit, limit, #8
-	orr	data2, data2, tmp1	/* 8 bytes from SRC2 combined from two regs.*/
-	sub	has_nul, data1, zeroones
-	eor	diff, data1, data2	/* Non-zero if differences found.  */
-	orr	tmp3, data1, #REP8_7f
-	csinv	endloop, diff, xzr, hi	/* If limit, set to all ones. */
-	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL byte found in SRC1. */
-	orr	tmp3, endloop, has_nul
-	cbnz	tmp3, L(full_check)
-
-	ldr	data1, [src1], #8
-L(misaligned_mid_loop):
-	/* STEP_B: Compare first part of data1 to second part of tmp2. */
-	LS_FW	data2, tmp2, offset
-#ifdef __AARCH64EB__
-	/* For big-endian we do a byte reverse to avoid carry-propagation
-	problem described above. This way we can reuse the has_nul in the
-	next step and also use syndrome value trick at the end. */
-	rev	tmp3, data1
-	#define data1_fixed tmp3
-#else
-	#define data1_fixed data1
-#endif
-	sub	has_nul, data1_fixed, zeroones
-	orr	tmp3, data1_fixed, #REP8_7f
-	eor	diff, data2, data1	/* Non-zero if differences found.  */
-	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL terminator.  */
-#ifdef __AARCH64EB__
-	rev	has_nul, has_nul
-#endif
-	cmp	limit, neg_offset, lsr #3
-	orr	syndrome, diff, has_nul
-	bic	syndrome, syndrome, mask	/* Ignore later bytes. */
-	csinv	tmp3, syndrome, xzr, hi	/* If limit, set to all ones. */
-	cbnz	tmp3, L(syndrome_check)
-
-	/* STEP_C: Compare second part of data1 to first part of tmp1. */
-	ldp	tmp1, tmp2, [src2], #16
-	cmp	limit, #8
-	LS_BK	data2, tmp1, neg_offset
-	eor	diff, data2, data1	/* Non-zero if differences found.  */
-	orr	syndrome, diff, has_nul
-	and	syndrome, syndrome, mask	/* Ignore earlier bytes. */
-	csinv	tmp3, syndrome, xzr, hi	/* If limit, set to all ones. */
-	cbnz	tmp3, L(syndrome_check)
-
-	ldr	data1, [src1], #8
-	sub	limit, limit, #8
-	b	L(loop_misaligned)
-
-#ifdef	__AARCH64EB__
-L(syndrome_check):
-	clz	pos, syndrome
-	cmp	pos, limit, lsl #3
-	b.lo	L(end_quick)
-#endif
-
-L(ret0):
-	mov	result, #0
-	ret
-END(__strncmp_aarch64_mte)
-
diff --git a/contrib/arm-optimized-routines/string/aarch64/strncmp.S b/contrib/arm-optimized-routines/string/aarch64/strncmp.S
index 738b6539cab6..493a0f06ed1d 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strncmp.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strncmp.S
@@ -1,20 +1,20 @@
 /*
  * strncmp - compare two strings
  *
- * Copyright (c) 2013-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2013-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64
+ * ARMv8-a, AArch64.
+ * MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define REP8_01 0x0101010101010101
 #define REP8_7f 0x7f7f7f7f7f7f7f7f
-#define REP8_80 0x8080808080808080
 
 /* Parameters and result.  */
 #define src1		x0
@@ -35,15 +35,26 @@
 #define tmp3		x10
 #define zeroones	x11
 #define pos		x12
-#define limit_wd	x13
-#define mask		x14
-#define endloop		x15
+#define mask		x13
+#define endloop		x14
 #define count		mask
+#define offset		pos
+#define neg_offset	x15
+
+/* Define endian dependent shift operations.
+   On big-endian early bytes are at MSB and on little-endian LSB.
+   LS_FW means shifting towards early bytes.
+   LS_BK means shifting towards later bytes.
+   */
+#ifdef __AARCH64EB__
+#define LS_FW lsl
+#define LS_BK lsr
+#else
+#define LS_FW lsr
+#define LS_BK lsl
+#endif
 
 ENTRY (__strncmp_aarch64)
-	PTR_ARG (0)
-	PTR_ARG (1)
-	SIZE_ARG (2)
 	cbz	limit, L(ret0)
 	eor	tmp1, src1, src2
 	mov	zeroones, #REP8_01
@@ -51,9 +62,6 @@ ENTRY (__strncmp_aarch64)
 	and	count, src1, #7
 	b.ne	L(misaligned8)
 	cbnz	count, L(mutual_align)
-	/* Calculate the number of full and partial words -1.  */
-	sub	limit_wd, limit, #1	/* limit != 0, so no underflow.  */
-	lsr	limit_wd, limit_wd, #3	/* Convert to Dwords.  */
 
 	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
 	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
@@ -63,56 +71,52 @@ L(loop_aligned):
 	ldr	data1, [src1], #8
 	ldr	data2, [src2], #8
 L(start_realigned):
-	subs	limit_wd, limit_wd, #1
+	subs	limit, limit, #8
 	sub	tmp1, data1, zeroones
 	orr	tmp2, data1, #REP8_7f
 	eor	diff, data1, data2	/* Non-zero if differences found.  */
-	csinv	endloop, diff, xzr, pl	/* Last Dword or differences.  */
+	csinv	endloop, diff, xzr, hi	/* Last Dword or differences.  */
 	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
 	ccmp	endloop, #0, #0, eq
 	b.eq	L(loop_aligned)
 	/* End of main loop */
 
-	/* Not reached the limit, must have found the end or a diff.  */
-	tbz	limit_wd, #63, L(not_limit)
-
-	/* Limit % 8 == 0 => all bytes significant.  */
-	ands	limit, limit, #7
-	b.eq	L(not_limit)
-
-	lsl	limit, limit, #3	/* Bits -> bytes.  */
-	mov	mask, #~0
-#ifdef __AARCH64EB__
-	lsr	mask, mask, limit
-#else
-	lsl	mask, mask, limit
-#endif
-	bic	data1, data1, mask
-	bic	data2, data2, mask
-
-	/* Make sure that the NUL byte is marked in the syndrome.  */
-	orr	has_nul, has_nul, mask
-
-L(not_limit):
+L(full_check):
+#ifndef __AARCH64EB__
 	orr	syndrome, diff, has_nul
-
-#ifndef	__AARCH64EB__
+	add	limit, limit, 8	/* Rewind limit to before last subs. */
+L(syndrome_check):
+	/* Limit was reached. Check if the NUL byte or the difference
+	   is before the limit. */
 	rev	syndrome, syndrome
 	rev	data1, data1
-	/* The MS-non-zero bit of the syndrome marks either the first bit
-	   that is different, or the top bit of the first zero byte.
-	   Shifting left now will bring the critical information into the
-	   top bits.  */
 	clz	pos, syndrome
 	rev	data2, data2
 	lsl	data1, data1, pos
+	cmp	limit, pos, lsr #3
 	lsl	data2, data2, pos
 	/* But we need to zero-extend (char is unsigned) the value and then
 	   perform a signed 32-bit subtraction.  */
 	lsr	data1, data1, #56
 	sub	result, data1, data2, lsr #56
+	csel result, result, xzr, hi
 	ret
 #else
+	/* Not reached the limit, must have found the end or a diff.  */
+	tbz	limit, #63, L(not_limit)
+	add	tmp1, limit, 8
+	cbz	limit, L(not_limit)
+
+	lsl	limit, tmp1, #3	/* Bits -> bytes.  */
+	mov	mask, #~0
+	lsr	mask, mask, limit
+	bic	data1, data1, mask
+	bic	data2, data2, mask
+
+	/* Make sure that the NUL byte is marked in the syndrome.  */
+	orr	has_nul, has_nul, mask
+
+L(not_limit):
 	/* For big-endian we cannot use the trick with the syndrome value
 	   as carry-propagation can corrupt the upper bits if the trailing
 	   bytes in the string contain 0x01.  */
@@ -133,10 +137,11 @@ L(not_limit):
 	rev	has_nul, has_nul
 	orr	syndrome, diff, has_nul
 	clz	pos, syndrome
-	/* The MS-non-zero bit of the syndrome marks either the first bit
-	   that is different, or the top bit of the first zero byte.
+	/* The most-significant-non-zero bit of the syndrome marks either the
+	   first bit that is different, or the top bit of the first zero byte.
 	   Shifting left now will bring the critical information into the
 	   top bits.  */
+L(end_quick):
 	lsl	data1, data1, pos
 	lsl	data2, data2, pos
 	/* But we need to zero-extend (char is unsigned) the value and then
@@ -158,22 +163,12 @@ L(mutual_align):
 	neg	tmp3, count, lsl #3	/* 64 - bits(bytes beyond align). */
 	ldr	data2, [src2], #8
 	mov	tmp2, #~0
-	sub	limit_wd, limit, #1	/* limit != 0, so no underflow.  */
-#ifdef __AARCH64EB__
-	/* Big-endian.  Early bytes are at MSB.  */
-	lsl	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
-#else
-	/* Little-endian.  Early bytes are at LSB.  */
-	lsr	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
-#endif
-	and	tmp3, limit_wd, #7
-	lsr	limit_wd, limit_wd, #3
-	/* Adjust the limit. Only low 3 bits used, so overflow irrelevant.  */
-	add	limit, limit, count
-	add	tmp3, tmp3, count
+	LS_FW	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
+	/* Adjust the limit and ensure it doesn't overflow.  */
+	adds	limit, limit, count
+	csinv	limit, limit, xzr, lo
 	orr	data1, data1, tmp2
 	orr	data2, data2, tmp2
-	add	limit_wd, limit_wd, tmp3, lsr #3
 	b	L(start_realigned)
 
 	.p2align 4
@@ -196,13 +191,11 @@ L(done):
 	/* Align the SRC1 to a dword by doing a bytewise compare and then do
 	   the dword loop.  */
 L(try_misaligned_words):
-	lsr	limit_wd, limit, #3
-	cbz	count, L(do_misaligned)
+	cbz	count, L(src1_aligned)
 
 	neg	count, count
 	and	count, count, #7
 	sub	limit, limit, count
-	lsr	limit_wd, limit, #3
 
 L(page_end_loop):
 	ldrb	data1w, [src1], #1
@@ -213,48 +206,100 @@ L(page_end_loop):
 	subs	count, count, #1
 	b.hi	L(page_end_loop)
 
-L(do_misaligned):
-	/* Prepare ourselves for the next page crossing.  Unlike the aligned
-	   loop, we fetch 1 less dword because we risk crossing bounds on
-	   SRC2.  */
-	mov	count, #8
-	subs	limit_wd, limit_wd, #1
-	b.lo	L(done_loop)
-L(loop_misaligned):
-	and	tmp2, src2, #0xff8
-	eor	tmp2, tmp2, #0xff8
-	cbz	tmp2, L(page_end_loop)
+	/* The following diagram explains the comparison of misaligned strings.
+	   The bytes are shown in natural order. For little-endian, it is
+	   reversed in the registers. The "x" bytes are before the string.
+	   The "|" separates data that is loaded at one time.
+	   src1     | a a a a a a a a | b b b c c c c c | . . .
+	   src2     | x x x x x a a a   a a a a a b b b | c c c c c . . .
+
+	   After shifting in each step, the data looks like this:
+	                STEP_A              STEP_B              STEP_C
+	   data1    a a a a a a a a     b b b c c c c c     b b b c c c c c
+	   data2    a a a a a a a a     b b b 0 0 0 0 0     0 0 0 c c c c c
 
+	   The bytes with "0" are eliminated from the syndrome via mask.
+
+	   Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
+	   time from SRC2. The comparison happens in 3 steps. After each step
+	   the loop can exit, or read from SRC1 or SRC2. */
+L(src1_aligned):
+	/* Calculate offset from 8 byte alignment to string start in bits. No
+	   need to mask offset since shifts are ignoring upper bits. */
+	lsl	offset, src2, #3
+	bic	src2, src2, #0xf
+	mov	mask, -1
+	neg	neg_offset, offset
 	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	eor	diff, data1, data2	/* Non-zero if differences found.  */
-	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
-	ccmp	diff, #0, #0, eq
-	b.ne	L(not_limit)
-	subs	limit_wd, limit_wd, #1
-	b.pl	L(loop_misaligned)
+	ldp	tmp1, tmp2, [src2], #16
+	LS_BK	mask, mask, neg_offset
+	and	neg_offset, neg_offset, #63	/* Need actual value for cmp later. */
+	/* Skip the first compare if data in tmp1 is irrelevant. */
+	tbnz	offset, 6, L(misaligned_mid_loop)
 
-L(done_loop):
-	/* We found a difference or a NULL before the limit was reached.  */
-	and	limit, limit, #7
-	cbz	limit, L(not_limit)
-	/* Read the last word.  */
-	sub	src1, src1, 8
-	sub	src2, src2, 8
-	ldr	data1, [src1, limit]
-	ldr	data2, [src2, limit]
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
+L(loop_misaligned):
+	/* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
+	LS_FW	data2, tmp1, offset
+	LS_BK	tmp1, tmp2, neg_offset
+	subs	limit, limit, #8
+	orr	data2, data2, tmp1	/* 8 bytes from SRC2 combined from two regs.*/
+	sub	has_nul, data1, zeroones
 	eor	diff, data1, data2	/* Non-zero if differences found.  */
-	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
-	ccmp	diff, #0, #0, eq
-	b.ne	L(not_limit)
+	orr	tmp3, data1, #REP8_7f
+	csinv	endloop, diff, xzr, hi	/* If limit, set to all ones. */
+	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL byte found in SRC1. */
+	orr	tmp3, endloop, has_nul
+	cbnz	tmp3, L(full_check)
+
+	ldr	data1, [src1], #8
+L(misaligned_mid_loop):
+	/* STEP_B: Compare first part of data1 to second part of tmp2. */
+	LS_FW	data2, tmp2, offset
+#ifdef __AARCH64EB__
+	/* For big-endian we do a byte reverse to avoid carry-propagation
+	problem described above. This way we can reuse the has_nul in the
+	next step and also use syndrome value trick at the end. */
+	rev	tmp3, data1
+	#define data1_fixed tmp3
+#else
+	#define data1_fixed data1
+#endif
+	sub	has_nul, data1_fixed, zeroones
+	orr	tmp3, data1_fixed, #REP8_7f
+	eor	diff, data2, data1	/* Non-zero if differences found.  */
+	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL terminator.  */
+#ifdef __AARCH64EB__
+	rev	has_nul, has_nul
+#endif
+	cmp	limit, neg_offset, lsr #3
+	orr	syndrome, diff, has_nul
+	bic	syndrome, syndrome, mask	/* Ignore later bytes. */
+	csinv	tmp3, syndrome, xzr, hi	/* If limit, set to all ones. */
+	cbnz	tmp3, L(syndrome_check)
+
+	/* STEP_C: Compare second part of data1 to first part of tmp1. */
+	ldp	tmp1, tmp2, [src2], #16
+	cmp	limit, #8
+	LS_BK	data2, tmp1, neg_offset
+	eor	diff, data2, data1	/* Non-zero if differences found.  */
+	orr	syndrome, diff, has_nul
+	and	syndrome, syndrome, mask	/* Ignore earlier bytes. */
+	csinv	tmp3, syndrome, xzr, hi	/* If limit, set to all ones. */
+	cbnz	tmp3, L(syndrome_check)
+
+	ldr	data1, [src1], #8
+	sub	limit, limit, #8
+	b	L(loop_misaligned)
+
+#ifdef	__AARCH64EB__
+L(syndrome_check):
+	clz	pos, syndrome
+	cmp	pos, limit, lsl #3
+	b.lo	L(end_quick)
+#endif
 
 L(ret0):
 	mov	result, #0
 	ret
-
-END ( __strncmp_aarch64)
+END(__strncmp_aarch64)
 
diff --git a/contrib/arm-optimized-routines/string/aarch64/strnlen.S b/contrib/arm-optimized-routines/string/aarch64/strnlen.S
index 48d2495d2082..6a96ec268f1a 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strnlen.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strnlen.S
@@ -1,8 +1,8 @@
 /*
  * strnlen - calculate the length of a string with limit.
  *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  * MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define srcin		x0
 #define cntin		x1
@@ -20,39 +20,28 @@
 #define src		x2
 #define synd		x3
 #define	shift		x4
-#define wtmp		w4
 #define tmp		x4
 #define cntrem		x5
 
 #define qdata		q0
 #define vdata		v0
 #define vhas_chr	v1
-#define vrepmask	v2
-#define vend		v3
-#define dend		d3
+#define vend		v2
+#define dend		d2
 
 /*
    Core algorithm:
-
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+   Process the string in 16-byte aligned chunks. Compute a 64-bit mask with
+   four bits per byte using the shrn instruction. A count trailing zeros then
+   identifies the first zero byte.  */
 
 ENTRY (__strnlen_aarch64)
-	PTR_ARG (0)
-	SIZE_ARG (1)
 	bic	src, srcin, 15
-	mov	wtmp, 0xf00f
 	cbz	cntin, L(nomatch)
-	ld1	{vdata.16b}, [src], 16
-	dup	vrepmask.8h, wtmp
+	ld1	{vdata.16b}, [src]
 	cmeq	vhas_chr.16b, vdata.16b, 0
 	lsl	shift, srcin, 2
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	cbz	synd, L(start_loop)
@@ -64,37 +53,40 @@ L(finish):
 	csel	result, cntin, result, ls
 	ret
 
+L(nomatch):
+	mov	result, cntin
+	ret
+
 L(start_loop):
 	sub	tmp, src, srcin
+	add	tmp, tmp, 17
 	subs	cntrem, cntin, tmp
-	b.ls	L(nomatch)
+	b.lo	L(nomatch)
 
 	/* Make sure that it won't overread by a 16-byte chunk */
-	add	tmp, cntrem, 15
-	tbnz	tmp, 4, L(loop32_2)
-
+	tbz	cntrem, 4, L(loop32_2)
+	sub	src, src, 16
 	.p2align 5
 L(loop32):
-	ldr	qdata, [src], 16
+	ldr	qdata, [src, 32]!
 	cmeq	vhas_chr.16b, vdata.16b, 0
 	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
 	fmov	synd, dend
 	cbnz	synd, L(end)
 L(loop32_2):
-	ldr	qdata, [src], 16
+	ldr	qdata, [src, 16]
 	subs	cntrem, cntrem, 32
 	cmeq	vhas_chr.16b, vdata.16b, 0
-	b.ls	L(end)
+	b.lo	L(end_2)
 	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
 	fmov	synd, dend
 	cbz	synd, L(loop32)
-
+L(end_2):
+	add	src, src, 16
 L(end):
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
-	sub	src, src, 16
-	mov	synd, vend.d[0]
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	sub	result, src, srcin
+	fmov	synd, dend
 #ifndef __AARCH64EB__
 	rbit	synd, synd
 #endif
@@ -104,9 +96,5 @@ L(end):
 	csel	result, cntin, result, ls
 	ret
 
-L(nomatch):
-	mov	result, cntin
-	ret
-
 END (__strnlen_aarch64)
 
diff --git a/contrib/arm-optimized-routines/string/aarch64/strrchr-mte.S b/contrib/arm-optimized-routines/string/aarch64/strrchr-mte.S
index 1e4fb1a68f7e..8668ce6d2916 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strrchr-mte.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strrchr-mte.S
@@ -1,8 +1,8 @@
 /*
  * strrchr - find last position of a character in a string.
  *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  * MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define srcin		x0
 #define chrin		w1
@@ -19,7 +19,6 @@
 
 #define src		x2
 #define tmp		x3
-#define wtmp		w3
 #define synd		x3
 #define shift		x4
 #define src_match	x4
@@ -31,7 +30,6 @@
 #define vhas_nul	v2
 #define vhas_chr	v3
 #define vrepmask	v4
-#define vrepmask2	v5
 #define vend		v5
 #define dend		d5
 
@@ -44,58 +42,69 @@
    if the relevant byte matched the NUL end of string.  */
 
 ENTRY (__strrchr_aarch64_mte)
-	PTR_ARG (0)
 	bic	src, srcin, 15
 	dup	vrepchr.16b, chrin
-	mov	wtmp, 0x3003
-	dup	vrepmask.8h, wtmp
-	tst	srcin, 15
-	beq	L(loop1)
-
-	ld1	{vdata.16b}, [src], 16
+	movi	vrepmask.16b, 0x33
+	ld1	{vdata.16b}, [src]
 	cmeq	vhas_nul.16b, vdata.16b, 0
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
-	mov	wtmp, 0xf00f
-	dup	vrepmask2.8h, wtmp
 	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	shrn	vend.8b, vhas_nul.8h, 4
 	lsl	shift, srcin, 2
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	lsl	synd, synd, shift
 	ands	nul_match, synd, 0xcccccccccccccccc
 	bne	L(tail)
-	cbnz	synd, L(loop2)
+	cbnz	synd, L(loop2_start)
 
-	.p2align 5
+	.p2align 4
 L(loop1):
-	ld1	{vdata.16b}, [src], 16
+	ldr	q1, [src, 16]
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	cmhs	vhas_nul.16b, vhas_chr.16b, vdata.16b
+	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	cbnz	synd, L(loop1_end)
+	ldr	q1, [src, 32]!
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	cmhs	vhas_nul.16b, vhas_chr.16b, vdata.16b
 	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
 	fmov	synd, dend
 	cbz	synd, L(loop1)
-
+	sub	src, src, 16
+L(loop1_end):
+	add	src, src, 16
 	cmeq	vhas_nul.16b, vdata.16b, 0
+#ifdef __AARCH64EB__
+	bif	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
+	shrn	vend.8b, vhas_nul.8h, 4
+	fmov	synd, dend
+	rbit	synd, synd
+#else
 	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
-	bic	vhas_nul.8h, 0x0f, lsl 8
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	shrn	vend.8b, vhas_nul.8h, 4
 	fmov	synd, dend
+#endif
 	ands	nul_match, synd, 0xcccccccccccccccc
-	beq	L(loop2)
-
+	beq	L(loop2_start)
 L(tail):
 	sub	nul_match, nul_match, 1
 	and	chr_match, synd, 0x3333333333333333
 	ands	chr_match, chr_match, nul_match
-	sub	result, src, 1
+	add	result, src, 15
 	clz	tmp, chr_match
 	sub	result, result, tmp, lsr 2
 	csel	result, result, xzr, ne
 	ret
 
 	.p2align 4
+	nop
+	nop
+L(loop2_start):
+	add	src, src, 16
+	bic	vrepmask.8h, 0xf0
+
 L(loop2):
 	cmp	synd, 0
 	csel	src_match, src, src_match, ne
diff --git a/contrib/arm-optimized-routines/string/aarch64/strrchr.S b/contrib/arm-optimized-routines/string/aarch64/strrchr.S
index 56185ff534e3..f5713f4260fb 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strrchr.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strrchr.S
@@ -1,8 +1,8 @@
 /*
  * strrchr - find last position of a character in a string.
  *
- * Copyright (c) 2014-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2014-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  * Neon Available.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 /* Arguments and results.  */
 #define srcin		x0
@@ -55,7 +55,6 @@
    identify exactly which byte is causing the termination, and why.  */
 
 ENTRY (__strrchr_aarch64)
-	PTR_ARG (0)
 	/* Magic constant 0x40100401 to allow us to identify which lane
 	   matches the requested byte.  Magic constant 0x80200802 used
 	   similarly for NUL termination.  */