aboutsummaryrefslogtreecommitdiff
path: root/contrib/arm-optimized-routines/string/aarch64/strlen-mte.S
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/arm-optimized-routines/string/aarch64/strlen-mte.S')
-rw-r--r--contrib/arm-optimized-routines/string/aarch64/strlen-mte.S67
1 files changed, 38 insertions, 29 deletions
diff --git a/contrib/arm-optimized-routines/string/aarch64/strlen-mte.S b/contrib/arm-optimized-routines/string/aarch64/strlen-mte.S
index 7cf41d5c1eac..afa72eed9a43 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strlen-mte.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strlen-mte.S
@@ -1,8 +1,8 @@
/*
* strlen - calculate the length of a string.
*
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* Assumptions:
@@ -11,7 +11,7 @@
* MTE compatible.
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
#define srcin x0
#define result x0
@@ -19,62 +19,71 @@
#define src x1
#define synd x2
#define tmp x3
-#define wtmp w3
#define shift x4
#define data q0
#define vdata v0
#define vhas_nul v1
-#define vrepmask v2
-#define vend v3
-#define dend d3
+#define vend v2
+#define dend d2
/* Core algorithm:
-
- For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
- per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
- requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
- set likewise for odd bytes so that adjacent bytes can be merged. Since the
- bits in the syndrome reflect the order in which things occur in the original
- string, counting trailing zeros identifies exactly which byte matched. */
+ Process the string in 16-byte aligned chunks. Compute a 64-bit mask with
+ four bits per byte using the shrn instruction. A count trailing zeros then
+ identifies the first zero byte. */
ENTRY (__strlen_aarch64_mte)
- PTR_ARG (0)
bic src, srcin, 15
- mov wtmp, 0xf00f
ld1 {vdata.16b}, [src]
- dup vrepmask.8h, wtmp
cmeq vhas_nul.16b, vdata.16b, 0
lsl shift, srcin, 2
- and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
- addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
+ shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
fmov synd, dend
lsr synd, synd, shift
- cbz synd, L(loop)
+ cbz synd, L(next16)
rbit synd, synd
clz result, synd
lsr result, result, 2
ret
- .p2align 5
-L(loop):
- ldr data, [src, 16]!
+L(next16):
+ ldr data, [src, 16]
cmeq vhas_nul.16b, vdata.16b, 0
- umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
+ shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
fmov synd, dend
cbz synd, L(loop)
-
- and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
- addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
+ add src, src, 16
+#ifndef __AARCH64EB__
+ rbit synd, synd
+#endif
sub result, src, srcin
+ clz tmp, synd
+ add result, result, tmp, lsr 2
+ ret
+
+ .p2align 5
+L(loop):
+ ldr data, [src, 32]!
+ cmeq vhas_nul.16b, vdata.16b, 0
+ addhn vend.8b, vhas_nul.8h, vhas_nul.8h
+ fmov synd, dend
+ cbnz synd, L(loop_end)
+ ldr data, [src, 16]
+ cmeq vhas_nul.16b, vdata.16b, 0
+ addhn vend.8b, vhas_nul.8h, vhas_nul.8h
fmov synd, dend
+ cbz synd, L(loop)
+ add src, src, 16
+L(loop_end):
+ sub result, shift, src, lsl 2 /* (srcin - src) << 2. */
#ifndef __AARCH64EB__
rbit synd, synd
+ sub result, result, 3
#endif
clz tmp, synd
- add result, result, tmp, lsr 2
+ sub result, tmp, result
+ lsr result, result, 2
ret
END (__strlen_aarch64_mte)
-