diff options
Diffstat (limited to 'contrib/arm-optimized-routines/string/aarch64/strlen-mte.S')
-rw-r--r-- | contrib/arm-optimized-routines/string/aarch64/strlen-mte.S | 67 |
1 files changed, 38 insertions, 29 deletions
diff --git a/contrib/arm-optimized-routines/string/aarch64/strlen-mte.S b/contrib/arm-optimized-routines/string/aarch64/strlen-mte.S index 7cf41d5c1eac..afa72eed9a43 100644 --- a/contrib/arm-optimized-routines/string/aarch64/strlen-mte.S +++ b/contrib/arm-optimized-routines/string/aarch64/strlen-mte.S @@ -1,8 +1,8 @@ /* * strlen - calculate the length of a string. * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2020-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * MTE compatible. */ -#include "../asmdefs.h" +#include "asmdefs.h" #define srcin x0 #define result x0 @@ -19,62 +19,71 @@ #define src x1 #define synd x2 #define tmp x3 -#define wtmp w3 #define shift x4 #define data q0 #define vdata v0 #define vhas_nul v1 -#define vrepmask v2 -#define vend v3 -#define dend d3 +#define vend v2 +#define dend d2 /* Core algorithm: - - For each 16-byte chunk we calculate a 64-bit syndrome value with four bits - per byte. For even bytes, bits 0-3 are set if the relevant byte matched the - requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are - set likewise for odd bytes so that adjacent bytes can be merged. Since the - bits in the syndrome reflect the order in which things occur in the original - string, counting trailing zeros identifies exactly which byte matched. */ + Process the string in 16-byte aligned chunks. Compute a 64-bit mask with + four bits per byte using the shrn instruction. A count trailing zeros then + identifies the first zero byte. */ ENTRY (__strlen_aarch64_mte) - PTR_ARG (0) bic src, srcin, 15 - mov wtmp, 0xf00f ld1 {vdata.16b}, [src] - dup vrepmask.8h, wtmp cmeq vhas_nul.16b, vdata.16b, 0 lsl shift, srcin, 2 - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b - addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ fmov synd, dend lsr synd, synd, shift - cbz synd, L(loop) + cbz synd, L(next16) rbit synd, synd clz result, synd lsr result, result, 2 ret - .p2align 5 -L(loop): - ldr data, [src, 16]! +L(next16): + ldr data, [src, 16] cmeq vhas_nul.16b, vdata.16b, 0 - umaxp vend.16b, vhas_nul.16b, vhas_nul.16b + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ fmov synd, dend cbz synd, L(loop) - - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b - addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ + add src, src, 16 +#ifndef __AARCH64EB__ + rbit synd, synd +#endif sub result, src, srcin + clz tmp, synd + add result, result, tmp, lsr 2 + ret + + .p2align 5 +L(loop): + ldr data, [src, 32]! + cmeq vhas_nul.16b, vdata.16b, 0 + addhn vend.8b, vhas_nul.8h, vhas_nul.8h + fmov synd, dend + cbnz synd, L(loop_end) + ldr data, [src, 16] + cmeq vhas_nul.16b, vdata.16b, 0 + addhn vend.8b, vhas_nul.8h, vhas_nul.8h fmov synd, dend + cbz synd, L(loop) + add src, src, 16 +L(loop_end): + sub result, shift, src, lsl 2 /* (srcin - src) << 2. */ #ifndef __AARCH64EB__ rbit synd, synd + sub result, result, 3 #endif clz tmp, synd - add result, result, tmp, lsr 2 + sub result, tmp, result + lsr result, result, 2 ret END (__strlen_aarch64_mte) - |