diff options
Diffstat (limited to 'lib/libc/amd64/string/memrchr.S')
-rw-r--r-- | lib/libc/amd64/string/memrchr.S | 158 |
1 files changed, 158 insertions, 0 deletions
diff --git a/lib/libc/amd64/string/memrchr.S b/lib/libc/amd64/string/memrchr.S new file mode 100644 index 000000000000..80fb306af2a3 --- /dev/null +++ b/lib/libc/amd64/string/memrchr.S @@ -0,0 +1,158 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023, 2025 Robert Clausecker <fuz@FreeBSD.org> + */ + +#include <machine/asm.h> + +#include "amd64_archlevel.h" + +#define ALIGN_TEXT .p2align 4, 0x90 + +ARCHFUNCS(memrchr) + ARCHFUNC(memrchr, scalar) + ARCHFUNC(memrchr, baseline) +ENDARCHFUNCS(memrchr) + +ARCHENTRY(memrchr, scalar) + lea -1(%rdi, %rdx, 1), %rax # point to last char in buffer + sub $4, %rdx # 4 bytes left to process? + jb .Ltail + + ALIGN_TEXT +0: cmp %sil, (%rax) # match at last entry? + je 1f + + cmp %sil, -1(%rax) # match at second to last entry? + je 2f + + cmp %sil, -2(%rax) # match at third to last entry? + je 3f + + cmp %sil, -3(%rax) # match at fourth to last entry? + je 4f + + sub $4, %rax + sub $4, %rdx + jae 0b + +.Ltail: cmp $-3, %edx # at least one character left to process? + jb .Lnotfound + + cmp %sil, (%rax) + je 1f + + cmp $-2, %edx # at least two characters left to process? + jb .Lnotfound + + cmp %sil, -1(%rax) + je 2f + + cmp $-1, %edx # at least three characters left to process? + jb .Lnotfound + + cmp %sil, -2(%rax) + je 3f + +.Lnotfound: + xor %eax, %eax + ret + + /* match found -- adjust rax to point to matching byte */ +4: dec %rax +3: dec %rax +2: dec %rax +1: ret +ARCHEND(memrchr, scalar) + +ARCHENTRY(memrchr, baseline) + test %rdx, %rdx # empty input? + je .Lnomatchb + + + lea (%rdi, %rdx, 1), %ecx # pointer to end of buffer + lea -1(%rdi, %rdx, 1), %rdx # pointer to last char in buffer + movd %esi, %xmm2 + and $~0x1f, %rdx # pointer to final 32 buffer bytes + movdqa (%rdx), %xmm0 # load last 32 bytes + movdqa 16(%rdx), %xmm1 + + punpcklbw %xmm2, %xmm2 # c -> cc + + mov $-1, %r8d + neg %ecx + mov %r8d, %r9d + shr %cl, %r8d # mask with zeroes after the string + + punpcklwd %xmm2, %xmm2 # cc -> cccc + + mov %edi, %ecx + mov %r9d, %eax + shl %cl, %r9d # mask with zeroes before the string + + pshufd $0, %xmm2, %xmm2 # cccc -> cccccccccccccccc + + cmp %rdx, %rdi # tail is beginning of buffer? + cmovae %r9d, %eax # if yes, do combined head/tail processing + and %r8d, %eax # mak of bytes in tail part of string + + /* process tail */ + pcmpeqb %xmm2, %xmm1 + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm1, %esi + pmovmskb %xmm0, %ecx + shl $16, %esi + or %esi, %ecx # locations of matches + and %ecx, %eax # any match inside buffer? + jnz .Lprecisematchb + + cmp %rdx, %rdi # did the buffer begin here? + jae .Lnomatchb # if yes, we are done + + /* main loop */ + ALIGN_TEXT +0: movdqa -32(%rdx), %xmm0 # load previous string chunk + movdqa -16(%rdx), %xmm1 + sub $32, %rdx # beginning of string reached? + cmp %rdx, %rdi + jae .Ltailb + + pcmpeqb %xmm2, %xmm0 + pcmpeqb %xmm2, %xmm1 + por %xmm1, %xmm0 # match in either half? + pmovmskb %xmm0, %eax + test %eax, %eax + jz 0b + +.Lmatchb: + pcmpeqb (%rdx), %xmm2 # redo comparison of first 16 bytes + pmovmskb %xmm1, %ecx + pmovmskb %xmm2, %eax + shl $16, %ecx + or %ecx, %eax # location of matches + +.Lprecisematchb: + bsr %eax, %eax # find location of match + add %rdx, %rax # point to matching byte + ret + +.Ltailb: + pcmpeqb %xmm2, %xmm1 + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm1, %ecx + pmovmskb %xmm0, %eax + shl $16, %ecx + or %ecx, %eax # location of matches + and %r9d, %eax # mask out matches before buffer + bsr %eax, %edi # location of match + lea (%rdx, %rdi, 1), %rdx # pointer to match (if any) + cmovnz %rdx, %rax # point to match if present, + ret # else null pointer + +.Lnomatchb: + xor %eax, %eax # return null pointer + ret +ARCHEND(memrchr, baseline) + + .section .note.GNU-stack, "", %progbits |