aboutsummaryrefslogtreecommitdiff
path: root/lib/libc/amd64/string/memrchr.S
diff options
context:
space:
mode:
Diffstat (limited to 'lib/libc/amd64/string/memrchr.S')
-rw-r--r--lib/libc/amd64/string/memrchr.S224
1 files changed, 108 insertions, 116 deletions
diff --git a/lib/libc/amd64/string/memrchr.S b/lib/libc/amd64/string/memrchr.S
index 4f6c5a238daa..80fb306af2a3 100644
--- a/lib/libc/amd64/string/memrchr.S
+++ b/lib/libc/amd64/string/memrchr.S
@@ -1,7 +1,7 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause
*
- * Copyright (c) 2023 Robert Clausecker
+ * Copyright (c) 2023, 2025 Robert Clausecker <fuz@FreeBSD.org>
*/
#include <machine/asm.h>
@@ -16,150 +16,142 @@ ARCHFUNCS(memrchr)
ENDARCHFUNCS(memrchr)
ARCHENTRY(memrchr, scalar)
- xor %eax, %eax # prospective return value
- sub $4, %rdx # 4 bytes left to process?
- jb 1f
+ lea -1(%rdi, %rdx, 1), %rax # point to last char in buffer
+ sub $4, %rdx # 4 bytes left to process?
+ jb .Ltail
ALIGN_TEXT
-0: xor %r8, %r8
- lea 2(%rdi), %r10
- cmp %sil, 2(%rdi)
- cmovne %r8, %r10 # point to null if no match
+0: cmp %sil, (%rax) # match at last entry?
+ je 1f
- cmp %sil, (%rdi)
- cmove %rdi, %r8 # point to first char if match
+ cmp %sil, -1(%rax) # match at second to last entry?
+ je 2f
- lea 1(%rdi), %r9
- cmp %sil, 1(%rdi)
- cmovne %r8, %r9 # point to first result if no match in second
+ cmp %sil, -2(%rax) # match at third to last entry?
+ je 3f
- lea 3(%rdi), %r11
- cmp %sil, 3(%rdi)
- cmovne %r10, %r11
+ cmp %sil, -3(%rax) # match at fourth to last entry?
+ je 4f
- test %r11, %r11
- cmovz %r9, %r11 # take first pair match if none in second
+ sub $4, %rax
+ sub $4, %rdx
+ jae 0b
- test %r11, %r11
- cmovnz %r11, %rax # take match in current set if any
+.Ltail: cmp $-3, %edx # at least one character left to process?
+ jb .Lnotfound
- add $4, %rdi
- sub $4, %rdx
- jae 0b
+ cmp %sil, (%rax)
+ je 1f
-1: cmp $-3, %edx # a least one character left to process?
- jb 2f
+ cmp $-2, %edx # at least two characters left to process?
+ jb .Lnotfound
- cmp %sil, (%rdi)
- cmove %rdi, %rax
+ cmp %sil, -1(%rax)
+ je 2f
- lea 1(%rdi), %rcx
- cmp $-2, %edx # at least two characters left to process?
- jb 2f
+ cmp $-1, %edx # at least three characters left to process?
+ jb .Lnotfound
- cmp %sil, 1(%rdi)
- cmove %rcx, %rax
+ cmp %sil, -2(%rax)
+ je 3f
- lea 2(%rdi), %rcx
- cmp $-1, %edx # at least three character left to process?
- jb 2f
-
- cmp %sil, 2(%rdi)
- cmove %rcx, %rax
+.Lnotfound:
+ xor %eax, %eax
+ ret
-2: ret
+ /* match found -- adjust rax to point to matching byte */
+4: dec %rax
+3: dec %rax
+2: dec %rax
+1: ret
ARCHEND(memrchr, scalar)
ARCHENTRY(memrchr, baseline)
- movd %esi, %xmm4
- test %rdx, %rdx # empty buffer?
- jz .L0 # if yes, return immediately
+ test %rdx, %rdx # empty input?
+ je .Lnomatchb
+
+
+ lea (%rdi, %rdx, 1), %ecx # pointer to end of buffer
+ lea -1(%rdi, %rdx, 1), %rdx # pointer to last char in buffer
+ movd %esi, %xmm2
+ and $~0x1f, %rdx # pointer to final 32 buffer bytes
+ movdqa (%rdx), %xmm0 # load last 32 bytes
+ movdqa 16(%rdx), %xmm1
+
+ punpcklbw %xmm2, %xmm2 # c -> cc
- punpcklbw %xmm4, %xmm4 # c -> cc
- mov %edi, %ecx
- punpcklwd %xmm4, %xmm4 # cc -> cccc
- and $~0xf, %rdi # align source pointer
- pshufd $0, %xmm4, %xmm4 # cccc -> cccccccccccccccc
- and $0xf, %ecx
- movdqa %xmm4, %xmm0
mov $-1, %r8d
- pcmpeqb (%rdi), %xmm0 # compare aligned head
- shl %cl, %r8d # mask of bytes in the head of the buffer
- pmovmskb %xmm0, %eax
+ neg %ecx
+ mov %r8d, %r9d
+ shr %cl, %r8d # mask with zeroes after the string
- sub $16, %rcx
- and %r8d, %eax # match mask
- add %rcx, %rdx # advance past head
- cmc
- jbe .Lrunt # did the string end in the buffer?
+ punpcklwd %xmm2, %xmm2 # cc -> cccc
- mov %rdi, %rsi # pointer to matching chunk
- add $16, %rdi
- sub $16, %rdx # enough left for another round?
- jbe 1f
+ mov %edi, %ecx
+ mov %r9d, %eax
+ shl %cl, %r9d # mask with zeroes before the string
- /* main loop unrolled twice */
- ALIGN_TEXT
-0: movdqa %xmm4, %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %r8d
+ pshufd $0, %xmm2, %xmm2 # cccc -> cccccccccccccccc
- cmp $16, %rdx # enough left for second chunk?
- jbe 2f
+ cmp %rdx, %rdi # tail is beginning of buffer?
+ cmovae %r9d, %eax # if yes, do combined head/tail processing
+ and %r8d, %eax # mak of bytes in tail part of string
- movdqa %xmm4, %xmm0
- pcmpeqb 16(%rdi), %xmm0
+ /* process tail */
+ pcmpeqb %xmm2, %xmm1
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm1, %esi
pmovmskb %xmm0, %ecx
+ shl $16, %esi
+ or %esi, %ecx # locations of matches
+ and %ecx, %eax # any match inside buffer?
+ jnz .Lprecisematchb
- lea 16(%rdi), %r9
- test %ecx, %ecx # match found in second chunk?
- cmovz %r8d, %ecx # if not, use match data from first chunk
- cmovz %rdi, %r9
-
- test %ecx, %ecx # any match found?
- cmovnz %ecx, %eax # if yes, overwrite previously found match
- cmovnz %r9, %rsi
-
- add $32, %rdi # advance to next iteration
- sub $32, %rdx # advance to next chunks
- ja 0b
-
- /* process remaining 1--16 bytes */
-1: pcmpeqb (%rdi), %xmm4
- mov $0xffff, %r8d
- xor %ecx, %ecx
- sub %edx, %ecx # number of bytes to be masked out
- pmovmskb %xmm4, %r9d
- shr %cl, %r8d # mask of bytes to be kept in the buffer
- and %r9d, %r8d
- cmovnz %r8d, %eax
- cmovnz %rdi, %rsi
- bsr %eax, %eax
- lea (%rsi, %rax, 1), %rsi # pointer to match (or junk)
- cmovnz %rsi, %rax # if any match was found, return it
- ret
+ cmp %rdx, %rdi # did the buffer begin here?
+ jae .Lnomatchb # if yes, we are done
- /* end of chunk reached within first half iteration */
-2: test %r8d, %r8d # match in previous chunk?
- cmovnz %r8d, %eax # if yes, overwrite previous chunks
- cmovnz %rdi, %rsi
- add $16, %rdi # point to tail
- sub $16, %edx
- jmp 1b # handle tail the same otherwise
-
- /* runt: string ends within head, edx has negated amount of invalid head bytes */
-.Lrunt: mov $0xffff, %r8d
- xor %ecx, %ecx
- sub %edx, %ecx
- shr %cl, %r8d
- and %r8d, %eax
- bsr %eax, %eax
- lea (%rdi, %rax, 1), %rdi
- cmovnz %rdi, %rax
+ /* main loop */
+ ALIGN_TEXT
+0: movdqa -32(%rdx), %xmm0 # load previous string chunk
+ movdqa -16(%rdx), %xmm1
+ sub $32, %rdx # beginning of string reached?
+ cmp %rdx, %rdi
+ jae .Ltailb
+
+ pcmpeqb %xmm2, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ por %xmm1, %xmm0 # match in either half?
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jz 0b
+
+.Lmatchb:
+ pcmpeqb (%rdx), %xmm2 # redo comparison of first 16 bytes
+ pmovmskb %xmm1, %ecx
+ pmovmskb %xmm2, %eax
+ shl $16, %ecx
+ or %ecx, %eax # location of matches
+
+.Lprecisematchb:
+ bsr %eax, %eax # find location of match
+ add %rdx, %rax # point to matching byte
ret
- /* empty buffer: return a null pointer */
-.L0: xor %eax, %eax
+.Ltailb:
+ pcmpeqb %xmm2, %xmm1
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm1, %ecx
+ pmovmskb %xmm0, %eax
+ shl $16, %ecx
+ or %ecx, %eax # location of matches
+ and %r9d, %eax # mask out matches before buffer
+ bsr %eax, %edi # location of match
+ lea (%rdx, %rdi, 1), %rdx # pointer to match (if any)
+ cmovnz %rdx, %rax # point to match if present,
+ ret # else null pointer
+
+.Lnomatchb:
+ xor %eax, %eax # return null pointer
ret
ARCHEND(memrchr, baseline)