aboutsummaryrefslogtreecommitdiff
path: root/lib/libc/amd64/string/memrchr.S
diff options
context:
space:
mode:
Diffstat (limited to 'lib/libc/amd64/string/memrchr.S')
-rw-r--r--lib/libc/amd64/string/memrchr.S158
1 files changed, 158 insertions, 0 deletions
diff --git a/lib/libc/amd64/string/memrchr.S b/lib/libc/amd64/string/memrchr.S
new file mode 100644
index 000000000000..80fb306af2a3
--- /dev/null
+++ b/lib/libc/amd64/string/memrchr.S
@@ -0,0 +1,158 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023, 2025 Robert Clausecker <fuz@FreeBSD.org>
+ */
+
+#include <machine/asm.h>
+
+#include "amd64_archlevel.h"
+
+#define ALIGN_TEXT .p2align 4, 0x90
+
+ARCHFUNCS(memrchr)
+ ARCHFUNC(memrchr, scalar)
+ ARCHFUNC(memrchr, baseline)
+ENDARCHFUNCS(memrchr)
+
+ARCHENTRY(memrchr, scalar)
+ lea -1(%rdi, %rdx, 1), %rax # point to last char in buffer
+ sub $4, %rdx # 4 bytes left to process?
+ jb .Ltail
+
+ ALIGN_TEXT
+0: cmp %sil, (%rax) # match at last entry?
+ je 1f
+
+ cmp %sil, -1(%rax) # match at second to last entry?
+ je 2f
+
+ cmp %sil, -2(%rax) # match at third to last entry?
+ je 3f
+
+ cmp %sil, -3(%rax) # match at fourth to last entry?
+ je 4f
+
+ sub $4, %rax
+ sub $4, %rdx
+ jae 0b
+
+.Ltail: cmp $-3, %edx # at least one character left to process?
+ jb .Lnotfound
+
+ cmp %sil, (%rax)
+ je 1f
+
+ cmp $-2, %edx # at least two characters left to process?
+ jb .Lnotfound
+
+ cmp %sil, -1(%rax)
+ je 2f
+
+ cmp $-1, %edx # at least three characters left to process?
+ jb .Lnotfound
+
+ cmp %sil, -2(%rax)
+ je 3f
+
+.Lnotfound:
+ xor %eax, %eax
+ ret
+
+ /* match found -- adjust rax to point to matching byte */
+4: dec %rax
+3: dec %rax
+2: dec %rax
+1: ret
+ARCHEND(memrchr, scalar)
+
+ARCHENTRY(memrchr, baseline)
+ test %rdx, %rdx # empty input?
+ je .Lnomatchb
+
+
+ lea (%rdi, %rdx, 1), %ecx # pointer to end of buffer
+ lea -1(%rdi, %rdx, 1), %rdx # pointer to last char in buffer
+ movd %esi, %xmm2
+ and $~0x1f, %rdx # pointer to final 32 buffer bytes
+ movdqa (%rdx), %xmm0 # load last 32 bytes
+ movdqa 16(%rdx), %xmm1
+
+ punpcklbw %xmm2, %xmm2 # c -> cc
+
+ mov $-1, %r8d
+ neg %ecx
+ mov %r8d, %r9d
+ shr %cl, %r8d # mask with zeroes after the string
+
+ punpcklwd %xmm2, %xmm2 # cc -> cccc
+
+ mov %edi, %ecx
+ mov %r9d, %eax
+ shl %cl, %r9d # mask with zeroes before the string
+
+ pshufd $0, %xmm2, %xmm2 # cccc -> cccccccccccccccc
+
+ cmp %rdx, %rdi # tail is beginning of buffer?
+ cmovae %r9d, %eax # if yes, do combined head/tail processing
+ and %r8d, %eax # mak of bytes in tail part of string
+
+ /* process tail */
+ pcmpeqb %xmm2, %xmm1
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm1, %esi
+ pmovmskb %xmm0, %ecx
+ shl $16, %esi
+ or %esi, %ecx # locations of matches
+ and %ecx, %eax # any match inside buffer?
+ jnz .Lprecisematchb
+
+ cmp %rdx, %rdi # did the buffer begin here?
+ jae .Lnomatchb # if yes, we are done
+
+ /* main loop */
+ ALIGN_TEXT
+0: movdqa -32(%rdx), %xmm0 # load previous string chunk
+ movdqa -16(%rdx), %xmm1
+ sub $32, %rdx # beginning of string reached?
+ cmp %rdx, %rdi
+ jae .Ltailb
+
+ pcmpeqb %xmm2, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ por %xmm1, %xmm0 # match in either half?
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jz 0b
+
+.Lmatchb:
+ pcmpeqb (%rdx), %xmm2 # redo comparison of first 16 bytes
+ pmovmskb %xmm1, %ecx
+ pmovmskb %xmm2, %eax
+ shl $16, %ecx
+ or %ecx, %eax # location of matches
+
+.Lprecisematchb:
+ bsr %eax, %eax # find location of match
+ add %rdx, %rax # point to matching byte
+ ret
+
+.Ltailb:
+ pcmpeqb %xmm2, %xmm1
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm1, %ecx
+ pmovmskb %xmm0, %eax
+ shl $16, %ecx
+ or %ecx, %eax # location of matches
+ and %r9d, %eax # mask out matches before buffer
+ bsr %eax, %edi # location of match
+ lea (%rdx, %rdi, 1), %rdx # pointer to match (if any)
+ cmovnz %rdx, %rax # point to match if present,
+ ret # else null pointer
+
+.Lnomatchb:
+ xor %eax, %eax # return null pointer
+ ret
+ARCHEND(memrchr, baseline)
+
+ .section .note.GNU-stack, "", %progbits