aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRobert Clausecker <fuz@FreeBSD.org>2023-08-24 16:43:40 +0000
committerRobert Clausecker <fuz@FreeBSD.org>2023-09-08 21:22:20 +0000
commitde12a689fad271f5a2ba7c188b0b5fb5cabf48e7 (patch)
tree0123b8dc6bb2af86316a0299286ccd794ece028d
parenta559ef1ac85947df1bff82b867bbfc07a8085092 (diff)
downloadsrc-de12a689fad271f5a2ba7c188b0b5fb5cabf48e7.tar.gz
src-de12a689fad271f5a2ba7c188b0b5fb5cabf48e7.zip
lib/libc/amd64/string: add memchr(3) scalar, baseline implementation
This is conceptually similar to strchr(3), but there are slight changes to account for the buffer having an explicit buffer length. Sponsored by: The FreeBSD Foundation Approved by: mjg MFC after: 1 week MFC to: stable/14 Differential Revision: https://reviews.freebsd.org/D41598
-rw-r--r--lib/libc/amd64/string/Makefile.inc1
-rw-r--r--lib/libc/amd64/string/memchr.S204
2 files changed, 205 insertions, 0 deletions
diff --git a/lib/libc/amd64/string/Makefile.inc b/lib/libc/amd64/string/Makefile.inc
index b03811f40062..7c6a1f429590 100644
--- a/lib/libc/amd64/string/Makefile.inc
+++ b/lib/libc/amd64/string/Makefile.inc
@@ -2,6 +2,7 @@
MDSRCS+= \
amd64_archlevel.c \
bcmp.S \
+ memchr.S \
memcmp.S \
memcpy.S \
memmove.S \
diff --git a/lib/libc/amd64/string/memchr.S b/lib/libc/amd64/string/memchr.S
new file mode 100644
index 000000000000..e10bd6c22f90
--- /dev/null
+++ b/lib/libc/amd64/string/memchr.S
@@ -0,0 +1,204 @@
+/*-
+ * Copyright (c) 2023 The FreeBSD Foundation
+ *
+ * This software was developed by Robert Clausecker <fuz@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE
+ */
+
+#include <machine/asm.h>
+
+#include "amd64_archlevel.h"
+
+#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */
+
+ .weak memchr
+ .set memchr, __memchr
+ARCHFUNCS(__memchr)
+ ARCHFUNC(__memchr, scalar)
+ ARCHFUNC(__memchr, baseline)
+ENDARCHFUNCS(__memchr)
+
+ARCHENTRY(__memchr, scalar)
+ test %rdx, %rdx # empty input?
+ je .Lnomatch
+
+ lea (, %rdi, 8), %ecx
+ add %rdi, %rdx # pointer to end of buffer
+ and $~7, %rdi # align to 8 bytes
+ mov (%rdi), %rax # load first word
+ movzbl %sil, %esi # clear stray high bits
+ movabs $0x0101010101010101, %r8
+ imul %r8, %rsi # replicate char 8 times
+
+ /* compute head and tail masks */
+ mov %r8, %r10
+ movabs $0x8080808080808080, %r9
+ shl %cl, %r10 # 0x01 where string head is
+ lea (, %rdx, 8), %ecx
+ xor %r8, %r10 # 0x01 where it is not
+ neg %r8 # negate 01..01 so we can use lea
+ mov %r9, %r11
+ xor %rsi, %rax # str ^ c (0x00 where str[i] == c)
+ neg %ecx
+ or %r10, %rax # except before the string
+ shr %cl, %r11 # 0x80 where string tail is
+
+ add $8, %rdi # advance to next 8 bytes
+ cmp %rdx, %rdi # end of buffer reached during head?
+ jae .Ltail # and go to tail-processing code
+
+ /* main loop, unrolled twice */
+ ALIGN_TEXT
+0: lea (%rax, %r8, 1), %rcx # (str ^ c) - 0x01..01
+ not %rax # ~(str ^ c)
+ and %r9, %rax # ((str^c) - 0x01..01) & ~(str^c)
+ and %rcx, %rax # not including junk bytes
+ jnz .Lmatch
+
+ mov (%rdi), %rax
+ add $8, %rdi
+ xor %rsi, %rax # str ^ c
+ cmp %rdx, %rdi
+ jae .Ltail
+
+ lea (%rax, %r8, 1), %rcx # (str ^ c) - 0x01..01
+ not %rax # ~(str ^ c)
+ and %r9, %rax # ((str^c) - 0x01..01) & ~(str^c)
+ and %rcx, %rax # not including junk bytes
+ jnz .Lmatch
+
+ mov (%rdi), %rax
+ add $8, %rdi
+ xor %rsi, %rax # str ^ c
+ cmp %rdx, %rdi
+ jb 0b
+
+.Ltail: lea (%rax, %r8, 1), %rcx # (str ^ c) - 0x01..01
+ not %rax # ~(str ^ c)
+ and %r11, %rax # ((str^c) - 0x01..01) & ~(str^c)
+ and %rcx, %rax # not including junk bytes or bytes past buffer
+ jz .Lnomatch
+
+.Lmatch:
+ tzcnt %rax, %rax # first match
+ shr $3, %eax # scale from bit to byte index
+ lea -8(%rdi, %rax), %rax # pointer to found c
+ ret
+
+ /* no match found */
+.Lnomatch:
+ xor %eax, %eax # return null pointer
+ ret
+ARCHEND(__memchr, scalar)
+
+ARCHENTRY(__memchr, baseline)
+ test %rdx, %rdx # empty input?
+ je .Lnomatchb
+
+ movd %esi, %xmm2
+ mov %edi, %ecx
+ add %rdi, %rdx # pointer to end of buffer
+ and $~0x1f, %rdi # align to 32 bytes
+ movdqa (%rdi), %xmm0 # load first 32 bytes
+ movdqa 16(%rdi), %xmm1
+
+ punpcklbw %xmm2, %xmm2 # c -> cc
+
+ mov $-1, %r9d
+ shl %cl, %r9d # mask with zeroes before the string
+
+ punpcklwd %xmm2, %xmm2 # cc -> cccc
+
+ mov $-1, %r8d
+ xor %ecx, %ecx
+ sub %edx, %ecx # edx = -ecx
+ shr %cl, %r8d # bytes in tail that are part of the buffer
+
+ pshufd $0, %xmm2, %xmm2 # cccc -> cccccccccccccccc
+
+ add $32, %rdi # advance to next 32 bytes
+ mov $-1, %eax
+ cmp %rdx, %rdi # end of buffer reached during head?
+ cmovae %r8d, %eax # if yes, do combined head/tail processing
+ and %r9d, %eax # mask of bytes in head part of string
+
+ /* process head */
+ pcmpeqb %xmm2, %xmm1
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm1, %esi
+ pmovmskb %xmm0, %ecx
+ shl $16, %esi
+ or %esi, %ecx # locations of matches
+ and %ecx, %eax # any match inside buffer?
+ jnz .Lprecisematchb
+
+ cmp %rdx, %rdi # did the buffer end here?
+ jae .Lnomatchb # if yes we are done
+
+ /* main loop */
+ ALIGN_TEXT
+0: movdqa (%rdi), %xmm0 # load next string chunk
+ movdqa 16(%rdi), %xmm1
+ add $32, %rdi
+ cmp %rdx, %rdi # ready for main loop?
+ jae .Ltailb
+
+ pcmpeqb %xmm2, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ por %xmm1, %xmm0 # match in either half?
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jz 0b
+
+.Lmatchb:
+ pcmpeqb -32(%rdi), %xmm2 # redo comparison of first 16 bytes
+ pmovmskb %xmm1, %ecx
+ pmovmskb %xmm2, %eax
+ shl $16, %ecx
+ or %ecx, %eax # location of matches
+
+.Lprecisematchb:
+ tzcnt %eax, %eax # find location of match
+ lea -32(%rdi, %rax, 1), %rax # point to matching byte
+ ret
+
+.Ltailb:
+ pcmpeqb %xmm2, %xmm1
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm1, %edx
+ pmovmskb %xmm0, %eax
+ shl $16, %edx
+ or %edx, %eax # location of matches
+ and %r8d, %eax # mask out matches beyond buffer
+ bsf %eax, %edx # location of match
+ lea -32(%rdi, %rdx, 1), %rdx # pointer to match (if any)
+ cmovnz %rdx, %rax # point to match if present,
+ ret # else null pointer
+
+.Lnomatchb:
+ xor %eax, %eax # return null pointer
+ ret
+ARCHEND(__memchr, baseline)
+
+ .section .note.GNU-stack,"",%progbits