aboutsummaryrefslogtreecommitdiff
path: root/lib/libc/amd64
diff options
context:
space:
mode:
Diffstat (limited to 'lib/libc/amd64')
-rw-r--r--lib/libc/amd64/SYS.h53
-rw-r--r--lib/libc/amd64/Symbol.map40
-rw-r--r--lib/libc/amd64/gen/Makefile.inc4
-rw-r--r--lib/libc/amd64/gen/_setjmp.S3
-rw-r--r--lib/libc/amd64/gen/flt_rounds.c1
-rw-r--r--lib/libc/amd64/gen/infinity.c1
-rw-r--r--lib/libc/amd64/gen/makecontext.c1
-rw-r--r--lib/libc/amd64/gen/rfork_thread.S93
-rw-r--r--lib/libc/amd64/gen/setjmp.S3
-rw-r--r--lib/libc/amd64/gen/signalcontext.c1
-rw-r--r--lib/libc/amd64/gen/sigsetjmp.S2
-rw-r--r--lib/libc/amd64/stdlib/Makefile.inc2
-rw-r--r--lib/libc/amd64/string/Makefile.inc26
-rw-r--r--lib/libc/amd64/string/bcopy.c3
-rw-r--r--lib/libc/amd64/string/bzero.c3
-rw-r--r--lib/libc/amd64/string/memccpy.S260
-rw-r--r--lib/libc/amd64/string/memchr.S207
-rw-r--r--lib/libc/amd64/string/memcmp.S17
-rw-r--r--lib/libc/amd64/string/memrchr.S166
-rw-r--r--lib/libc/amd64/string/stpncpy.S283
-rw-r--r--lib/libc/amd64/string/strcat.S47
-rw-r--r--lib/libc/amd64/string/strcmp.S299
-rw-r--r--lib/libc/amd64/string/strcpy.c1
-rw-r--r--lib/libc/amd64/string/strcspn.S396
-rw-r--r--lib/libc/amd64/string/strlcat.c27
-rw-r--r--lib/libc/amd64/string/strlcpy.S281
-rw-r--r--lib/libc/amd64/string/strncat.c31
-rw-r--r--lib/libc/amd64/string/strncmp.S488
-rw-r--r--lib/libc/amd64/string/strncpy.c (renamed from lib/libc/amd64/static_tls.h)31
-rw-r--r--lib/libc/amd64/string/strnlen.c (renamed from lib/libc/amd64/sys/amd64_set_gsbase.c)47
-rw-r--r--lib/libc/amd64/string/strpbrk.c (renamed from lib/libc/amd64/sys/amd64_set_fsbase.c)45
-rw-r--r--lib/libc/amd64/string/strrchr.S209
-rw-r--r--lib/libc/amd64/string/strsep.c (renamed from lib/libc/amd64/sys/getcontext.S)53
-rw-r--r--lib/libc/amd64/string/strspn.S358
-rw-r--r--lib/libc/amd64/string/timingsafe_bcmp.S232
-rw-r--r--lib/libc/amd64/string/timingsafe_memcmp.S145
-rw-r--r--lib/libc/amd64/sys/Makefile.inc12
-rw-r--r--lib/libc/amd64/sys/amd64_get_fsbase.c64
-rw-r--r--lib/libc/amd64/sys/amd64_get_gsbase.c64
-rw-r--r--lib/libc/amd64/sys/cerror.S58
-rw-r--r--lib/libc/amd64/sys/vfork.S52
41 files changed, 3534 insertions, 575 deletions
diff --git a/lib/libc/amd64/SYS.h b/lib/libc/amd64/SYS.h
deleted file mode 100644
index c2c8ef8a56ef..000000000000
--- a/lib/libc/amd64/SYS.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*-
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Copyright (c) 1990 The Regents of the University of California.
- * All rights reserved.
- *
- * This code is derived from software contributed to Berkeley by
- * William Jolitz.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. Neither the name of the University nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * @(#)SYS.h 5.5 (Berkeley) 5/7/91
- */
-
-#include <sys/syscall.h>
-#include <machine/asm.h>
-
-#define RSYSCALL(name) ENTRY(__sys_##name); \
- WEAK_REFERENCE(__sys_##name, name); \
- WEAK_REFERENCE(__sys_##name, _##name); \
- mov $SYS_##name,%eax; KERNCALL; \
- jb HIDENAME(cerror); ret; \
- END(__sys_##name)
-
-#define PSEUDO(name) ENTRY(__sys_##name); \
- WEAK_REFERENCE(__sys_##name, _##name); \
- mov $SYS_##name,%eax; KERNCALL; \
- jb HIDENAME(cerror); ret; \
- END(__sys_##name)
-
-#define KERNCALL movq %rcx, %r10; syscall
diff --git a/lib/libc/amd64/Symbol.map b/lib/libc/amd64/Symbol.map
index 39a913bd5e84..36f54de24fbd 100644
--- a/lib/libc/amd64/Symbol.map
+++ b/lib/libc/amd64/Symbol.map
@@ -1,20 +1,12 @@
/*
- */
-
-/*
* This only needs to contain symbols that are not listed in
* symbol maps from other parts of libc (i.e., not found in
* stdlib/Symbol.map, string/Symbol.map, sys/Symbol.map, ...).
*/
FBSD_1.0 {
- /* PSEUDO syscalls */
- _exit;
-
.mcount;
- _setjmp;
- _longjmp;
- fabs;
__flt_rounds;
+ brk;
fpgetmask;
fpgetprec;
fpgetround;
@@ -22,32 +14,7 @@ FBSD_1.0 {
fpsetmask;
fpsetprec;
fpsetround;
- __infinity;
- __nan;
- makecontext;
- rfork_thread;
- setjmp;
- longjmp;
- sigsetjmp;
- siglongjmp;
- htonl;
- htons;
- ntohl;
- ntohs;
- amd64_get_fsbase;
- amd64_get_gsbase;
- amd64_set_fsbase;
- amd64_set_gsbase;
- brk;
sbrk;
- vfork;
-};
-
-FBSD_1.6 {
- x86_pkru_get_perm;
- x86_pkru_set_perm;
- x86_pkru_protect_range;
- x86_pkru_unprotect_range;
};
/*
@@ -56,15 +23,10 @@ FBSD_1.6 {
*
*/
FBSDprivate_1.0 {
- /* PSEUDO syscalls */
- _getlogin;
-
___longjmp;
- __makecontext;
__longjmp;
__signalcontext;
signalcontext;
__siglongjmp;
_brk;
- _vfork;
};
diff --git a/lib/libc/amd64/gen/Makefile.inc b/lib/libc/amd64/gen/Makefile.inc
index 4869973ca254..aaffcb0481f1 100644
--- a/lib/libc/amd64/gen/Makefile.inc
+++ b/lib/libc/amd64/gen/Makefile.inc
@@ -1,6 +1,4 @@
-# @(#)Makefile.inc 8.1 (Berkeley) 6/4/93
-
-SRCS+= _setjmp.S rfork_thread.S setjmp.S sigsetjmp.S \
+SRCS+= _setjmp.S setjmp.S sigsetjmp.S \
fabs.S \
infinity.c ldexp.c makecontext.c signalcontext.c \
flt_rounds.c fpgetmask.c fpsetmask.c fpgetprec.c fpsetprec.c \
diff --git a/lib/libc/amd64/gen/_setjmp.S b/lib/libc/amd64/gen/_setjmp.S
index 43af2b68b3f2..93b27de49ea0 100644
--- a/lib/libc/amd64/gen/_setjmp.S
+++ b/lib/libc/amd64/gen/_setjmp.S
@@ -30,9 +30,6 @@
* SUCH DAMAGE.
*/
-#if defined(LIBC_SCCS) && !defined(lint)
- .asciz "@(#)_setjmp.s 5.1 (Berkeley) 4/23/90"
-#endif /* LIBC_SCCS and not lint */
#include <machine/asm.h>
/*
* C library -- _setjmp, _longjmp
diff --git a/lib/libc/amd64/gen/flt_rounds.c b/lib/libc/amd64/gen/flt_rounds.c
index 018ea029ee3f..cd7e501af5af 100644
--- a/lib/libc/amd64/gen/flt_rounds.c
+++ b/lib/libc/amd64/gen/flt_rounds.c
@@ -3,7 +3,6 @@
* Public domain.
*/
-#include <sys/cdefs.h>
#include <float.h>
static const int map[] = {
diff --git a/lib/libc/amd64/gen/infinity.c b/lib/libc/amd64/gen/infinity.c
index b9db2fc84efa..bc05708abd2b 100644
--- a/lib/libc/amd64/gen/infinity.c
+++ b/lib/libc/amd64/gen/infinity.c
@@ -2,7 +2,6 @@
* infinity.c
*/
-#include <sys/cdefs.h>
#include <math.h>
/* bytes for +Infinity on a 387 */
diff --git a/lib/libc/amd64/gen/makecontext.c b/lib/libc/amd64/gen/makecontext.c
index dcc3b8ab9b45..c5767c9d5d75 100644
--- a/lib/libc/amd64/gen/makecontext.c
+++ b/lib/libc/amd64/gen/makecontext.c
@@ -26,7 +26,6 @@
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#include <sys/cdefs.h>
#include <sys/types.h>
#include <sys/ucontext.h>
#include <stdarg.h>
diff --git a/lib/libc/amd64/gen/rfork_thread.S b/lib/libc/amd64/gen/rfork_thread.S
deleted file mode 100644
index a3c64fad7994..000000000000
--- a/lib/libc/amd64/gen/rfork_thread.S
+++ /dev/null
@@ -1,93 +0,0 @@
-/*-
- * Copyright (c) 2000 Peter Wemm <peter@FreeBSD.org>
- * Copyright (c) 2003 Alan L. Cox <alc@cs.rice.edu>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#include <machine/asm.h>
-/*
- * With thanks to John Dyson for the original version of this.
- */
-
-#include <SYS.h>
-
-/*
- * %edi %rsi %rdx %rcx
- * rfork_thread(flags, stack_addr, start_fnc, start_arg);
- *
- * flags: Flags to rfork system call. See rfork(2).
- * stack_addr: Top of stack for thread.
- * start_fnc: Address of thread function to call in child.
- * start_arg: Argument to pass to the thread function in child.
- */
-
-ENTRY(rfork_thread)
- pushq %rbx
- pushq %r12
- movq %rdx, %rbx
- movq %rcx, %r12
-
- /*
- * Prepare and execute the thread creation syscall
- */
- movq $SYS_rfork, %rax
- KERNCALL
- jb 2f
-
- /*
- * Check to see if we are in the parent or child
- */
- cmpl $0, %edx
- jnz 1f
- popq %r12
- popq %rbx
- ret
-
- /*
- * If we are in the child (new thread), then
- * set-up the call to the internal subroutine. If it
- * returns, then call __exit.
- */
-1:
- movq %rsi, %rsp
- movq %r12, %rdi
- call *%rbx
- movl %eax, %edi
-
- /*
- * Exit system call
- */
- movq $SYS_exit, %rax
- KERNCALL
-
- /*
- * Branch here if the thread creation fails:
- */
-2:
- popq %r12
- popq %rbx
- jmp HIDENAME(cerror)
-END(rfork_thread)
-
- .section .note.GNU-stack,"",%progbits
diff --git a/lib/libc/amd64/gen/setjmp.S b/lib/libc/amd64/gen/setjmp.S
index 6f469c4c08e8..54939f123807 100644
--- a/lib/libc/amd64/gen/setjmp.S
+++ b/lib/libc/amd64/gen/setjmp.S
@@ -30,9 +30,6 @@
* SUCH DAMAGE.
*/
-#if defined(LIBC_SCCS) && !defined(lint)
- .asciz "@(#)setjmp.s 5.1 (Berkeley) 4/23/90"
-#endif /* LIBC_SCCS and not lint */
#include <machine/asm.h>
/*
* C library -- _setjmp, _longjmp
diff --git a/lib/libc/amd64/gen/signalcontext.c b/lib/libc/amd64/gen/signalcontext.c
index cc1c2523c754..a97dd158542a 100644
--- a/lib/libc/amd64/gen/signalcontext.c
+++ b/lib/libc/amd64/gen/signalcontext.c
@@ -26,7 +26,6 @@
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#include <sys/cdefs.h>
#include <sys/types.h>
#include <sys/ucontext.h>
#include <signal.h>
diff --git a/lib/libc/amd64/gen/sigsetjmp.S b/lib/libc/amd64/gen/sigsetjmp.S
index 757280159d82..c4775b1c2bea 100644
--- a/lib/libc/amd64/gen/sigsetjmp.S
+++ b/lib/libc/amd64/gen/sigsetjmp.S
@@ -28,8 +28,6 @@
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
- *
- * @(#)setjmp.s 5.1 (Berkeley) 4/23/90"
*/
#if defined(LIBC_SCCS) && !defined(lint)
diff --git a/lib/libc/amd64/stdlib/Makefile.inc b/lib/libc/amd64/stdlib/Makefile.inc
index 8b9af2b3eab1..568f8eb4afa7 100644
--- a/lib/libc/amd64/stdlib/Makefile.inc
+++ b/lib/libc/amd64/stdlib/Makefile.inc
@@ -1,3 +1 @@
-# @(#)Makefile.inc 8.1 (Berkeley) 6/4/93
-
MDSRCS+=div.S ldiv.S lldiv.S
diff --git a/lib/libc/amd64/string/Makefile.inc b/lib/libc/amd64/string/Makefile.inc
index 4df4ff8f1417..d5bb646c5c53 100644
--- a/lib/libc/amd64/string/Makefile.inc
+++ b/lib/libc/amd64/string/Makefile.inc
@@ -1,14 +1,36 @@
-
MDSRCS+= \
amd64_archlevel.c \
bcmp.S \
+ memchr.S \
memcmp.S \
+ memccpy.S \
memcpy.S \
memmove.S \
+ memrchr.S \
memset.S \
stpcpy.S \
+ stpncpy.S \
strcat.S \
strchrnul.S \
strcmp.S \
+ strcpy.c \
+ strcspn.S \
+ strlcat.c \
+ strlcpy.S \
strlen.S \
- strcpy.c
+ strncat.c \
+ strncmp.S \
+ strncpy.c \
+ strnlen.c \
+ strpbrk.c \
+ strrchr.S \
+ strsep.c \
+ strspn.S \
+ timingsafe_bcmp.S \
+ timingsafe_memcmp.S
+
+.if ${MK_ASAN} != "no"
+# Disable ASAN for amd64_archlevel.c since its code is executed before the
+# sanitizer runtime can initialize itself.
+CFLAGS.amd64_archlevel.c+= -fno-sanitize=address
+.endif
diff --git a/lib/libc/amd64/string/bcopy.c b/lib/libc/amd64/string/bcopy.c
index 406b28f0b97a..0dee529fb9df 100644
--- a/lib/libc/amd64/string/bcopy.c
+++ b/lib/libc/amd64/string/bcopy.c
@@ -2,9 +2,10 @@
* Public domain.
*/
-#include <sys/cdefs.h>
#include <string.h>
+#undef bcopy /* _FORTIFY_SOURCE */
+
void
bcopy(const void *src, void *dst, size_t len)
{
diff --git a/lib/libc/amd64/string/bzero.c b/lib/libc/amd64/string/bzero.c
index a4fdb74d6bb4..d82f3061865b 100644
--- a/lib/libc/amd64/string/bzero.c
+++ b/lib/libc/amd64/string/bzero.c
@@ -2,9 +2,10 @@
* Public domain.
*/
-#include <sys/cdefs.h>
#include <string.h>
+#undef bzero /* _FORTIFY_SOURCE */
+
void
bzero(void *b, size_t len)
{
diff --git a/lib/libc/amd64/string/memccpy.S b/lib/libc/amd64/string/memccpy.S
new file mode 100644
index 000000000000..69b650fffc33
--- /dev/null
+++ b/lib/libc/amd64/string/memccpy.S
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 2023, 2024 The FreeBSD Foundation
+ *
+ * This software was developed by Robert Clausecker <fuz@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE
+ */
+
+#include <machine/asm.h>
+
+#include "amd64_archlevel.h"
+
+#define ALIGN_TEXT .p2align 4, 0x90
+
+ .weak memccpy
+ .set memccpy, __memccpy
+ARCHFUNCS(__memccpy)
+ ARCHFUNC(__memccpy, scalar)
+ ARCHFUNC(__memccpy, baseline)
+ENDARCHFUNCS(__memccpy)
+
+ARCHENTRY(__memccpy, scalar)
+ push %rbp # establish stack frame
+ mov %rsp, %rbp
+ push %rax # dummy push for alignment
+ push %rbx
+ push %rdi
+ push %rsi
+
+ mov %rsi, %rdi
+ mov %edx, %esi
+ mov %rcx, %rdx
+ mov %rcx, %rbx
+ call CNAME(__memchr) # ptr = memchr(src, c, len)
+
+ pop %rsi
+ pop %rdi
+ lea 1(%rax), %rdx
+ sub %rsi, %rdx # size = ptr - src + 1
+ mov %rbx, %rcx
+ lea (%rdi, %rdx, 1), %rbx # res = dest + size
+ test %rax, %rax # if (ptr == NULL)
+ cmovz %rcx, %rdx # size = len
+ cmovz %rax, %rbx # res = NULL
+ call CNAME(memcpy)
+
+ mov %rbx, %rax # return (res)
+ pop %rbx
+ leave
+ ret
+ARCHEND(__memccpy, scalar)
+
+ARCHENTRY(__memccpy, baseline)
+ sub $1, %rcx # RCX refers to last character in buffer
+ jb .L0 # go to special code path if len was 0
+
+ movd %edx, %xmm4
+ mov %rcx, %rdx
+ punpcklbw %xmm4, %xmm4 # c -> cc
+ mov %esi, %ecx
+ punpcklwd %xmm4, %xmm4 # cc -> cccc
+ mov %rsi, %r9 # stash a copy of the source pointer for later
+ pshufd $0, %xmm4, %xmm4 # cccc -> cccccccccccccccc
+ and $~0xf, %rsi
+ movdqa %xmm4, %xmm1
+ pcmpeqb (%rsi), %xmm1 # c found in head?
+ and $0xf, %ecx
+ mov $-1, %eax
+ pmovmskb %xmm1, %r8d
+ lea -32(%rcx), %r11
+ shl %cl, %eax # mask of bytes in the string
+ add %rdx, %r11 # distance from alignment boundary - 32
+ jnc .Lrunt # jump if buffer length is 32 or less
+
+ and %r8d, %eax
+ jz 0f # match (or induced match) found?
+
+ /* match in first chunk */
+ tzcnt %eax, %edx # where is c?
+ sub %ecx, %edx # ... from the beginning of the string?
+ lea 1(%rdi, %rdx, 1), %rax # return value
+ jmp .L0116
+
+0: movdqa 16(%rsi), %xmm3 # load second string chunk
+ movdqu (%r9), %xmm2 # load unaligned string head
+ movdqa %xmm4, %xmm1
+ pcmpeqb %xmm3, %xmm1 # c found in second chunk?
+
+ /* process second chunk */
+ pmovmskb %xmm1, %eax
+ test %eax, %eax
+ jz 0f
+
+ /* match in second chunk */
+ tzcnt %eax, %edx # where is c?
+ sub $16, %ecx
+ sub %ecx, %edx # adjust for alignment offset
+ lea 1(%rdi, %rdx, 1), %rax # return value
+ jmp .L0132
+
+ /* c not found in second chunk: prepare for main loop */
+0: movdqa 32(%rsi), %xmm0 # load next string chunk
+ movdqa %xmm4, %xmm1
+ movdqu %xmm2, (%rdi) # deposit head into buffer
+ sub %rcx, %rdi # adjust RDI to correspond to RSI
+ mov %r11, %rdx
+ movdqu %xmm3, 16(%rdi) # deposit second chunk
+ sub %rsi, %rdi # express RDI as distance from RSI
+ add $32, %rsi # advance RSI past first two chunks
+ sub $16, %rdx # enough left for another round?
+ jb 1f
+
+ /* main loop unrolled twice */
+ ALIGN_TEXT
+0: pcmpeqb %xmm0, %xmm1 # c encountered?
+ pmovmskb %xmm1, %eax
+ test %eax, %eax
+ jnz 3f
+
+ movdqu %xmm0, (%rsi, %rdi)
+ movdqa 16(%rsi), %xmm0 # load next string chunk
+ movdqa %xmm4, %xmm1
+ cmp $16, %rdx # more than a full chunk left?
+ jb 2f
+
+ add $32, %rsi # advance pointers to next chunk
+ pcmpeqb %xmm0, %xmm1 # c encountered?
+ pmovmskb %xmm1, %eax
+ test %eax, %eax
+ jnz 4f
+
+ movdqu %xmm0, -16(%rsi, %rdi)
+ movdqa (%rsi), %xmm0 # load next string chunk
+ movdqa %xmm4, %xmm1
+ sub $32, %rdx
+ jae 0b
+
+1: sub $16, %rsi # undo second advancement
+ add $16, %edx
+
+ /* 1--16 bytes left in the buffer but string has not ended yet */
+2: pcmpeqb %xmm1, %xmm0 # c encountered?
+ pmovmskb %xmm0, %r8d
+ mov %r8d, %ecx
+ bts %edx, %r8d # treat end of buffer as end of string
+ tzcnt %r8d, %r8d # find tail length
+ add %rsi, %rdi # restore RDI
+ movdqu 1(%rsi, %r8, 1), %xmm0 # load string tail
+ movdqu %xmm0, 1(%rdi, %r8, 1) # store string tail
+ lea 17(%rdi, %r8, 1), %rsi # return value if terminator encountered
+ xor %eax, %eax # return value if no terminator encountered
+ bt %r8d, %ecx # terminator encountered inside buffer?
+ cmovc %rsi, %rax # if yes, return pointer, else NULL
+ ret
+
+4: sub $16, %rsi # undo second advancement
+
+ /* terminator found and buffer has not ended yet */
+3: tzcnt %eax, %eax # find length of string tail
+ movdqu -15(%rsi, %rax, 1), %xmm0 # load string tail (incl. c)
+ add %rsi, %rdi # restore destination pointer
+ movdqu %xmm0, -15(%rdi, %rax, 1) # store string tail (incl. c)
+ lea 1(%rdi, %rax, 1), %rax # compute return value
+ ret
+
+ /* buffer is 1--32 bytes in size */
+ ALIGN_TEXT
+.Lrunt: add $32, %r11d # undo earlier decrement
+ mov %r8d, %r10d # keep a copy of the original match mask
+ bts %r11d, %r8d # induce match at buffer end
+ and %ax, %r8w # is there a match in the first 16 bytes?
+ jnz 0f # if yes, skip looking at second chunk
+
+ pcmpeqb 16(%rsi), %xmm4 # check for match in second chunk
+ pmovmskb %xmm4, %r8d
+ shl $16, %r8d # place second chunk matches in bits 16--31
+ mov %r8d, %r10d # keep a copy of the original match mask
+ bts %r11d, %r8d # induce a match at buffer end
+
+0: xor %eax, %eax # return value if terminator not found
+ tzcnt %r8d, %edx # find string/buffer length from alignment boundary
+ lea 1(%rdi, %rdx, 1), %r8 # return value if terminator found + rcx
+ sub %rcx, %r8
+ bt %edx, %r10d # was the terminator present?
+ cmovc %r8, %rax # if yes, return pointer, else NULL
+ sub %ecx, %edx # find actual string/buffer length
+
+ ALIGN_TEXT
+.L0132: cmp $16, %rdx # at least 17 bytes to copy?
+ jb .L0116
+
+ /* copy 17--32 bytes */
+ movdqu (%r9), %xmm0 # load first 16 bytes
+ movdqu -15(%r9, %rdx, 1), %xmm1 # load last 16 bytes
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm1, -15(%rdi, %rdx, 1)
+ ret
+
+ /* process strings of 1--16 bytes (rdx: min(buflen, srclen), rax: srclen) */
+ ALIGN_TEXT
+.L0116: cmp $8, %rdx # at least 9 bytes to copy?
+ jae .L0916
+
+ cmp $4, %rdx # at least 5 bytes to copy?
+ jae .L0508
+
+ cmp $2, %rdx # at least 3 bytes to copy?
+ jae .L0304
+
+ /* copy one or two bytes */
+ movzbl (%r9), %ecx # load first byte from src
+ movzbl (%r9, %rdx, 1), %esi # load last byte from src
+ mov %cl, (%rdi) # deposit into destination
+ mov %sil, (%rdi, %rdx, 1)
+ ret
+
+.L0304: movzwl (%r9), %ecx
+ movzwl -1(%r9, %rdx, 1), %esi
+ mov %cx, (%rdi)
+ mov %si, -1(%rdi, %rdx, 1)
+ ret
+
+.L0508: mov (%r9), %ecx
+ mov -3(%r9, %rdx, 1), %esi
+ mov %ecx, (%rdi)
+ mov %esi, -3(%rdi, %rdx, 1)
+ ret
+
+.L0916: mov (%r9), %rcx
+ mov -7(%r9, %rdx, 1), %rsi
+ mov %rcx, (%rdi)
+ mov %rsi, -7(%rdi, %rdx, 1)
+ ret
+
+ /* length zero destination: return null pointer */
+.L0: xor %eax, %eax
+ ret
+ARCHEND(__memccpy, baseline)
+
+ .section .note.GNU-stack,"",%progbits
diff --git a/lib/libc/amd64/string/memchr.S b/lib/libc/amd64/string/memchr.S
new file mode 100644
index 000000000000..cfab9b1302de
--- /dev/null
+++ b/lib/libc/amd64/string/memchr.S
@@ -0,0 +1,207 @@
+/*-
+ * Copyright (c) 2023 The FreeBSD Foundation
+ *
+ * This software was developed by Robert Clausecker <fuz@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE
+ */
+
+#include <machine/asm.h>
+
+#include "amd64_archlevel.h"
+
+#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */
+
+ .weak memchr
+ .set memchr, __memchr
+ARCHFUNCS(__memchr)
+ ARCHFUNC(__memchr, scalar)
+ ARCHFUNC(__memchr, baseline)
+ENDARCHFUNCS(__memchr)
+
+ARCHENTRY(__memchr, scalar)
+ test %rdx, %rdx # empty input?
+ je .Lnomatch
+
+ lea (, %rdi, 8), %ecx
+ mov $-1, %rax
+ add %rdi, %rdx # pointer to end of buffer or to end of
+ cmovc %rax, %rdx # address space (whichever comes first)
+ and $~7, %rdi # align to 8 bytes
+ mov (%rdi), %rax # load first word
+ movzbl %sil, %esi # clear stray high bits
+ movabs $0x0101010101010101, %r8
+ imul %r8, %rsi # replicate char 8 times
+
+ /* compute head and tail masks */
+ mov %r8, %r10
+ movabs $0x8080808080808080, %r9
+ shl %cl, %r10 # 0x01 where string head is
+ lea (, %rdx, 8), %ecx
+ xor %r8, %r10 # 0x01 where it is not
+ neg %r8 # negate 01..01 so we can use lea
+ mov %r9, %r11
+ xor %rsi, %rax # str ^ c (0x00 where str[i] == c)
+ neg %ecx
+ or %r10, %rax # except before the string
+ shr %cl, %r11 # 0x80 where string tail is
+
+ add $8, %rdi # advance to next 8 bytes
+ cmp %rdx, %rdi # end of buffer reached during head?
+ jae .Ltail # and go to tail-processing code
+
+ /* main loop, unrolled twice */
+ ALIGN_TEXT
+0: lea (%rax, %r8, 1), %rcx # (str ^ c) - 0x01..01
+ not %rax # ~(str ^ c)
+ and %r9, %rax # ((str^c) - 0x01..01) & ~(str^c)
+ and %rcx, %rax # not including junk bytes
+ jnz .Lmatch
+
+ mov (%rdi), %rax
+ add $8, %rdi
+ xor %rsi, %rax # str ^ c
+ cmp %rdx, %rdi
+ jae .Ltail
+
+ lea (%rax, %r8, 1), %rcx # (str ^ c) - 0x01..01
+ not %rax # ~(str ^ c)
+ and %r9, %rax # ((str^c) - 0x01..01) & ~(str^c)
+ and %rcx, %rax # not including junk bytes
+ jnz .Lmatch
+
+ mov (%rdi), %rax
+ add $8, %rdi
+ xor %rsi, %rax # str ^ c
+ cmp %rdx, %rdi
+ jb 0b
+
+.Ltail: lea (%rax, %r8, 1), %rcx # (str ^ c) - 0x01..01
+ not %rax # ~(str ^ c)
+ and %r11, %rax # ((str^c) - 0x01..01) & ~(str^c)
+ and %rcx, %rax # not including junk bytes or bytes past buffer
+ jz .Lnomatch
+
+.Lmatch:
+ tzcnt %rax, %rax # first match
+ shr $3, %eax # scale from bit to byte index
+ lea -8(%rdi, %rax), %rax # pointer to found c
+ ret
+
+ /* no match found */
+.Lnomatch:
+ xor %eax, %eax # return null pointer
+ ret
+ARCHEND(__memchr, scalar)
+
+ARCHENTRY(__memchr, baseline)
+ test %rdx, %rdx # empty input?
+ je .Lnomatchb
+
+ movd %esi, %xmm2
+ mov %edi, %ecx
+ mov $-1, %r9
+ add %rdi, %rdx # pointer to end of buffer or to end of
+ cmovc %r9, %rdx # address space (whichever comes first)
+ and $~0x1f, %rdi # align to 32 bytes
+ movdqa (%rdi), %xmm0 # load first 32 bytes
+ movdqa 16(%rdi), %xmm1
+
+ punpcklbw %xmm2, %xmm2 # c -> cc
+
+ shl %cl, %r9d # mask with zeroes before the string
+
+ punpcklwd %xmm2, %xmm2 # cc -> cccc
+
+ mov $-1, %r8d
+ xor %ecx, %ecx
+ sub %edx, %ecx # edx = -ecx
+ shr %cl, %r8d # bytes in tail that are part of the buffer
+
+ pshufd $0, %xmm2, %xmm2 # cccc -> cccccccccccccccc
+
+ add $32, %rdi # advance to next 32 bytes
+ mov $-1, %eax
+ cmp %rdx, %rdi # end of buffer reached during head?
+ cmovae %r8d, %eax # if yes, do combined head/tail processing
+ and %r9d, %eax # mask of bytes in head part of string
+
+ /* process head */
+ pcmpeqb %xmm2, %xmm1
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm1, %esi
+ pmovmskb %xmm0, %ecx
+ shl $16, %esi
+ or %esi, %ecx # locations of matches
+ and %ecx, %eax # any match inside buffer?
+ jnz .Lprecisematchb
+
+ cmp %rdx, %rdi # did the buffer end here?
+ jae .Lnomatchb # if yes we are done
+
+ /* main loop */
+ ALIGN_TEXT
+0: movdqa (%rdi), %xmm0 # load next string chunk
+ movdqa 16(%rdi), %xmm1
+ add $32, %rdi
+ cmp %rdx, %rdi # ready for main loop?
+ jae .Ltailb
+
+ pcmpeqb %xmm2, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ por %xmm1, %xmm0 # match in either half?
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jz 0b
+
+.Lmatchb:
+ pcmpeqb -32(%rdi), %xmm2 # redo comparison of first 16 bytes
+ pmovmskb %xmm1, %ecx
+ pmovmskb %xmm2, %eax
+ shl $16, %ecx
+ or %ecx, %eax # location of matches
+
+.Lprecisematchb:
+ tzcnt %eax, %eax # find location of match
+ lea -32(%rdi, %rax, 1), %rax # point to matching byte
+ ret
+
+.Ltailb:
+ pcmpeqb %xmm2, %xmm1
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm1, %edx
+ pmovmskb %xmm0, %eax
+ shl $16, %edx
+ or %edx, %eax # location of matches
+ and %r8d, %eax # mask out matches beyond buffer
+ bsf %eax, %edx # location of match
+ lea -32(%rdi, %rdx, 1), %rdx # pointer to match (if any)
+ cmovnz %rdx, %rax # point to match if present,
+ ret # else null pointer
+
+.Lnomatchb:
+ xor %eax, %eax # return null pointer
+ ret
+ARCHEND(__memchr, baseline)
+
+ .section .note.GNU-stack,"",%progbits
diff --git a/lib/libc/amd64/string/memcmp.S b/lib/libc/amd64/string/memcmp.S
index d192229677b3..dc8bcff73cb9 100644
--- a/lib/libc/amd64/string/memcmp.S
+++ b/lib/libc/amd64/string/memcmp.S
@@ -328,13 +328,28 @@ ARCHENTRY(memcmp, baseline)
movdqu 16(%rsi, %rdi, 1), %xmm1
pcmpeqb 16(%rdi), %xmm1 # compare second half of this iteration
add %rcx, %rdx # pointer to last byte in buffer
- pcmpeqb %xmm2, %xmm0
+ jc .Loverflow # did this overflow?
+0: pcmpeqb %xmm2, %xmm0
pmovmskb %xmm0, %eax
xor $0xffff, %eax # any mismatch?
jne .Lmismatch_head
add $64, %rdi # advance to next iteration
jmp 1f # and get going with the loop
+ /*
+ * If we got here, a buffer length was passed to memcmp(a, b, len)
+ * such that a + len < a. While this sort of usage is illegal,
+ * it is plausible that a caller tries to do something like
+ * memcmp(a, b, SIZE_MAX) if a and b are known to differ, intending
+ * for memcmp() to stop comparing at the first mismatch. This
+ * behaviour is not guaranteed by any version of ISO/IEC 9899,
+ * but usually works out in practice. Let's try to make this
+ * case work by comparing until the end of the address space.
+ */
+.Loverflow:
+ mov $-1, %rdx # compare until the end of memory
+ jmp 0b
+
/* process buffer 32 bytes at a time */
ALIGN_TEXT
0: movdqu -32(%rsi, %rdi, 1), %xmm0
diff --git a/lib/libc/amd64/string/memrchr.S b/lib/libc/amd64/string/memrchr.S
new file mode 100644
index 000000000000..4f6c5a238daa
--- /dev/null
+++ b/lib/libc/amd64/string/memrchr.S
@@ -0,0 +1,166 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023 Robert Clausecker
+ */
+
+#include <machine/asm.h>
+
+#include "amd64_archlevel.h"
+
+#define ALIGN_TEXT .p2align 4, 0x90
+
+ARCHFUNCS(memrchr)
+ ARCHFUNC(memrchr, scalar)
+ ARCHFUNC(memrchr, baseline)
+ENDARCHFUNCS(memrchr)
+
+ARCHENTRY(memrchr, scalar)
+ xor %eax, %eax # prospective return value
+ sub $4, %rdx # 4 bytes left to process?
+ jb 1f
+
+ ALIGN_TEXT
+0: xor %r8, %r8
+ lea 2(%rdi), %r10
+ cmp %sil, 2(%rdi)
+ cmovne %r8, %r10 # point to null if no match
+
+ cmp %sil, (%rdi)
+ cmove %rdi, %r8 # point to first char if match
+
+ lea 1(%rdi), %r9
+ cmp %sil, 1(%rdi)
+ cmovne %r8, %r9 # point to first result if no match in second
+
+ lea 3(%rdi), %r11
+ cmp %sil, 3(%rdi)
+ cmovne %r10, %r11
+
+ test %r11, %r11
+ cmovz %r9, %r11 # take first pair match if none in second
+
+ test %r11, %r11
+ cmovnz %r11, %rax # take match in current set if any
+
+ add $4, %rdi
+ sub $4, %rdx
+ jae 0b
+
+1: cmp $-3, %edx # a least one character left to process?
+ jb 2f
+
+ cmp %sil, (%rdi)
+ cmove %rdi, %rax
+
+ lea 1(%rdi), %rcx
+ cmp $-2, %edx # at least two characters left to process?
+ jb 2f
+
+ cmp %sil, 1(%rdi)
+ cmove %rcx, %rax
+
+ lea 2(%rdi), %rcx
+ cmp $-1, %edx # at least three character left to process?
+ jb 2f
+
+ cmp %sil, 2(%rdi)
+ cmove %rcx, %rax
+
+2: ret
+ARCHEND(memrchr, scalar)
+
+ARCHENTRY(memrchr, baseline)
+ movd %esi, %xmm4
+ test %rdx, %rdx # empty buffer?
+ jz .L0 # if yes, return immediately
+
+ punpcklbw %xmm4, %xmm4 # c -> cc
+ mov %edi, %ecx
+ punpcklwd %xmm4, %xmm4 # cc -> cccc
+ and $~0xf, %rdi # align source pointer
+ pshufd $0, %xmm4, %xmm4 # cccc -> cccccccccccccccc
+ and $0xf, %ecx
+ movdqa %xmm4, %xmm0
+ mov $-1, %r8d
+ pcmpeqb (%rdi), %xmm0 # compare aligned head
+ shl %cl, %r8d # mask of bytes in the head of the buffer
+ pmovmskb %xmm0, %eax
+
+ sub $16, %rcx
+ and %r8d, %eax # match mask
+ add %rcx, %rdx # advance past head
+ cmc
+ jbe .Lrunt # did the string end in the buffer?
+
+ mov %rdi, %rsi # pointer to matching chunk
+ add $16, %rdi
+ sub $16, %rdx # enough left for another round?
+ jbe 1f
+
+ /* main loop unrolled twice */
+ ALIGN_TEXT
+0: movdqa %xmm4, %xmm0
+ pcmpeqb (%rdi), %xmm0
+ pmovmskb %xmm0, %r8d
+
+ cmp $16, %rdx # enough left for second chunk?
+ jbe 2f
+
+ movdqa %xmm4, %xmm0
+ pcmpeqb 16(%rdi), %xmm0
+ pmovmskb %xmm0, %ecx
+
+ lea 16(%rdi), %r9
+ test %ecx, %ecx # match found in second chunk?
+ cmovz %r8d, %ecx # if not, use match data from first chunk
+ cmovz %rdi, %r9
+
+ test %ecx, %ecx # any match found?
+ cmovnz %ecx, %eax # if yes, overwrite previously found match
+ cmovnz %r9, %rsi
+
+ add $32, %rdi # advance to next iteration
+ sub $32, %rdx # advance to next chunks
+ ja 0b
+
+ /* process remaining 1--16 bytes */
+1: pcmpeqb (%rdi), %xmm4
+ mov $0xffff, %r8d
+ xor %ecx, %ecx
+ sub %edx, %ecx # number of bytes to be masked out
+ pmovmskb %xmm4, %r9d
+ shr %cl, %r8d # mask of bytes to be kept in the buffer
+ and %r9d, %r8d
+ cmovnz %r8d, %eax
+ cmovnz %rdi, %rsi
+ bsr %eax, %eax
+ lea (%rsi, %rax, 1), %rsi # pointer to match (or junk)
+ cmovnz %rsi, %rax # if any match was found, return it
+ ret
+
+ /* end of chunk reached within first half iteration */
+2: test %r8d, %r8d # match in previous chunk?
+ cmovnz %r8d, %eax # if yes, overwrite previous chunks
+ cmovnz %rdi, %rsi
+ add $16, %rdi # point to tail
+ sub $16, %edx
+ jmp 1b # handle tail the same otherwise
+
+ /* runt: string ends within head, edx has negated amount of invalid head bytes */
+.Lrunt: mov $0xffff, %r8d
+ xor %ecx, %ecx
+ sub %edx, %ecx
+ shr %cl, %r8d
+ and %r8d, %eax
+ bsr %eax, %eax
+ lea (%rdi, %rax, 1), %rdi
+ cmovnz %rdi, %rax
+ ret
+
+ /* empty buffer: return a null pointer */
+.L0: xor %eax, %eax
+ ret
+ARCHEND(memrchr, baseline)
+
+ .section .note.GNU-stack, "", %progbits
diff --git a/lib/libc/amd64/string/stpncpy.S b/lib/libc/amd64/string/stpncpy.S
new file mode 100644
index 000000000000..5ce0dd093a9e
--- /dev/null
+++ b/lib/libc/amd64/string/stpncpy.S
@@ -0,0 +1,283 @@
+/*
+ * Copyright (c) 2023 The FreeBSD Foundation
+ *
+ * This software was developed by Robert Clausecker <fuz@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE
+ */
+
+#include <machine/asm.h>
+
+#include "amd64_archlevel.h"
+
+#define ALIGN_TEXT .p2align 4, 0x90
+
+ .weak stpncpy
+ .set stpncpy, __stpncpy
+ARCHFUNCS(__stpncpy)
+ ARCHFUNC(__stpncpy, scalar)
+ ARCHFUNC(__stpncpy, baseline)
+ENDARCHFUNCS(__stpncpy)
+
+ARCHENTRY(__stpncpy, scalar)
+ push %rbp # establish stack frame
+ mov %rsp, %rbp
+
+ push %rdx
+ push %rdi
+ push %rsi
+ push %rax # dummy push for alignment
+
+ mov %rsi, %rdi
+ xor %esi, %esi
+ call CNAME(__memchr) # memchr(src, '\0', len)
+ pop %rcx # dummy pop
+ pop %rsi
+ mov -16(%rbp), %rdi
+
+ test %rax, %rax # NUL found?
+ jz .Lfullcopy
+
+ mov %rax, %rdx
+ sub %rsi, %rdx # copy until the NUL byte
+ add %rdx, -16(%rbp) # advance destination by string length
+ sub %rdx, -8(%rbp) # and shorten buffer size by string length
+ call CNAME(memcpy)
+
+ pop %rdi
+ pop %rdx
+ xor %esi, %esi
+ pop %rbp
+ jmp CNAME(memset) # clear remaining buffer
+
+.Lfullcopy:
+ mov -8(%rbp), %rdx
+ call CNAME(memcpy) # copy whole string
+ add -8(%rbp), %rax # point to dest[n]
+ leave
+ ret
+ARCHEND(__stpncpy, scalar)
+
+ /*
+ * this mask allows us to generate masks of 16-n 0xff bytes
+ * followed by n 0x00 bytes by loading from .Lmask+n.
+ */
+ .section .rodata
+.Lmask: .quad 0xffffffffffffffff
+ .quad 0xffffffffffffffff
+ .quad 0x0000000000000000
+ .quad 0x0000000000000000
+
+/* stpncpy(char *restrict rdi, const char *rsi, size_t rdx) */
+ARCHENTRY(__stpncpy, baseline)
+#define bounce (-3*16-8) /* location of on-stack bounce buffer */
+
+ test %rdx, %rdx # no bytes to copy?
+ jz .L0
+
+ mov %esi, %ecx
+ and $~0xf, %rsi # align source to 16 bytes
+ movdqa (%rsi), %xmm0 # load head
+ and $0xf, %ecx # offset from alignment
+ mov $-1, %r9d
+ lea -32(%rcx), %rax # set up overflow-proof comparison rdx+rcx<=32
+ shl %cl, %r9d # mask of bytes belonging to the string
+ sub %rcx, %rdi # adjust RDI to correspond to RSI
+ pxor %xmm1, %xmm1
+ movdqa %xmm0, bounce(%rsp) # stash copy of head on the stack
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %r8d
+
+ lea (%rdx, %rcx, 1), %r10 # buffer length from alignment boundary
+ add %rdx, %rax # less than 2 chunks (32 bytes) to play with?
+ jnc .Lrunt # if yes, use special runt processing
+
+ movdqu %xmm1, -16(%rdi, %r10, 1) # clear final bytes of destination
+ and %r9d, %r8d # end of string within head?
+ jnz .Lheadnul
+
+ movdqu (%rsi, %rcx, 1), %xmm2 # load head from source buffer
+ movdqu %xmm2, (%rdi, %rcx, 1) # an deposit
+
+ add $16, %rsi
+ add $16, %rdi
+ sub $32, %r10
+
+ /* main loop unrolled twice */
+ ALIGN_TEXT
+0: movdqa (%rsi), %xmm0
+ pxor %xmm1, %xmm1
+ pcmpeqb %xmm0, %xmm1 # NUL byte encountered?
+ pmovmskb %xmm1, %r8d
+ test %r8d, %r8d
+ jnz 3f
+
+ movdqu %xmm0, (%rdi)
+ cmp $16, %r10 # more than a full chunk left?
+ jbe 1f
+
+ movdqa 16(%rsi), %xmm0
+ add $32, %rdi # advance pointers to next chunk
+ add $32, %rsi
+ pxor %xmm1, %xmm1
+ pcmpeqb %xmm0, %xmm1 # NUL byte encountered?
+ pmovmskb %xmm1, %r8d
+ test %r8d, %r8d
+ jnz 2f
+
+ movdqu %xmm0, -16(%rdi)
+ sub $32, %r10 # more than another full chunk left?
+ ja 0b
+
+ sub $16, %rdi # undo second advancement
+ sub $16, %rsi
+ add $16, %r10d # restore number of remaining bytes
+
+ /* 1--16 bytes left but string has not ended yet */
+1: pxor %xmm1, %xmm1
+ pcmpeqb 16(%rsi), %xmm1 # NUL byte in source tail?
+ pmovmskb %xmm1, %r8d
+ bts %r10d, %r8d # treat end of buffer as NUL
+ tzcnt %r8d, %r8d # where is the NUL byte?
+ movdqu (%rsi, %r8, 1), %xmm0 # load source tail before NUL
+ lea 16(%rdi, %r8, 1), %rax # point return value to NUL byte
+ # or end of buffer
+ movdqu %xmm0, (%rdi, %r8, 1) # store tail into the buffer
+ ret
+
+2: sub $16, %rdi # undo second advancement
+ sub $16, %rsi
+ sub $16, %r10
+
+ /* string has ended and buffer has not */
+3: tzcnt %r8d, %r8d # where did the string end?
+ lea .Lmask+16(%rip), %rcx
+ lea (%rdi, %r8, 1), %rax # where the NUL byte will be
+ neg %r8
+ movdqu (%rcx, %r8, 1), %xmm1 # mask with FF where the string is,
+ # 00 where it is not
+ pand %xmm1, %xmm0 # mask out bytes after the string
+ movdqu %xmm0, (%rdi) # store masked current chunk
+ pxor %xmm1, %xmm1
+ sub $16, %r10 # another full chunk left?
+ jbe 1f
+
+ /* clear remaining destination buffer (tail has been cleared earlier) */
+ ALIGN_TEXT
+0: movdqu %xmm1, 16(%rdi)
+ cmp $16, %r10
+ jbe 1f
+
+ movdqu %xmm1, 32(%rdi)
+ add $32, %rdi
+ sub $32, %r10
+ ja 0b
+
+1: ret
+
+ /* at least two chunks to play with and NUL while processing head */
+.Lheadnul:
+ movdqu bounce(%rsp, %rcx, 1), %xmm0 # load start of source from stack
+ tzcnt %r8d, %r8d # find location of NUL byte
+ movdqu %xmm0, (%rdi, %rcx, 1) # deposit head in the destination
+ movdqu %xmm1, (%rdi, %r8, 1) # clear out following bytes
+ movdqu %xmm1, 16(%rdi) # clear out second chunk
+ lea (%rdi, %r8, 1), %rax # make RAX point to the NUL byte
+
+ add $32, %rdi # advance past first two chunks
+ sub $32+16, %r10 # advance past first three chunks
+ jbe 1f # did we pass the end of the buffer?
+
+ /* clear remaining destination buffer (tail has been cleared earlier) */
+ ALIGN_TEXT
+0: movdqu %xmm1, (%rdi) # clear out buffer chunk
+ cmp $16, %r10
+ jbe 1f
+
+ movdqu %xmm1, 16(%rdi)
+ add $32, %rdi
+ sub $32, %r10
+ ja 0b
+
+1: ret
+
+ /* 1--32 bytes to copy, bounce through the stack */
+.Lrunt: movdqa %xmm1, bounce+16(%rsp) # clear out rest of on-stack copy
+ bts %r10d, %r8d # treat end of buffer as end of string
+ and %r9w, %r8w # end of string within first buffer?
+ jnz 0f # if yes, do not inspect second buffer
+
+ movdqa 16(%rsi), %xmm0 # load second chunk of input
+ movdqa %xmm0, bounce+16(%rsp) # stash copy on stack
+ pcmpeqb %xmm1, %xmm0 # NUL in second chunk?
+ pmovmskb %xmm0, %r9d
+ shl $16, %r9d
+ or %r9d, %r8d # merge found NUL bytes into NUL mask
+
+ /* end of string after one buffer */
+0: tzcnt %r8d, %r8d # location of last char in string
+ movdqu %xmm1, bounce(%rsp, %r8, 1) # clear bytes behind string
+ lea bounce(%rsp, %rcx, 1), %rsi # start of string copy on stack
+ lea (%rdi, %r8, 1), %rax # return pointer to NUL byte
+
+ cmp $16, %edx # at least 16 bytes to transfer?
+ jae .L1631
+
+ mov (%rsi), %r8 # load string head
+ cmp $8, %edx # at least 8 bytes to transfer?
+ jae .L0815
+
+ cmp $4, %edx # at least 4 bytes to transfer?
+ jae .L0407
+
+ movzwl -2(%rsi, %rdx, 1), %esi # load last two bytes of string
+ mov %r8b, (%rdi, %rcx, 1) # store first byte
+
+ cmp $2, %edx # at least 2 bytes to transfer?
+ jb .L1
+
+ mov %si, -2(%rdi, %r10, 1) # store last two bytes of string
+.L1: ret
+
+.L1631: movdqu (%rsi), %xmm0 # load first 16 bytes of string
+ movdqu -16(%rsi, %rdx, 1), %xmm1 # load last 16 bytes of string
+ movdqu %xmm0, (%rdi, %rcx, 1)
+ movdqu %xmm1, -16(%rdi, %r10, 1)
+ ret
+
+.L0815: mov -8(%rsi, %rdx, 1), %rdx # load last 8 bytes of string
+ mov %r8, (%rdi, %rcx, 1)
+ mov %rdx, -8(%rdi, %r10, 1)
+ ret
+
+.L0407: mov -4(%rsi, %rdx, 1), %edx # load last four bytes of string
+ mov %r8d, (%rdi, %rcx, 1)
+ mov %edx, -4(%rdi, %r10, 1)
+ ret
+
+ /* length 0 buffer: just return dest */
+.L0: mov %rdi, %rax
+ ret
+ARCHEND(__stpncpy, baseline)
+
+ .section .note.GNU-stack,"",%progbits
diff --git a/lib/libc/amd64/string/strcat.S b/lib/libc/amd64/string/strcat.S
index 0834408acfb7..081e98840cee 100644
--- a/lib/libc/amd64/string/strcat.S
+++ b/lib/libc/amd64/string/strcat.S
@@ -1,6 +1,14 @@
-/*
- * Written by J.T. Conklin <jtc@acorntoolworks.com>
- * Public domain.
+/*-
+ * Copyright (c) 2023, The FreeBSD Foundation
+ *
+ * SPDX-License-Expression: BSD-2-Clause
+ *
+ * Portions of this software were developed by Robert Clausecker
+ * <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation.
+ *
+ * Adapted from NetBSD's common/lib/libc/arch/x86_64/string/strcat.S
+ * written by J.T. Conklin <jtc@acorntoolworks.com>
+ * that was originally dedicated to the public domain
*/
#include <machine/asm.h>
@@ -8,7 +16,14 @@
RCSID("$NetBSD: strcat.S,v 1.4 2004/07/26 18:51:21 drochner Exp $")
#endif
-ENTRY(strcat)
+#include "amd64_archlevel.h"
+
+ARCHFUNCS(strcat)
+ ARCHFUNC(strcat, scalar)
+ ARCHFUNC(strcat, baseline)
+ENDARCHFUNCS(strcat)
+
+ARCHENTRY(strcat, scalar)
movq %rdi,%rax
movabsq $0x0101010101010101,%r8
movabsq $0x8080808080808080,%r9
@@ -161,6 +176,28 @@ ENTRY(strcat)
.Ldone:
ret
-END(strcat)
+ARCHEND(strcat, scalar)
+
+/*
+ * Call into strlen + strcpy if we have any SIMD at all.
+ * The scalar implementation above is better for the scalar
+ * case as it avoids the function call overhead, but pessimal
+ * if we could call SIMD routines instead.
+ */
+ARCHENTRY(strcat, baseline)
+ push %rbp
+ mov %rsp, %rbp
+ push %rsi
+ push %rbx
+ mov %rdi, %rbx # remember destination for later
+ call CNAME(strlen) # strlen(dest)
+ mov -8(%rbp), %rsi
+ lea (%rbx, %rax, 1), %rdi # dest + strlen(dest)
+ call CNAME(__stpcpy) # stpcpy(dest + strlen(dest), src)
+ mov %rbx, %rax # return dest
+ pop %rbx
+ leave
+ ret
+ARCHEND(strcat, baseline)
.section .note.GNU-stack,"",%progbits
diff --git a/lib/libc/amd64/string/strcmp.S b/lib/libc/amd64/string/strcmp.S
index 437db7eca43a..eb354bd2af82 100644
--- a/lib/libc/amd64/string/strcmp.S
+++ b/lib/libc/amd64/string/strcmp.S
@@ -1,14 +1,33 @@
-/*
- * Written by J.T. Conklin <jtc@acorntoolworks.com>
- * Public domain.
+/*-
+ * Copyright (c) 2023, The FreeBSD Foundation
+ *
+ * SPDX-License-Expression: BSD-2-Clause
+ *
+ * Portions of this software were developed by Robert Clausecker
+ * <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation.
+ *
+ * Adapted from NetBSD's common/lib/libc/arch/x86_64/string/strcmp.S
+ * written by J.T. Conklin <jtc@acorntoolworks.com> that was originally
+ * dedicated to the public domain.
*/
#include <machine/asm.h>
+#include <machine/param.h>
+
#if 0
RCSID("$NetBSD: strcmp.S,v 1.3 2004/07/19 20:04:41 drochner Exp $")
#endif
-ENTRY(strcmp)
+#include "amd64_archlevel.h"
+
+#define ALIGN_TEXT .p2align 4, 0x90
+
+ARCHFUNCS(strcmp)
+ ARCHFUNC(strcmp, scalar)
+ ARCHFUNC(strcmp, baseline)
+ENDARCHFUNCS(strcmp)
+
+ARCHENTRY(strcmp, scalar)
/*
* Align s1 to word boundary.
* Consider unrolling loop?
@@ -39,7 +58,7 @@ ENTRY(strcmp)
movabsq $0x8080808080808080,%r9
subq $8,%rsi
- .align 4
+ ALIGN_TEXT
.Lword_loop:
movq 8(%rdi),%rax
addq $8,%rdi
@@ -53,7 +72,7 @@ ENTRY(strcmp)
testq %r9,%rdx
je .Lword_loop
- .align 4
+ ALIGN_TEXT
.Lbyte_loop:
movb (%rdi),%al
incq %rdi
@@ -69,6 +88,272 @@ ENTRY(strcmp)
movzbq %dl,%rdx
subq %rdx,%rax
ret
-END(strcmp)
+ARCHEND(strcmp, scalar)
+
+ARCHENTRY(strcmp, baseline)
+ /* check if either string crosses a page in the head */
+ lea 15(%rdi), %r8d # end of head
+ lea 15(%rsi), %r9d
+ mov %edi, %eax
+ mov %esi, %edx
+ xor %edi, %r8d # bits that changed between first and last byte
+ xor %esi, %r9d
+ and $~0xf, %rdi # align heads to 16 bytes
+ and $~0xf, %rsi
+ or %r8d, %r9d # in either RSI or RDI
+ and $0xf, %eax # offset from alignment
+ and $0xf, %edx
+ pxor %xmm1, %xmm1
+ test $PAGE_SIZE, %r9d # did the page change?
+ jz 0f # if not, take fast path
+
+ /* heads may cross page boundary, avoid unmapped loads */
+ movdqa (%rdi), %xmm0 # load aligned heads
+ movdqa (%rsi), %xmm2
+ mov $-1, %r8d
+ mov $-1, %r9d
+ mov %eax, %ecx
+ shl %cl, %r8d # string head in XMM0
+ mov %edx, %ecx
+ shl %cl, %r9d # string head in XMM2
+ movdqa %xmm0, -40(%rsp) # stash copies of the heads on the stack
+ movdqa %xmm2, -24(%rsp)
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm1, %xmm2
+ pmovmskb %xmm0, %r10d
+ pmovmskb %xmm2, %r11d
+ test %r8d, %r10d # NUL byte present in first string?
+ lea -40(%rsp), %r8
+ cmovz %rdi, %r8
+ test %r9d, %r11d # NUL byte present in second string?
+ lea -24(%rsp), %r9
+ cmovz %rsi, %r9
+ movdqu (%r8, %rax, 1), %xmm0 # load true (or fake) heads
+ movdqu (%r9, %rdx, 1), %xmm4
+ jmp 1f
+
+0: movdqu (%rdi, %rax, 1), %xmm0 # load true heads
+ movdqu (%rsi, %rdx, 1), %xmm4
+1: pxor %xmm2, %xmm2
+ pcmpeqb %xmm0, %xmm2 # NUL byte present?
+ pcmpeqb %xmm0, %xmm4 # which bytes match?
+ pandn %xmm4, %xmm2 # match and not NUL byte?
+ pmovmskb %xmm2, %r9d
+ xor $0xffff, %r9d # mismatch or NUL byte?
+ jnz .Lhead_mismatch
+
+ /* load head and second chunk */
+ movdqa 16(%rdi), %xmm2 # load second chunks
+ movdqa 16(%rsi), %xmm3
+ sub %rdx, %rax # is a&0xf >= b&0xf?
+ jb .Lswapped # if not, proceed with swapped operands
+
+ neg %rax
+ movdqu 16(%rsi, %rax, 1), %xmm0
+ sub %rdi, %rsi # express RSI as distance from RDI
+ lea (%rsi, %rax, 1), %rdx # point RDX to offset in second string
+ neg %rax
+ pcmpeqb %xmm3, %xmm1 # ... corresponding to RDI
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm1, %r8d
+ pmovmskb %xmm0, %r9d
+ add $16, %rdi
+ test %r8d, %r8d
+ jnz .Lnul_found
+ xor $0xffff, %r9d
+ jnz .Lmismatch
+ add $16, %rdi # advance aligned pointers
+
+ /*
+ * During the main loop, the layout of the two strings is something like:
+ *
+ * v ------1------ v ------2------ v
+ * RDI: AAAAAAAAAAAAABBBBBBBBBBBBBBBB...
+ * RSI: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC...
+ *
+ * where v indicates the alignment boundaries and corresponding chunks
+ * of the strings have the same letters. Chunk A has been checked in
+ * the previous iteration. This iteration, we first check that string
+ * RSI doesn't end within region 2, then we compare chunk B between the
+ * two strings. As RSI is known not to hold a NUL byte in regsions 1
+ * and 2 at this point, this also ensures that RDI has not ended yet.
+ */
+ ALIGN_TEXT
+0: movdqu (%rdi, %rdx, 1), %xmm0 # chunk of 2nd string corresponding to RDI?
+ pxor %xmm1, %xmm1
+ pcmpeqb (%rdi, %rsi, 1), %xmm1 # end of string in RSI?
+ pcmpeqb (%rdi), %xmm0 # where do the chunks match?
+ pmovmskb %xmm1, %r8d
+ pmovmskb %xmm0, %r9d
+ test %r8d, %r8d
+ jnz .Lnul_found
+ xor $0xffff, %r9d # any mismatches?
+ jnz .Lmismatch
+
+ /* main loop unrolled twice */
+ movdqu 16(%rdi, %rdx, 1), %xmm0 # chunk of 2nd string corresponding to RDI?
+ pxor %xmm1, %xmm1
+ pcmpeqb 16(%rdi, %rsi, 1), %xmm1 # end of string in RSI?
+ pcmpeqb 16(%rdi), %xmm0 # where do the chunks match?
+ pmovmskb %xmm1, %r8d
+ pmovmskb %xmm0, %r9d
+ add $32, %rdi
+ test %r8d, %r8d
+ jnz .Lnul_found2
+ xor $0xffff, %r9d # any mismatches?
+ jz 0b
+
+ sub $16, %rdi # roll back second increment
+
+ /* a mismatch has been found between RDX and RSI */
+.Lmismatch:
+ tzcnt %r9d, %r9d # where is the mismatch?
+ add %rdi, %rdx # turn RDX from offset to pointer
+ movzbl (%rdx, %r9, 1), %ecx
+ movzbl (%rdi, %r9, 1), %eax
+ sub %ecx, %eax # difference of the mismatching chars
+ ret
+
+ /* mismatch in true heads */
+.Lhead_mismatch:
+ tzcnt %r9d, %r9d # where is the mismatch?
+ add %rax, %rdi # return to true heads
+ add %rdx, %rsi
+ movzbl (%rdi, %r9, 1), %eax # mismatching characters
+ movzbl (%rsi, %r9, 1), %ecx
+ sub %ecx, %eax
+ ret
+
+.Lnul_found2:
+ sub $16, %rdi # roll back second increment
+
+ /* a NUL has been found in RSI */
+.Lnul_found:
+ mov %eax, %ecx
+ mov %r8d, %r10d
+ shl %cl, %r8w # adjust NUL mask to positions in RDI/RDX
+ xor $0xffff, %r9d # mask of mismatches
+ or %r8d, %r9d # NUL bytes also count as mismatches
+ jnz .Lmismatch
+
+ /*
+ * (RDI) == (RSI) and NUL is past the string.
+ * Compare (RSI) with the corresponding part
+ * of the other string until the NUL byte.
+ */
+ movdqu (%rdi, %rax, 1), %xmm0
+ pcmpeqb (%rdi, %rsi, 1), %xmm0
+ add %rdi, %rsi # restore RSI pointer
+ add %rax, %rdi # point RDI to chunk corresponding to (RSI)
+ pmovmskb %xmm0, %ecx # mask of matches
+ not %ecx # mask of mismatches
+ or %r10d, %ecx # mask of mismatches or NUL bytes
+ tzcnt %ecx, %ecx # location of first mismatch
+ movzbl (%rdi, %rcx, 1), %eax
+ movzbl (%rsi, %rcx, 1), %ecx
+ sub %ecx, %eax
+ ret
+
+ /*
+ * If (a&0xf) < (b&0xf), we do the same thing but with swapped
+ * operands. I found that this performs slightly better than
+ * using conditional moves to do the swap branchless.
+ */
+.Lswapped:
+ movdqu 16(%rdi, %rax, 1), %xmm0
+ sub %rsi, %rdi # express RDI as distance from RSI
+ lea (%rdi, %rax, 1), %rdx # point RDX to offset in RDI corresponding to RSI
+ neg %rax # make difference positive
+ pcmpeqb %xmm2, %xmm1
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm1, %r8d
+ pmovmskb %xmm0, %r9d
+ add $16, %rsi # advance aligned pointers
+ test %r8d, %r8d
+ jnz .Lnul_founds
+ xor $0xffff, %r9d
+ jnz .Lmismatchs
+ add $16, %rsi
+
+ /*
+ * During the main loop, the layout of the two strings is something like:
+ *
+ * v ------1------ v ------2------ v
+ * RDI: AAAAAAAAAAAAABBBBBBBBBBBBBBBB...
+ * RSI: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC...
+ *
+ * where v indicates the alignment boundaries and corresponding chunks
+ * of the strings have the same letters. Chunk A has been checked in
+ * the previous iteration. This iteration, we first check that string
+ * RSI doesn't end within region 2, then we compare chunk B between the
+ * two strings. As RSI is known not to hold a NUL byte in regsions 1
+ * and 2 at this point, this also ensures that RDI has not ended yet.
+ */
+ ALIGN_TEXT
+0: movdqu (%rsi, %rdx, 1), %xmm0 # chunk of 2nd string corresponding to RDI?
+ pxor %xmm1, %xmm1
+ pcmpeqb (%rsi, %rdi, 1), %xmm1 # end of string in RSI?
+ pcmpeqb (%rsi), %xmm0 # where do the chunks match?
+ pmovmskb %xmm1, %r8d
+ pmovmskb %xmm0, %r9d
+ test %r8d, %r8d
+ jnz .Lnul_founds
+ xor $0xffff, %r9d # any mismatches?
+ jnz .Lmismatchs
+
+ /* main loop unrolled twice */
+ movdqu 16(%rsi, %rdx, 1), %xmm0 # chunk of 2nd string corresponding to RDI?
+ pxor %xmm1, %xmm1
+ pcmpeqb 16(%rsi, %rdi, 1), %xmm1 # end of string in RSI?
+ pcmpeqb 16(%rsi), %xmm0 # where do the chunks match?
+ pmovmskb %xmm1, %r8d
+ pmovmskb %xmm0, %r9d
+ add $32, %rsi
+ test %r8d, %r8d
+ jnz .Lnul_found2s
+ xor $0xffff, %r9d # any mismatches?
+ jz 0b
+
+ sub $16, %rsi # roll back second increment
+
+ /* a mismatch has been found between RDX and RDI */
+.Lmismatchs:
+ tzcnt %r9d, %r9d # where is the mismatch?
+ add %rsi, %rdx # turn RDX from offset to pointer
+ movzbl (%rdx, %r9, 1), %eax
+ movzbl (%rsi, %r9, 1), %ecx
+ sub %ecx, %eax # difference of the mismatching chars
+ ret
+
+.Lnul_found2s:
+ sub $16, %rsi # roll back second increment
+
+ /* a NUL has been found in RSI */
+.Lnul_founds:
+ mov %eax, %ecx
+ mov %r8d, %r10d
+ shl %cl, %r8w # adjust NUL mask to positions in RDI/RDX
+ xor $0xffff, %r9d # mask of mismatches
+ or %r8d, %r9d # NUL bytes also count as mismatches
+ jnz .Lmismatchs
+
+ /*
+ * (RDI) == (RSI) and NUL is past the string.
+ * Compare (RSI) with the corresponding part
+ * of the other string until the NUL byte.
+ */
+ movdqu (%rsi, %rax, 1), %xmm0
+ pcmpeqb (%rsi, %rdi, 1), %xmm0
+ add %rsi, %rdi # restore RDI pointer
+ add %rax, %rsi # point RSI to chunk corresponding to (RDI)
+ pmovmskb %xmm0, %ecx # mask of matches
+ not %ecx # mask of mismatches
+ or %r10d, %ecx # mask of mismatches or NUL bytes
+ tzcnt %ecx, %ecx # location of first mismatch
+ movzbl (%rdi, %rcx, 1), %eax
+ movzbl (%rsi, %rcx, 1), %ecx
+ sub %ecx, %eax
+ ret
+ARCHEND(strcmp, baseline)
.section .note.GNU-stack,"",%progbits
diff --git a/lib/libc/amd64/string/strcpy.c b/lib/libc/amd64/string/strcpy.c
index fbc661462ff2..eb93b0defbaa 100644
--- a/lib/libc/amd64/string/strcpy.c
+++ b/lib/libc/amd64/string/strcpy.c
@@ -27,7 +27,6 @@
* SUCH DAMAGE.
*/
-#include <sys/cdefs.h>
char *__stpcpy(char * __restrict, const char * __restrict);
char *
diff --git a/lib/libc/amd64/string/strcspn.S b/lib/libc/amd64/string/strcspn.S
new file mode 100644
index 000000000000..7ebd7a847d67
--- /dev/null
+++ b/lib/libc/amd64/string/strcspn.S
@@ -0,0 +1,396 @@
+/*
+ * Copyright (c) 2023 The FreeBSD Foundation
+ *
+ * This software was developed by Robert Clausecker <fuz@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE
+ */
+
+#include <machine/asm.h>
+#include <machine/param.h>
+
+#include "amd64_archlevel.h"
+
+#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */
+
+ .weak strcspn
+ .set strcspn, __strcspn
+ARCHFUNCS(__strcspn)
+ ARCHFUNC(__strcspn, scalar)
+ NOARCHFUNC
+ ARCHFUNC(__strcspn, x86_64_v2)
+ENDARCHFUNCS(__strcspn)
+
+ARCHENTRY(__strcspn, scalar)
+ push %rbp # align stack to enable function call
+ mov %rsp, %rbp
+ sub $256, %rsp # allocate space for lookup table
+
+ /* check for special cases */
+ movzbl (%rsi), %eax # first character in the set
+ test %eax, %eax
+ jz .Lstrlen
+
+ movzbl 1(%rsi), %edx # second character in the set
+ test %edx, %edx
+ jz .Lstrchr
+
+ /* no special case matches -- prepare lookup table */
+ xor %r8d, %r8d
+ mov $28, %ecx
+0: mov %r8, (%rsp, %rcx, 8)
+ mov %r8, 8(%rsp, %rcx, 8)
+ mov %r8, 16(%rsp, %rcx, 8)
+ mov %r8, 24(%rsp, %rcx, 8)
+ sub $4, %ecx
+ jnc 0b
+
+ add $2, %rsi
+ movb $1, (%rsp, %rax, 1) # register first chars in set
+ movb $1, (%rsp, %rdx, 1)
+ mov %rdi, %rax # a copy of the source to iterate over
+
+ /* process remaining chars in set */
+ ALIGN_TEXT
+0: movzbl (%rsi), %ecx
+ movb $1, (%rsp, %rcx, 1)
+ test %ecx, %ecx
+ jz 1f
+
+ movzbl 1(%rsi), %ecx
+ movb $1, (%rsp, %rcx, 1)
+ test %ecx, %ecx
+ jz 1f
+
+ add $2, %rsi
+ jmp 0b
+
+ /* find match */
+ ALIGN_TEXT
+1: movzbl (%rax), %ecx
+ cmpb $0, (%rsp, %rcx, 1)
+ jne 2f
+
+ movzbl 1(%rax), %ecx
+ cmpb $0, (%rsp, %rcx, 1)
+ jne 3f
+
+ movzbl 2(%rax), %ecx
+ cmpb $0, (%rsp, %rcx, 1)
+ jne 4f
+
+ movzbl 3(%rax), %ecx
+ add $4, %rax
+ cmpb $0, (%rsp, %rcx, 1)
+ je 1b
+
+ sub $3, %rax
+4: dec %rdi
+3: inc %rax
+2: sub %rdi, %rax # number of characters preceding match
+ leave
+ ret
+
+ /* set is empty, degrades to strlen */
+.Lstrlen:
+ leave
+ jmp CNAME(strlen)
+
+ /* just one character in set, degrades to strchr */
+.Lstrchr:
+ mov %rdi, (%rsp) # stash a copy of the string
+ mov %eax, %esi # find the character in the set
+ call CNAME(strchrnul)
+ sub (%rsp), %rax # length of prefix before match
+ leave
+ ret
+ARCHEND(__strcspn, scalar)
+
+ /*
+ * This kernel uses pcmpistri to do the heavy lifting.
+ * We provide five code paths, depending on set size:
+ *
+ * 0: call strlen()
+ * 1: call strchr()
+ * 2--16: one pcmpistri per 16 bytes of input
+ * 17--32: two pcmpistri per 16 bytes of input
+ * >=33: fall back to look up table
+ */
+ARCHENTRY(__strcspn, x86_64_v2)
+ push %rbp
+ mov %rsp, %rbp
+ sub $256, %rsp
+
+ /* check for special cases */
+ movzbl (%rsi), %eax
+ test %eax, %eax # empty string?
+ jz .Lstrlenv2
+
+ cmpb $0, 1(%rsi) # single character string?
+ jz .Lstrchrv2
+
+ /* find set size and copy up to 32 bytes to (%rsp) */
+ mov %esi, %ecx
+ and $~0xf, %rsi # align set pointer
+ movdqa (%rsi), %xmm0
+ pxor %xmm1, %xmm1
+ and $0xf, %ecx # amount of bytes rsi is past alignment
+ xor %edx, %edx
+ pcmpeqb %xmm0, %xmm1 # end of string reached?
+ movdqa %xmm0, 32(%rsp) # transfer head of set to stack
+ pmovmskb %xmm1, %eax
+ shr %cl, %eax # clear out junk before string
+ test %eax, %eax # end of set reached?
+ jnz 0f
+
+ movdqa 16(%rsi), %xmm0 # second chunk of the set
+ mov $16, %edx
+ sub %ecx, %edx # length of set preceding xmm0
+ pxor %xmm1, %xmm1
+ pcmpeqb %xmm0, %xmm1
+ movdqa %xmm0, 48(%rsp)
+ movdqu 32(%rsp, %rcx, 1), %xmm2 # head of set
+ pmovmskb %xmm1, %eax
+ test %eax, %eax
+ jnz 1f
+
+ movdqa 32(%rsi), %xmm0 # third chunk
+ add $16, %edx
+ pxor %xmm1, %xmm1
+ pcmpeqb %xmm0, %xmm1
+ movdqa %xmm0, 64(%rsp)
+ pmovmskb %xmm1, %eax
+ test %eax, %eax # still not done?
+ jz .Lgt32v2
+
+0: movdqu 32(%rsp, %rcx, 1), %xmm2 # head of set
+1: tzcnt %eax, %eax
+ add %eax, %edx # length of set (excluding NUL byte)
+ cmp $32, %edx # above 32 bytes?
+ ja .Lgt32v2
+
+ /*
+ * At this point we know that we want to use pcmpistri.
+ * one last problem obtains: the head of the string is not
+ * aligned and may cross a cacheline. If this is the case,
+ * we take the part before the page boundary and repeat the
+ * last byte to fill up the xmm register.
+ */
+ mov %rdi, %rax # save original string pointer
+ lea 15(%rdi), %esi # last byte of the head
+ xor %edi, %esi
+ test $PAGE_SIZE, %esi # does the head cross a page?
+ jz 0f
+
+ /* head crosses page: copy to stack to fix up */
+ and $~0xf, %rax # align head pointer temporarily
+ movzbl 15(%rax), %esi # last head byte on the page
+ movdqa (%rax), %xmm0
+ movabs $0x0101010101010101, %r8
+ imul %r8, %rsi # repeated 8 times
+ movdqa %xmm0, (%rsp) # head word on stack
+ mov %rsi, 16(%rsp) # followed by filler (last byte x8)
+ mov %rsi, 24(%rsp)
+ mov %edi, %eax
+ and $0xf, %eax # offset of head from alignment
+ add %rsp, %rax # pointer to fake head
+
+0: movdqu (%rax), %xmm0 # load head (fake or real)
+ lea 16(%rdi), %rax
+ and $~0xf, %rax # second 16 bytes of string (aligned)
+1: cmp $16, %edx # 16--32 bytes?
+ ja .Lgt16v2
+
+
+ /* set is 2--16 bytes in size */
+
+ /* _SIDD_UBYTE_OPS|_SIDD_CMP_EQUAL_ANY|_SIDD_LEAST_SIGNIFICANT */
+ pcmpistri $0, %xmm0, %xmm2 # match in head?
+ jbe .Lheadmatchv2
+
+ ALIGN_TEXT
+0: pcmpistri $0, (%rax), %xmm2
+ jbe 1f # match or end of string?
+ pcmpistri $0, 16(%rax), %xmm2
+ lea 32(%rax), %rax
+ ja 0b # match or end of string?
+
+3: lea -16(%rax), %rax # go back to second half
+1: jc 2f # jump if match found
+ movdqa (%rax), %xmm0 # reload string piece
+ pxor %xmm1, %xmm1
+ pcmpeqb %xmm1, %xmm0 # where is the NUL byte?
+ pmovmskb %xmm0, %ecx
+ tzcnt %ecx, %ecx # location of NUL byte in (%rax)
+2: sub %rdi, %rax # offset of %xmm0 from beginning of string
+ add %rcx, %rax # prefix length before match/NUL
+ leave
+ ret
+
+.Lheadmatchv2:
+ jc 2f # jump if match found
+ pxor %xmm1, %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %ecx
+ tzcnt %ecx, %ecx # location of NUL byte
+2: mov %ecx, %eax # prefix length before match/NUL
+ leave
+ ret
+
+ /* match in first set half during head */
+.Lheadmatchv2first:
+ mov %ecx, %eax
+ pcmpistri $0, %xmm0, %xmm3 # match in second set half?
+ cmp %ecx, %eax # before the first half match?
+ cmova %ecx, %eax # use the earlier match
+ leave
+ ret
+
+.Lgt16v2:
+ movdqu 48(%rsp, %rcx, 1), %xmm3 # second part of set
+
+ /* set is 17--32 bytes in size */
+ pcmpistri $0, %xmm0, %xmm2 # match in first set half?
+ jb .Lheadmatchv2first
+ pcmpistri $0, %xmm0, %xmm3 # match in second set half or end of string?
+ jbe .Lheadmatchv2
+
+ ALIGN_TEXT
+0: movdqa (%rax), %xmm0
+ pcmpistri $0, %xmm0, %xmm2
+ jb 4f # match in first set half?
+ pcmpistri $0, %xmm0, %xmm3
+ jbe 1f # match in second set half or end of string?
+ movdqa 16(%rax), %xmm0
+ add $32, %rax
+ pcmpistri $0, %xmm0, %xmm2
+ jb 3f # match in first set half?
+ pcmpistri $0, %xmm0, %xmm3
+ ja 0b # neither match in 2nd half nor string end?
+
+ /* match in second half or NUL */
+ lea -16(%rax), %rax # go back to second half
+1: jc 2f # jump if match found
+ pxor %xmm1, %xmm1
+ pcmpeqb %xmm1, %xmm0 # where is the NUL byte?
+ pmovmskb %xmm0, %ecx
+ tzcnt %ecx, %ecx # location of NUL byte in (%rax)
+2: sub %rdi, %rax # offset of %xmm0 from beginning of string
+ add %rcx, %rax # prefix length before match/NUL
+ leave
+ ret
+
+ /* match in first half */
+3: sub $16, %rax # go back to second half
+4: sub %rdi, %rax # offset of %xmm0 from beginning of string
+ mov %ecx, %edx
+ pcmpistri $0, %xmm0, %xmm3 # match in second set half?
+ cmp %ecx, %edx # before the first half match?
+ cmova %ecx, %edx # use the earlier match
+ add %rdx, %rax # return full ofset
+ leave
+ ret
+
+ /* set is empty, degrades to strlen */
+.Lstrlenv2:
+ leave
+ jmp CNAME(strlen)
+
+ /* just one character in set, degrades to strchr */
+.Lstrchrv2:
+ mov %rdi, (%rsp) # stash a copy of the string
+ mov %eax, %esi # find this character
+ call CNAME(strchrnul)
+ sub (%rsp), %rax # length of prefix before match
+ leave
+ ret
+
+ /* set is >=33 bytes in size */
+.Lgt32v2:
+ xorps %xmm0, %xmm0
+ mov $256-64, %edx
+
+ /* clear out look up table */
+0: movaps %xmm0, (%rsp, %rdx, 1)
+ movaps %xmm0, 16(%rsp, %rdx, 1)
+ movaps %xmm0, 32(%rsp, %rdx, 1)
+ movaps %xmm0, 48(%rsp, %rdx, 1)
+ sub $64, %edx
+ jnc 0b
+
+ add %rcx, %rsi # restore string pointer
+ mov %rdi, %rax # keep a copy of the string
+
+ /* initialise look up table */
+ ALIGN_TEXT
+0: movzbl (%rsi), %ecx
+ movb $1, (%rsp, %rcx, 1)
+ test %ecx, %ecx
+ jz 1f
+
+ movzbl 1(%rsi), %ecx
+ movb $1, (%rsp, %rcx, 1)
+ test %ecx, %ecx
+ jz 1f
+
+ movzbl 2(%rsi), %ecx
+ movb $1, (%rsp, %rcx, 1)
+ test %ecx, %ecx
+ jz 1f
+
+ movzbl 3(%rsi), %ecx
+ movb $1, (%rsp, %rcx, 1)
+ test %ecx, %ecx
+ jz 1f
+
+ add $4, %rsi
+ jmp 0b
+
+ /* find match */
+ ALIGN_TEXT
+1: movzbl (%rax), %ecx
+ cmpb $0, (%rsp, %rcx, 1)
+ jne 2f
+
+ movzbl 1(%rax), %ecx
+ cmpb $0, (%rsp, %rcx, 1)
+ jne 3f
+
+ movzbl 2(%rax), %ecx
+ cmpb $0, (%rsp, %rcx, 1)
+ jne 4f
+
+ movzbl 3(%rax), %ecx
+ add $4, %rax
+ cmpb $0, (%rsp, %rcx, 1)
+ je 1b
+
+ sub $3, %rax
+4: dec %rdi
+3: inc %rax
+2: sub %rdi, %rax # number of characters preceding match
+ leave
+ ret
+ARCHEND(__strcspn, x86_64_v2)
+
+ .section .note.GNU-stack,"",%progbits
diff --git a/lib/libc/amd64/string/strlcat.c b/lib/libc/amd64/string/strlcat.c
new file mode 100644
index 000000000000..94fdc0963dc3
--- /dev/null
+++ b/lib/libc/amd64/string/strlcat.c
@@ -0,0 +1,27 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023 Robert Clausecker
+ */
+
+#include <sys/cdefs.h>
+
+#include <string.h>
+
+#undef strlcat /* FORTIFY_SOURCE */
+
+void *__memchr(const void *, int, size_t);
+size_t __strlcpy(char *restrict, const char *restrict, size_t);
+
+size_t
+strlcat(char *restrict dst, const char *restrict src, size_t dstsize)
+{
+ char *loc = __memchr(dst, '\0', dstsize);
+
+ if (loc != NULL) {
+ size_t dstlen = (size_t)(loc - dst);
+
+ return (dstlen + __strlcpy(loc, src, dstsize - dstlen));
+ } else
+ return (dstsize + strlen(src));
+}
diff --git a/lib/libc/amd64/string/strlcpy.S b/lib/libc/amd64/string/strlcpy.S
new file mode 100644
index 000000000000..2b32c6c78047
--- /dev/null
+++ b/lib/libc/amd64/string/strlcpy.S
@@ -0,0 +1,281 @@
+/*
+ * Copyright (c) 2023 The FreeBSD Foundation
+ *
+ * This software was developed by Robert Clausecker <fuz@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE
+ */
+
+#include <machine/asm.h>
+
+#include "amd64_archlevel.h"
+
+#define ALIGN_TEXT .p2align 4, 0x90
+
+ .weak strlcpy
+ .set strlcpy, __strlcpy
+ARCHFUNCS(__strlcpy)
+ ARCHFUNC(__strlcpy, scalar)
+ ARCHFUNC(__strlcpy, baseline)
+ENDARCHFUNCS(__strlcpy)
+
+ARCHENTRY(__strlcpy, scalar)
+ push %rbp # establish stack frame
+ mov %rsp, %rbp
+ push %rsi
+ push %rbx
+ push %rdi
+ push %rdx
+ mov %rsi, %rdi
+ call CNAME(strlen) # strlen(src)
+ pop %rdx
+ pop %rdi
+ mov -8(%rbp), %rsi
+ mov %rax, %rbx # remember string length for return value
+ sub $1, %rdx # do not copy into the final byte of the buffer
+ jc 0f # skip copying altogether if buffer was empty
+ cmp %rax, %rdx # is the buffer longer than the input?
+ cmova %rax, %rdx # if yes, only copy the part that fits
+ movb $0, (%rdi, %rdx, 1) # NUL-terminate output buffer
+ call CNAME(memcpy) # copy string to output
+0: mov %rbx, %rax # restore return value
+ pop %rbx
+ leave
+ ret
+ARCHEND(__strlcpy, scalar)
+
+ARCHENTRY(__strlcpy, baseline)
+ sub $1, %rdx # do not count NUL byte in buffer length
+ jb .L0 # go to special code path if len was 0
+
+ mov %esi, %ecx
+ pxor %xmm1, %xmm1
+ mov %rsi, %r9 # stash a copy of the source pointer for later
+ and $~0xf, %rsi
+ pcmpeqb (%rsi), %xmm1 # NUL found in head?
+ mov $-1, %r8d
+ and $0xf, %ecx
+ shl %cl, %r8d # mask of bytes in the string
+ pmovmskb %xmm1, %eax
+ and %r8d, %eax
+ jnz .Lhead_nul
+
+ movdqa 16(%rsi), %xmm3 # load second string chunk
+ movdqu (%r9), %xmm2 # load unaligned string head
+ mov $32, %r8d
+ sub %ecx, %r8d # head length + length of second chunk
+ pxor %xmm1, %xmm1
+ pcmpeqb %xmm3, %xmm1 # NUL found in second chunk?
+
+ sub %r8, %rdx # enough space left for the second chunk?
+ jbe .Lhead_buf_end
+
+ /* process second chunk */
+ pmovmskb %xmm1, %eax
+ test %eax, %eax
+ jnz .Lsecond_nul
+
+ /* string didn't end in second chunk and neither did buffer -- not a runt! */
+ movdqa 32(%rsi), %xmm0 # load next string chunk
+ pxor %xmm1, %xmm1
+ movdqu %xmm2, (%rdi) # deposit head into buffer
+ sub %rcx, %rdi # adjust RDI to correspond to RSI
+ movdqu %xmm3, 16(%rdi) # deposit second chunk
+ sub %rsi, %rdi # express RDI as distance from RSI
+ add $32, %rsi # advance RSI past first two chunks
+ sub $16, %rdx # enough left for another round?
+ jbe 1f
+
+ /* main loop unrolled twice */
+ ALIGN_TEXT
+0: pcmpeqb %xmm0, %xmm1 # NUL byte encountered?
+ pmovmskb %xmm1, %eax
+ test %eax, %eax
+ jnz 3f
+
+ movdqu %xmm0, (%rsi, %rdi)
+ movdqa 16(%rsi), %xmm0 # load next string chunk
+ pxor %xmm1, %xmm1
+ cmp $16, %rdx # more than a full chunk left?
+ jbe 2f
+
+ add $32, %rsi # advance pointers to next chunk
+ pcmpeqb %xmm0, %xmm1 # NUL byte encountered?
+ pmovmskb %xmm1, %eax
+ test %eax, %eax
+ jnz 4f
+
+ movdqu %xmm0, -16(%rsi, %rdi)
+ movdqa (%rsi), %xmm0 # load next string chunk
+ pxor %xmm1, %xmm1
+ sub $32, %rdx
+ ja 0b
+
+1: sub $16, %rsi # undo second advancement
+ add $16, %edx
+
+ /* 1--16 bytes left in the buffer but string has not ended yet */
+2: pcmpeqb %xmm1, %xmm0 # NUL byte encountered?
+ pmovmskb %xmm0, %r8d
+ mov %r8d, %eax
+ bts %edx, %r8d # treat end of buffer as end of string
+ tzcnt %r8d, %r8d # find tail length
+ add %rsi, %rdi # restore RDI
+ movdqu (%rsi, %r8, 1), %xmm0 # load string tail
+ movdqu %xmm0, (%rdi, %r8, 1) # store string tail
+ movb $0, 16(%rdi, %r8, 1) # NUL terminate
+
+ /* continue to find the end of the string */
+ test %eax, %eax # end of string already reached?
+ jnz 1f
+
+ ALIGN_TEXT
+0: pcmpeqb 32(%rsi), %xmm1
+ pmovmskb %xmm1, %eax
+ pxor %xmm1, %xmm1
+ test %eax, %eax
+ jnz 2f
+
+ pcmpeqb 48(%rsi), %xmm1
+ pmovmskb %xmm1, %eax
+ add $32, %rsi
+ pxor %xmm1, %xmm1
+ test %eax, %eax
+ jz 0b
+
+1: sub $16, %rsi # undo second advancement
+2: tzcnt %eax, %eax # where is the NUL byte?
+ sub %r9, %rsi
+ lea 32(%rsi, %rax, 1), %rax # return string length
+ ret
+
+4: sub $16, %rsi # undo second advancement
+ add $16, %rdx # restore number of remaining bytes
+
+ /* string has ended but buffer has not */
+3: tzcnt %eax, %eax # find length of string tail
+ movdqu -15(%rsi, %rax, 1), %xmm0 # load string tail (incl. NUL)
+ add %rsi, %rdi # restore destination pointer
+ movdqu %xmm0, -15(%rdi, %rax, 1) # store string tail (incl. NUL)
+ sub %r9, %rsi # string length to current chunk
+ add %rsi, %rax # plus length of current chunk
+ ret
+
+.Lhead_buf_end:
+ pmovmskb %xmm1, %r8d
+ add $32, %edx # restore edx to (len-1) + ecx
+ mov %r8d, %eax
+ shl $16, %r8d # place 2nd chunk NUL mask into bits 16--31
+ bts %rdx, %r8 # treat end of buffer as end of string
+ tzcnt %r8, %rdx # find string/bufer len from alignment boundary
+ sub %ecx, %edx # find actual string/buffer len
+ movb $0, (%rdi, %rdx, 1) # write NUL terminator
+
+ /* continue to find the end of the string */
+ test %eax, %eax # end of string already reached?
+ jnz 1f
+
+ ALIGN_TEXT
+0: pcmpeqb 32(%rsi), %xmm1
+ pmovmskb %xmm1, %eax
+ pxor %xmm1, %xmm1
+ test %eax, %eax
+ jnz 2f
+
+ pcmpeqb 48(%rsi), %xmm1
+ pmovmskb %xmm1, %eax
+ add $32, %rsi
+ pxor %xmm1, %xmm1
+ test %eax, %eax
+ jz 0b
+
+1: sub $16, %rsi
+2: tzcnt %eax, %eax
+ sub %r9, %rsi
+ lea 32(%rsi, %rax, 1), %rax # return string length
+ jmp .L0031
+
+.Lsecond_nul:
+ add %r8, %rdx # restore buffer length
+ tzcnt %eax, %eax # where is the NUL byte?
+ lea -16(%rcx), %r8d
+ sub %r8d, %eax # string length
+ cmp %rax, %rdx # is the string shorter than the buffer?
+ cmova %rax, %rdx # copy only min(buflen, srclen) bytes
+ movb $0, (%rdi, %rdx, 1) # write NUL terminator
+.L0031: cmp $16, %rdx # at least 16 bytes to copy (not incl NUL)?
+ jb .L0015
+
+ /* copy 16--31 bytes */
+ movdqu (%r9), %xmm0 # load first 16 bytes
+ movdqu -16(%r9, %rdx, 1), %xmm1 # load last 16 bytes
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm1, -16(%rdi, %rdx, 1)
+ ret
+
+.Lhead_nul:
+ tzcnt %eax, %eax # where is the NUL byte?
+ sub %ecx, %eax # ... from the beginning of the string?
+ cmp %rax, %rdx # is the string shorter than the buffer?
+ cmova %rax, %rdx # copy only min(buflen, srclen) bytes
+ movb $0, (%rdi, %rdx, 1) # write NUL terminator
+
+ /* process strings of 0--15 bytes (rdx: min(buflen, srclen), rax: srclen) */
+.L0015: cmp $8, %rdx # at least 8 bytes to copy?
+ jae .L0815
+
+ cmp $4, %rdx # at least 4 bytes to copy?
+ jae .L0407
+
+ cmp $2, %rdx # at least 2 bytes to copy?
+ jae .L0203
+
+ movzbl (%r9), %ecx # load first byte from src
+ mov %cl, (%rdi) # deposit into destination
+ movb $0, (%rdi, %rdx, 1) # add NUL terminator (again)
+ ret
+
+.L0203: movzwl (%r9), %ecx
+ movzwl -2(%r9, %rdx, 1), %esi
+ mov %cx, (%rdi)
+ mov %si, -2(%rdi, %rdx, 1)
+ ret
+
+.L0407: mov (%r9), %ecx
+ mov -4(%r9, %rdx, 1), %esi
+ mov %ecx, (%rdi)
+ mov %esi, -4(%rdi, %rdx, 1)
+ ret
+
+.L0815: mov (%r9), %rcx
+ mov -8(%r9, %rdx, 1), %rsi
+ mov %rcx, (%rdi)
+ mov %rsi, -8(%rdi, %rdx, 1)
+ ret
+
+ /* length zero destination: just return the string length */
+.L0: mov %rsi, %rdi
+ jmp CNAME(strlen)
+ARCHEND(__strlcpy, baseline)
+
+ .section .note.GNU-stack,"",%progbits
diff --git a/lib/libc/amd64/string/strncat.c b/lib/libc/amd64/string/strncat.c
new file mode 100644
index 000000000000..2c63ab50b3c3
--- /dev/null
+++ b/lib/libc/amd64/string/strncat.c
@@ -0,0 +1,31 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023 Robert Clausecker
+ */
+
+#include <sys/cdefs.h>
+
+#include <string.h>
+
+#undef strncat /* _FORTIFY_SOURCE */
+
+void *__memccpy(void *restrict, const void *restrict, int, size_t);
+
+char *
+strncat(char *dest, const char *src, size_t n)
+{
+ size_t len;
+ char *endptr;
+
+ len = strlen(dest);
+ endptr = __memccpy(dest + len, src, '\0', n);
+
+ /* avoid an extra branch */
+ if (endptr == NULL)
+ endptr = dest + len + n + 1;
+
+ endptr[-1] = '\0';
+
+ return (dest);
+}
diff --git a/lib/libc/amd64/string/strncmp.S b/lib/libc/amd64/string/strncmp.S
new file mode 100644
index 000000000000..932cf078bdfc
--- /dev/null
+++ b/lib/libc/amd64/string/strncmp.S
@@ -0,0 +1,488 @@
+/*-
+ * Copyright (c) 2023 The FreeBSD Foundation
+ *
+ * This software was developed by Robert Clausecker <fuz@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE
+ */
+
+#include <machine/asm.h>
+#include <machine/param.h>
+
+#include "amd64_archlevel.h"
+
+#define ALIGN_TEXT .p2align 4, 0x90
+
+ARCHFUNCS(strncmp)
+ ARCHFUNC(strncmp, scalar)
+ ARCHFUNC(strncmp, baseline)
+ENDARCHFUNCS(strncmp)
+
+/*
+ * This is just the scalar loop unrolled a bunch of times.
+ */
+ARCHENTRY(strncmp, scalar)
+ xor %eax, %eax
+ sub $4, %rdx # 4 chars left to compare?
+ jbe 1f
+
+ ALIGN_TEXT
+0: movzbl (%rdi), %ecx
+ test %ecx, %ecx # NUL char in first string?
+ jz .L0
+ cmpb (%rsi), %cl # mismatch between strings?
+ jnz .L0
+
+ movzbl 1(%rdi), %ecx
+ test %ecx, %ecx
+ jz .L1
+ cmpb 1(%rsi), %cl
+ jnz .L1
+
+ movzbl 2(%rdi), %ecx
+ test %ecx, %ecx
+ jz .L2
+ cmpb 2(%rsi), %cl
+ jnz .L2
+
+ movzbl 3(%rdi), %ecx
+ test %ecx, %ecx
+ jz .L3
+ cmpb 3(%rsi), %cl
+ jnz .L3
+
+ add $4, %rdi # advance to next iteration
+ add $4, %rsi
+ sub $4, %rdx
+ ja 0b
+
+ /* end of string within the next 4 characters */
+1: cmp $-4, %edx # end of string reached immediately?
+ jz .Leq
+ movzbl (%rdi), %ecx
+ test %ecx, %ecx
+ jz .L0
+ cmpb (%rsi), %cl
+ jnz .L0
+
+ cmp $-3, %edx # end of string reached after 1 char?
+ jz .Leq
+ movzbl 1(%rdi), %ecx
+ test %ecx, %ecx
+ jz .L1
+ cmpb 1(%rsi), %cl
+ jnz .L1
+
+ cmp $-2, %edx
+ jz .Leq
+ movzbl 2(%rdi), %ecx
+ test %ecx, %ecx
+ jz .L2
+ cmpb 2(%rsi), %cl
+ jnz .L2
+
+ cmp $-1, %edx # either end of string after 3 chars,
+ jz .Leq # or it boils down to the last char
+
+.L3: inc %eax
+.L2: inc %eax
+.L1: inc %eax
+.L0: movzbl (%rsi, %rax, 1), %ecx
+ movzbl (%rdi, %rax, 1), %eax
+ sub %ecx, %eax
+.Leq: ret
+ARCHEND(strncmp, scalar)
+
+ARCHENTRY(strncmp, baseline)
+ push %rbx
+ sub $1, %rdx # RDX--, so RDX points to the last byte to compare
+ jb .Lempty # where there any bytes to compare at all?
+
+ lea 15(%rdi), %r8d # end of head
+ lea 15(%rsi), %r9d
+ mov %edi, %eax
+ mov %esi, %ebx
+ xor %edi, %r8d # bits that changed between first and last byte
+ xor %esi, %r9d
+ and $~0xf, %rdi # align heads to 16 bytes
+ and $~0xf, %rsi
+ or %r8d, %r9d
+ and $0xf, %eax # offset from alignment
+ and $0xf, %ebx
+ movdqa (%rdi), %xmm0 # load aligned heads
+ movdqa (%rsi), %xmm2
+ pxor %xmm1, %xmm1
+ cmp $16, %rdx # end of buffer within the first 32 bytes?
+ jb .Llt16
+
+ test $PAGE_SIZE, %r9d # did the page change?
+ jz 0f # if not, take fast path
+
+
+ /* heads may cross page boundary, avoid unmapped loads */
+ movdqa %xmm0, -32(%rsp) # stash copies of the heads on the stack
+ movdqa %xmm2, -16(%rsp)
+ mov $-1, %r8d
+ mov $-1, %r9d
+ mov %eax, %ecx
+ shl %cl, %r8d # string head in XMM0
+ mov %ebx, %ecx
+ shl %cl, %r9d # string head in XMM2
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm1, %xmm2
+ pmovmskb %xmm0, %r10d
+ pmovmskb %xmm2, %r11d
+ test %r8d, %r10d # NUL byte present in first string?
+ lea -32(%rsp), %r8
+ cmovz %rdi, %r8
+ test %r9d, %r11d # NUL byte present in second string?
+ lea -16(%rsp), %r9
+ cmovz %rsi, %r9
+ movdqu (%r8, %rax, 1), %xmm0 # load true (or fake) heads
+ movdqu (%r9, %rbx, 1), %xmm4
+ jmp 1f
+
+ /* rdx == 0 */
+.Lempty:
+ xor %eax, %eax # zero-length buffers compare equal
+ pop %rbx
+ ret
+
+0: movdqu (%rdi, %rax, 1), %xmm0 # load true heads
+ movdqu (%rsi, %rbx, 1), %xmm4
+1: pxor %xmm2, %xmm2
+ pcmpeqb %xmm0, %xmm2 # NUL byte present?
+ pcmpeqb %xmm0, %xmm4 # which bytes match?
+ pandn %xmm4, %xmm2 # match and not NUL byte?
+ pmovmskb %xmm2, %r9d
+ xor $0xffff, %r9d # mismatch or NUL byte?
+ jnz .Lhead_mismatch
+
+ /* load head and second chunk */
+ movdqa 16(%rdi), %xmm2 # load second chunks
+ movdqa 16(%rsi), %xmm3
+ lea -16(%rdx, %rbx, 1), %rdx # account for length of RSI chunk
+ sub %rbx, %rax # is a&0xf >= b&0xf?
+ jb .Lswapped # if not, proceed with swapped operands
+ jmp .Lnormal
+
+ /* buffer ends within the first 16 bytes */
+.Llt16: test $PAGE_SIZE, %r9d # did the page change?
+ jz 0f # if not, take fast path
+
+ /* heads may cross page boundary */
+ movdqa %xmm0, -32(%rsp) # stash copies of the heads on the stack
+ movdqa %xmm2, -16(%rsp)
+ mov $-1, %r8d
+ mov $-1, %r9d
+ mov %eax, %ecx
+ shl %cl, %r8d # string head in XMM0
+ mov %ebx, %ecx
+ shl %cl, %r9d # string head in XMM2
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm1, %xmm2
+ pmovmskb %xmm0, %r10d
+ pmovmskb %xmm2, %r11d
+ lea (%rdx, %rax, 1), %ecx # location of last buffer byte in xmm0
+ bts %ecx, %r10d # treat as if NUL byte present
+ lea (%rdx, %rbx, 1), %ecx
+ bts %ecx, %r11d
+ test %r8w, %r10w # NUL byte present in first string head?
+ lea -32(%rsp), %r8
+ cmovz %rdi, %r8
+ test %r9w, %r11w # NUL byte present in second string head?
+ lea -16(%rsp), %r9
+ cmovz %rsi, %r9
+ movdqu (%r8, %rax, 1), %xmm0 # load true (or fake) heads
+ movdqu (%r9, %rbx, 1), %xmm4
+ jmp 1f
+
+0: movdqu (%rdi, %rax, 1), %xmm0 # load true heads
+ movdqu (%rsi, %rbx, 1), %xmm4
+1: pxor %xmm2, %xmm2
+ pcmpeqb %xmm0, %xmm2 # NUL byte present?
+ pcmpeqb %xmm0, %xmm4 # which bytes match?
+ pandn %xmm4, %xmm2 # match and not NUL byte?
+ pmovmskb %xmm2, %r9d
+ btr %edx, %r9d # induce mismatch in last byte of buffer
+ not %r9d # mismatch or NUL byte?
+
+ /* mismatch in true heads */
+ ALIGN_TEXT
+.Lhead_mismatch:
+ tzcnt %r9d, %r9d # where is the mismatch?
+ add %rax, %rdi # return to true heads
+ add %rbx, %rsi
+ movzbl (%rdi, %r9, 1), %eax # mismatching characters
+ movzbl (%rsi, %r9, 1), %ecx
+ sub %ecx, %eax
+ pop %rbx
+ ret
+
+ /* rax >= 0 */
+ ALIGN_TEXT
+.Lnormal:
+ neg %rax
+ movdqu 16(%rsi, %rax, 1), %xmm0
+ sub %rdi, %rsi # express RSI as distance from RDI
+ lea (%rsi, %rax, 1), %rbx # point RBX to offset in second string
+ neg %rax # ... corresponding to RDI
+ pcmpeqb %xmm3, %xmm1 # NUL present?
+ pcmpeqb %xmm2, %xmm0 # Mismatch between chunks?
+ pmovmskb %xmm1, %r8d
+ pmovmskb %xmm0, %r9d
+ mov $16, %ecx
+ cmp %rcx, %rdx # does the buffer end within (RDI,RSI,1)?
+ cmovb %edx, %ecx # ECX = min(16, RDX)
+ add $32, %rdi # advance to next iteration
+ bts %ecx, %r8d # mark end-of-buffer as if there was a NUL byte
+ test %r8w, %r8w # NUL or end of buffer found?
+ jnz .Lnul_found2
+ xor $0xffff, %r9d
+ jnz .Lmismatch2
+ sub $48, %rdx # end of buffer within first main loop iteration?
+ jb .Ltail # if yes, process tail
+
+ /*
+ * During the main loop, the layout of the two strings is something like:
+ *
+ * v ------1------ v ------2------ v
+ * RDI: AAAAAAAAAAAAABBBBBBBBBBBBBBBB...
+ * RSI: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC...
+ *
+ * where v indicates the alignment boundaries and corresponding chunks
+ * of the strings have the same letters. Chunk A has been checked in
+ * the previous iteration. This iteration, we first check that string
+ * RSI doesn't end within region 2, then we compare chunk B between the
+ * two strings. As RSI is known not to hold a NUL byte in regsions 1
+ * and 2 at this point, this also ensures that RDI has not ended yet.
+ */
+ ALIGN_TEXT
+0: movdqu (%rdi, %rbx, 1), %xmm0 # chunk of 2nd string corresponding to RDI
+ pxor %xmm1, %xmm1
+ pcmpeqb (%rdi, %rsi, 1), %xmm1 # end of string in RSI?
+ pcmpeqb (%rdi), %xmm0 # where do the chunks match?
+ pmovmskb %xmm1, %r8d
+ pmovmskb %xmm0, %r9d
+ test %r8d, %r8d
+ jnz .Lnul_found
+ xor $0xffff, %r9d # any mismatches?
+ jnz .Lmismatch
+
+ /* main loop unrolled twice */
+ movdqu 16(%rdi, %rbx, 1), %xmm0
+ pxor %xmm1, %xmm1
+ pcmpeqb 16(%rdi, %rsi, 1), %xmm1
+ pcmpeqb 16(%rdi), %xmm0
+ pmovmskb %xmm1, %r8d
+ pmovmskb %xmm0, %r9d
+ add $32, %rdi
+ test %r8d, %r8d
+ jnz .Lnul_found2
+ xor $0xffff, %r9d
+ jnz .Lmismatch2
+ sub $32, %rdx # end of buffer within next iteration?
+ jae 0b
+
+ /* end of buffer will occur in next 32 bytes */
+.Ltail: movdqu (%rdi, %rbx, 1), %xmm0 # chunk of 2nd string corresponding to RDI
+ pxor %xmm1, %xmm1
+ pcmpeqb (%rdi, %rsi, 1), %xmm1 # end of string in RSI?
+ pcmpeqb (%rdi), %xmm0 # where do the chunks match?
+ pmovmskb %xmm1, %r8d
+ pmovmskb %xmm0, %r9d
+ bts %edx, %r8d # indicate NUL byte at last byte in buffer
+ test %r8w, %r8w # NUL byte in first chunk?
+ jnz .Lnul_found
+ xor $0xffff, %r9d # any mismatches?
+ jnz .Lmismatch
+
+ /* main loop unrolled twice */
+ movdqu 16(%rdi, %rbx, 1), %xmm0
+ pxor %xmm1, %xmm1
+ pcmpeqb 16(%rdi, %rsi, 1), %xmm1
+ pcmpeqb 16(%rdi), %xmm0
+ pmovmskb %xmm1, %r8d
+ pmovmskb %xmm0, %r9d
+ sub $16, %edx # take first half into account
+ bts %edx, %r8d # indicate NUL byte at last byte in buffer
+ add $32, %rdi
+
+.Lnul_found2:
+ sub $16, %rdi
+
+.Lnul_found:
+ mov %eax, %ecx
+ mov %r8d, %r10d
+ shl %cl, %r8d # adjust NUL mask to positions in RDI/RBX
+ not %r9d # mask of mismatches
+ or %r8w, %r9w # NUL bytes als count as mismatches
+ jnz .Lmismatch
+
+ /*
+ * (RDI) == (RSI) and NUL is past the string.
+ * compare (RSI) with the corresponding part
+ * of the other string until the NUL byte.
+ */
+ movdqu (%rdi, %rax, 1), %xmm0
+ pcmpeqb (%rdi, %rsi, 1), %xmm0
+ add %rdi, %rsi # restore RSI pointer
+ add %rax, %rdi # point RDI to chunk corresponding to (RSI)
+ pmovmskb %xmm0, %ecx # mask of matches
+ not %ecx # mask of mismatches
+ or %r10d, %ecx # mask of mismatches or NUL bytes
+ tzcnt %ecx, %ecx # location of first mismatch
+ movzbl (%rdi, %rcx, 1), %eax
+ movzbl (%rsi, %rcx, 1), %ecx
+ sub %ecx, %eax
+ pop %rbx
+ ret
+
+.Lmismatch2:
+ sub $16, %rdi
+
+ /* a mismatch has been found between RBX and RSI */
+.Lmismatch:
+ tzcnt %r9d, %r9d # where is the mismatch?
+ add %rdi, %rbx # turn RBX from offset into pointer
+ movzbl (%rbx, %r9, 1), %ecx
+ movzbl (%rdi, %r9, 1), %eax
+ sub %ecx, %eax
+ pop %rbx
+ ret
+
+ /* rax < 0 */
+ ALIGN_TEXT
+.Lswapped:
+ movdqu 16(%rdi, %rax, 1), %xmm0
+ sub %rsi, %rdi # express RDI as distance from RDI
+ lea (%rdi, %rax, 1), %rbx # point RBX to offset in first string
+ pcmpeqb %xmm2, %xmm1 # NUL present?
+ pcmpeqb %xmm3, %xmm0 # mismatch between chunks?
+ pmovmskb %xmm1, %r8d
+ pmovmskb %xmm0, %r9d
+ add %rax, %rdx # RDX points to buffer end in RSI
+ neg %rax # ... corresponding to RSI
+ mov $16, %ecx
+ cmp %rcx, %rdx # does the buffer end within (RSI,RDI,1)?
+ cmovb %edx, %ecx # ECX = min(16, RDX)
+ add $32, %rsi
+ bts %ecx, %r8d # mark end-of-buffer as if there was a NUL byte
+ test %r8w, %r8w # NUL or end of buffer found?
+ jnz .Lnul_found2s
+ xor $0xffff, %r9d
+ jnz .Lmismatch2s
+ sub $48, %rdx # end of buffer within first main loop iteration?
+ jb .Ltails # if yes, process tail
+
+ ALIGN_TEXT
+0: movdqu (%rsi, %rbx, 1), %xmm0 # chunk of 1st string corresponding to RSI
+ pxor %xmm1, %xmm1
+ pcmpeqb (%rsi, %rdi, 1), %xmm1 # end of string in RDI?
+ pcmpeqb (%rsi), %xmm0 # where do the chunks match?
+ pmovmskb %xmm1, %r8d
+ pmovmskb %xmm0, %r9d
+ test %r8d, %r8d
+ jnz .Lnul_founds
+ xor $0xffff, %r9d # any mismatches?
+ jnz .Lmismatchs
+
+ /* main loop unrolled twice */
+ movdqu 16(%rsi, %rbx, 1), %xmm0
+ pxor %xmm1, %xmm1
+ pcmpeqb 16(%rsi, %rdi, 1), %xmm1
+ pcmpeqb 16(%rsi), %xmm0
+ pmovmskb %xmm1, %r8d
+ pmovmskb %xmm0, %r9d
+ add $32, %rsi
+ test %r8d, %r8d
+ jnz .Lnul_found2s
+ xor $0xffff, %r9d
+ jnz .Lmismatch2s
+ sub $32, %rdx # end of buffer within next iteration?
+ jae 0b
+
+ /* end of buffer will occur in next 32 bytes */
+.Ltails:
+ movdqu (%rsi, %rbx, 1), %xmm0 # chunk of 1st string corresponding to RSI
+ pxor %xmm1, %xmm1
+ pcmpeqb (%rsi, %rdi, 1), %xmm1 # end of string in RDI?
+ pcmpeqb (%rsi), %xmm0 # where do the chunks match?
+ pmovmskb %xmm1, %r8d
+ pmovmskb %xmm0, %r9d
+ bts %edx, %r8d # indicate NUL byte at laste byte in buffer
+ test %r8w, %r8w # NUL byte in first chunk?
+ jnz .Lnul_founds
+ xor $0xffff, %r9d # any mismatches?
+ jnz .Lmismatchs
+
+ /* main loop unrolled twice */
+ movdqu 16(%rsi, %rbx, 1), %xmm0
+ pxor %xmm1, %xmm1
+ pcmpeqb 16(%rsi, %rdi, 1), %xmm1
+ pcmpeqb 16(%rsi), %xmm0
+ pmovmskb %xmm1, %r8d
+ pmovmskb %xmm0, %r9d
+ sub $16, %edx # take first half into account
+ bts %edx, %r8d # indicate NUL byte at laste byte in buffer
+ add $32, %rsi
+
+.Lnul_found2s:
+ sub $16, %rsi
+
+.Lnul_founds:
+ mov %eax, %ecx
+ mov %r8d, %r10d
+ shl %cl, %r8d # adjust NUL mask to positions in RSI/RBX
+ not %r9d # mask of mismatches
+ or %r8w, %r9w # NUL bytes also count as mismatches
+ jnz .Lmismatchs
+
+ movdqu (%rsi, %rax, 1), %xmm0
+ pcmpeqb (%rsi, %rdi, 1), %xmm0
+ add %rsi, %rdi # restore RDI pointer
+ add %rax, %rsi # point RSI to chunk corresponding to (RDI)
+ pmovmskb %xmm0, %ecx # mask of matches
+ not %ecx # mask of mismatches
+ or %r10d, %ecx # mask of mismatches or NUL bytes
+ tzcnt %ecx, %ecx # location of first mismatch
+ movzbl (%rdi, %rcx, 1), %eax
+ movzbl (%rsi, %rcx, 1), %ecx
+ sub %ecx, %eax
+ pop %rbx
+ ret
+
+.Lmismatch2s:
+ sub $16, %rsi
+
+.Lmismatchs:
+ tzcnt %r9d, %r9d # where is the mismatch?
+ add %rsi, %rbx # turn RBX from offset into pointer
+ movzbl (%rbx, %r9, 1), %eax
+ movzbl (%rsi, %r9, 1), %ecx
+ sub %ecx, %eax
+ pop %rbx
+ ret
+ARCHEND(strncmp, baseline)
+
+ .section .note.GNU-stack,"",%progbits
diff --git a/lib/libc/amd64/static_tls.h b/lib/libc/amd64/string/strncpy.c
index 1ee738b231c7..0e7a58222aa8 100644
--- a/lib/libc/amd64/static_tls.h
+++ b/lib/libc/amd64/string/strncpy.c
@@ -1,9 +1,7 @@
/*-
- * SPDX-License-Identifier: BSD-2-Clause
+ * Copyright (c) 2023 The FreeBSD Foundation
*
- * Copyright (c) 2019 The FreeBSD Foundation
- *
- * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
+ * This software was developed by Robert Clausecker <fuz@FreeBSD.org>
* under sponsorship from the FreeBSD Foundation.
*
* Redistribution and use in source and binary forms, with or without
@@ -15,7 +13,7 @@
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
@@ -25,20 +23,21 @@
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
+ * SUCH DAMAGE
*/
-#ifndef _LIBC_AMD64_STATIC_TLS_H
-#define _LIBC_AMD64_STATIC_TLS_H
+#include <sys/cdefs.h>
+#include <string.h>
+
+#undef strncpy /* _FORTIFY_SOURCE */
+
+char *__stpncpy(char *restrict, const char *restrict, size_t);
-static __inline uintptr_t
-_libc_get_static_tls_base(size_t offset)
+char *
+strncpy(char *restrict dst, const char *restrict src, size_t len)
{
- uintptr_t tlsbase;
- __asm __volatile("movq %%fs:0, %0" : "=r" (tlsbase));
- tlsbase -= offset;
- return (tlsbase);
-}
+ __stpncpy(dst, src, len);
-#endif
+ return (dst);
+}
diff --git a/lib/libc/amd64/sys/amd64_set_gsbase.c b/lib/libc/amd64/string/strnlen.c
index 10004afe8234..74020f1b1c65 100644
--- a/lib/libc/amd64/sys/amd64_set_gsbase.c
+++ b/lib/libc/amd64/string/strnlen.c
@@ -1,11 +1,7 @@
/*-
- * SPDX-License-Identifier: BSD-2-Clause
+ * Copyright (c) 2023 The FreeBSD Foundation
*
- * Copyright (c) 2003 Peter Wemm
- * Copyright (c) 2017, 2018 The FreeBSD Foundation
- * All rights reserved.
- *
- * Portions of this software were developed by Konstantin Belousov
+ * This software was developed by Robert Clausecker <fuz@FreeBSD.org>
* under sponsorship from the FreeBSD Foundation.
*
* Redistribution and use in source and binary forms, with or without
@@ -17,48 +13,29 @@
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
+ * SUCH DAMAGE
*/
-#include <sys/cdefs.h>
-#define _WANT_P_OSREL
-#include <sys/param.h>
-#include <machine/cpufunc.h>
-#include <machine/specialreg.h>
-#include <machine/sysarch.h>
-#include <x86/ifunc.h>
-#include "libc_private.h"
-
-static int
-amd64_set_gsbase_cpu(void *addr)
-{
+#include <string.h>
- wrgsbase((uintptr_t)addr);
- return (0);
-}
+char *__memchr(const void *, int, size_t);
-static int
-amd64_set_gsbase_syscall(void *addr)
+size_t
+strnlen(const char *s, size_t maxlen)
{
+ const char *loc;
- return (sysarch(AMD64_SET_GSBASE, &addr));
-}
-
-DEFINE_UIFUNC(, int, amd64_set_gsbase, (void *))
-{
+ loc = __memchr(s, '\0', maxlen);
- if (__getosreldate() >= P_OSREL_WRFSBASE &&
- (cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0)
- return (amd64_set_gsbase_cpu);
- return (amd64_set_gsbase_syscall);
+ return (loc == NULL ? maxlen : (size_t)(loc - s));
}
diff --git a/lib/libc/amd64/sys/amd64_set_fsbase.c b/lib/libc/amd64/string/strpbrk.c
index 24dddcad48f8..87f587789991 100644
--- a/lib/libc/amd64/sys/amd64_set_fsbase.c
+++ b/lib/libc/amd64/string/strpbrk.c
@@ -1,11 +1,7 @@
/*-
- * SPDX-License-Identifier: BSD-2-Clause
+ * Copyright (c) 2023 The FreeBSD Foundation
*
- * Copyright (c) 2003 Peter Wemm
- * Copyright (c) 2017, 2018 The FreeBSD Foundation
- * All rights reserved.
- *
- * Portions of this software were developed by Konstantin Belousov
+ * This software was developed by Robert Clausecker <fuz@FreeBSD.org>
* under sponsorship from the FreeBSD Foundation.
*
* Redistribution and use in source and binary forms, with or without
@@ -17,48 +13,31 @@
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
+ * SUCH DAMAGE
*/
#include <sys/cdefs.h>
-#define _WANT_P_OSREL
-#include <sys/param.h>
-#include <machine/cpufunc.h>
-#include <machine/specialreg.h>
-#include <machine/sysarch.h>
-#include <x86/ifunc.h>
-#include "libc_private.h"
-static int
-amd64_set_fsbase_cpu(void *addr)
-{
+#include <string.h>
- wrfsbase((uintptr_t)addr);
- return (0);
-}
+size_t __strcspn(const char *, const char *);
-static int
-amd64_set_fsbase_syscall(void *addr)
+char *
+strpbrk(const char *s, const char *charset)
{
+ size_t loc;
- return (sysarch(AMD64_SET_FSBASE, &addr));
-}
-
-DEFINE_UIFUNC(, int, amd64_set_fsbase, (void *))
-{
+ loc = __strcspn(s, charset);
- if (__getosreldate() >= P_OSREL_WRFSBASE &&
- (cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0)
- return (amd64_set_fsbase_cpu);
- return (amd64_set_fsbase_syscall);
+ return (s[loc] == '\0' ? NULL : (char *)&s[loc]);
}
diff --git a/lib/libc/amd64/string/strrchr.S b/lib/libc/amd64/string/strrchr.S
new file mode 100644
index 000000000000..e397bbcd3478
--- /dev/null
+++ b/lib/libc/amd64/string/strrchr.S
@@ -0,0 +1,209 @@
+/*-
+ * Copyright (c) 2023 The FreeBSD Foundation
+ *
+ * This software was developed by Robert Clausecker <fuz@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE
+ */
+
+#include <machine/asm.h>
+
+#include "amd64_archlevel.h"
+
+#define ALIGN_TEXT .p2align 4,0x90 # 16-byte alignment, nop-filled
+
+ .weak rindex
+ .set rindex, strrchr
+
+ARCHFUNCS(strrchr)
+ ARCHFUNC(strrchr, scalar)
+ ARCHFUNC(strrchr, baseline)
+ENDARCHFUNCS(strrchr)
+
+ARCHENTRY(strrchr, scalar)
+ mov %edi, %ecx
+ and $~7, %rdi # align to 8 byte
+ movzbl %sil, %esi # clear stray high bits
+ movabs $0x0101010101010101, %r8
+ mov (%rdi), %rax # load first word
+ imul %r8, %rsi # replicate char 8 times
+
+ /*
+ * Unaligned input: align to 8 bytes. Then proceed the same
+ * way as with aligned input, but prevent matches before the
+ * beginning of the string. This is achieved by oring 0x01
+ * into each byte of the buffer before the string
+ */
+ shl $3, %ecx
+ mov %r8, %r10
+ shl %cl, %r10 # 0x01 where the string is
+ xor %r8, %r10 # 0x01 where it is not
+ neg %r8 # negate 01..01 so we can use lea
+ movabs $0x8080808080808080, %r9
+
+ mov %rsi, %rcx
+ xor %rax, %rcx # str ^ c
+ or %r10, %rax # ensure str != 0 before string
+ or %r10, %rcx # ensure str^c != 0 before string
+ bswap %rcx # in reverse order, to find last match
+ mov %rdi, %r10 # location of initial mismatch (if any)
+ xor %r11, %r11 # initial mismatch (none)
+ add $8, %rdi # advance to next iteration
+ lea (%rax, %r8, 1), %rdx # str - 0x01..01
+ not %rax # ~str
+ and %rdx, %rax # (str - 0x01..01) & ~str
+ and %r9, %rax # not including junk bits
+ jnz 1f # end of string?
+
+ lea (%rcx, %r8, 1), %rdx # (str ^ c) - 0x01..01
+ not %rcx # ~(str ^ c)
+ and %rdx, %rcx # ((str ^ c - 0x01..01) & ~(str ^ c)
+ and %r9, %rcx # not including junk bits
+ mov %rcx, %r11 # remember mismatch in head
+ jmp 0f
+
+ /* main loop unrolled twice */
+ ALIGN_TEXT
+3: lea (%rcx, %r8, 1), %rdx # (str ^ c) - 0x01..01
+ not %rcx # ~(str ^ c)
+ and %rdx, %rcx # ((str ^ c - 0x01..01) & ~(str ^ c)
+ and %r9, %rcx # not including junk bits
+ lea -8(%rdi), %rdx
+ cmovnz %rdx, %r10 # remember location of current mismatch
+ cmovnz %rcx, %r11
+
+0: mov (%rdi), %rax # str
+ mov %rsi, %rcx
+ xor %rax, %rcx # str ^ c
+ bswap %rcx # in reverse order, to find last match
+ lea (%rax, %r8, 1), %rdx # str - 0x01..01
+ not %rax # ~str
+ and %rdx, %rax # (str - 0x01..01) & ~str
+ and %r9, %rax # not including junk bits
+ jnz 2f # end of string?
+
+ lea (%rcx, %r8, 1), %rdx # (str ^ c) - 0x01..01
+ not %rcx # ~(str ^ c)
+ and %rdx, %rcx # ((str ^ c - 0x01..01) & ~(str ^ c)
+ and %r9, %rcx # not including junk bits
+ cmovnz %rdi, %r10 # remember location of current mismatch
+ cmovnz %rcx, %r11
+
+ mov 8(%rdi), %rax # str
+ add $16, %rdi
+ mov %rsi, %rcx
+ xor %rax, %rcx # str ^ c
+ bswap %rcx
+ lea (%rax, %r8, 1), %rdx # str - 0x01..01
+ not %rax # ~str
+ and %rdx, %rax # (str - 0x01..01) & ~str
+ and %r9, %rax # not including junk bits
+ jz 3b # end of string?
+
+ /* NUL found */
+1: sub $8, %rdi # undo advance past buffer
+2: lea (%rcx, %r8, 1), %rdx # (str ^ c) - 0x01..01
+ not %rcx # ~(str ^ c)
+ and %rdx, %rcx # ((str ^ c - 0x01..01) & ~(str ^ c)
+ and %r9, %rcx # not including junk bits
+ lea -1(%rax), %rdx
+ xor %rdx, %rax # mask of bytes in the string
+ bswap %rdx # in reverse order
+ and %rdx, %rcx # c found in the tail?
+ cmovnz %rdi, %r10
+ cmovnz %rcx, %r11
+ bswap %r11 # unreverse byte order
+ bsr %r11, %rcx # last location of c in (R10)
+ shr $3, %rcx # as byte offset
+ lea (%r10, %rcx, 1), %rax # pointer to match
+ test %r11, %r11 # was there actually a match?
+ cmovz %r11, %rax # if not, return null pointer
+ ret
+ARCHEND(strrchr, scalar)
+
+ARCHENTRY(strrchr, baseline)
+ mov %edi, %ecx
+ and $~0xf, %rdi # align to 16 bytes
+ movdqa (%rdi), %xmm1
+ movd %esi, %xmm0
+ and $0xf, %ecx # offset from alignment
+ pxor %xmm2, %xmm2
+ mov $-1, %edx
+ punpcklbw %xmm0, %xmm0 # c -> cc
+ shl %cl, %edx # bits corresponding to bytes in the string
+ punpcklwd %xmm0, %xmm0 # cc -> cccc
+ xor %r8, %r8 # address of latest match
+ mov $1, %esi # bit mask of latest match
+ mov %rdi, %r9 # candidate location for next match
+ add $16, %rdi # advance to next chunk
+
+ /* check for match in head */
+ pcmpeqb %xmm1, %xmm2 # NUL byte present?
+ pshufd $0, %xmm0, %xmm0 # cccc -> cccccccccccccccc
+ pcmpeqb %xmm0, %xmm1 # c present?
+ pmovmskb %xmm2, %eax
+ pmovmskb %xmm1, %ecx
+ and %edx, %ecx # c present in the string?
+ and %edx, %eax # NUL present in the string?
+ jnz .Lend2
+
+ /* main loop unrolled twice */
+ ALIGN_TEXT
+0: movdqa (%rdi), %xmm1
+ test %ecx, %ecx # was there a match in the last iter.?
+ cmovnz %r9, %r8 # remember match if any
+ cmovnz %ecx, %esi
+ pxor %xmm2, %xmm2
+ pcmpeqb %xmm1, %xmm2 # NUL byte present?
+ pcmpeqb %xmm0, %xmm1 # c present?
+ pmovmskb %xmm2, %eax
+ pmovmskb %xmm1, %ecx
+ test %eax, %eax # end of string in first half?
+ jnz .Lend
+
+ movdqa 16(%rdi), %xmm1
+ test %ecx, %ecx # was there a match in the last iter.?
+ cmovnz %rdi, %r8 # remember match if any
+ cmovnz %ecx, %esi
+ pxor %xmm2, %xmm2
+ pcmpeqb %xmm1, %xmm2 # NUL byte present?
+ pcmpeqb %xmm0, %xmm1 # c present?
+ pmovmskb %xmm2, %eax
+ pmovmskb %xmm1, %ecx
+ lea 16(%rdi), %r9
+ add $32, %rdi
+ test %eax, %eax # end of string in second half?
+ jz 0b
+
+ ALIGN_TEXT
+.Lend2: sub $16, %rdi
+.Lend: lea -1(%rax), %edx
+ xor %edx, %eax # mask of bytes in the string
+ and %eax, %ecx # c found in the tail?
+ cmovnz %rdi, %r8
+ cmovnz %ecx, %esi
+ bsr %esi, %esi # last location of c in (R8)
+ lea (%r8, %rsi, 1), %rax # pointer to match
+ ret
+ARCHEND(strrchr, baseline)
+ .section .note.GNU-stack,"",%progbits
diff --git a/lib/libc/amd64/sys/getcontext.S b/lib/libc/amd64/string/strsep.c
index 6860a3cf9bef..9fda56d7e135 100644
--- a/lib/libc/amd64/sys/getcontext.S
+++ b/lib/libc/amd64/string/strsep.c
@@ -1,6 +1,8 @@
/*-
- * Copyright (c) 2003 Peter Wemm <peter@FreeBSD.org>
- * All rights reserved.
+ * Copyright (c) 2023 The FreeBSD Foundation
+ *
+ * This software was developed by Robert Clausecker <fuz@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -11,36 +13,45 @@
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
+ * SUCH DAMAGE
*/
-#include <machine/asm.h>
-#include <SYS.h>
+#include <sys/cdefs.h>
+#include <string.h>
+
+size_t __strcspn(const char *, const char *);
/*
- * This has to be magic to handle the multiple returns.
- * Otherwise, the setcontext() syscall will return here and we'll
- * pop off the return address and go to the *setcontext* call.
+ * We have a fast strcspn() on amd64. Use it over a direct
+ * implementation of strsep for better performance.
*/
- WEAK_REFERENCE(__sys_getcontext, _getcontext)
- WEAK_REFERENCE(__sys_getcontext, getcontext)
-ENTRY(__sys_getcontext)
- movq (%rsp),%rsi /* save getcontext return address */
- mov $SYS_getcontext,%rax
- KERNCALL
- jb HIDENAME(cerror)
- addq $8,%rsp /* remove stale (setcontext) return address */
- jmp *%rsi /* restore return address */
-END(__sys_getcontext)
+char *
+strsep(char **stringp, const char *delim)
+{
+ size_t n;
+ char *s;
+
+ s = *stringp;
+ if (s == NULL)
+ return (NULL);
+
+ n = __strcspn(s, delim);
+ if (s[n] == '\0')
+ *stringp = NULL;
+ else {
+ s[n] = '\0';
+ *stringp = s + n + 1;
+ }
- .section .note.GNU-stack,"",%progbits
+ return (s);
+}
diff --git a/lib/libc/amd64/string/strspn.S b/lib/libc/amd64/string/strspn.S
new file mode 100644
index 000000000000..565330f0c385
--- /dev/null
+++ b/lib/libc/amd64/string/strspn.S
@@ -0,0 +1,358 @@
+/*-
+ * Copyright (c) 2023 The FreeBSD Foundation
+ *
+ * This software was developed by Robert Clausecker <fuz@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE
+ */
+
+#include <machine/asm.h>
+#include <machine/param.h>
+
+#include "amd64_archlevel.h"
+
+#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */
+
+ARCHFUNCS(strspn)
+ ARCHFUNC(strspn, scalar)
+ NOARCHFUNC
+ ARCHFUNC(strspn, x86_64_v2)
+ENDARCHFUNCS(strspn)
+
+ARCHENTRY(strspn, scalar)
+ push %rbp # align stack to enable function call
+ mov %rsp, %rbp
+ sub $256, %rsp # allocate space for lookup table
+
+ /* check for special cases */
+ movzbl (%rsi), %edx # first character in the set
+ test %edx, %edx
+ jz .Lzero # empty set always returns 0
+
+ movzbl 1(%rsi), %eax # second character in the set
+ test %eax, %eax
+ jz .Lsingle
+
+ /* no special case matches -- prepare lookup table */
+ xor %r8d, %r8d
+ mov $28, %ecx
+0: mov %r8, (%rsp, %rcx, 8)
+ mov %r8, 8(%rsp, %rcx, 8)
+ mov %r8, 16(%rsp, %rcx, 8)
+ mov %r8, 24(%rsp, %rcx, 8)
+ sub $4, %ecx
+ jnc 0b
+
+ movb $1, (%rsp, %rdx, 1) # register first char in set
+ add $2, %rsi
+
+ /* process remaining chars in set */
+ ALIGN_TEXT
+0: movb $1, (%rsp, %rax, 1) # register previous char
+ movzbl (%rsi), %eax # next char in set
+ test %eax, %eax # end of string?
+ jz 1f
+
+ movb $1, (%rsp, %rax, 1)
+ add $2, %rsi
+ movzbl -1(%rsi), %eax
+ test %eax, %eax
+ jnz 0b
+
+1: mov %rdi, %rax # a copy of the source to iterate over
+
+ /* find mismatch */
+ ALIGN_TEXT
+0: movzbl (%rax), %ecx
+ cmpb $0, (%rsp, %rcx, 1)
+ je 2f
+
+ movzbl 1(%rax), %ecx
+ cmpb $0, (%rsp, %rcx, 1)
+ je 3f
+
+ movzbl 2(%rax), %ecx
+ cmpb $0, (%rsp, %rcx, 1)
+ je 4f
+
+ movzbl 3(%rax), %ecx
+ add $4, %rax
+ cmpb $0, (%rsp, %rcx, 1)
+ jne 0b
+
+ sub $3, %rax
+4: dec %rdi
+3: inc %rax
+2: sub %rdi, %rax # number of characters preceding match
+ leave
+ ret
+
+ /* empty set never matches */
+.Lzero: xor %eax, %eax
+ leave
+ ret
+
+ /* find repeated single character */
+ ALIGN_TEXT
+.Lsingle:
+ cmpb %dl, (%rdi, %rax, 1)
+ jne 1f
+
+ cmpb %dl, 1(%rdi, %rax, 1)
+ jne 2f
+
+ cmpb %dl, 2(%rdi, %rax, 1)
+ jne 3f
+
+ cmpb %dl, 3(%rdi, %rax, 1)
+ lea 4(%rax), %rax
+ je .Lsingle
+
+ sub $3, %rax
+3: inc %rax
+2: inc %rax
+1: leave
+ ret
+ARCHEND(strspn, scalar)
+
+ /*
+ * This kernel uses pcmpistri to do the heavy lifting.
+ * We provide three code paths, depending on set size:
+ *
+ * 0--16: one pcmpistri per 16 bytes of input
+ * 17--32: two pcmpistri per 16 bytes of input
+ * >=33: fall back to look up table
+ */
+ARCHENTRY(strspn, x86_64_v2)
+ push %rbp
+ mov %rsp, %rbp
+ sub $256, %rsp
+
+ /* find set size and copy up to 32 bytes to (%rsp) */
+ mov %esi, %ecx
+ and $~0xf, %rsi # align set pointer
+ movdqa (%rsi), %xmm0
+ pxor %xmm1, %xmm1
+ and $0xf, %ecx # amount of bytes rsi is past alignment
+ xor %edx, %edx
+ pcmpeqb %xmm0, %xmm1 # end of string reached?
+ movdqa %xmm0, 32(%rsp) # transfer head of set to stack
+ pmovmskb %xmm1, %eax
+ shr %cl, %eax # clear out junk before string
+ test %eax, %eax # end of set reached?
+ jnz 0f
+
+ movdqa 16(%rsi), %xmm0 # second chunk of the set
+ mov $16, %edx
+ sub %ecx, %edx # length of set preceding xmm0
+ pxor %xmm1, %xmm1
+ pcmpeqb %xmm0, %xmm1
+ movdqa %xmm0, 48(%rsp)
+ movdqu 32(%rsp, %rcx, 1), %xmm2 # head of set
+ pmovmskb %xmm1, %eax
+ test %eax, %eax
+ jnz 1f
+
+ movdqa 32(%rsi), %xmm0 # third chunk
+ add $16, %edx
+ pxor %xmm1, %xmm1
+ pcmpeqb %xmm0, %xmm1
+ movdqa %xmm0, 64(%rsp)
+ pmovmskb %xmm1, %eax
+ test %eax, %eax # still not done?
+ jz .Lgt32v2
+
+0: movdqu 32(%rsp, %rcx, 1), %xmm2 # head of set
+1: tzcnt %eax, %eax
+ add %eax, %edx # length of set (excluding NUL byte)
+ cmp $32, %edx # above 32 bytes?
+ ja .Lgt32v2
+
+ /*
+ * At this point we know that we want to use pcmpistri.
+ * one last problem obtains: the head of the string is not
+ * aligned and may cross a cacheline. If this is the case,
+ * we take the part before the page boundary and repeat the
+ * last byte to fill up the xmm register.
+ */
+ mov %rdi, %rax # save original string pointer
+ lea 15(%rdi), %esi # last byte of the head
+ xor %edi, %esi
+ test $PAGE_SIZE, %esi # does the head cross a page?
+ jz 0f
+
+ /* head crosses page: copy to stack to fix up */
+ and $~0xf, %rax # align head pointer temporarily
+ movzbl 15(%rax), %esi # last head byte on the page
+ movdqa (%rax), %xmm0
+ movabs $0x0101010101010101, %r8
+ imul %r8, %rsi # repeated 8 times
+ movdqa %xmm0, (%rsp) # head word on stack
+ mov %rsi, 16(%rsp) # followed by filler (last byte x8)
+ mov %rsi, 24(%rsp)
+ mov %edi, %eax
+ and $0xf, %eax # offset of head from alignment
+ add %rsp, %rax # pointer to fake head
+
+0: movdqu (%rax), %xmm1 # load head (fake or real)
+ lea 16(%rdi), %rax
+ and $~0xf, %rax # second 16 bytes of string (aligned)
+1: cmp $16, %edx # 16--32 bytes?
+ ja .Lgt16v2
+
+
+ /* set is 2--16 bytes in size */
+
+ /* _SIDD_UBYTE_OPS|_SIDD_CMP_EQUAL_ANY|_SIDD_LEAST_SIGNIFICANT|_SIDD_NEGATIVE_POLARITY */
+ pcmpistri $0x10, %xmm1, %xmm2 # match in head?
+ jc .Lheadmismatchv2
+
+ ALIGN_TEXT
+0: pcmpistri $0x10, (%rax), %xmm2
+ jc 1f # match or end of string?
+ pcmpistri $0x10, 16(%rax), %xmm2
+ lea 32(%rax), %rax
+ jnc 0b # match or end of string?
+
+ sub $16, %rax # go back to second half
+1: sub %rdi, %rax # offset of (%rax) from beginning of string
+ add %rcx, %rax # prefix length before match/NUL
+ leave
+ ret
+
+.Lheadmismatchv2:
+ mov %ecx, %eax # prefix length before mismatch/NUL
+ leave
+ ret
+
+ /* set is 17--32 bytes in size */
+.Lgt16v2:
+ movdqu 48(%rsp, %rcx, 1), %xmm3 # second part of set
+
+ /* _SIDD_UBYTE_OPS|_SIDD_CMP_EQUAL_ANY|_SIDD_BIT_MASK|_SIDD_NEGATIVE_POLARITY */
+ pcmpistrm $0x10, %xmm1, %xmm2 # any mismatch in first half?
+ movdqa %xmm0, %xmm4
+ pcmpistrm $0x10, %xmm1, %xmm3 # any mismatch in the second half?
+ ptest %xmm0, %xmm4 # any entry that doesn't match either?
+ jnz 2f
+
+ ALIGN_TEXT
+0: movdqa (%rax), %xmm1
+ pcmpistrm $0x10, %xmm1, %xmm2
+ movdqa %xmm0, %xmm4
+ pcmpistrm $0x10, %xmm1, %xmm3
+ ptest %xmm0, %xmm4
+ jnz 1f
+ movdqa 16(%rax), %xmm1
+ add $32, %rax
+ pcmpistrm $0x10, %xmm1, %xmm2
+ movdqa %xmm0, %xmm4
+ pcmpistrm $0x10, %xmm1, %xmm3
+ ptest %xmm0, %xmm4
+ jz 0b
+
+ sub $16, %rax
+1: pand %xmm4, %xmm0
+ movd %xmm0, %ecx
+ sub %rdi, %rax # offset of %xmm1 from beginning of string
+ tzcnt %ecx, %ecx
+ add %rcx, %rax # prefix length before match/NUL
+ leave
+ ret
+
+ /* mismatch or string end in head */
+2: pand %xmm4, %xmm0 # bit mask of mismatches (end of string counts)
+ movd %xmm0, %eax
+ tzcnt %eax, %eax # prefix length before mismatch/NUL
+ leave
+ ret
+
+ /* set is >=33 bytes in size */
+.Lgt32v2:
+ xorps %xmm0, %xmm0
+ mov $256-64, %edx
+
+ /* clear out look up table */
+0: movaps %xmm0, (%rsp, %rdx, 1)
+ movaps %xmm0, 16(%rsp, %rdx, 1)
+ movaps %xmm0, 32(%rsp, %rdx, 1)
+ movaps %xmm0, 48(%rsp, %rdx, 1)
+ sub $64, %edx
+ jnc 0b
+
+ add %rcx, %rsi # restore string pointer
+ mov %rdi, %rax # keep a copy of the string
+
+ /* initialise look up table */
+ movzbl (%rsi), %ecx # string is known not to be empty
+
+ ALIGN_TEXT
+0: movb $1, (%rsp, %rcx, 1)
+ movzbl 1(%rsi), %ecx
+ test %ecx, %ecx
+ jz 1f
+
+ movb $1, (%rsp, %rcx, 1)
+ movzbl 2(%rsi), %ecx
+ test %ecx, %ecx
+ jz 1f
+
+ movb $1, (%rsp, %rcx, 1)
+ movzbl 3(%rsi), %ecx
+ add $4, %rsi
+ test %ecx, %ecx
+ jz 1f
+
+ movb $1, (%rsp, %rcx, 1)
+ movzbl (%rsi), %ecx
+ test %ecx, %ecx
+ jnz 0b
+
+ /* find match */
+ ALIGN_TEXT
+1: movzbl (%rax), %ecx
+ cmpb $0, (%rsp, %rcx, 1)
+ je 2f
+
+ movzbl 1(%rax), %ecx
+ cmpb $0, (%rsp, %rcx, 1)
+ je 3f
+
+ movzbl 2(%rax), %ecx
+ cmpb $0, (%rsp, %rcx, 1)
+ je 4f
+
+ movzbl 3(%rax), %ecx
+ add $4, %rax
+ cmpb $0, (%rsp, %rcx, 1)
+ jne 1b
+
+ sub $3, %rax
+4: dec %rdi
+3: inc %rax
+2: sub %rdi, %rax # number of characters preceding match
+ leave
+ ret
+ARCHEND(strspn, x86_64_v2)
+
+ .section .note.GNU-stack,"",%progbits
diff --git a/lib/libc/amd64/string/timingsafe_bcmp.S b/lib/libc/amd64/string/timingsafe_bcmp.S
new file mode 100644
index 000000000000..c003da2ea9a7
--- /dev/null
+++ b/lib/libc/amd64/string/timingsafe_bcmp.S
@@ -0,0 +1,232 @@
+/*-
+ * Copyright (c) 2023 The FreeBSD Foundation
+ *
+ * This software was developed by Robert Clausecker <fuz@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE
+ */
+
+#include <machine/asm.h>
+
+#include "amd64_archlevel.h"
+
+#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */
+
+ARCHFUNCS(timingsafe_bcmp)
+ ARCHFUNC(timingsafe_bcmp, scalar)
+ ARCHFUNC(timingsafe_bcmp, baseline)
+ENDARCHFUNCS(timingsafe_bcmp)
+
+ARCHENTRY(timingsafe_bcmp, scalar)
+ cmp $16, %rdx # at least 17 bytes to process?
+ ja .Lgt16
+
+ cmp $8, %edx # at least 9 bytes to process?
+ ja .L0916
+
+ cmp $4, %edx # at least 5 bytes to process?
+ ja .L0508
+
+ cmp $2, %edx # at least 3 bytes to process?
+ ja .L0304
+
+ test %edx, %edx # buffer empty?
+ jnz .L0102
+
+ xor %eax, %eax # empty buffer always matches
+ ret
+
+.L0102: movzbl (%rdi), %eax # load 1--2 bytes from first buffer
+ movzbl -1(%rdi, %rdx, 1), %ecx
+ xor (%rsi), %al # xor in second buffer
+ xor -1(%rsi, %rdx, 1), %cl
+ or %ecx, %eax # mismatch in any of the two?
+ ret
+
+.L0304: movzwl (%rdi), %eax
+ movzwl -2(%rdi, %rdx, 1), %ecx
+ xor (%rsi), %ax
+ xor -2(%rsi, %rdx, 1), %cx
+ or %ecx, %eax
+ ret
+
+.L0508: mov (%rdi), %eax
+ mov -4(%rdi, %rdx, 1), %ecx
+ xor (%rsi), %eax
+ xor -4(%rsi, %rdx, 1), %ecx
+ or %ecx, %eax
+ ret
+
+.L0916: mov (%rdi), %rax
+ mov -8(%rdi, %rdx, 1), %rcx
+ xor (%rsi), %rax
+ xor -8(%rsi, %rdx, 1), %rcx
+ or %rcx, %rax
+ setnz %al # ensure EAX nonzero even if only
+ ret # high bits of RAX were set
+
+ /* more than 16 bytes: process buffer in a loop */
+.Lgt16: mov (%rdi), %rax # process first 16 bytes
+ mov 8(%rdi), %r9
+ mov $32, %ecx
+ xor (%rsi), %rax
+ xor 8(%rsi), %r9
+ or %r9, %rax
+
+ cmp %rdx, %rcx # enough left for a full iteration?
+ jae .Ltail
+
+ /* main loop processing 16 bytes per iteration */
+ ALIGN_TEXT
+0: mov -16(%rdi, %rcx, 1), %r8
+ mov -8(%rdi, %rcx, 1), %r9
+ xor -16(%rsi, %rcx, 1), %r8
+ xor -8(%rsi, %rcx, 1), %r9
+ add $16, %rcx
+ or %r9, %r8
+ or %r8, %rax
+
+ cmp %rdx, %rcx
+ jb 0b
+
+ /* process last 16 bytes */
+.Ltail: mov -16(%rdi, %rdx, 1), %r8
+ mov -8(%rdi, %rdx, 1), %r9
+ xor -16(%rsi, %rdx, 1), %r8
+ xor -8(%rsi, %rdx, 1), %r9
+ or %r9, %r8
+ or %r8, %rax
+ setnz %al
+ ret
+ARCHEND(timingsafe_bcmp, scalar)
+
+ARCHENTRY(timingsafe_bcmp, baseline)
+ cmp $32, %rdx # at least 33 bytes to process?
+ ja .Lgt32b
+
+ cmp $16, %edx # at least 17 bytes to process?
+ ja .L1732b
+
+ cmp $8, %edx # at least 9 bytes to process?
+ ja .L0916b
+
+ cmp $4, %edx # at least 5 bytes to process?
+ ja .L0508b
+
+ cmp $2, %edx # at least 3 bytes to process?
+ ja .L0304b
+
+ test %edx, %edx # buffer empty?
+ jnz .L0102b
+
+ xor %eax, %eax # empty buffer always matches
+ ret
+
+.L0102b:
+ movzbl (%rdi), %eax # load 1--2 bytes from first buffer
+ movzbl -1(%rdi, %rdx, 1), %ecx
+ xor (%rsi), %al # xor in second buffer
+ xor -1(%rsi, %rdx, 1), %cl
+ or %ecx, %eax # mismatch in any of the two?
+ ret
+
+.L0304b:
+ movzwl (%rdi), %eax
+ movzwl -2(%rdi, %rdx, 1), %ecx
+ xor (%rsi), %ax
+ xor -2(%rsi, %rdx, 1), %cx
+ or %ecx, %eax
+ ret
+
+.L0508b:
+ mov (%rdi), %eax
+ mov -4(%rdi, %rdx, 1), %ecx
+ xor (%rsi), %eax
+ xor -4(%rsi, %rdx, 1), %ecx
+ or %ecx, %eax
+ ret
+
+.L0916b:
+ mov (%rdi), %rax
+ mov -8(%rdi, %rdx, 1), %rcx
+ xor (%rsi), %rax
+ xor -8(%rsi, %rdx, 1), %rcx
+ or %rcx, %rax
+ setnz %al # ensure EAX nonzero even if only
+ ret # high bits of RAX were set
+
+.L1732b:
+ movdqu (%rdi), %xmm0
+ movdqu (%rsi), %xmm2
+ movdqu -16(%rdi, %rdx, 1), %xmm1
+ movdqu -16(%rsi, %rdx, 1), %xmm3
+ pcmpeqb %xmm2, %xmm0
+ pcmpeqb %xmm3, %xmm1
+ pand %xmm1, %xmm0
+ pmovmskb %xmm0, %eax # 1 where equal
+ xor $0xffff, %eax # 1 where not equal
+ ret
+
+ /* more than 32 bytes: process buffer in a loop */
+.Lgt32b:
+ movdqu (%rdi), %xmm4
+ movdqu (%rsi), %xmm2
+ movdqu 16(%rdi), %xmm1
+ movdqu 16(%rsi), %xmm3
+ mov $64, %ecx
+ pcmpeqb %xmm2, %xmm4
+ pcmpeqb %xmm3, %xmm1
+ pand %xmm1, %xmm4
+ cmp %rdx, %rcx # enough left for a full iteration?
+ jae .Ltailb
+
+ /* main loop processing 32 bytes per iteration */
+ ALIGN_TEXT
+0: movdqu -32(%rdi, %rcx, 1), %xmm0
+ movdqu -32(%rsi, %rcx, 1), %xmm2
+ movdqu -16(%rdi, %rcx, 1), %xmm1
+ movdqu -16(%rsi, %rcx, 1), %xmm3
+ add $32, %rcx
+ pcmpeqb %xmm2, %xmm0
+ pcmpeqb %xmm3, %xmm1
+ pand %xmm1, %xmm0
+ pand %xmm0, %xmm4
+ cmp %rdx, %rcx
+ jb 0b
+
+ /* process last 32 bytes */
+.Ltailb:
+ movdqu -32(%rdi, %rdx, 1), %xmm0
+ movdqu -32(%rsi, %rdx, 1), %xmm2
+ movdqu -16(%rdi, %rdx, 1), %xmm1
+ movdqu -16(%rsi, %rdx, 1), %xmm3
+ pcmpeqb %xmm2, %xmm0
+ pcmpeqb %xmm3, %xmm1
+ pand %xmm1, %xmm0
+ pand %xmm4, %xmm0
+ pmovmskb %xmm0, %eax
+ xor $0xffff, %eax
+ ret
+ARCHEND(timingsafe_bcmp, baseline)
+
+ .section .note.GNU-stack,"",%progbits
diff --git a/lib/libc/amd64/string/timingsafe_memcmp.S b/lib/libc/amd64/string/timingsafe_memcmp.S
new file mode 100644
index 000000000000..3f1eccdbd640
--- /dev/null
+++ b/lib/libc/amd64/string/timingsafe_memcmp.S
@@ -0,0 +1,145 @@
+/*-
+ * Copyright (c) 2023 The FreeBSD Foundation
+ *
+ * This software was developed by Robert Clausecker <fuz@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE
+ */
+
+#include <machine/asm.h>
+
+#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */
+
+/* int timingsafe_memcmp(const void *rdi, const void *rsi, size_t rdx) */
+ENTRY(timingsafe_memcmp)
+ cmp $16, %rdx # at least 17 bytes to process?
+ ja .Lgt16
+
+ cmp $8, %edx # at least 9 bytes to process?
+ ja .L0916
+
+ cmp $4, %edx # at least 5 bytes to process?
+ ja .L0508
+
+ cmp $2, %edx # at least 3 bytes to process?
+ ja .L0304
+
+ test %edx, %edx # buffer empty?
+ jnz .L0102
+
+ xor %eax, %eax # empty buffer always matches
+ ret
+
+.L0102: movzbl -1(%rdi, %rdx, 1), %eax # load 1--2 bytes from first buffer
+ movzbl -1(%rsi, %rdx, 1), %ecx
+ mov (%rdi), %ah # in big endian
+ mov (%rsi), %ch
+ sub %ecx, %eax
+ ret
+
+.L0304: movzwl -2(%rdi, %rdx, 1), %ecx
+ movzwl -2(%rsi, %rdx, 1), %edx
+ movzwl (%rdi), %eax
+ movzwl (%rsi), %esi
+ bswap %ecx # convert to big endian
+ bswap %edx # dito for edx, (e)ax, and (e)si
+ rol $8, %ax # ROLW is used here so the upper two
+ rol $8, %si # bytes stay clear, allowing us to
+ sub %edx, %ecx # save a SBB compared to .L0508
+ sbb %esi, %eax
+ or %eax, %ecx # nonzero if not equal
+ setnz %al
+ ret
+
+.L0508: mov -4(%rdi, %rdx, 1), %ecx
+ mov -4(%rsi, %rdx, 1), %edx
+ mov (%rdi), %edi
+ mov (%rsi), %esi
+ bswap %ecx # compare in big endian
+ bswap %edx
+ bswap %edi
+ bswap %esi
+ sub %edx, %ecx
+ sbb %esi, %edi
+ sbb %eax, %eax # -1 if less, 0 if greater or equal
+ or %edi, %ecx # nonzero if not equal
+ setnz %al # negative if <, 0 if =, 1 if >
+ ret
+
+.L0916: mov -8(%rdi, %rdx, 1), %rcx
+ mov -8(%rsi, %rdx, 1), %rdx
+ mov (%rdi), %rdi
+ mov (%rsi), %rsi
+ bswap %rcx # compare in big endian
+ bswap %rdx
+ bswap %rdi
+ bswap %rsi
+ sub %rdx, %rcx
+ sbb %rsi, %rdi
+ sbb %eax, %eax # -1 if less, 0 if greater or equal
+ or %rdi, %rcx # nonzero if not equal
+ setnz %al # negative if <, 0 if =, 1 if >
+ ret
+
+ /* compare 17+ bytes */
+.Lgt16: mov (%rdi), %r8 # process first 16 bytes
+ mov (%rsi), %r9
+ mov $32, %ecx
+ cmp %r8, %r9 # mismatch in head?
+ cmove 8(%rdi), %r8 # if not, try second pair
+ cmove 8(%rsi), %r9
+ cmp %rdx, %rcx
+ jae .Ltail
+
+ /* main loop processing 16 bytes per iteration */
+ ALIGN_TEXT
+0: mov -16(%rdi, %rcx, 1), %r10
+ mov -16(%rsi, %rcx, 1), %r11
+ cmp %r10, %r11 # mismatch in first pair?
+ cmove -8(%rdi, %rcx, 1), %r10 # if not, try second pair
+ cmove -8(%rsi, %rcx, 1), %r11
+ cmp %r8, %r9 # was there a mismatch previously?
+ cmove %r10, %r8 # apply new pair if there was not
+ cmove %r11, %r9
+ add $16, %rcx
+ cmp %rdx, %rcx
+ jb 0b
+
+.Ltail: mov -8(%rdi, %rdx, 1), %r10
+ mov -8(%rsi, %rdx, 1), %r11
+ cmp %r8, %r9
+ cmove -16(%rdi, %rdx, 1), %r8
+ cmove -16(%rsi, %rdx, 1), %r9
+ bswap %r10 # compare in big endian
+ bswap %r11
+ bswap %r8
+ bswap %r9
+ sub %r11, %r10
+ sbb %r9, %r8
+ sbb %eax, %eax # -1 if less, 0 if greater or equal
+ or %r10, %r8 # nonzero if not equal
+ setnz %al # negative if <, 0 if =, 1 if >
+ ret
+END(timingsafe_memcmp)
+
+ .section .note.GNU-stack,"",%progbits
diff --git a/lib/libc/amd64/sys/Makefile.inc b/lib/libc/amd64/sys/Makefile.inc
deleted file mode 100644
index 43af1d2a85a2..000000000000
--- a/lib/libc/amd64/sys/Makefile.inc
+++ /dev/null
@@ -1,12 +0,0 @@
-# from: Makefile.inc,v 1.1 1993/09/03 19:04:23 jtc Exp
-
-SRCS+= \
- amd64_get_fsbase.c \
- amd64_get_gsbase.c \
- amd64_set_fsbase.c \
- amd64_set_gsbase.c
-
-MDASM= vfork.S cerror.S getcontext.S
-
-# Don't generate default code for these syscalls:
-NOASM+= sbrk.o vfork.o
diff --git a/lib/libc/amd64/sys/amd64_get_fsbase.c b/lib/libc/amd64/sys/amd64_get_fsbase.c
deleted file mode 100644
index 4784bb0baf42..000000000000
--- a/lib/libc/amd64/sys/amd64_get_fsbase.c
+++ /dev/null
@@ -1,64 +0,0 @@
-/*-
- * SPDX-License-Identifier: BSD-2-Clause
- *
- * Copyright (c) 2003 Peter Wemm
- * Copyright (c) 2017, 2018 The FreeBSD Foundation
- * All rights reserved.
- *
- * Portions of this software were developed by Konstantin Belousov
- * under sponsorship from the FreeBSD Foundation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#include <sys/cdefs.h>
-#define _WANT_P_OSREL
-#include <sys/param.h>
-#include <machine/cpufunc.h>
-#include <machine/specialreg.h>
-#include <machine/sysarch.h>
-#include <x86/ifunc.h>
-#include "libc_private.h"
-
-static int
-amd64_get_fsbase_cpu(void **addr)
-{
-
- *addr = (void *)rdfsbase();
- return (0);
-}
-
-static int
-amd64_get_fsbase_syscall(void **addr)
-{
-
- return (sysarch(AMD64_GET_FSBASE, addr));
-}
-
-DEFINE_UIFUNC(, int, amd64_get_fsbase, (void **))
-{
-
- if (__getosreldate() >= P_OSREL_WRFSBASE &&
- (cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0)
- return (amd64_get_fsbase_cpu);
- return (amd64_get_fsbase_syscall);
-}
diff --git a/lib/libc/amd64/sys/amd64_get_gsbase.c b/lib/libc/amd64/sys/amd64_get_gsbase.c
deleted file mode 100644
index c81773c4b78c..000000000000
--- a/lib/libc/amd64/sys/amd64_get_gsbase.c
+++ /dev/null
@@ -1,64 +0,0 @@
-/*-
- * SPDX-License-Identifier: BSD-2-Clause
- *
- * Copyright (c) 2003 Peter Wemm
- * Copyright (c) 2017, 2018 The FreeBSD Foundation
- * All rights reserved.
- *
- * Portions of this software were developed by Konstantin Belousov
- * under sponsorship from the FreeBSD Foundation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#include <sys/cdefs.h>
-#define _WANT_P_OSREL
-#include <sys/param.h>
-#include <machine/cpufunc.h>
-#include <machine/specialreg.h>
-#include <machine/sysarch.h>
-#include <x86/ifunc.h>
-#include "libc_private.h"
-
-static int
-amd64_get_gsbase_cpu(void **addr)
-{
-
- *addr = (void *)rdgsbase();
- return (0);
-}
-
-static int
-amd64_get_gsbase_syscall(void **addr)
-{
-
- return (sysarch(AMD64_GET_GSBASE, addr));
-}
-
-DEFINE_UIFUNC(, int, amd64_get_gsbase, (void **))
-{
-
- if (__getosreldate() >= P_OSREL_WRFSBASE &&
- (cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0)
- return (amd64_get_gsbase_cpu);
- return (amd64_get_gsbase_syscall);
-}
diff --git a/lib/libc/amd64/sys/cerror.S b/lib/libc/amd64/sys/cerror.S
deleted file mode 100644
index d0b11888562e..000000000000
--- a/lib/libc/amd64/sys/cerror.S
+++ /dev/null
@@ -1,58 +0,0 @@
-/*-
- * Copyright (c) 1990 The Regents of the University of California.
- * All rights reserved.
- *
- * This code is derived from software contributed to Berkeley by
- * William Jolitz.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. Neither the name of the University nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#if defined(SYSLIBC_SCCS) && !defined(lint)
- .asciz "@(#)cerror.s 5.1 (Berkeley) 4/23/90"
-#endif /* SYSLIBC_SCCS and not lint */
-#include <machine/asm.h>
-#include "SYS.h"
-
- .globl HIDENAME(cerror)
- .hidden HIDENAME(cerror)
-
- /*
- * The __error() function is thread aware. For non-threaded
- * programs and the initial thread in threaded programs,
- * it returns a pointer to the global errno variable.
- */
- .globl CNAME(__error)
- .type CNAME(__error),@function
-HIDENAME(cerror):
- pushq %rax
- call PIC_PLT(CNAME(__error))
- popq %rcx
- movl %ecx,(%rax)
- movq $-1,%rax
- movq $-1,%rdx
- ret
-
- .section .note.GNU-stack,"",%progbits
diff --git a/lib/libc/amd64/sys/vfork.S b/lib/libc/amd64/sys/vfork.S
deleted file mode 100644
index 11faadc6b310..000000000000
--- a/lib/libc/amd64/sys/vfork.S
+++ /dev/null
@@ -1,52 +0,0 @@
-/*-
- * Copyright (c) 1990 The Regents of the University of California.
- * All rights reserved.
- *
- * This code is derived from software contributed to Berkeley by
- * William Jolitz.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. Neither the name of the University nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#if defined(SYSLIBC_SCCS) && !defined(lint)
- .asciz "@(#)Ovfork.s 5.1 (Berkeley) 4/23/90"
-#endif /* SYSLIBC_SCCS and not lint */
-#include <machine/asm.h>
-#include "SYS.h"
-
- WEAK_REFERENCE(__sys_vfork, _vfork)
- WEAK_REFERENCE(__sys_vfork, vfork)
-ENTRY(__sys_vfork)
- popq %rsi /* fetch return address (%rsi preserved) */
- mov $SYS_vfork,%rax
- KERNCALL
- jb 1f
- jmp *%rsi
-1:
- pushq %rsi
- jmp HIDENAME(cerror)
-END(__sys_vfork)
-
- .section .note.GNU-stack,"",%progbits