aboutsummaryrefslogtreecommitdiff
path: root/lib/libc/amd64/string/strlen.S
diff options
context:
space:
mode:
Diffstat (limited to 'lib/libc/amd64/string/strlen.S')
-rw-r--r--lib/libc/amd64/string/strlen.S131
1 files changed, 131 insertions, 0 deletions
diff --git a/lib/libc/amd64/string/strlen.S b/lib/libc/amd64/string/strlen.S
new file mode 100644
index 000000000000..cc248af001ac
--- /dev/null
+++ b/lib/libc/amd64/string/strlen.S
@@ -0,0 +1,131 @@
+/*-
+ * Written by Mateusz Guzik <mjg@freebsd.org>
+ * Copyright (c) 2023 The FreeBSD Foundation
+ *
+ * Portions of this software were developed by Robert Clausecker
+ * <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation.
+ *
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+#include "amd64_archlevel.h"
+
+/*
+ * Note: this routine was written with kernel use in mind (read: no simd),
+ * it is only present in userspace as a temporary measure until something
+ * better gets imported.
+ */
+
+#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */
+
+ARCHFUNCS(strlen)
+ ARCHFUNC(strlen, scalar)
+ ARCHFUNC(strlen, baseline)
+ENDARCHFUNCS(strlen)
+
+/*
+ * strlen(string)
+ * %rdi
+ *
+ * Uses the ((x - 0x01....01) & ~x & 0x80....80) trick.
+ *
+ * 0x01....01 is replaced with 0x0 - 0x01....01 so that it can be added
+ * with leaq.
+ *
+ * For a description see either:
+ * - "Hacker's Delight" by Henry S. Warren, Jr.
+ * - "Optimizing subroutines in assembly language: An optimization guide for x86 platforms"
+ * by Agner Fog
+ *
+ * The latter contains a 32-bit variant of the same algorithm coded in assembly for i386.
+ */
+ARCHENTRY(strlen, scalar)
+ movabsq $0xfefefefefefefeff,%r8
+ movabsq $0x8080808080808080,%r9
+
+ movq %rdi,%r10
+ movq %rdi,%rcx
+ testb $7,%dil
+ jz 2f
+
+ /*
+ * Handle misaligned reads: align to 8 and fill
+ * the spurious bytes.
+ */
+ andq $~7,%rdi
+ movq (%rdi),%r11
+ shlq $3,%rcx
+ movq $-1,%rdx
+ shlq %cl,%rdx
+ notq %rdx
+ orq %rdx,%r11
+
+ leaq (%r11,%r8),%rcx
+ notq %r11
+ andq %r11,%rcx
+ andq %r9,%rcx
+ jnz 3f
+
+ /*
+ * Main loop.
+ */
+ ALIGN_TEXT
+1:
+ leaq 8(%rdi),%rdi
+2:
+ movq (%rdi),%r11
+ leaq (%r11,%r8),%rcx
+ notq %r11
+ andq %r11,%rcx
+ andq %r9,%rcx
+ jz 1b
+3:
+ bsfq %rcx,%rcx
+ shrq $3,%rcx
+ leaq (%rcx,%rdi),%rax
+ subq %r10,%rax
+ ret
+ARCHEND(strlen, scalar)
+
+ARCHENTRY(strlen, baseline)
+ mov %rdi, %rcx
+ pxor %xmm1, %xmm1
+ and $~0xf, %rdi # align string
+ pcmpeqb (%rdi), %xmm1 # compare head (with junk before string)
+ mov %rcx, %rsi # string pointer copy for later
+ and $0xf, %ecx # amount of bytes rdi is past 16 byte alignment
+ pmovmskb %xmm1, %eax
+ add $32, %rdi # advance to next iteration
+ shr %cl, %eax # clear out matches in junk bytes
+ test %eax, %eax # any match? (can't use ZF from SHR as CL=0 is possible)
+ jnz 2f
+
+ ALIGN_TEXT
+1: pxor %xmm1, %xmm1
+ pcmpeqb -16(%rdi), %xmm1 # find NUL bytes
+ pmovmskb %xmm1, %eax
+ test %eax, %eax # were any NUL bytes present?
+ jnz 3f
+
+ /* the same unrolled once more */
+ pxor %xmm1, %xmm1
+ pcmpeqb (%rdi), %xmm1
+ pmovmskb %xmm1, %eax
+ add $32, %rdi # advance to next iteration
+ test %eax, %eax
+ jz 1b
+
+ /* match found in loop body */
+ sub $16, %rdi # undo half the advancement
+3: tzcnt %eax, %eax # find the first NUL byte
+ sub %rsi, %rdi # string length until beginning of (%rdi)
+ lea -16(%rdi, %rax, 1), %rax # that plus loc. of NUL byte: full string length
+ ret
+
+ /* match found in head */
+2: tzcnt %eax, %eax # compute string length
+ ret
+ARCHEND(strlen, baseline)
+
+ .section .note.GNU-stack,"",%progbits