diff options
Diffstat (limited to 'lib/libc/amd64/string/strlen.S')
-rw-r--r-- | lib/libc/amd64/string/strlen.S | 131 |
1 files changed, 131 insertions, 0 deletions
diff --git a/lib/libc/amd64/string/strlen.S b/lib/libc/amd64/string/strlen.S new file mode 100644 index 000000000000..cc248af001ac --- /dev/null +++ b/lib/libc/amd64/string/strlen.S @@ -0,0 +1,131 @@ +/*- + * Written by Mateusz Guzik <mjg@freebsd.org> + * Copyright (c) 2023 The FreeBSD Foundation + * + * Portions of this software were developed by Robert Clausecker + * <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation. + * + * Public domain. + */ + +#include <machine/asm.h> +#include "amd64_archlevel.h" + +/* + * Note: this routine was written with kernel use in mind (read: no simd), + * it is only present in userspace as a temporary measure until something + * better gets imported. + */ + +#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */ + +ARCHFUNCS(strlen) + ARCHFUNC(strlen, scalar) + ARCHFUNC(strlen, baseline) +ENDARCHFUNCS(strlen) + +/* + * strlen(string) + * %rdi + * + * Uses the ((x - 0x01....01) & ~x & 0x80....80) trick. + * + * 0x01....01 is replaced with 0x0 - 0x01....01 so that it can be added + * with leaq. + * + * For a description see either: + * - "Hacker's Delight" by Henry S. Warren, Jr. + * - "Optimizing subroutines in assembly language: An optimization guide for x86 platforms" + * by Agner Fog + * + * The latter contains a 32-bit variant of the same algorithm coded in assembly for i386. + */ +ARCHENTRY(strlen, scalar) + movabsq $0xfefefefefefefeff,%r8 + movabsq $0x8080808080808080,%r9 + + movq %rdi,%r10 + movq %rdi,%rcx + testb $7,%dil + jz 2f + + /* + * Handle misaligned reads: align to 8 and fill + * the spurious bytes. + */ + andq $~7,%rdi + movq (%rdi),%r11 + shlq $3,%rcx + movq $-1,%rdx + shlq %cl,%rdx + notq %rdx + orq %rdx,%r11 + + leaq (%r11,%r8),%rcx + notq %r11 + andq %r11,%rcx + andq %r9,%rcx + jnz 3f + + /* + * Main loop. + */ + ALIGN_TEXT +1: + leaq 8(%rdi),%rdi +2: + movq (%rdi),%r11 + leaq (%r11,%r8),%rcx + notq %r11 + andq %r11,%rcx + andq %r9,%rcx + jz 1b +3: + bsfq %rcx,%rcx + shrq $3,%rcx + leaq (%rcx,%rdi),%rax + subq %r10,%rax + ret +ARCHEND(strlen, scalar) + +ARCHENTRY(strlen, baseline) + mov %rdi, %rcx + pxor %xmm1, %xmm1 + and $~0xf, %rdi # align string + pcmpeqb (%rdi), %xmm1 # compare head (with junk before string) + mov %rcx, %rsi # string pointer copy for later + and $0xf, %ecx # amount of bytes rdi is past 16 byte alignment + pmovmskb %xmm1, %eax + add $32, %rdi # advance to next iteration + shr %cl, %eax # clear out matches in junk bytes + test %eax, %eax # any match? (can't use ZF from SHR as CL=0 is possible) + jnz 2f + + ALIGN_TEXT +1: pxor %xmm1, %xmm1 + pcmpeqb -16(%rdi), %xmm1 # find NUL bytes + pmovmskb %xmm1, %eax + test %eax, %eax # were any NUL bytes present? + jnz 3f + + /* the same unrolled once more */ + pxor %xmm1, %xmm1 + pcmpeqb (%rdi), %xmm1 + pmovmskb %xmm1, %eax + add $32, %rdi # advance to next iteration + test %eax, %eax + jz 1b + + /* match found in loop body */ + sub $16, %rdi # undo half the advancement +3: tzcnt %eax, %eax # find the first NUL byte + sub %rsi, %rdi # string length until beginning of (%rdi) + lea -16(%rdi, %rax, 1), %rax # that plus loc. of NUL byte: full string length + ret + + /* match found in head */ +2: tzcnt %eax, %eax # compute string length + ret +ARCHEND(strlen, baseline) + + .section .note.GNU-stack,"",%progbits |