aboutsummaryrefslogtreecommitdiff
path: root/lib/libc/amd64/string/stpncpy.S
blob: 5ce0dd093a9e78dcd4c087c122f983cbe4b49a1d (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
/*
 * Copyright (c) 2023 The FreeBSD Foundation
 *
 * This software was developed by Robert Clausecker <fuz@FreeBSD.org>
 * under sponsorship from the FreeBSD Foundation.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE
 */

#include <machine/asm.h>

#include "amd64_archlevel.h"

#define ALIGN_TEXT	.p2align 4, 0x90

	.weak stpncpy
	.set stpncpy, __stpncpy
ARCHFUNCS(__stpncpy)
	ARCHFUNC(__stpncpy, scalar)
	ARCHFUNC(__stpncpy, baseline)
ENDARCHFUNCS(__stpncpy)

ARCHENTRY(__stpncpy, scalar)
	push	%rbp		# establish stack frame
	mov	%rsp, %rbp

	push	%rdx
	push	%rdi
	push	%rsi
	push	%rax		# dummy push for alignment

	mov	%rsi, %rdi
	xor	%esi, %esi
	call	CNAME(__memchr)	# memchr(src, '\0', len)
	pop	%rcx		# dummy pop
	pop	%rsi
	mov	-16(%rbp), %rdi

	test	%rax, %rax	# NUL found?
	jz	.Lfullcopy

	mov	%rax, %rdx
	sub	%rsi, %rdx	# copy until the NUL byte
	add	%rdx, -16(%rbp)	# advance destination by string length
	sub	%rdx, -8(%rbp)	# and shorten buffer size by string length
	call	CNAME(memcpy)

	pop	%rdi
	pop	%rdx
	xor	%esi, %esi
	pop	%rbp
	jmp	CNAME(memset)	# clear remaining buffer

.Lfullcopy:
	mov	-8(%rbp), %rdx
	call	CNAME(memcpy)	# copy whole string
	add	-8(%rbp), %rax	# point to dest[n]
	leave
	ret
ARCHEND(__stpncpy, scalar)

	/*
	 * this mask allows us to generate masks of 16-n 0xff bytes
	 * followed by n 0x00 bytes by loading from .Lmask+n.
	 */
	.section	.rodata
.Lmask:	.quad		0xffffffffffffffff
	.quad		0xffffffffffffffff
	.quad		0x0000000000000000
	.quad		0x0000000000000000

/* stpncpy(char *restrict rdi, const char *rsi, size_t rdx) */
ARCHENTRY(__stpncpy, baseline)
#define bounce		(-3*16-8)		/* location of on-stack bounce buffer */

	test		%rdx, %rdx		# no bytes to copy?
	jz		.L0

	mov		%esi, %ecx
	and		$~0xf, %rsi		# align source to 16 bytes
	movdqa		(%rsi), %xmm0		# load head
	and		$0xf, %ecx		# offset from alignment
	mov		$-1, %r9d
	lea		-32(%rcx), %rax		# set up overflow-proof comparison rdx+rcx<=32
	shl		%cl, %r9d		# mask of bytes belonging to the string
	sub		%rcx, %rdi		# adjust RDI to correspond to RSI
	pxor		%xmm1, %xmm1
	movdqa		%xmm0, bounce(%rsp)	# stash copy of head on the stack
	pcmpeqb		%xmm1, %xmm0
	pmovmskb	%xmm0, %r8d

	lea		(%rdx, %rcx, 1), %r10	# buffer length from alignment boundary
	add		%rdx, %rax		# less than 2 chunks (32 bytes) to play with?
	jnc		.Lrunt			# if yes, use special runt processing

	movdqu		%xmm1, -16(%rdi, %r10, 1) # clear final bytes of destination
	and		%r9d, %r8d		# end of string within head?
	jnz		.Lheadnul

	movdqu		(%rsi, %rcx, 1), %xmm2	# load head from source buffer
	movdqu		%xmm2, (%rdi, %rcx, 1)	# an deposit

	add		$16, %rsi
	add		$16, %rdi
	sub		$32, %r10

	/* main loop unrolled twice */
	ALIGN_TEXT
0:	movdqa		(%rsi), %xmm0
	pxor		%xmm1, %xmm1
	pcmpeqb		%xmm0, %xmm1		# NUL byte encountered?
	pmovmskb	%xmm1, %r8d
	test		%r8d, %r8d
	jnz		3f

	movdqu		%xmm0, (%rdi)
	cmp		$16, %r10		# more than a full chunk left?
	jbe		1f

	movdqa		16(%rsi), %xmm0
	add		$32, %rdi		# advance pointers to next chunk
	add		$32, %rsi
	pxor		%xmm1, %xmm1
	pcmpeqb		%xmm0, %xmm1		# NUL byte encountered?
	pmovmskb	%xmm1, %r8d
	test		%r8d, %r8d
	jnz		2f

	movdqu		%xmm0, -16(%rdi)
	sub		$32, %r10		# more than another full chunk left?
	ja		0b

	sub		$16, %rdi		# undo second advancement
	sub		$16, %rsi
	add		$16, %r10d		# restore number of remaining bytes

	/* 1--16 bytes left but string has not ended yet */
1:	pxor		%xmm1, %xmm1
	pcmpeqb		16(%rsi), %xmm1		# NUL byte in source tail?
	pmovmskb	%xmm1, %r8d
	bts		%r10d, %r8d		# treat end of buffer as NUL
	tzcnt		%r8d, %r8d		# where is the NUL byte?
	movdqu		(%rsi, %r8, 1), %xmm0	# load source tail before NUL
	lea		16(%rdi, %r8, 1), %rax	# point return value to NUL byte
						# or end of buffer
	movdqu		%xmm0, (%rdi, %r8, 1)	# store tail into the buffer
	ret

2:	sub		$16, %rdi		# undo second advancement
	sub		$16, %rsi
	sub		$16, %r10

	/* string has ended and buffer has not */
3:	tzcnt		%r8d, %r8d		# where did the string end?
	lea		.Lmask+16(%rip), %rcx
	lea		(%rdi, %r8, 1), %rax 	# where the NUL byte will be
	neg		%r8
	movdqu		(%rcx, %r8, 1), %xmm1	# mask with FF where the string is,
						# 00 where it is not
	pand		%xmm1, %xmm0		# mask out bytes after the string
	movdqu		%xmm0, (%rdi)	 	# store masked current chunk
	pxor		%xmm1, %xmm1
	sub		$16, %r10		# another full chunk left?
	jbe		1f

	/* clear remaining destination buffer (tail has been cleared earlier) */
	ALIGN_TEXT
0:	movdqu		%xmm1, 16(%rdi)
	cmp		$16, %r10
	jbe		1f

	movdqu		%xmm1, 32(%rdi)
	add		$32, %rdi
	sub		$32, %r10
	ja		0b

1:	ret

	/* at least two chunks to play with and NUL while processing head */
.Lheadnul:
	movdqu		bounce(%rsp, %rcx, 1), %xmm0 # load start of source from stack
	tzcnt		%r8d, %r8d		# find location of NUL byte
	movdqu		%xmm0, (%rdi, %rcx, 1)	# deposit head in the destination
	movdqu		%xmm1, (%rdi, %r8, 1)	# clear out following bytes
	movdqu		%xmm1, 16(%rdi)		# clear out second chunk
	lea		(%rdi, %r8, 1), %rax	# make RAX point to the NUL byte

	add		$32, %rdi		# advance past first two chunks
	sub		$32+16, %r10		# advance past first three chunks
	jbe		1f			# did we pass the end of the buffer?

	/* clear remaining destination buffer (tail has been cleared earlier) */
	ALIGN_TEXT
0:	movdqu		%xmm1, (%rdi)		# clear out buffer chunk
	cmp		$16, %r10
	jbe		1f

	movdqu		%xmm1, 16(%rdi)
	add		$32, %rdi
	sub		$32, %r10
	ja		0b

1:	ret

	/* 1--32 bytes to copy, bounce through the stack */
.Lrunt:	movdqa		%xmm1, bounce+16(%rsp)	# clear out rest of on-stack copy
	bts		%r10d, %r8d		# treat end of buffer as end of string
	and		%r9w, %r8w		# end of string within first buffer?
	jnz		0f			# if yes, do not inspect second buffer

	movdqa		16(%rsi), %xmm0		# load second chunk of input
	movdqa		%xmm0, bounce+16(%rsp)	# stash copy on stack
	pcmpeqb		%xmm1, %xmm0		# NUL in second chunk?
	pmovmskb	%xmm0, %r9d
	shl		$16, %r9d
	or		%r9d, %r8d		# merge found NUL bytes into NUL mask

	/* end of string after one buffer */
0:	tzcnt		%r8d, %r8d		# location of last char in string
	movdqu		%xmm1, bounce(%rsp, %r8, 1) # clear bytes behind string
	lea		bounce(%rsp, %rcx, 1), %rsi # start of string copy on stack
	lea		(%rdi, %r8, 1), %rax	# return pointer to NUL byte

	cmp		$16, %edx		# at least 16 bytes to transfer?
	jae		.L1631

	mov		(%rsi), %r8		# load string head
	cmp		$8, %edx		# at least 8 bytes to transfer?
	jae		.L0815

	cmp		$4, %edx		# at least 4 bytes to transfer?
	jae		.L0407

	movzwl		-2(%rsi, %rdx, 1), %esi	# load last two bytes of string
	mov		%r8b, (%rdi, %rcx, 1)	# store first byte

	cmp		$2, %edx		# at least 2 bytes to transfer?
	jb		.L1

	mov		%si, -2(%rdi, %r10, 1)	# store last two bytes of string
.L1:	ret

.L1631:	movdqu		(%rsi), %xmm0		# load first 16 bytes of string
	movdqu		-16(%rsi, %rdx, 1), %xmm1 # load last 16 bytes of string
	movdqu		%xmm0, (%rdi, %rcx, 1)
	movdqu		%xmm1, -16(%rdi, %r10, 1)
	ret

.L0815:	mov		-8(%rsi, %rdx, 1), %rdx	# load last 8 bytes of string
	mov		%r8, (%rdi, %rcx, 1)
	mov		%rdx, -8(%rdi, %r10, 1)
	ret

.L0407:	mov		-4(%rsi, %rdx, 1), %edx	# load last four bytes of string
	mov		%r8d, (%rdi, %rcx, 1)
	mov		%edx, -4(%rdi, %r10, 1)
	ret

	/* length 0 buffer: just return dest */
.L0:	mov		%rdi, %rax
	ret
ARCHEND(__stpncpy, baseline)

	.section .note.GNU-stack,"",%progbits