xref: /freebsd/lib/libc/amd64/string/strlcpy.S (revision f552d7ad)
1/*
2 * Copyright (c) 2023 The FreeBSD Foundation
3 *
4 * This software was developed by Robert Clausecker <fuz@FreeBSD.org>
5 * under sponsorship from the FreeBSD Foundation.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE
27 */
28
29#include <machine/asm.h>
30
31#include "amd64_archlevel.h"
32
33#define ALIGN_TEXT	.p2align 4, 0x90
34
35	.weak strlcpy
36	.set strlcpy, __strlcpy
37ARCHFUNCS(__strlcpy)
38	ARCHFUNC(__strlcpy, scalar)
39	ARCHFUNC(__strlcpy, baseline)
40ENDARCHFUNCS(__strlcpy)
41
42ARCHENTRY(__strlcpy, scalar)
43	push	%rbp		# establish stack frame
44	mov	%rsp, %rbp
45	push	%rsi
46	push	%rbx
47	push	%rdi
48	push	%rdx
49	mov	%rsi, %rdi
50	call	CNAME(strlen)	# strlen(src)
51	pop	%rdx
52	pop	%rdi
53	mov	-8(%rbp), %rsi
54	mov	%rax, %rbx	# remember string length for return value
55	sub	$1, %rdx	# do not copy into the final byte of the buffer
56	jc	0f		# skip copying altogether if buffer was empty
57	cmp	%rax, %rdx	# is the buffer longer than the input?
58	cmova	%rax, %rdx	# if yes, only copy the part that fits
59	movb	$0, (%rdi, %rdx, 1) # NUL-terminate output buffer
60	call	CNAME(memcpy)	# copy string to output
610:	mov	%rbx, %rax	# restore return value
62	pop	%rbx
63	leave
64	ret
65ARCHEND(__strlcpy, scalar)
66
67ARCHENTRY(__strlcpy, baseline)
68	sub		$1, %rdx		# do not count NUL byte in buffer length
69	jb		.L0			# go to special code path if len was 0
70
71	mov		%esi, %ecx
72	pxor		%xmm1, %xmm1
73	mov		%rsi, %r9		# stash a copy of the source pointer for later
74	and		$~0xf, %rsi
75	pcmpeqb		(%rsi), %xmm1		# NUL found in head?
76	mov		$-1, %r8d
77	and		$0xf, %ecx
78	shl		%cl, %r8d		# mask of bytes in the string
79	pmovmskb	%xmm1, %eax
80	and		%r8d, %eax
81	jnz		.Lhead_nul
82
83	movdqa		16(%rsi), %xmm3		# load second string chunk
84	movdqu		(%r9), %xmm2		# load unaligned string head
85	mov		$32, %r8d
86	sub		%ecx, %r8d		# head length + length of second chunk
87	pxor		%xmm1, %xmm1
88	pcmpeqb		%xmm3, %xmm1		# NUL found in second chunk?
89
90	sub		%r8, %rdx		# enough space left for the second chunk?
91	jbe		.Lhead_buf_end
92
93	/* process second chunk */
94	pmovmskb	%xmm1, %eax
95	test		%eax, %eax
96	jnz		.Lsecond_nul
97
98	/* string didn't end in second chunk and neither did buffer -- not a runt! */
99	movdqa		32(%rsi), %xmm0		# load next string chunk
100	pxor		%xmm1, %xmm1
101	movdqu		%xmm2, (%rdi)		# deposit head into buffer
102	sub		%rcx, %rdi		# adjust RDI to correspond to RSI
103	movdqu		%xmm3, 16(%rdi)		# deposit second chunk
104	sub		%rsi, %rdi		# express RDI as distance from RSI
105	add		$32, %rsi		# advance RSI past first two chunks
106	sub		$16, %rdx		# enough left for another round?
107	jbe		1f
108
109	/* main loop unrolled twice */
110	ALIGN_TEXT
1110:	pcmpeqb		%xmm0, %xmm1		# NUL byte encountered?
112	pmovmskb	%xmm1, %eax
113	test		%eax, %eax
114	jnz		3f
115
116	movdqu		%xmm0, (%rsi, %rdi)
117	movdqa		16(%rsi), %xmm0		# load next string chunk
118	pxor		%xmm1, %xmm1
119	cmp		$16, %rdx		# more than a full chunk left?
120	jbe		2f
121
122	add		$32, %rsi		# advance pointers to next chunk
123	pcmpeqb		%xmm0, %xmm1		# NUL byte encountered?
124	pmovmskb	%xmm1, %eax
125	test		%eax, %eax
126	jnz		4f
127
128	movdqu		%xmm0, -16(%rsi, %rdi)
129	movdqa		(%rsi), %xmm0		# load next string chunk
130	pxor		%xmm1, %xmm1
131	sub		$32, %rdx
132	ja		0b
133
1341:	sub		$16, %rsi		# undo second advancement
135	add		$16, %edx
136
137	/* 1--16 bytes left in the buffer but string has not ended yet */
1382:	pcmpeqb		%xmm1, %xmm0		# NUL byte encountered?
139	pmovmskb	%xmm0, %r8d
140	mov		%r8d, %eax
141	bts		%edx, %r8d		# treat end of buffer as end of string
142	tzcnt		%r8d, %r8d		# find tail length
143	add		%rsi, %rdi		# restore RDI
144	movdqu		(%rsi, %r8, 1), %xmm0	# load string tail
145	movdqu		%xmm0, (%rdi, %r8, 1)	# store string tail
146	movb		$0, 16(%rdi, %r8, 1)	# NUL terminate
147
148	/* continue to find the end of the string */
149	test		%eax, %eax		# end of string already reached?
150	jnz		1f
151
152	ALIGN_TEXT
1530:	pcmpeqb		32(%rsi), %xmm1
154	pmovmskb	%xmm1, %eax
155	pxor		%xmm1, %xmm1
156	test		%eax, %eax
157	jnz		2f
158
159	pcmpeqb		48(%rsi), %xmm1
160	pmovmskb	%xmm1, %eax
161	add		$32, %rsi
162	pxor		%xmm1, %xmm1
163	test		%eax, %eax
164	jz		0b
165
1661:	sub		$16, %rsi		# undo second advancement
1672:	tzcnt		%eax, %eax		# where is the NUL byte?
168	sub		%r9, %rsi
169	lea		32(%rsi, %rax, 1), %rax	# return string length
170	ret
171
1724:	sub		$16, %rsi		# undo second advancement
173	add		$16, %rdx		# restore number of remaining bytes
174
175	/* string has ended but buffer has not */
1763:	tzcnt		%eax, %eax		# find length of string tail
177	movdqu		-15(%rsi, %rax, 1), %xmm0 # load string tail (incl. NUL)
178	add		%rsi, %rdi		# restore destination pointer
179	movdqu		%xmm0, -15(%rdi, %rax, 1) # store string tail (incl. NUL)
180	sub		%r9, %rsi		# string length to current chunk
181	add		%rsi, %rax		# plus length of current chunk
182	ret
183
184.Lhead_buf_end:
185	pmovmskb	%xmm1, %r8d
186	add		$32, %edx		# restore edx to (len-1) + ecx
187	mov		%r8d, %eax
188	shl		$16, %r8d		# place 2nd chunk NUL mask into bits 16--31
189	bts		%rdx, %r8		# treat end of buffer as end of string
190	tzcnt		%r8, %rdx		# find string/bufer len from alignment boundary
191	sub		%ecx, %edx		# find actual string/buffer len
192	movb		$0, (%rdi, %rdx, 1)	# write NUL terminator
193
194	/* continue to find the end of the string */
195	test		%eax, %eax		# end of string already reached?
196	jnz		1f
197
198	ALIGN_TEXT
1990:	pcmpeqb		32(%rsi), %xmm1
200	pmovmskb	%xmm1, %eax
201	pxor		%xmm1, %xmm1
202	test		%eax, %eax
203	jnz		2f
204
205	pcmpeqb		48(%rsi), %xmm1
206	pmovmskb	%xmm1, %eax
207	add		$32, %rsi
208	pxor		%xmm1, %xmm1
209	test		%eax, %eax
210	jz		0b
211
2121:	sub		$16, %rsi
2132:	tzcnt		%eax, %eax
214	sub		%r9, %rsi
215	lea		32(%rsi, %rax, 1), %rax	# return string length
216	jmp		.L0031
217
218.Lsecond_nul:
219	add		%r8, %rdx		# restore buffer length
220	tzcnt		%eax, %eax		# where is the NUL byte?
221	lea		-16(%rcx), %r8d
222	sub		%r8d, %eax		# string length
223	cmp		%rax, %rdx		# is the string shorter than the buffer?
224	cmova		%rax, %rdx		# copy only min(buflen, srclen) bytes
225	movb		$0, (%rdi, %rdx, 1)	# write NUL terminator
226.L0031:	cmp		$16, %rdx		# at least 16 bytes to copy (not incl NUL)?
227	jb		.L0015
228
229	/* copy 16--31 bytes */
230	movdqu		(%r9), %xmm0		# load first 16 bytes
231	movdqu		-16(%r9, %rdx, 1), %xmm1 # load last 16 bytes
232	movdqu		%xmm0, (%rdi)
233	movdqu		%xmm1, -16(%rdi, %rdx, 1)
234	ret
235
236.Lhead_nul:
237	tzcnt		%eax, %eax		# where is the NUL byte?
238	sub		%ecx, %eax		# ... from the beginning of the string?
239	cmp		%rax, %rdx		# is the string shorter than the buffer?
240	cmova		%rax, %rdx		# copy only min(buflen, srclen) bytes
241	movb		$0, (%rdi, %rdx, 1)	# write NUL terminator
242
243	/* process strings of 0--15 bytes (rdx: min(buflen, srclen), rax: srclen) */
244.L0015:	cmp		$8, %rdx		# at least 8 bytes to copy?
245	jae		.L0815
246
247	cmp		$4, %rdx		# at least 4 bytes to copy?
248	jae		.L0407
249
250	cmp		$2, %rdx		# at least 2 bytes to copy?
251	jae		.L0203
252
253	movzbl		(%r9), %ecx		# load first byte from src
254	mov		%cl, (%rdi)		# deposit into destination
255	movb		$0, (%rdi, %rdx, 1)	# add NUL terminator (again)
256	ret
257
258.L0203:	movzwl		(%r9), %ecx
259	movzwl		-2(%r9, %rdx, 1), %esi
260	mov		%cx, (%rdi)
261	mov		%si, -2(%rdi, %rdx, 1)
262	ret
263
264.L0407:	mov		(%r9), %ecx
265	mov		-4(%r9, %rdx, 1), %esi
266	mov		%ecx, (%rdi)
267	mov		%esi, -4(%rdi, %rdx, 1)
268	ret
269
270.L0815:	mov		(%r9), %rcx
271	mov		-8(%r9, %rdx, 1), %rsi
272	mov		%rcx, (%rdi)
273	mov		%rsi, -8(%rdi, %rdx, 1)
274	ret
275
276	/* length zero destination: just return the string length */
277.L0:	mov		%rsi, %rdi
278	jmp		CNAME(strlen)
279ARCHEND(__strlcpy, baseline)
280
281	.section .note.GNU-stack,"",%progbits
282