xref: /freebsd/lib/libc/amd64/string/strspn.S (revision 069ac184)
1/*-
2 * Copyright (c) 2023 The FreeBSD Foundation
3 *
4 * This software was developed by Robert Clausecker <fuz@FreeBSD.org>
5 * under sponsorship from the FreeBSD Foundation.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE
27 */
28
29#include <machine/asm.h>
30#include <machine/param.h>
31
32#include "amd64_archlevel.h"
33
34#define ALIGN_TEXT	.p2align 4,0x90 /* 16-byte alignment, nop filled */
35
36ARCHFUNCS(strspn)
37	ARCHFUNC(strspn, scalar)
38	NOARCHFUNC
39	ARCHFUNC(strspn, x86_64_v2)
40ENDARCHFUNCS(strspn)
41
42ARCHENTRY(strspn, scalar)
43	push	%rbp			# align stack to enable function call
44	mov	%rsp, %rbp
45	sub	$256, %rsp		# allocate space for lookup table
46
47	/* check for special cases */
48	movzbl	(%rsi), %edx		# first character in the set
49	test	%edx, %edx
50	jz	.Lzero			# empty set always returns 0
51
52	movzbl	1(%rsi), %eax		# second character in the set
53	test	%eax, %eax
54	jz	.Lsingle
55
56	/* no special case matches -- prepare lookup table */
57	xor	%r8d, %r8d
58	mov	$28, %ecx
590:	mov	%r8, (%rsp, %rcx, 8)
60	mov	%r8, 8(%rsp, %rcx, 8)
61	mov	%r8, 16(%rsp, %rcx, 8)
62	mov	%r8, 24(%rsp, %rcx, 8)
63	sub	$4, %ecx
64	jnc	0b
65
66	movb	$1, (%rsp, %rdx, 1)	# register first char in set
67	add	$2, %rsi
68
69	/* process remaining chars in set */
70	ALIGN_TEXT
710:	movb	$1, (%rsp, %rax, 1)	# register previous char
72	movzbl	(%rsi), %eax		# next char in set
73	test	%eax, %eax		# end of string?
74	jz	1f
75
76	movb	$1, (%rsp, %rax, 1)
77	add	$2, %rsi
78	movzbl	-1(%rsi), %eax
79	test	%eax, %eax
80	jnz	0b
81
821:	mov	%rdi, %rax		# a copy of the source to iterate over
83
84	/* find mismatch */
85	ALIGN_TEXT
860:	movzbl	(%rax), %ecx
87	cmpb	$0, (%rsp, %rcx, 1)
88	je	2f
89
90	movzbl	1(%rax), %ecx
91	cmpb	$0, (%rsp, %rcx, 1)
92	je	3f
93
94	movzbl	2(%rax), %ecx
95	cmpb	$0, (%rsp, %rcx, 1)
96	je	4f
97
98	movzbl	3(%rax), %ecx
99	add	$4, %rax
100	cmpb	$0, (%rsp, %rcx, 1)
101	jne	0b
102
103	sub	$3, %rax
1044:	dec	%rdi
1053:	inc	%rax
1062:	sub	%rdi, %rax		# number of characters preceding match
107	leave
108	ret
109
110	/* empty set never matches */
111.Lzero:	xor	%eax, %eax
112	leave
113	ret
114
115	/* find repeated single character */
116	ALIGN_TEXT
117.Lsingle:
118	cmpb	%dl, (%rdi, %rax, 1)
119	jne	1f
120
121	cmpb	%dl, 1(%rdi, %rax, 1)
122	jne	2f
123
124	cmpb	%dl, 2(%rdi, %rax, 1)
125	jne	3f
126
127	cmpb	%dl, 3(%rdi, %rax, 1)
128	lea	4(%rax), %rax
129	je	.Lsingle
130
131	sub	$3, %rax
1323:	inc	%rax
1332:	inc	%rax
1341:	leave
135	ret
136ARCHEND(strspn, scalar)
137
138	/*
139	 * This kernel uses pcmpistri to do the heavy lifting.
140	 * We provide three code paths, depending on set size:
141	 *
142	 *  0--16: one pcmpistri per 16 bytes of input
143	 * 17--32: two pcmpistri per 16 bytes of input
144	 *   >=33: fall back to look up table
145	 */
146ARCHENTRY(strspn, x86_64_v2)
147	push		%rbp
148	mov		%rsp, %rbp
149	sub		$256, %rsp
150
151	/* find set size and copy up to 32 bytes to (%rsp) */
152	mov		%esi, %ecx
153	and		$~0xf, %rsi		# align set pointer
154	movdqa		(%rsi), %xmm0
155	pxor		%xmm1, %xmm1
156	and		$0xf, %ecx		# amount of bytes rsi is past alignment
157	xor		%edx, %edx
158	pcmpeqb		%xmm0, %xmm1		# end of string reached?
159	movdqa		%xmm0, 32(%rsp)		# transfer head of set to stack
160	pmovmskb	%xmm1, %eax
161	shr		%cl, %eax		# clear out junk before string
162	test		%eax, %eax		# end of set reached?
163	jnz		0f
164
165	movdqa		16(%rsi), %xmm0		# second chunk of the set
166	mov		$16, %edx
167	sub		%ecx, %edx		# length of set preceding xmm0
168	pxor		%xmm1, %xmm1
169	pcmpeqb		%xmm0, %xmm1
170	movdqa		%xmm0, 48(%rsp)
171	movdqu		32(%rsp, %rcx, 1), %xmm2 # head of set
172	pmovmskb	%xmm1, %eax
173	test		%eax, %eax
174	jnz		1f
175
176	movdqa		32(%rsi), %xmm0		# third chunk
177	add		$16, %edx
178	pxor		%xmm1, %xmm1
179	pcmpeqb		%xmm0, %xmm1
180	movdqa		%xmm0, 64(%rsp)
181	pmovmskb	%xmm1, %eax
182	test		%eax, %eax		# still not done?
183	jz		.Lgt32v2
184
1850:	movdqu		32(%rsp, %rcx, 1), %xmm2 # head of set
1861:	tzcnt		%eax, %eax
187	add		%eax, %edx		# length of set (excluding NUL byte)
188	cmp		$32, %edx		# above 32 bytes?
189	ja		.Lgt32v2
190
191	/*
192	 * At this point we know that we want to use pcmpistri.
193	 * one last problem obtains: the head of the string is not
194	 * aligned and may cross a cacheline.  If this is the case,
195	 * we take the part before the page boundary and repeat the
196	 * last byte to fill up the xmm register.
197	 */
198	mov		%rdi, %rax		# save original string pointer
199	lea		15(%rdi), %esi		# last byte of the head
200	xor		%edi, %esi
201	test		$PAGE_SIZE, %esi	# does the head cross a page?
202	jz		0f
203
204	/* head crosses page: copy to stack to fix up */
205	and		$~0xf, %rax		# align head pointer temporarily
206	movzbl		15(%rax), %esi		# last head byte on the page
207	movdqa		(%rax), %xmm0
208	movabs		$0x0101010101010101, %r8
209	imul		%r8, %rsi		# repeated 8 times
210	movdqa		%xmm0, (%rsp)		# head word on stack
211	mov		%rsi, 16(%rsp)		# followed by filler (last byte x8)
212	mov		%rsi, 24(%rsp)
213	mov		%edi, %eax
214	and		$0xf, %eax		# offset of head from alignment
215	add		%rsp, %rax		# pointer to fake head
216
2170:	movdqu		(%rax), %xmm1		# load head (fake or real)
218	lea		16(%rdi), %rax
219	and		$~0xf, %rax		# second 16 bytes of string (aligned)
2201:	cmp		$16, %edx		# 16--32 bytes?
221	ja		.Lgt16v2
222
223
224	/* set is 2--16 bytes in size */
225
226	/* _SIDD_UBYTE_OPS|_SIDD_CMP_EQUAL_ANY|_SIDD_LEAST_SIGNIFICANT|_SIDD_NEGATIVE_POLARITY */
227	pcmpistri	$0x10, %xmm1, %xmm2	# match in head?
228	jc		.Lheadmismatchv2
229
230	ALIGN_TEXT
2310:	pcmpistri	$0x10, (%rax), %xmm2
232	jc		1f			# match or end of string?
233	pcmpistri	$0x10, 16(%rax), %xmm2
234	lea		32(%rax), %rax
235	jnc		0b			# match or end of string?
236
237	sub		$16, %rax		# go back to second half
2381:	sub		%rdi, %rax		# offset of (%rax) from beginning of string
239	add		%rcx, %rax		# prefix length before match/NUL
240        leave
241        ret
242
243.Lheadmismatchv2:
244	mov		%ecx, %eax		# prefix length before mismatch/NUL
245	leave
246	ret
247
248	/* set is 17--32 bytes in size */
249.Lgt16v2:
250	movdqu		48(%rsp, %rcx, 1), %xmm3 # second part of set
251
252	/* _SIDD_UBYTE_OPS|_SIDD_CMP_EQUAL_ANY|_SIDD_BIT_MASK|_SIDD_NEGATIVE_POLARITY */
253	pcmpistrm	$0x10, %xmm1, %xmm2	# any mismatch in first half?
254	movdqa		%xmm0, %xmm4
255	pcmpistrm	$0x10, %xmm1, %xmm3	# any mismatch in the second half?
256	ptest		%xmm0, %xmm4		# any entry that doesn't match either?
257	jnz		2f
258
259	ALIGN_TEXT
2600:	movdqa		(%rax), %xmm1
261	pcmpistrm	$0x10, %xmm1, %xmm2
262	movdqa		%xmm0, %xmm4
263	pcmpistrm	$0x10, %xmm1, %xmm3
264	ptest		%xmm0, %xmm4
265	jnz		1f
266	movdqa		16(%rax), %xmm1
267	add		$32, %rax
268	pcmpistrm	$0x10, %xmm1, %xmm2
269	movdqa		%xmm0, %xmm4
270	pcmpistrm	$0x10, %xmm1, %xmm3
271	ptest		%xmm0, %xmm4
272	jz		0b
273
274	sub		$16, %rax
2751:	pand		%xmm4, %xmm0
276	movd		%xmm0, %ecx
277	sub		%rdi, %rax		# offset of %xmm1 from beginning of string
278	tzcnt		%ecx, %ecx
279	add		%rcx, %rax		# prefix length before match/NUL
280	leave
281	ret
282
283	/* mismatch or string end in head */
2842:	pand		%xmm4, %xmm0		# bit mask of mismatches (end of string counts)
285	movd		%xmm0, %eax
286	tzcnt		%eax, %eax		# prefix length before mismatch/NUL
287	leave
288	ret
289
290	/* set is >=33 bytes in size */
291.Lgt32v2:
292	xorps	%xmm0, %xmm0
293	mov	$256-64, %edx
294
295	/* clear out look up table */
2960:	movaps	%xmm0, (%rsp, %rdx, 1)
297	movaps	%xmm0, 16(%rsp, %rdx, 1)
298	movaps	%xmm0, 32(%rsp, %rdx, 1)
299	movaps	%xmm0, 48(%rsp, %rdx, 1)
300	sub	$64, %edx
301	jnc	0b
302
303	add	%rcx, %rsi		# restore string pointer
304	mov	%rdi, %rax		# keep a copy of the string
305
306	/* initialise look up table */
307	movzbl	(%rsi), %ecx		# string is known not to be empty
308
309	ALIGN_TEXT
3100:	movb	$1, (%rsp, %rcx, 1)
311	movzbl	1(%rsi), %ecx
312	test	%ecx, %ecx
313	jz	1f
314
315	movb	$1, (%rsp, %rcx, 1)
316	movzbl	2(%rsi), %ecx
317	test	%ecx, %ecx
318	jz	1f
319
320	movb	$1, (%rsp, %rcx, 1)
321	movzbl	3(%rsi), %ecx
322	add	$4, %rsi
323	test	%ecx, %ecx
324	jz	1f
325
326	movb	$1, (%rsp, %rcx, 1)
327	movzbl	(%rsi), %ecx
328	test	%ecx, %ecx
329	jnz	0b
330
331	/* find match */
332	ALIGN_TEXT
3331:	movzbl	(%rax), %ecx
334	cmpb	$0, (%rsp, %rcx, 1)
335	je	2f
336
337	movzbl	1(%rax), %ecx
338	cmpb	$0, (%rsp, %rcx, 1)
339	je	3f
340
341	movzbl	2(%rax), %ecx
342	cmpb	$0, (%rsp, %rcx, 1)
343	je	4f
344
345	movzbl	3(%rax), %ecx
346	add	$4, %rax
347	cmpb	$0, (%rsp, %rcx, 1)
348	jne	1b
349
350	sub	$3, %rax
3514:	dec	%rdi
3523:	inc	%rax
3532:	sub	%rdi, %rax		# number of characters preceding match
354	leave
355	ret
356ARCHEND(strspn, x86_64_v2)
357
358	.section .note.GNU-stack,"",%progbits
359