xref: /freebsd/lib/libc/amd64/string/strchrnul.S (revision 4b9d6057)
1/*-
2 * Copyright (c) 2023 The FreeBSD Foundation
3 *
4 * This software was developed by Robert Clausecker <fuz@FreeBSD.org>
5 * under sponsorship from the FreeBSD Foundation.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE
27 */
28
29#include <machine/asm.h>
30
31#include "amd64_archlevel.h"
32
33#define ALIGN_TEXT	.p2align 4,0x90	# 16-byte alignment, nop-filled
34
35	.weak	strchrnul
36	.set	strchrnul, __strchrnul
37
38ARCHFUNCS(__strchrnul)
39	ARCHFUNC(__strchrnul, scalar)
40	ARCHFUNC(__strchrnul, baseline)
41ENDARCHFUNCS(__strchrnul)
42
43/*
44 * strchrnul(str, c)
45 * This is implemented like strlen(str), but we check for the
46 * presence of both NUL and c in each iteration.
47 */
48ARCHENTRY(__strchrnul, scalar)
49	mov	%edi, %ecx
50	and	$~7, %rdi		# align to 8 byte
51	movzbl	%sil, %esi		# clear stray high bits
52	movabs	$0x0101010101010101, %r8
53	mov	(%rdi), %rax		# load first word
54	imul	%r8, %rsi		# replicate char 8 times
55
56	/*
57	 * Unaligned input: align to 8 bytes.  Then proceed the same
58	 * way as with aligned input, but prevent matches before the
59	 * beginning of the string.  This is achieved by oring 0x01
60	 * into each byte of the buffer before the string
61	 */
62	shl	$3, %ecx
63	mov	%r8, %r10
64	add	$8, %rdi
65	shl	%cl, %r10		# 0x01 where the string is
66	xor	%r8, %r10		# 0x01 where it is not
67	neg	%r8			# negate 01..01 so we can use lea
68	movabs	$0x8080808080808080, %r9
69
70	mov	%rsi, %rcx
71	xor	%rax, %rcx		# str ^ c
72	or	%r10, %rax		# str without NUL bytes before it
73	or	%r10, %rcx		# (str ^ c) without matches before it
74	lea	(%rax, %r8, 1), %rdx	# str - 0x01..01
75	lea	(%rcx, %r8, 1), %r11	# (str ^ c) - 0x01..01
76	not	%rax			# ~str
77	not	%rcx			# ~(str ^ c)
78	and	%rdx, %rax		# (str - 0x01..01) & ~str
79	and	%r11, %rcx		# ((str ^ c - 0x01..01) & ~(str ^ c)
80	or	%rcx, %rax		# matches for both
81	and	%r9, %rax		# not including junk bytes
82	jnz	1f
83
84	/* main loop unrolled twice */
85	ALIGN_TEXT
860:	mov	(%rdi), %rax		# str
87	mov	%rsi, %rcx
88	xor	%rax, %rcx		# str ^ c
89	lea	(%rax, %r8, 1), %rdx	# str - 0x01..01
90	lea	(%rcx, %r8, 1), %r11	# (str ^ c) - 0x01..01
91	not	%rax			# ~str
92	not	%rcx			# ~(str ^ c)
93	and	%rdx, %rax		# (str - 0x01..01) & ~str
94	and	%r11, %rcx		# ((str ^ c - 0x01..01) & ~(str ^ c)
95	or	%rcx, %rax		# matches for both
96	and	%r9, %rax		# not including junk bits
97	jnz	2f
98
99	mov	8(%rdi), %rax		# str
100	add	$16, %rdi
101	mov	%rsi, %rcx
102	xor	%rax, %rcx		# str ^ c
103	lea	(%rax, %r8, 1), %rdx	# str - 0x01..01
104	lea	(%rcx, %r8, 1), %r11	# (str ^ c) - 0x01..01
105	not	%rax			# ~str
106	not	%rcx			# ~(str ^ c)
107	and	%rdx, %rax		# (str - 0x01..01) & ~str
108	and	%r11, %rcx		# ((str ^ c - 0x01..01) & ~(str ^ c)
109	or	%rcx, %rax		# matches for both
110	and	%r9, %rax		# not including junk bits
111	jz	0b
112
113	/* NUL or c found */
1141:	sub	$8, %rdi		# undo advance past buffer
1152:	tzcnt	%rax, %rax		# first NUL or c byte match
116	shr	$3, %eax		# scale from bit to byte index
117	add	%rdi, %rax		# pointer to found c or NUL
118	ret
119ARCHEND(__strchrnul, scalar)
120
121ARCHENTRY(__strchrnul, baseline)
122	mov		%edi, %ecx
123	and		$~0xf, %rdi		# align to 16 byte
124	movdqa		(%rdi), %xmm1
125	movd		%esi, %xmm0
126	and		$0xf, %ecx		# distance from (%rdi) to start of string
127	pxor		%xmm2, %xmm2
128	mov		$-1, %edx
129	punpcklbw	%xmm0, %xmm0		# c -> cc
130	shl		%cl, %edx		# bits corresponding to bytes in the string
131	punpcklwd	%xmm0, %xmm0		# cc -> cccc
132	add		$16, %rdi
133
134	/* check for match in head */
135	pcmpeqb		%xmm1, %xmm2		# NUL bytes present?
136	pshufd		$0, %xmm0, %xmm0	# cccc -> cccccccccccccccc
137	pcmpeqb		%xmm0, %xmm1		# c present?
138	por		%xmm2, %xmm1		# either present?
139	pmovmskb	%xmm1, %eax
140	and		%edx, %eax		# match in the string?
141	jnz		1f
142
143	/* main loop unrolled twice */
144	ALIGN_TEXT
1450:	movdqa		(%rdi), %xmm1
146	pxor		%xmm2, %xmm2
147	pcmpeqb		%xmm1, %xmm2		# NUL bytes present?
148	pcmpeqb		%xmm0, %xmm1		# c present?
149	por		%xmm2, %xmm1		# either present?
150	pmovmskb	%xmm1, %eax
151	test		%eax, %eax		# match in the string?
152	jnz		2f
153
154	movdqa		16(%rdi), %xmm1
155	add		$32, %rdi
156	pxor		%xmm2, %xmm2
157	pcmpeqb		%xmm1, %xmm2		# NUL bytes present?
158	pcmpeqb		%xmm0, %xmm1		# c present?
159	por		%xmm2, %xmm1		# either present?
160	pmovmskb	%xmm1, %eax
161	test		%eax, %eax		# match in the string?
162	jz		0b
163
1641:	sub		$16, %rdi		# undo advance past buffer
1652:	tzcnt		%eax, %eax		# where is the match?
166	add		%rdi, %rax		# pointer to found c or NUL
167	ret
168ARCHEND(__strchrnul, baseline)
169
170	.section .note.GNU-stack,"",%progbits
171