xref: /freebsd/lib/libc/amd64/string/strrchr.S (revision 1edb7116)
1/*-
2 * Copyright (c) 2023 The FreeBSD Foundation
3 *
4 * This software was developed by Robert Clausecker <fuz@FreeBSD.org>
5 * under sponsorship from the FreeBSD Foundation.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE
27 */
28
29#include <machine/asm.h>
30
31#include "amd64_archlevel.h"
32
33#define ALIGN_TEXT	.p2align 4,0x90	# 16-byte alignment, nop-filled
34
35	.weak	rindex
36	.set	rindex, strrchr
37
38ARCHFUNCS(strrchr)
39	ARCHFUNC(strrchr, scalar)
40	ARCHFUNC(strrchr, baseline)
41ENDARCHFUNCS(strrchr)
42
43ARCHENTRY(strrchr, scalar)
44	mov	%edi, %ecx
45	and	$~7, %rdi		# align to 8 byte
46	movzbl	%sil, %esi		# clear stray high bits
47	movabs	$0x0101010101010101, %r8
48	mov	(%rdi), %rax		# load first word
49	imul	%r8, %rsi		# replicate char 8 times
50
51	/*
52	 * Unaligned input: align to 8 bytes.  Then proceed the same
53	 * way as with aligned input, but prevent matches before the
54	 * beginning of the string.  This is achieved by oring 0x01
55	 * into each byte of the buffer before the string
56	 */
57	shl	$3, %ecx
58	mov	%r8, %r10
59	shl	%cl, %r10		# 0x01 where the string is
60	xor	%r8, %r10		# 0x01 where it is not
61	neg	%r8			# negate 01..01 so we can use lea
62	movabs	$0x8080808080808080, %r9
63
64	mov	%rsi, %rcx
65	xor	%rax, %rcx		# str ^ c
66	or	%r10, %rax		# ensure str != 0 before string
67	or	%r10, %rcx		# ensure str^c != 0 before string
68	bswap	%rcx			# in reverse order, to find last match
69	mov	%rdi, %r10		# location of initial mismatch (if any)
70	xor	%r11, %r11		# initial mismatch (none)
71	add	$8, %rdi		# advance to next iteration
72	lea	(%rax, %r8, 1), %rdx	# str - 0x01..01
73	not	%rax			# ~str
74	and	%rdx, %rax		# (str - 0x01..01) & ~str
75	and	%r9, %rax		# not including junk bits
76	jnz	1f			# end of string?
77
78	lea	(%rcx, %r8, 1), %rdx	# (str ^ c) - 0x01..01
79	not	%rcx			# ~(str ^ c)
80	and	%rdx, %rcx		# ((str ^ c - 0x01..01) & ~(str ^ c)
81	and	%r9, %rcx		# not including junk bits
82	mov	%rcx, %r11		# remember mismatch in head
83	jmp	0f
84
85	/* main loop unrolled twice */
86	ALIGN_TEXT
873:	lea	(%rcx, %r8, 1), %rdx	# (str ^ c) - 0x01..01
88	not	%rcx			# ~(str ^ c)
89	and	%rdx, %rcx		# ((str ^ c - 0x01..01) & ~(str ^ c)
90	and	%r9, %rcx		# not including junk bits
91	lea	-8(%rdi), %rdx
92	cmovnz	%rdx, %r10		# remember location of current mismatch
93	cmovnz	%rcx, %r11
94
950:	mov	(%rdi), %rax		# str
96	mov	%rsi, %rcx
97	xor	%rax, %rcx		# str ^ c
98	bswap	%rcx			# in reverse order, to find last match
99	lea	(%rax, %r8, 1), %rdx	# str - 0x01..01
100	not	%rax			# ~str
101	and	%rdx, %rax		# (str - 0x01..01) & ~str
102	and	%r9, %rax		# not including junk bits
103	jnz	2f			# end of string?
104
105	lea	(%rcx, %r8, 1), %rdx	# (str ^ c) - 0x01..01
106	not	%rcx			# ~(str ^ c)
107	and	%rdx, %rcx		# ((str ^ c - 0x01..01) & ~(str ^ c)
108	and	%r9, %rcx		# not including junk bits
109	cmovnz	%rdi, %r10		# remember location of current mismatch
110	cmovnz	%rcx, %r11
111
112	mov	8(%rdi), %rax		# str
113	add	$16, %rdi
114	mov	%rsi, %rcx
115	xor	%rax, %rcx		# str ^ c
116	bswap	%rcx
117	lea	(%rax, %r8, 1), %rdx	# str - 0x01..01
118	not	%rax			# ~str
119	and	%rdx, %rax		# (str - 0x01..01) & ~str
120	and	%r9, %rax		# not including junk bits
121	jz	3b			# end of string?
122
123	/* NUL found */
1241:	sub	$8, %rdi		# undo advance past buffer
1252:	lea	(%rcx, %r8, 1), %rdx	# (str ^ c) - 0x01..01
126	not	%rcx			# ~(str ^ c)
127	and	%rdx, %rcx		# ((str ^ c - 0x01..01) & ~(str ^ c)
128	and	%r9, %rcx		# not including junk bits
129	lea	-1(%rax), %rdx
130	xor	%rdx, %rax		# mask of bytes in the string
131	bswap	%rdx			# in reverse order
132	and	%rdx, %rcx		# c found in the tail?
133	cmovnz	%rdi, %r10
134	cmovnz	%rcx, %r11
135	bswap	%r11			# unreverse byte order
136	bsr	%r11, %rcx		# last location of c in (R10)
137	shr	$3, %rcx		# as byte offset
138	lea	(%r10, %rcx, 1), %rax	# pointer to match
139	test	%r11, %r11		# was there actually a match?
140	cmovz	%r11, %rax		# if not, return null pointer
141	ret
142ARCHEND(strrchr, scalar)
143
144ARCHENTRY(strrchr, baseline)
145	mov		%edi, %ecx
146	and		$~0xf, %rdi		# align to 16 bytes
147	movdqa		(%rdi), %xmm1
148	movd		%esi, %xmm0
149	and		$0xf, %ecx		# offset from alignment
150	pxor		%xmm2, %xmm2
151	mov		$-1, %edx
152	punpcklbw	%xmm0, %xmm0		# c -> cc
153	shl		%cl, %edx		# bits corresponding to bytes in the string
154	punpcklwd	%xmm0, %xmm0		# cc -> cccc
155	xor		%r8, %r8		# address of latest match
156	mov		$1, %esi		# bit mask of latest match
157	mov		%rdi, %r9		# candidate location for next match
158	add		$16, %rdi		# advance to next chunk
159
160	/* check for match in head */
161	pcmpeqb		%xmm1, %xmm2		# NUL byte present?
162	pshufd		$0, %xmm0, %xmm0	# cccc -> cccccccccccccccc
163	pcmpeqb		%xmm0, %xmm1		# c present?
164	pmovmskb	%xmm2, %eax
165	pmovmskb	%xmm1, %ecx
166	and		%edx, %ecx		# c present in the string?
167	and		%edx, %eax		# NUL present in the string?
168	jnz		.Lend2
169
170	/* main loop unrolled twice */
171	ALIGN_TEXT
1720:	movdqa		(%rdi), %xmm1
173	test		%ecx, %ecx		# was there a match in the last iter.?
174	cmovnz		%r9, %r8		# remember match if any
175	cmovnz		%ecx, %esi
176	pxor		%xmm2, %xmm2
177	pcmpeqb		%xmm1, %xmm2		# NUL byte present?
178	pcmpeqb		%xmm0, %xmm1		# c present?
179	pmovmskb	%xmm2, %eax
180	pmovmskb	%xmm1, %ecx
181	test		%eax, %eax		# end of string in first half?
182	jnz		.Lend
183
184	movdqa		16(%rdi), %xmm1
185	test		%ecx, %ecx		# was there a match in the last iter.?
186	cmovnz		%rdi, %r8		# remember match if any
187	cmovnz		%ecx, %esi
188	pxor		%xmm2, %xmm2
189	pcmpeqb		%xmm1, %xmm2		# NUL byte present?
190	pcmpeqb		%xmm0, %xmm1		# c present?
191	pmovmskb	%xmm2, %eax
192	pmovmskb	%xmm1, %ecx
193	lea		16(%rdi), %r9
194	add		$32, %rdi
195	test		%eax, %eax		# end of string in second half?
196	jz		0b
197
198	ALIGN_TEXT
199.Lend2:	sub		$16, %rdi
200.Lend:	lea 		-1(%rax), %edx
201	xor		%edx, %eax		# mask of bytes in the string
202	and		%eax, %ecx		# c found in the tail?
203	cmovnz		%rdi, %r8
204	cmovnz		%ecx, %esi
205	bsr		%esi, %esi		# last location of c in (R8)
206	lea		(%r8, %rsi, 1), %rax	# pointer to match
207	ret
208ARCHEND(strrchr, baseline)
209	.section .note.GNU-stack,"",%progbits
210