1/*
2 * memchr - find a character in a memory zone
3 *
4 * Copyright (c) 2014, ARM Limited
5 * All rights Reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *     * Redistributions of source code must retain the above copyright
10 *       notice, this list of conditions and the following disclaimer.
11 *     * Redistributions in binary form must reproduce the above copyright
12 *       notice, this list of conditions and the following disclaimer in the
13 *       documentation and/or other materials provided with the distribution.
14 *     * Neither the name of the company nor the names of its contributors
15 *       may be used to endorse or promote products derived from this
16 *       software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31/* Assumptions:
32 *
33 * ARMv8-a, AArch64
34 * Neon Available.
35 */
36
37/* Arguments and results.  */
38#define srcin		x0
39#define chrin		w1
40#define cntin		x2
41
42#define result		x0
43
44#define src		x3
45#define	tmp		x4
46#define wtmp2		w5
47#define synd		x6
48#define soff		x9
49#define cntrem		x10
50
51#define vrepchr		v0
52#define vdata1		v1
53#define vdata2		v2
54#define vhas_chr1	v3
55#define vhas_chr2	v4
56#define vrepmask	v5
57#define vend		v6
58
59/*
60 * Core algorithm:
61 *
62 * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
63 * per byte. For each tuple, bit 0 is set if the relevant byte matched the
64 * requested character and bit 1 is not used (faster than using a 32bit
65 * syndrome). Since the bits in the syndrome reflect exactly the order in which
66 * things occur in the original string, counting trailing zeros allows to
67 * identify exactly which byte has matched.
68 */
69
70	.macro def_fn f p2align=0
71	.text
72	.p2align \p2align
73	.global \f
74	.type \f, %function
75\f:
76	.endm
77
78def_fn memchr
79	/* Do not dereference srcin if no bytes to compare.  */
80	cbz	cntin, .Lzero_length
81	/*
82	 * Magic constant 0x40100401 allows us to identify which lane matches
83	 * the requested byte.
84	 */
85	mov	wtmp2, #0x0401
86	movk	wtmp2, #0x4010, lsl #16
87	dup	vrepchr.16b, chrin
88	/* Work with aligned 32-byte chunks */
89	bic	src, srcin, #31
90	dup	vrepmask.4s, wtmp2
91	ands	soff, srcin, #31
92	and	cntrem, cntin, #31
93	b.eq	.Lloop
94
95	/*
96	 * Input string is not 32-byte aligned. We calculate the syndrome
97	 * value for the aligned 32 bytes block containing the first bytes
98	 * and mask the irrelevant part.
99	 */
100
101	ld1	{vdata1.16b, vdata2.16b}, [src], #32
102	sub	tmp, soff, #32
103	adds	cntin, cntin, tmp
104	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
105	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
106	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
107	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
108	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
109	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
110	mov	synd, vend.d[0]
111	/* Clear the soff*2 lower bits */
112	lsl	tmp, soff, #1
113	lsr	synd, synd, tmp
114	lsl	synd, synd, tmp
115	/* The first block can also be the last */
116	b.ls	.Lmasklast
117	/* Have we found something already? */
118	cbnz	synd, .Ltail
119
120.Lloop:
121	ld1	{vdata1.16b, vdata2.16b}, [src], #32
122	subs	cntin, cntin, #32
123	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
124	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
125	/* If we're out of data we finish regardless of the result */
126	b.ls	.Lend
127	/* Use a fast check for the termination condition */
128	orr	vend.16b, vhas_chr1.16b, vhas_chr2.16b
129	addp	vend.2d, vend.2d, vend.2d
130	mov	synd, vend.d[0]
131	/* We're not out of data, loop if we haven't found the character */
132	cbz	synd, .Lloop
133
134.Lend:
135	/* Termination condition found, let's calculate the syndrome value */
136	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
137	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
138	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
139	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
140	mov	synd, vend.d[0]
141	/* Only do the clear for the last possible block */
142	b.hi	.Ltail
143
144.Lmasklast:
145	/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
146	add	tmp, cntrem, soff
147	and	tmp, tmp, #31
148	sub	tmp, tmp, #32
149	neg	tmp, tmp, lsl #1
150	lsl	synd, synd, tmp
151	lsr	synd, synd, tmp
152
153.Ltail:
154	/* Count the trailing zeros using bit reversing */
155	rbit	synd, synd
156	/* Compensate the last post-increment */
157	sub	src, src, #32
158	/* Check that we have found a character */
159	cmp	synd, #0
160	/* And count the leading zeros */
161	clz	synd, synd
162	/* Compute the potential result */
163	add	result, src, synd, lsr #1
164	/* Select result or NULL */
165	csel	result, xzr, result, eq
166	ret
167
168.Lzero_length:
169	mov	result, #0
170	ret
171
172	.size	memchr, . - memchr
173