1/* Copyright (c) 2012, Linaro Limited
2   All rights reserved.
3
4   Redistribution and use in source and binary forms, with or without
5   modification, are permitted provided that the following conditions are met:
6       * Redistributions of source code must retain the above copyright
7         notice, this list of conditions and the following disclaimer.
8       * Redistributions in binary form must reproduce the above copyright
9         notice, this list of conditions and the following disclaimer in the
10         documentation and/or other materials provided with the distribution.
11       * Neither the name of the Linaro nor the
12         names of its contributors may be used to endorse or promote products
13         derived from this software without specific prior written permission.
14
15   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
26
27/*
28 * Copyright (c) 2015 ARM Ltd
29 * All rights reserved.
30 *
31 * Redistribution and use in source and binary forms, with or without
32 * modification, are permitted provided that the following conditions
33 * are met:
34 * 1. Redistributions of source code must retain the above copyright
35 *    notice, this list of conditions and the following disclaimer.
36 * 2. Redistributions in binary form must reproduce the above copyright
37 *    notice, this list of conditions and the following disclaimer in the
38 *    documentation and/or other materials provided with the distribution.
39 * 3. The name of the company may not be used to endorse or promote
40 *    products derived from this software without specific prior written
41 *    permission.
42 *
43 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
44 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
45 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
46 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
47 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
48 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
49 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
50 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
51 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
52 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
53 */
54
55/* Assumptions:
56 *
57 * ARMv8-a, AArch64, unaligned accesses
58 *
59 */
60
61
62#define dstin	x0
63#define val	x1
64#define valw	w1
65#define count	x2
66#define dst	x3
67#define dstend	x4
68#define tmp1	x5
69#define tmp1w	w5
70#define tmp2	x6
71#define tmp2w	w6
72#define zva_len x7
73#define zva_lenw w7
74
75#define L(l) .L ## l
76
77	.macro def_fn f p2align=0
78	.text
79	.p2align \p2align
80	.global \f
81	.type \f, %function
82\f:
83	.endm
84
85def_fn memset p2align=6
86
87	dup	v0.16B, valw
88	add	dstend, dstin, count
89
90	cmp	count, 96
91	b.hi	L(set_long)
92	cmp	count, 16
93	b.hs	L(set_medium)
94	mov	val, v0.D[0]
95
96	/* Set 0..15 bytes.  */
97	tbz	count, 3, 1f
98	str	val, [dstin]
99	str	val, [dstend, -8]
100	ret
101	nop
1021:	tbz	count, 2, 2f
103	str	valw, [dstin]
104	str	valw, [dstend, -4]
105	ret
1062:	cbz	count, 3f
107	strb	valw, [dstin]
108	tbz	count, 1, 3f
109	strh	valw, [dstend, -2]
1103:	ret
111
112	/* Set 17..96 bytes.  */
113L(set_medium):
114	str	q0, [dstin]
115	tbnz	count, 6, L(set96)
116	str	q0, [dstend, -16]
117	tbz	count, 5, 1f
118	str	q0, [dstin, 16]
119	str	q0, [dstend, -32]
1201:	ret
121
122	.p2align 4
123	/* Set 64..96 bytes.  Write 64 bytes from the start and
124	   32 bytes from the end.  */
125L(set96):
126	str	q0, [dstin, 16]
127	stp	q0, q0, [dstin, 32]
128	stp	q0, q0, [dstend, -32]
129	ret
130
131	.p2align 3
132	nop
133L(set_long):
134	and	valw, valw, 255
135	bic	dst, dstin, 15
136	str	q0, [dstin]
137	cmp	count, 256
138	ccmp	valw, 0, 0, cs
139	b.eq	L(try_zva)
140L(no_zva):
141	sub	count, dstend, dst	/* Count is 16 too large.  */
142	add	dst, dst, 16
143	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
1441:	stp	q0, q0, [dst], 64
145	stp	q0, q0, [dst, -32]
146L(tail64):
147	subs	count, count, 64
148	b.hi	1b
1492:	stp	q0, q0, [dstend, -64]
150	stp	q0, q0, [dstend, -32]
151	ret
152
153	.p2align 3
154L(try_zva):
155	mrs	tmp1, dczid_el0
156	tbnz	tmp1w, 4, L(no_zva)
157	and	tmp1w, tmp1w, 15
158	cmp	tmp1w, 4	/* ZVA size is 64 bytes.  */
159	b.ne	 L(zva_128)
160
161	/* Write the first and last 64 byte aligned block using stp rather
162	   than using DC ZVA.  This is faster on some cores.
163	 */
164L(zva_64):
165	str	q0, [dst, 16]
166	stp	q0, q0, [dst, 32]
167	bic	dst, dst, 63
168	stp	q0, q0, [dst, 64]
169	stp	q0, q0, [dst, 96]
170	sub	count, dstend, dst	/* Count is now 128 too large.	*/
171	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
172	add	dst, dst, 128
173	nop
1741:	dc	zva, dst
175	add	dst, dst, 64
176	subs	count, count, 64
177	b.hi	1b
178	stp	q0, q0, [dst, 0]
179	stp	q0, q0, [dst, 32]
180	stp	q0, q0, [dstend, -64]
181	stp	q0, q0, [dstend, -32]
182	ret
183
184	.p2align 3
185L(zva_128):
186	cmp	tmp1w, 5	/* ZVA size is 128 bytes.  */
187	b.ne	L(zva_other)
188
189	str	q0, [dst, 16]
190	stp	q0, q0, [dst, 32]
191	stp	q0, q0, [dst, 64]
192	stp	q0, q0, [dst, 96]
193	bic	dst, dst, 127
194	sub	count, dstend, dst	/* Count is now 128 too large.	*/
195	sub	count, count, 128+128	/* Adjust count and bias for loop.  */
196	add	dst, dst, 128
1971:	dc	zva, dst
198	add	dst, dst, 128
199	subs	count, count, 128
200	b.hi	1b
201	stp	q0, q0, [dstend, -128]
202	stp	q0, q0, [dstend, -96]
203	stp	q0, q0, [dstend, -64]
204	stp	q0, q0, [dstend, -32]
205	ret
206
207L(zva_other):
208	mov	tmp2w, 4
209	lsl	zva_lenw, tmp2w, tmp1w
210	add	tmp1, zva_len, 64	/* Max alignment bytes written.	 */
211	cmp	count, tmp1
212	blo	L(no_zva)
213
214	sub	tmp2, zva_len, 1
215	add	tmp1, dst, zva_len
216	add	dst, dst, 16
217	subs	count, tmp1, dst	/* Actual alignment bytes to write.  */
218	bic	tmp1, tmp1, tmp2	/* Aligned dc zva start address.  */
219	beq	2f
2201:	stp	q0, q0, [dst], 64
221	stp	q0, q0, [dst, -32]
222	subs	count, count, 64
223	b.hi	1b
2242:	mov	dst, tmp1
225	sub	count, dstend, tmp1	/* Remaining bytes to write.  */
226	subs	count, count, zva_len
227	b.lo	4f
2283:	dc	zva, dst
229	add	dst, dst, zva_len
230	subs	count, count, zva_len
231	b.hs	3b
2324:	add	count, count, zva_len
233	b	L(tail64)
234
235	.size	memset, . - memset
236