1/**************************************************************
2  Copyright (c) 2019 Huawei Technologies Co., Ltd.
3
4  Redistribution and use in source and binary forms, with or without
5  modification, are permitted provided that the following conditions
6  are met:
7    * Redistributions of source code must retain the above copyright
8      notice, this list of conditions and the following disclaimer.
9    * Redistributions in binary form must reproduce the above copyright
10      notice, this list of conditions and the following disclaimer in
11      the documentation and/or other materials provided with the
12      distribution.
13    * Neither the name of Huawei Corporation nor the names of its
14      contributors may be used to endorse or promote products derived
15      from this software without specific prior written permission.
16
17  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28**********************************************************************/
29.text
30
31.global gf_3vect_mad_neon
32.type gf_3vect_mad_neon, %function
33
34
35/* arguments */
36x_len		.req	x0
37x_vec		.req	x1
38x_vec_i		.req	x2
39x_tbl		.req	x3
40x_src		.req	x4
41x_dest		.req	x5
42
43/* returns */
44w_ret		.req	w0
45
46/* local variables */
47x_src_end	.req	x6
48x_dest1		.req	x7
49x_dest2		.req	x8
50x_dest3		.req	x_dest
51x_tmp		.req	x10
52x_tbl1		.req	x11
53x_tbl2		.req	x12
54x_tbl3		.req	x13
55x_const		.req	x14
56
57/* vectors */
58v_mask0f	.req	v0
59v_tmp_lo	.req	v1
60v_tmp_hi	.req	v2
61v_tmp		.req	v3
62q_tmp		.req	q3
63
64v_gft1_lo	.req	v4
65v_gft1_hi	.req	v5
66v_gft2_lo	.req	v6
67v_gft2_hi	.req	v7
68v_gft3_lo	.req	v16
69v_gft3_hi	.req	v17
70q_gft1_lo	.req	q4
71q_gft1_hi	.req	q5
72q_gft2_lo	.req	q6
73q_gft2_hi	.req	q7
74q_gft3_lo	.req	q16
75q_gft3_hi	.req	q17
76
77v_data_0	.req	v8
78v_data_1	.req	v9
79v_data_2	.req	v10
80v_data_3	.req	v11
81q_data_0	.req	q8
82q_data_1	.req	q9
83q_data_2	.req	q10
84q_data_3	.req	q11
85
86v_data_0_lo	.req	v12
87v_data_1_lo	.req	v13
88v_data_2_lo	.req	v14
89v_data_3_lo	.req	v15
90v_data_0_hi	.req	v_data_0
91v_data_1_hi	.req	v_data_1
92v_data_2_hi	.req	v_data_2
93v_data_3_hi	.req	v_data_3
94
95v_d1_0		.req	v20
96v_d1_1		.req	v21
97v_d1_2		.req	v22
98v_d1_3		.req	v23
99v_d2_0		.req	v24
100v_d2_1		.req	v25
101v_d2_2		.req	v26
102v_d2_3		.req	v27
103v_d3_0		.req	v28
104v_d3_1		.req	v29
105v_d3_2		.req	v30
106v_d3_3		.req	v31
107q_d1_0		.req	q20
108q_d1_1		.req	q21
109q_d1_2		.req	q22
110q_d1_3		.req	q23
111q_d2_0		.req	q24
112q_d2_1		.req	q25
113q_d2_2		.req	q26
114q_d2_3		.req	q27
115q_d3_0		.req	q28
116q_d3_1		.req	q29
117q_d3_2		.req	q30
118q_d3_3		.req	q31
119
120v_data		.req	v21
121q_data		.req	q21
122v_data_lo	.req	v22
123v_data_hi	.req	v23
124
125gf_3vect_mad_neon:
126	/* less than 16 bytes, return_fail */
127	cmp	x_len, #16
128	blt	.return_fail
129
130	movi	v_mask0f.16b, #0x0f
131	lsl	x_vec_i, x_vec_i, #5
132	lsl	x_vec, x_vec, #5
133	add	x_tbl1, x_tbl, x_vec_i
134	add	x_tbl2, x_tbl1, x_vec
135	add	x_tbl3, x_tbl2, x_vec
136	add	x_src_end, x_src, x_len
137	ldr	x_dest1, [x_dest]
138	ldr	x_dest2, [x_dest, #8]
139	ldr	x_dest3, [x_dest, #16]
140	ldr	q_gft1_lo, [x_tbl1]
141	ldr	q_gft1_hi, [x_tbl1, #16]
142	ldr	q_gft2_lo, [x_tbl2]
143	ldr	q_gft2_hi, [x_tbl2, #16]
144	ldr	q_gft3_lo, [x_tbl3]
145	ldr	q_gft3_hi, [x_tbl3, #16]
146
147.Lloop64_init:
148	/* less than 64 bytes, goto Lloop16_init */
149	cmp	x_len, #64
150	blt	.Lloop16_init
151
152	/* save d8 ~ d15 to stack */
153	sub	sp, sp, #64
154	stp	d8, d9, [sp]
155	stp	d10, d11, [sp, #16]
156	stp	d12, d13, [sp, #32]
157	stp	d14, d15, [sp, #48]
158
159	sub	x_src_end, x_src_end, #64
160
161.Lloop64:
162	ldr	q_data_0, [x_src, #16*0]
163	ldr	q_data_1, [x_src, #16*1]
164	ldr	q_data_2, [x_src, #16*2]
165	ldr	q_data_3, [x_src, #16*3]
166	add	x_src, x_src, #64
167
168	ldr	q_d1_0, [x_dest1, #16*0]
169	ldr	q_d1_1, [x_dest1, #16*1]
170	ldr	q_d1_2, [x_dest1, #16*2]
171	ldr	q_d1_3, [x_dest1, #16*3]
172
173	ldr	q_d2_0, [x_dest2, #16*0]
174	ldr	q_d2_1, [x_dest2, #16*1]
175	ldr	q_d2_2, [x_dest2, #16*2]
176	ldr	q_d2_3, [x_dest2, #16*3]
177
178	ldr	q_d3_0, [x_dest3, #16*0]
179	ldr	q_d3_1, [x_dest3, #16*1]
180	ldr	q_d3_2, [x_dest3, #16*2]
181	ldr	q_d3_3, [x_dest3, #16*3]
182
183	and	v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
184	and	v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
185	and	v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b
186	and	v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b
187
188	ushr	v_data_0_hi.16b, v_data_0.16b, #4
189	ushr	v_data_1_hi.16b, v_data_1.16b, #4
190	ushr	v_data_2_hi.16b, v_data_2.16b, #4
191	ushr	v_data_3_hi.16b, v_data_3.16b, #4
192
193	tbl	v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
194	tbl	v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
195	eor	v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b
196	eor	v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
197
198	tbl	v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
199	tbl	v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
200	eor	v_d1_1.16b, v_tmp_lo.16b, v_d1_1.16b
201	eor	v_d1_1.16b, v_d1_1.16b, v_tmp_hi.16b
202
203	tbl	v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b
204	tbl	v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b
205	eor	v_d1_2.16b, v_tmp_lo.16b, v_d1_2.16b
206	eor	v_d1_2.16b, v_d1_2.16b, v_tmp_hi.16b
207
208	tbl	v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b
209	tbl	v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b
210	eor	v_d1_3.16b, v_tmp_lo.16b, v_d1_3.16b
211	eor	v_d1_3.16b, v_d1_3.16b, v_tmp_hi.16b
212
213	tbl	v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_0_lo.16b
214	tbl	v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_0_hi.16b
215	eor	v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b
216	eor	v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
217
218	tbl	v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_1_lo.16b
219	tbl	v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_1_hi.16b
220	eor	v_d2_1.16b, v_tmp_lo.16b, v_d2_1.16b
221	eor	v_d2_1.16b, v_d2_1.16b, v_tmp_hi.16b
222
223	tbl	v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_2_lo.16b
224	tbl	v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_2_hi.16b
225	eor	v_d2_2.16b, v_tmp_lo.16b, v_d2_2.16b
226	eor	v_d2_2.16b, v_d2_2.16b, v_tmp_hi.16b
227
228	tbl	v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_3_lo.16b
229	tbl	v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_3_hi.16b
230	eor	v_d2_3.16b, v_tmp_lo.16b, v_d2_3.16b
231	eor	v_d2_3.16b, v_d2_3.16b, v_tmp_hi.16b
232
233	tbl	v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_0_lo.16b
234	tbl	v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_0_hi.16b
235	eor	v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b
236	eor	v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
237
238	tbl	v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_1_lo.16b
239	tbl	v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_1_hi.16b
240	eor	v_d3_1.16b, v_tmp_lo.16b, v_d3_1.16b
241	eor	v_d3_1.16b, v_d3_1.16b, v_tmp_hi.16b
242
243	tbl	v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_2_lo.16b
244	tbl	v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_2_hi.16b
245	eor	v_d3_2.16b, v_tmp_lo.16b, v_d3_2.16b
246	eor	v_d3_2.16b, v_d3_2.16b, v_tmp_hi.16b
247
248	tbl	v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_3_lo.16b
249	tbl	v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_3_hi.16b
250	eor	v_d3_3.16b, v_tmp_lo.16b, v_d3_3.16b
251	eor	v_d3_3.16b, v_d3_3.16b, v_tmp_hi.16b
252
253	str	q_d1_0, [x_dest1, #16*0]
254	str	q_d1_1, [x_dest1, #16*1]
255	str	q_d1_2, [x_dest1, #16*2]
256	str	q_d1_3, [x_dest1, #16*3]
257	add	x_dest1, x_dest1, #64
258
259	str	q_d2_0, [x_dest2, #16*0]
260	str	q_d2_1, [x_dest2, #16*1]
261	str	q_d2_2, [x_dest2, #16*2]
262	str	q_d2_3, [x_dest2, #16*3]
263	add	x_dest2, x_dest2, #64
264
265	str	q_d3_0, [x_dest3, #16*0]
266	str	q_d3_1, [x_dest3, #16*1]
267	str	q_d3_2, [x_dest3, #16*2]
268	str	q_d3_3, [x_dest3, #16*3]
269	add	x_dest3, x_dest3, #64
270
271	cmp	x_src, x_src_end
272	bls	.Lloop64
273
274.Lloop64_end:
275	/* restore d8 ~ d15 */
276	ldp	d8,  d9,  [sp]
277	ldp	d10, d11, [sp, #16]
278	ldp	d12, d13, [sp, #32]
279	ldp	d14, d15, [sp, #48]
280	add	sp, sp, #64
281	add	x_src_end, x_src_end, #64
282
283.Lloop16_init:
284	sub	x_src_end, x_src_end, #16
285	cmp	x_src, x_src_end
286	bhi	.lessthan16_init
287
288.Lloop16:
289	ldr	q_data, [x_src]
290
291	ldr	q_d1_0, [x_dest1]
292	ldr	q_d2_0, [x_dest2]
293	ldr	q_d3_0, [x_dest3]
294
295	and	v_data_lo.16b, v_data.16b, v_mask0f.16b
296	ushr	v_data_hi.16b, v_data.16b, #4
297
298	tbl	v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
299	tbl	v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
300	eor	v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b
301	eor	v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
302
303	tbl	v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
304	tbl	v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
305	eor	v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b
306	eor	v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
307
308	tbl	v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
309	tbl	v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
310	eor	v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b
311	eor	v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
312
313	str	q_d1_0, [x_dest1]
314	str	q_d2_0, [x_dest2]
315	str	q_d3_0, [x_dest3]
316
317	add	x_src, x_src, #16
318	add	x_dest1, x_dest1, #16
319	add	x_dest2, x_dest2, #16
320	add	x_dest3, x_dest3, #16
321	cmp	x_src, x_src_end
322	bls	.Lloop16
323
324.lessthan16_init:
325	sub	x_tmp, x_src, x_src_end
326	cmp	x_tmp, #16
327	beq	.return_pass
328
329.lessthan16:
330	mov	x_src, x_src_end
331	sub	x_dest1, x_dest1, x_tmp
332	sub	x_dest2, x_dest2, x_tmp
333	sub	x_dest3, x_dest3, x_tmp
334
335	ldr	x_const, =const_tbl
336	sub	x_const, x_const, x_tmp
337	ldr	q_tmp, [x_const, #16]
338
339	ldr	q_data, [x_src]
340	ldr	q_d1_0, [x_dest1]
341	ldr	q_d2_0, [x_dest2]
342	ldr	q_d3_0, [x_dest3]
343
344	and	v_data_lo.16b, v_data.16b, v_mask0f.16b
345	ushr	v_data_hi.16b, v_data.16b, #4
346
347	tbl	v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
348	tbl	v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
349	eor	v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
350	and	v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
351	eor	v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
352
353	tbl	v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
354	tbl	v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
355	eor	v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
356	and	v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
357	eor	v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
358
359	tbl	v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
360	tbl	v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
361	eor	v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
362	and	v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
363	eor	v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
364
365	str	q_d1_0, [x_dest1]
366	str	q_d2_0, [x_dest2]
367	str	q_d3_0, [x_dest3]
368
369.return_pass:
370	mov	w_ret, #0
371	ret
372
373.return_fail:
374	mov	w_ret, #1
375	ret
376
377.section .data
378.balign 8
379const_tbl:
380	.dword 0x0000000000000000, 0x0000000000000000
381	.dword 0xffffffffffffffff, 0xffffffffffffffff
382