1/**************************************************************
2  Copyright (c) 2019 Huawei Technologies Co., Ltd.
3
4  Redistribution and use in source and binary forms, with or without
5  modification, are permitted provided that the following conditions
6  are met:
7    * Redistributions of source code must retain the above copyright
8      notice, this list of conditions and the following disclaimer.
9    * Redistributions in binary form must reproduce the above copyright
10      notice, this list of conditions and the following disclaimer in
11      the documentation and/or other materials provided with the
12      distribution.
13    * Neither the name of Huawei Corporation nor the names of its
14      contributors may be used to endorse or promote products derived
15      from this software without specific prior written permission.
16
17  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28**********************************************************************/
29
30.text
31
32.global gf_4vect_mad_neon
33.type gf_4vect_mad_neon, %function
34
35
36/* arguments */
37x_len		.req	x0
38x_vec		.req	x1
39x_vec_i		.req	x2
40x_tbl		.req	x3
41x_src		.req	x4
42x_dest		.req	x5
43
44/* returns */
45w_ret		.req	w0
46
47/* local variables */
48x_src_end	.req	x6
49x_dest1		.req	x7
50x_dest2		.req	x8
51x_dest3		.req	x9
52x_dest4		.req	x_dest
53x_tmp		.req	x10
54x_tbl1		.req	x11
55x_tbl2		.req	x12
56x_tbl3		.req	x13
57x_tbl4		.req	x14
58x_const		.req	x15
59
60/* vectors */
61v_mask0f	.req	v0
62v_tmp_lo	.req	v1
63v_tmp_hi	.req	v2
64v_tmp		.req	v3
65q_tmp		.req	q3
66
67v_gft1_lo	.req	v4
68v_gft1_hi	.req	v5
69v_gft2_lo	.req	v6
70v_gft2_hi	.req	v7
71v_gft3_lo	.req	v16
72v_gft3_hi	.req	v17
73v_gft4_lo	.req	v18
74v_gft4_hi	.req	v19
75q_gft1_lo	.req	q4
76q_gft1_hi	.req	q5
77q_gft2_lo	.req	q6
78q_gft2_hi	.req	q7
79q_gft3_lo	.req	q16
80q_gft3_hi	.req	q17
81q_gft4_lo	.req	q18
82q_gft4_hi	.req	q19
83
84v_data_0	.req	v8
85v_data_1	.req	v9
86v_data_2	.req	v10
87v_data_3	.req	v11
88q_data_0	.req	q8
89q_data_1	.req	q9
90q_data_2	.req	q10
91q_data_3	.req	q11
92
93v_data_0_lo	.req	v12
94v_data_1_lo	.req	v13
95v_data_2_lo	.req	v14
96v_data_3_lo	.req	v15
97v_data_0_hi	.req	v_data_0
98v_data_1_hi	.req	v_data_1
99v_data_2_hi	.req	v_data_2
100v_data_3_hi	.req	v_data_3
101
102v_d1_0		.req	v20
103v_d1_1		.req	v21
104v_d1_2		.req	v22
105v_d1_3		.req	v23
106v_d2_0		.req	v24
107v_d2_1		.req	v25
108v_d2_2		.req	v26
109v_d2_3		.req	v27
110v_d3_0		.req	v28
111v_d3_1		.req	v29
112v_d3_2		.req	v30
113v_d3_3		.req	v31
114q_d1_0		.req	q20
115q_d1_1		.req	q21
116q_d1_2		.req	q22
117q_d1_3		.req	q23
118q_d2_0		.req	q24
119q_d2_1		.req	q25
120q_d2_2		.req	q26
121q_d2_3		.req	q27
122q_d3_0		.req	q28
123q_d3_1		.req	q29
124q_d3_2		.req	q30
125q_d3_3		.req	q31
126
127v_d4_0		.req	v_d1_0
128v_d4_1		.req	v_d1_1
129v_d4_2		.req	v_d1_2
130v_d4_3		.req	v_d1_3
131q_d4_0		.req	q_d1_0
132q_d4_1		.req	q_d1_1
133q_d4_2		.req	q_d1_2
134q_d4_3		.req	q_d1_3
135
136v_data		.req	v21
137q_data		.req	q21
138v_data_lo	.req	v22
139v_data_hi	.req	v23
140
141gf_4vect_mad_neon:
142	/* less than 16 bytes, return_fail */
143	cmp	x_len, #16
144	blt	.return_fail
145
146	movi	v_mask0f.16b, #0x0f
147	lsl	x_vec_i, x_vec_i, #5
148	lsl	x_vec, x_vec, #5
149	add	x_tbl1, x_tbl, x_vec_i
150	add	x_tbl2, x_tbl1, x_vec
151	add	x_tbl3, x_tbl2, x_vec
152	add	x_tbl4, x_tbl3, x_vec
153	add	x_src_end, x_src, x_len
154	ldr	x_dest1, [x_dest, #8*0]
155	ldr	x_dest2, [x_dest, #8*1]
156	ldr	x_dest3, [x_dest, #8*2]
157	ldr	x_dest4, [x_dest, #8*3]
158	ldr	q_gft1_lo, [x_tbl1]
159	ldr	q_gft1_hi, [x_tbl1, #16]
160	ldr	q_gft2_lo, [x_tbl2]
161	ldr	q_gft2_hi, [x_tbl2, #16]
162	ldr	q_gft3_lo, [x_tbl3]
163	ldr	q_gft3_hi, [x_tbl3, #16]
164	ldr	q_gft4_lo, [x_tbl4]
165	ldr	q_gft4_hi, [x_tbl4, #16]
166
167.Lloop64_init:
168	/* less than 64 bytes, goto Lloop16_init */
169	cmp	x_len, #64
170	blt	.Lloop16_init
171
172	/* save d8 ~ d15 to stack */
173	sub	sp, sp, #64
174	stp	d8, d9, [sp]
175	stp	d10, d11, [sp, #16]
176	stp	d12, d13, [sp, #32]
177	stp	d14, d15, [sp, #48]
178
179	sub	x_src_end, x_src_end, #64
180
181.Lloop64:
182	ldr	q_data_0, [x_src, #16*0]
183	ldr	q_data_1, [x_src, #16*1]
184	ldr	q_data_2, [x_src, #16*2]
185	ldr	q_data_3, [x_src, #16*3]
186	add	x_src, x_src, #64
187
188	ldr	q_d1_0, [x_dest1, #16*0]
189	ldr	q_d1_1, [x_dest1, #16*1]
190	ldr	q_d1_2, [x_dest1, #16*2]
191	ldr	q_d1_3, [x_dest1, #16*3]
192
193	ldr	q_d2_0, [x_dest2, #16*0]
194	ldr	q_d2_1, [x_dest2, #16*1]
195	ldr	q_d2_2, [x_dest2, #16*2]
196	ldr	q_d2_3, [x_dest2, #16*3]
197
198	and	v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
199	and	v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
200	and	v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b
201	and	v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b
202
203	ushr	v_data_0_hi.16b, v_data_0.16b, #4
204	ushr	v_data_1_hi.16b, v_data_1.16b, #4
205	ushr	v_data_2_hi.16b, v_data_2.16b, #4
206	ushr	v_data_3_hi.16b, v_data_3.16b, #4
207
208	/* dest1 */
209	tbl	v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
210	tbl	v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
211	eor	v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b
212	eor	v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
213
214	tbl	v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
215	tbl	v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
216	eor	v_d1_1.16b, v_tmp_lo.16b, v_d1_1.16b
217	eor	v_d1_1.16b, v_d1_1.16b, v_tmp_hi.16b
218
219	tbl	v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b
220	tbl	v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b
221	eor	v_d1_2.16b, v_tmp_lo.16b, v_d1_2.16b
222	eor	v_d1_2.16b, v_d1_2.16b, v_tmp_hi.16b
223
224	tbl	v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b
225	tbl	v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b
226	eor	v_d1_3.16b, v_tmp_lo.16b, v_d1_3.16b
227	eor	v_d1_3.16b, v_d1_3.16b, v_tmp_hi.16b
228
229	/* dest2 */
230	tbl	v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_0_lo.16b
231	tbl	v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_0_hi.16b
232	eor	v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b
233	eor	v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
234
235	tbl	v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_1_lo.16b
236	tbl	v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_1_hi.16b
237	eor	v_d2_1.16b, v_tmp_lo.16b, v_d2_1.16b
238	eor	v_d2_1.16b, v_d2_1.16b, v_tmp_hi.16b
239
240	tbl	v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_2_lo.16b
241	tbl	v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_2_hi.16b
242	eor	v_d2_2.16b, v_tmp_lo.16b, v_d2_2.16b
243	eor	v_d2_2.16b, v_d2_2.16b, v_tmp_hi.16b
244
245	tbl	v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_3_lo.16b
246	tbl	v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_3_hi.16b
247	eor	v_d2_3.16b, v_tmp_lo.16b, v_d2_3.16b
248	eor	v_d2_3.16b, v_d2_3.16b, v_tmp_hi.16b
249
250	str	q_d1_0, [x_dest1, #16*0]
251	str	q_d1_1, [x_dest1, #16*1]
252	str	q_d1_2, [x_dest1, #16*2]
253	str	q_d1_3, [x_dest1, #16*3]
254	add	x_dest1, x_dest1, #64
255
256	str	q_d2_0, [x_dest2, #16*0]
257	str	q_d2_1, [x_dest2, #16*1]
258	str	q_d2_2, [x_dest2, #16*2]
259	str	q_d2_3, [x_dest2, #16*3]
260	add	x_dest2, x_dest2, #64
261
262	ldr	q_d3_0, [x_dest3, #16*0]
263	ldr	q_d3_1, [x_dest3, #16*1]
264	ldr	q_d3_2, [x_dest3, #16*2]
265	ldr	q_d3_3, [x_dest3, #16*3]
266
267	ldr	q_d4_0, [x_dest4, #16*0]
268	ldr	q_d4_1, [x_dest4, #16*1]
269	ldr	q_d4_2, [x_dest4, #16*2]
270	ldr	q_d4_3, [x_dest4, #16*3]
271
272	/* dest3 */
273	tbl	v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_0_lo.16b
274	tbl	v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_0_hi.16b
275	eor	v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b
276	eor	v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
277
278	tbl	v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_1_lo.16b
279	tbl	v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_1_hi.16b
280	eor	v_d3_1.16b, v_tmp_lo.16b, v_d3_1.16b
281	eor	v_d3_1.16b, v_d3_1.16b, v_tmp_hi.16b
282
283	tbl	v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_2_lo.16b
284	tbl	v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_2_hi.16b
285	eor	v_d3_2.16b, v_tmp_lo.16b, v_d3_2.16b
286	eor	v_d3_2.16b, v_d3_2.16b, v_tmp_hi.16b
287
288	tbl	v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_3_lo.16b
289	tbl	v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_3_hi.16b
290	eor	v_d3_3.16b, v_tmp_lo.16b, v_d3_3.16b
291	eor	v_d3_3.16b, v_d3_3.16b, v_tmp_hi.16b
292
293	/* dest4 */
294	tbl	v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_0_lo.16b
295	tbl	v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_0_hi.16b
296	eor	v_d4_0.16b, v_tmp_lo.16b, v_d4_0.16b
297	eor	v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
298
299	tbl	v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_1_lo.16b
300	tbl	v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_1_hi.16b
301	eor	v_d4_1.16b, v_tmp_lo.16b, v_d4_1.16b
302	eor	v_d4_1.16b, v_d4_1.16b, v_tmp_hi.16b
303
304	tbl	v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_2_lo.16b
305	tbl	v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_2_hi.16b
306	eor	v_d4_2.16b, v_tmp_lo.16b, v_d4_2.16b
307	eor	v_d4_2.16b, v_d4_2.16b, v_tmp_hi.16b
308
309	tbl	v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_3_lo.16b
310	tbl	v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_3_hi.16b
311	eor	v_d4_3.16b, v_tmp_lo.16b, v_d4_3.16b
312	eor	v_d4_3.16b, v_d4_3.16b, v_tmp_hi.16b
313
314	str	q_d3_0, [x_dest3, #16*0]
315	str	q_d3_1, [x_dest3, #16*1]
316	str	q_d3_2, [x_dest3, #16*2]
317	str	q_d3_3, [x_dest3, #16*3]
318	add	x_dest3, x_dest3, #64
319
320	str	q_d4_0, [x_dest4, #16*0]
321	str	q_d4_1, [x_dest4, #16*1]
322	str	q_d4_2, [x_dest4, #16*2]
323	str	q_d4_3, [x_dest4, #16*3]
324	add	x_dest4, x_dest4, #64
325
326	cmp	x_src, x_src_end
327	bls	.Lloop64
328
329.Lloop64_end:
330	/* restore d8 ~ d15 */
331	ldp	d8,  d9,  [sp]
332	ldp	d10, d11, [sp, #16]
333	ldp	d12, d13, [sp, #32]
334	ldp	d14, d15, [sp, #48]
335	add	sp, sp, #64
336	add	x_src_end, x_src_end, #64
337
338.Lloop16_init:
339	sub	x_src_end, x_src_end, #16
340	cmp	x_src, x_src_end
341	bhi	.lessthan16_init
342
343.Lloop16:
344	ldr	q_data, [x_src]
345
346	ldr	q_d1_0, [x_dest1]
347	ldr	q_d2_0, [x_dest2]
348
349	and	v_data_lo.16b, v_data.16b, v_mask0f.16b
350	ushr	v_data_hi.16b, v_data.16b, #4
351
352	tbl	v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
353	tbl	v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
354	eor	v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b
355	eor	v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
356
357	tbl	v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
358	tbl	v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
359	eor	v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b
360	eor	v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
361
362	str	q_d1_0, [x_dest1]
363	str	q_d2_0, [x_dest2]
364	ldr	q_d3_0, [x_dest3]
365	ldr	q_d4_0, [x_dest4]
366
367	tbl	v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
368	tbl	v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
369	eor	v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b
370	eor	v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
371
372	tbl	v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b
373	tbl	v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b
374	eor	v_d4_0.16b, v_tmp_lo.16b, v_d4_0.16b
375	eor	v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
376
377	str	q_d3_0, [x_dest3]
378	str	q_d4_0, [x_dest4]
379
380	add	x_src, x_src, #16
381	add	x_dest1, x_dest1, #16
382	add	x_dest2, x_dest2, #16
383	add	x_dest3, x_dest3, #16
384	add	x_dest4, x_dest4, #16
385	cmp	x_src, x_src_end
386	bls	.Lloop16
387
388.lessthan16_init:
389	sub	x_tmp, x_src, x_src_end
390	cmp	x_tmp, #16
391	beq	.return_pass
392
393.lessthan16:
394	mov	x_src, x_src_end
395	sub	x_dest1, x_dest1, x_tmp
396	sub	x_dest2, x_dest2, x_tmp
397	sub	x_dest3, x_dest3, x_tmp
398	sub	x_dest4, x_dest4, x_tmp
399
400	ldr	x_const, =const_tbl
401	sub	x_const, x_const, x_tmp
402	ldr	q_tmp, [x_const, #16]
403
404	ldr	q_data, [x_src]
405	ldr	q_d1_0, [x_dest1]
406	ldr	q_d2_0, [x_dest2]
407
408	and	v_data_lo.16b, v_data.16b, v_mask0f.16b
409	ushr	v_data_hi.16b, v_data.16b, #4
410
411	tbl	v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
412	tbl	v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
413	eor	v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
414	and	v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
415	eor	v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
416
417	tbl	v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
418	tbl	v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
419	eor	v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
420	and	v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
421	eor	v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
422
423	str	q_d1_0, [x_dest1]
424	str	q_d2_0, [x_dest2]
425	ldr	q_d3_0, [x_dest3]
426	ldr	q_d4_0, [x_dest4]
427
428	tbl	v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
429	tbl	v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
430	eor	v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
431	and	v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
432	eor	v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
433
434	tbl	v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b
435	tbl	v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b
436	eor	v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
437	and	v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
438	eor	v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
439
440	str	q_d3_0, [x_dest3]
441	str	q_d4_0, [x_dest4]
442
443.return_pass:
444	mov	w_ret, #0
445	ret
446
447.return_fail:
448	mov	w_ret, #1
449	ret
450
451.section .data
452.balign 8
453const_tbl:
454	.dword 0x0000000000000000, 0x0000000000000000
455	.dword 0xffffffffffffffff, 0xffffffffffffffff
456